From 7d7bb0473a66d51a1a556630cc6a9f66e861f041 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 17 Jul 2018 10:51:01 -0500 Subject: [PATCH 001/168] Initial commit --- CMakeLists.txt | 100 ++ DEBIAN/postinst | 19 + DEBIAN/prerm | 18 + LICENSE | 21 + README.md | 59 + RPM/rpm_post | 1 + RPM/rpm_postun | 1 + _clang-format | 60 + cmake_modules/env.cmake | 125 ++ cmake_modules/utils.cmake | 96 ++ doc/rocprofiler_spec.md | 592 ++++++++++ inc/rocprofiler.h | 364 ++++++ script/rpl_run.sh | 377 ++++++ script/tblextr.py | 119 ++ script/txt2xml.sh | 94 ++ src/CMakeLists.txt | 37 + src/core/context.h | 546 +++++++++ src/core/hsa_proxy_queue.h | 67 ++ src/core/hsa_queue.h | 80 ++ src/core/intercept_queue.cpp | 40 + src/core/intercept_queue.h | 230 ++++ src/core/metrics.cpp | 28 + src/core/metrics.h | 302 +++++ src/core/profile.h | 271 +++++ src/core/proxy_queue.cpp | 63 + src/core/proxy_queue.h | 77 ++ src/core/queue.h | 42 + src/core/rocprofiler.cpp | 522 ++++++++ src/core/simple_proxy_queue.cpp | 40 + src/core/simple_proxy_queue.h | 262 +++++ src/core/tracker.h | 188 +++ src/core/types.h | 37 + src/util/exception.h | 72 ++ src/util/hsa_rsrc_factory.cpp | 562 +++++++++ src/util/hsa_rsrc_factory.h | 288 +++++ src/util/logger.h | 191 +++ src/xml/expr.h | 446 +++++++ src/xml/xml.h | 457 +++++++ test/CMakeLists.txt | 62 + test/app/test.cpp | 40 + test/ctrl/run_kernel.h | 83 ++ test/ctrl/test_aql.h | 77 ++ test/ctrl/test_hsa.cpp | 283 +++++ test/ctrl/test_hsa.h | 124 ++ test/ctrl/test_kernel.h | 134 +++ test/run.sh | 61 + .../gfx8_SimpleConvolution.hsaco | Bin 0 -> 9392 bytes .../gfx9_SimpleConvolution.hsaco | Bin 0 -> 11136 bytes test/simple_convolution/simple_convolution.cl | 76 ++ .../simple_convolution/simple_convolution.cpp | 388 ++++++ test/simple_convolution/simple_convolution.h | 94 ++ test/tool/gfx_metrics.xml | 69 ++ test/tool/input.xml | 14 + test/tool/metrics.xml | 205 ++++ test/tool/tool.cpp | 1048 +++++++++++++++++ test/util/helper_funcs.h | 86 ++ test/util/hsa_rsrc_factory.cpp | 556 +++++++++ test/util/hsa_rsrc_factory.h | 284 +++++ test/util/perf_timer.cpp | 179 +++ test/util/perf_timer.h | 83 ++ test/util/test_assert.h | 46 + test/util/xml.h | 457 +++++++ 62 files changed, 11343 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 DEBIAN/postinst create mode 100644 DEBIAN/prerm create mode 100644 LICENSE create mode 100644 README.md create mode 100644 RPM/rpm_post create mode 100644 RPM/rpm_postun create mode 100644 _clang-format create mode 100644 cmake_modules/env.cmake create mode 100644 cmake_modules/utils.cmake create mode 100644 doc/rocprofiler_spec.md create mode 100644 inc/rocprofiler.h create mode 100755 script/rpl_run.sh create mode 100755 script/tblextr.py create mode 100755 script/txt2xml.sh create mode 100644 src/CMakeLists.txt create mode 100644 src/core/context.h create mode 100644 src/core/hsa_proxy_queue.h create mode 100644 src/core/hsa_queue.h create mode 100644 src/core/intercept_queue.cpp create mode 100644 src/core/intercept_queue.h create mode 100644 src/core/metrics.cpp create mode 100644 src/core/metrics.h create mode 100644 src/core/profile.h create mode 100644 src/core/proxy_queue.cpp create mode 100644 src/core/proxy_queue.h create mode 100644 src/core/queue.h create mode 100644 src/core/rocprofiler.cpp create mode 100644 src/core/simple_proxy_queue.cpp create mode 100644 src/core/simple_proxy_queue.h create mode 100644 src/core/tracker.h create mode 100644 src/core/types.h create mode 100644 src/util/exception.h create mode 100644 src/util/hsa_rsrc_factory.cpp create mode 100644 src/util/hsa_rsrc_factory.h create mode 100644 src/util/logger.h create mode 100644 src/xml/expr.h create mode 100644 src/xml/xml.h create mode 100644 test/CMakeLists.txt create mode 100644 test/app/test.cpp create mode 100644 test/ctrl/run_kernel.h create mode 100644 test/ctrl/test_aql.h create mode 100644 test/ctrl/test_hsa.cpp create mode 100644 test/ctrl/test_hsa.h create mode 100644 test/ctrl/test_kernel.h create mode 100755 test/run.sh create mode 100644 test/simple_convolution/gfx8_SimpleConvolution.hsaco create mode 100755 test/simple_convolution/gfx9_SimpleConvolution.hsaco create mode 100644 test/simple_convolution/simple_convolution.cl create mode 100644 test/simple_convolution/simple_convolution.cpp create mode 100644 test/simple_convolution/simple_convolution.h create mode 100644 test/tool/gfx_metrics.xml create mode 100644 test/tool/input.xml create mode 100644 test/tool/metrics.xml create mode 100644 test/tool/tool.cpp create mode 100644 test/util/helper_funcs.h create mode 100644 test/util/hsa_rsrc_factory.cpp create mode 100644 test/util/hsa_rsrc_factory.h create mode 100644 test/util/perf_timer.cpp create mode 100644 test/util/perf_timer.h create mode 100644 test/util/test_assert.h create mode 100644 test/util/xml.h diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..6249e098 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,100 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +cmake_minimum_required ( VERSION 3.5.0 ) + +## Verbose output. +set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) + +## Set module name and project name. +set ( ROCPROFILER_NAME "rocprofiler" ) +set ( ROCPROFILER_TARGET "${ROCPROFILER_NAME}64" ) +set ( ROCPROFILER_LIBRARY "lib${ROCPROFILER_TARGET}" ) +project ( ${ROCPROFILER_TARGET} ) + +## Adding default path cmake modules +list ( APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules" ) +## Include common cmake modules +include ( utils ) +## Set build environment +include ( env ) + +## Setup the package version. +get_version ( "1.0.0" ) +message ( "-- LIB-VERSION: ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}" ) + +set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} ) +set ( BUILD_VERSION_MINOR ${VERSION_MINOR} ) +set ( BUILD_VERSION_PATCH ${VERSION_PATCH} ) +set ( LIB_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) +if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" ) + message ( "VERSION BUILD DEFINED ${VERSION_BUILD}" ) + set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" ) +endif () +set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) + +## Set target and root/lib/test directory +set ( TARGET_NAME "${ROCPROFILER_TARGET}" ) +set ( ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ) +set ( LIB_DIR "${ROOT_DIR}/src" ) +set ( TEST_DIR "${ROOT_DIR}/test" ) + +## Build library +include ( ${LIB_DIR}/CMakeLists.txt ) + +## Set the VERSION and SOVERSION values +set_property ( TARGET ${TARGET_NAME} PROPERTY VERSION "${LIB_VERSION_STRING}" ) +set_property ( TARGET ${TARGET_NAME} PROPERTY SOVERSION "${BUILD_VERSION_MAJOR}" ) + +## If the library is a release, strip the target library +if ( "${CMAKE_BUILD_TYPE}" STREQUAL release ) + add_custom_command ( TARGET ${ROCPROFILER_TARGET} POST_BUILD COMMAND ${CMAKE_STRIP} *.so ) +endif () + +## Build tests +add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test ) + +## Install information +install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION ${ROCPROFILER_NAME}/lib ) +install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h DESTINATION ${ROCPROFILER_NAME}/include ) + +## Packaging directives +set ( CPACK_PACKAGE_NAME "${ROCPROFILER_NAME}-dev" ) +set ( CPACK_PACKAGE_VENDOR "AMD" ) +set ( CPACK_PACKAGE_VERSION_MAJOR ${BUILD_VERSION_MAJOR} ) +set ( CPACK_PACKAGE_VERSION_MINOR ${BUILD_VERSION_MINOR} ) +set ( CPACK_PACKAGE_VERSION_PATCH ${BUILD_VERSION_PATCH} ) +set ( CPACK_PACKAGE_CONTACT "Advanced Micro Devices Inc." ) +set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "ROCPROFILER library for AMD HSA runtime API extension support" ) +set ( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" ) + +## Debian package specific variables +set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev" ) +set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/HSA-RocProfiler" ) +set ( CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst;${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm" ) + +## RPM package specific variables +set ( CPACK_RPM_PACKAGE_DEPENDS "hsa-rocr-dev" ) +set ( CPACK_RPM_PRE_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" ) +set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" ) + +include ( CPack ) diff --git a/DEBIAN/postinst b/DEBIAN/postinst new file mode 100644 index 00000000..3d022884 --- /dev/null +++ b/DEBIAN/postinst @@ -0,0 +1,19 @@ +#/bin/bash + +set -e + +do_ldconfig() { + echo /opt/rocm/librocprofiler/lib > /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig +} + +case "$1" in + configure) + do_ldconfig + ;; + abort-upgrade|abort-remove|abort-deconfigure) + echo "$1" + ;; + *) + exit 0 + ;; +esac diff --git a/DEBIAN/prerm b/DEBIAN/prerm new file mode 100644 index 00000000..b3f509a9 --- /dev/null +++ b/DEBIAN/prerm @@ -0,0 +1,18 @@ +#!/bin/bash + +set -e + +rm_ldconfig() { + rm -f /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig +} + +case "$1" in + remove) + rm_ldconfig + ;; + purge) + ;; + *) + exit 0 + ;; +esac diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..fe4ce68b --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ diff --git a/README.md b/README.md new file mode 100644 index 00000000..5492d17d --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +# ROC-profiler + +ROC profiler library. Profiling with perf-counters and derived metrics. Library supports GFX8/GFX9. + +The library source tree: + - doc - Documentation + - inc/rocprofiler.h - Library public API + - src - Library sources + - core - Library API sources + - util - Library utils sources + - xml - XML parser + - test - Library test suite + - ctrl - Test controll + - util - Test utils + - simple_convolution - Simple convolution test kernel + +## Build environment: +``` + export CMAKE_PREFIX_PATH=: + export CMAKE_BUILD_TYPE= # release by default + export CMAKE_DEBUG_TRACE=1 # to enable debug tracing +``` + +## To build with the current installed ROCM: +``` + cd .../rocprofiler + mkdir build + cd build + cmake -DCMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa .. + make +``` + +## To run the test: +``` + cd .../rocprofiler/build + export LD_LIBRARY_PATH=.: # paths to ROC profiler and oher libraries + export HSA_TOOLS_LIB=librocprofiler64.so # ROC profiler library loaded by HSA runtime + export ROCP_TOOL_LIB=test/libtool.so # tool library loaded by ROC profiler + export ROCP_METRICS=metrics.xml # ROC profiler metrics config file + export ROCP_INPUT=input.xml # input file for the tool library + export ROCP_OUTPUT_DIR=./ # output directory for the tool library, for metrics results file 'results.txt' + +``` + +## Internal 'simple_convolution' test run script: +``` + cd .../rocprofiler/build + run.sh +``` + +## To enable error messages logging to '/tmp/rocprofiler_log.txt': +``` + export ROCPROFILER_LOG=1 +``` + +## To enable verbose tracing: +``` + export ROCPROFILER_TRACE=1 +``` diff --git a/RPM/rpm_post b/RPM/rpm_post new file mode 100644 index 00000000..57c5c811 --- /dev/null +++ b/RPM/rpm_post @@ -0,0 +1 @@ +echo /opt/rocm/librocprofiler/lib > /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig diff --git a/RPM/rpm_postun b/RPM/rpm_postun new file mode 100644 index 00000000..6b3c8f28 --- /dev/null +++ b/RPM/rpm_postun @@ -0,0 +1 @@ +rm -f /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig diff --git a/_clang-format b/_clang-format new file mode 100644 index 00000000..0c81671e --- /dev/null +++ b/_clang-format @@ -0,0 +1,60 @@ +--- +Language: Cpp +# BasedOnStyle: Google +AccessModifierOffset: -1 +ConstructorInitializerIndentWidth: 4 +AlignEscapedNewlinesLeft: false +AlignTrailingComments: true +AlignConsecutiveAssignments: false +AlignOperands: false +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortIfStatementsOnASingleLine: true +AllowShortLoopsOnASingleLine: true +AllowShortFunctionsOnASingleLine: All +AlwaysBreakAfterDefinitionReturnType: false +AlwaysBreakTemplateDeclarations: false +AlwaysBreakBeforeMultilineStrings: true +BreakBeforeBinaryOperators: false +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BinPackParameters: true +ColumnLimit: 100 +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ExperimentalAutoDetectBinPacking: false +IndentCaseLabels: true +IndentWrappedFunctionNames: false +IndentFunctionDeclarationAfterType: false +MaxEmptyLinesToKeep: 2 +KeepEmptyLinesAtTheStartOfBlocks: false +NamespaceIndentation: None +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: false +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakString: 1000 +PenaltyBreakFirstLessLess: 120 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +DerivePointerAlignment: false +PointerAlignment: Left +SpacesBeforeTrailingComments: 2 +Cpp11BracedListStyle: true +Standard: Auto +IndentWidth: 2 +TabWidth: 8 +UseTab: Never +BreakBeforeBraces: Attach +SpacesInParentheses: false +SpacesInAngles: false +SpaceInEmptyParentheses: false +SpacesInCStyleCastParentheses: false +SpacesInContainerLiterals: true +SpaceBeforeAssignmentOperators: true +ContinuationIndentWidth: 4 +CommentPragmas: '^ IWYU pragma:' +ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] +SpaceBeforeParens: ControlStatements +DisableFormat: false +SortIncludes: false +... diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake new file mode 100644 index 00000000..ca7c4804 --- /dev/null +++ b/cmake_modules/env.cmake @@ -0,0 +1,125 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +## Build is not supported on Windows plaform +if ( WIN32 ) + message ( FATAL_ERROR "Windows build is not supported." ) +endif () + +## Compiler Preprocessor definitions. +add_definitions ( -D__linux__ ) +add_definitions ( -DUNIX_OS ) +add_definitions ( -DLINUX ) +add_definitions ( -D__AMD64__ ) +add_definitions ( -D__x86_64__ ) +add_definitions ( -DAMD_INTERNAL_BUILD ) +add_definitions ( -DLITTLEENDIAN_CPU=1 ) +add_definitions ( -DHSA_LARGE_MODEL= ) +add_definitions ( -DHSA_DEPRECATED= ) + +## Linux Compiler options +set ( CMAKE_CXX_FLAGS "-std=c++11") +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=return-type" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-math-errno" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" ) + +set ( CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bdynamic -Wl,-z,noexecstack" ) + +set ( CMAKE_SKIP_BUILD_RPATH TRUE ) + +## CLANG options +if ( "$ENV{CXX}" STREQUAL "/usr/bin/clang++" ) + set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ferror-limit=1000000" ) +endif() + +## Enable debug trace +if ( DEFINED ENV{CMAKE_DEBUG_TRACE} ) + add_definitions ( -DDEBUG_TRACE=1 ) +endif() + +## Enable direct loading of AQL-profile HSA extension +if ( DEFINED ENV{CMAKE_LD_AQLPROFILE} ) + add_definitions ( -DROCP_LD_AQLPROFILE=1 ) +endif() + +## Make env vars +if ( NOT DEFINED CMAKE_BUILD_TYPE OR "${CMAKE_BUILD_TYPE}" STREQUAL "" ) + if ( DEFINED ENV{CMAKE_BUILD_TYPE} ) + set ( CMAKE_BUILD_TYPE $ENV{CMAKE_BUILD_TYPE} ) + endif() +endif() +if ( NOT DEFINED CMAKE_PREFIX_PATH AND DEFINED ENV{CMAKE_PREFIX_PATH} ) + set ( CMAKE_PREFIX_PATH $ENV{CMAKE_PREFIX_PATH} ) +endif() + +## Extend Compiler flags based on build type +string ( TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE ) +if ( "${CMAKE_BUILD_TYPE}" STREQUAL debug ) + set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb" ) + set ( CMAKE_BUILD_TYPE "debug" ) +else () + set ( CMAKE_BUILD_TYPE "release" ) +endif () + +## Extend Compiler flags based on Processor architecture +if ( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64" ) + set ( NBIT 64 ) + set ( NBITSTR "64" ) + set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse -msse2" ) +elseif ( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86" ) + set ( NBIT 32 ) + set ( NBITSTR "" ) + set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32" ) +endif () + +## Find hsa-runtime headers/lib +find_file ( HSA_RUNTIME_INC "hsa.h" ) +if ( "${HSA_RUNTIME_INC_PATH}" STREQUAL "" ) + find_file ( HSA_RUNTIME_INC "hsa/hsa.h" ) +endif() +find_library ( HSA_RUNTIME_LIB "libhsa-runtime${NBIT}.so" ) +get_filename_component ( HSA_RUNTIME_INC_PATH ${HSA_RUNTIME_INC} DIRECTORY ) +get_filename_component ( HSA_RUNTIME_LIB_PATH ${HSA_RUNTIME_LIB} DIRECTORY ) + +find_library ( HSA_KMT_LIB "libhsakmt.so" ) +get_filename_component ( HSA_KMT_LIB_PATH ${HSA_KMT_LIB} DIRECTORY ) + +set ( API_PATH ${HSA_RUNTIME_INC_PATH} ) + +## Basic Tool Chain Information +message ( "----------------NBIT: ${NBIT}" ) +message ( "-----------BuildType: ${CMAKE_BUILD_TYPE}" ) +message ( "------------Compiler: ${CMAKE_CXX_COMPILER}" ) +message ( "----Compiler-Version: ${CMAKE_CXX_COMPILER_VERSION}" ) +message ( "-----HSA-Runtime-Inc: ${HSA_RUNTIME_INC_PATH}" ) +message ( "-----HSA-Runtime-Lib: ${HSA_RUNTIME_LIB_PATH}" ) +message ( "------------API-path: ${API_PATH}" ) +message ( "-----CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}" ) +message ( "---CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" ) diff --git a/cmake_modules/utils.cmake b/cmake_modules/utils.cmake new file mode 100644 index 00000000..15865820 --- /dev/null +++ b/cmake_modules/utils.cmake @@ -0,0 +1,96 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +## Parses the VERSION_STRING variable and places +## the first, second and third number values in +## the major, minor and patch variables. +function( parse_version VERSION_STRING ) + + string ( FIND ${VERSION_STRING} "-" STRING_INDEX ) + + if ( ${STRING_INDEX} GREATER -1 ) + math ( EXPR STRING_INDEX "${STRING_INDEX} + 1" ) + string ( SUBSTRING ${VERSION_STRING} ${STRING_INDEX} -1 VERSION_BUILD ) + endif () + + string ( REGEX MATCHALL "[0123456789]+" VERSIONS ${VERSION_STRING} ) + list ( LENGTH VERSIONS VERSION_COUNT ) + + if ( ${VERSION_COUNT} GREATER 0) + list ( GET VERSIONS 0 MAJOR ) + set ( VERSION_MAJOR ${MAJOR} PARENT_SCOPE ) + set ( TEMP_VERSION_STRING "${MAJOR}" ) + endif () + + if ( ${VERSION_COUNT} GREATER 1 ) + list ( GET VERSIONS 1 MINOR ) + set ( VERSION_MINOR ${MINOR} PARENT_SCOPE ) + set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${MINOR}" ) + endif () + + if ( ${VERSION_COUNT} GREATER 2 ) + list ( GET VERSIONS 2 PATCH ) + set ( VERSION_PATCH ${PATCH} PARENT_SCOPE ) + set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${PATCH}" ) + endif () + + if ( DEFINED VERSION_BUILD ) + set ( VERSION_BUILD "${VERSION_BUILD}" PARENT_SCOPE ) + endif () + + set ( VERSION_STRING "${TEMP_VERSION_STRING}" PARENT_SCOPE ) + +endfunction () + +## Gets the current version of the repository +## using versioning tags and git describe. +## Passes back a packaging version string +## and a library version string. +function ( get_version DEFAULT_VERSION_STRING ) + + parse_version ( ${DEFAULT_VERSION_STRING} ) + + find_program ( GIT NAMES git ) + + if ( GIT ) + + execute_process ( COMMAND "git describe --dirty --long --match [0-9]* 2>/dev/null" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE GIT_TAG_STRING + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE RESULT ) + + if ( ${RESULT} EQUAL 0 ) + + parse_version ( ${GIT_TAG_STRING} ) + + endif () + + endif () + + set( VERSION_STRING "${VERSION_STRING}" PARENT_SCOPE ) + set( VERSION_MAJOR "${VERSION_MAJOR}" PARENT_SCOPE ) + set( VERSION_MINOR "${VERSION_MINOR}" PARENT_SCOPE ) + set( VERSION_PATCH "${VERSION_PATCH}" PARENT_SCOPE ) + set( VERSION_BUILD "${VERSION_BUILD}" PARENT_SCOPE ) + +endfunction() diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md new file mode 100644 index 00000000..001bcbe1 --- /dev/null +++ b/doc/rocprofiler_spec.md @@ -0,0 +1,592 @@ +# ROC Profiler Library Specification + +## 1. High level overview +``` +The goal of the implementation is to provide a HW specific low-level performance analysis +interface for profiling of GPU compute applications. The profiling includes HW performance +counters with complex performance metrics and HW traces. The implementation distinguishes +two profiling features, metrics and traces. HW performance counters are treated as the basic +metrics and the formulas can be defined for derived complex metrics. +The library can be loaded by HSA runtime as a tool plugin and it can be loaded by higher +level HW independent performance analysis API like PAPI. +The library has C API and is based on AQLprofile AMD specific HSA extension. + + 1. The library provides methods to query the list of supported HW features. + 2. The library provides profiling APIs to start, stop, read metrics results and tracing + data. + 3. The library provides a callback API for collecting per-kernel profiling data for + the kernels + dispatched to HSA AQL queues. + 4. The library provides mechanism to load profiling tool library plugin by env variable + ROCP_TOOL_LIB. + 5. The library is responsible for allocation of the buffers for profiling and notifying + about output data buffer overflow for traces. + 6. The library is implemented based on AMD specific AQLprofile HSA extension. + 7. The library implementation is abstracted from the specific GFXIP. + 8. The library implementation is extensible: + - Easy adding of counters and metrics + - Counters enumeration + - Counters and metrics can be dynamically configured using XML configuration files with + counters and metrics tables: + o Counters table entry, basic metric: counter name, block name, event id + o Complex metrics table entry: metric name, an expression for calculation the metric + from the counters + +Metrics XML file example: + + + + . . . + + + + . . . + + + + + +``` +## 2. Environment +``` +* HSA_TOOLS_LIB - required to be set to the name of rocprofiler library to be loaded by +HSA runtime +* ROCP_METRICS - path to the metrics XML file +* ROCP_TOOL_LIB - path to profiling tool library loaded by ROC Profiler +* ROCP_HSA_INTERCEPT - if set then HSA dispatches intercepting is enabled +``` +## 3. General API +### 3.1. Description +``` +The library supports method for getting the error number and error string of the last +failed library API call. +To check the conformance of used library APi header and the library binary the version +macros and API methods can be used. + +Returning the error and error string methods: +- rocprofiler_errno - method for returning the error number +- rocprofiler_error_string - method for returning the error string + +Library version: +- ROCPROFILER_VERSION_MAJOR - API major version macro +- ROCPROFILER_VERSION_MINOR - API minor version macro +- rocprofiler_version_major - library major version +- rocprofiler_version_minor - library minor version +``` +### 3.2. Returning the error and error string methods +``` +rocprofiler_errno_t rocprofiler_errno(); +const char* rocprofiler_error_string(); +``` +### 3.3. Library version +``` +The library provides back compatibility if the library major version is less or equal +then the API major version macro. + +API version macros defined in the library API header 'rocprofiler.h': + +ROCPROFILER_VERSION_MAJOR +ROCPROFILER_VERSION_MINOR + +Methods to check library major and minor venison: + +uint32_t rocprofiler_major_version(); +uint32_t rocprofiler_minor_version(); +``` +## 4. Backend API +### 4.1. Description +``` +The library provides the methods to open/close profiling context, to start, stop and read +HW performance counters and traces, to intercept kernel dispatches to collect per-kernel +profiling data. Also the library provides methods to calculate complex performance metrics +and to query the list of available metrics. The library distinguishes two profiling features, +metrics and traces, where HW performance counters are treated as the basic metrics. To check +if there was an error the library methods return HSA standard status code. +For a given context the profiling can be started/stopped and counters sampled in standalone +mode or profiling can be initiated by intercepting the kernel dispatches with registering +a dispatch callback. +For counters sampling, which is the usage model of higher level APIs like PAPI, +the start/stop/read APIs should be used. +For collecting per-kernel data for the submitted to HSA queues kernels the dispatch callback +API should be used. +The library provides back compatibility if the library major version is less or equal. + +Returned API status: +- hsa_status_t - HSA status codes are used from hsa.h header + +Info API: +- rocprofiler_info_kind_t - profiling info kind +- rocprofiler_info_query_t - profiling info query +- rocprofiler_info_data_t - profiling info data +- rocprofiler_get_info - return the info for a given info kind +- rocprofiler_iterate_info - iterate over the info for a given info kind +- rocprofiler_query_info - iterate over the info for a given info query + +Context API: +- rocprofiler_t - profiling context handle +- rocprofiler_feature_kind_t - profiling feature kind +- rocprofiler_feature_parameter_t - profiling feature parameter +- rocprofiler_data_kind_t - profiling data kind +- rocprofiler_data_t - profiling data +- rocprofiler_feature_t - profiling feature +- rocprofiler_mode_t - profiling modes +- rocprofiler_properties_t - profiler properties +- rocprofiler_open - open new profiling context +- rocprofiler_close - close profiling context and release all allocated resources +- rocprofiler_group_count - return profiling groups count +- rocprofiler_get_group - return profiling group for a given index +- rocprofiler_get_metrics - method for calculating the metrics data +- rocprofiler_iterate_trace_data - method for iterating output trace data instances + +Sampling API: +- rocprofiler_start - start profiling +- rocprofiler_stop - stop profiling +- rocprofiler_read - read profiling data to the profiling features objects +- rocprofiler_get_data - wait for profiling data + Group versions of start/stop/read/get_data methods: + o rocprofiler_group_start + o rocprofiler_group_stop + o rocprofiler_group_read + o rocprofiler_group_get_data + +Intercepting API: +- rocprofiler_callback_t - profiling callback type +- rocprofiler_callback_data_t - profiling callback data type +- rocprofiler_set_queue_callbacks - set queue kernel dispatch and queue destroy callbacks +- rocprofiler_remove_queue_callbacks - remove queue callbacks +``` +### 4.2. Info API +``` +The profiling metrics are defined by name and the traces are defined by name and parameters. +All supported features can be iterated using 'iterate_info/query_info' methods. The counter +names are defined in counters table configuration file, each counter has a unique name and +defined by block name and event id. The traces and trace parameters names are same as in +the hardware documentation and the parameters codes are rocprofiler_feature_parameter_t values, +see below in the "Context API" section. +Profiling info kind: + +typedef enum { + ROCPROFILER_INFO_KIND_METRIC = 0, // metric info + ROCPROFILER_INFO_KIND_METRIC_COUNT = 1, // metrics count + ROCPROFILER_INFO_KIND_TRACE = 2, // trace info + ROCPROFILER_INFO_KIND_TRACE_COUNT = 3, // traces count +} rocprofiler_info_kind_t; + +Profiling info data: + +typedef struct { + rocprofiler_info_kind_t kind; // info data kind + union { + struct { + const char* name; // metric name + const char* description; // metric description + } metric; + struct { + const char* name; // trace name + const char* description; // trace description + uint32_t parameter_count; // supported by the trace number + // parameters + } trace; + }; +} rocprofiler_info_data_t; + +Return info for a given info kind: + +has_status_t rocprofiler_get_info( + const hsa_agent_t* agent, // [in] GPU handle, NULL for all + // GPU agents + rocprofiler info_kind_t kind, // kind of iterated info + void *data); // data passed to callback + +Iterate over the info for a given info kind, and invoke an application-defined callback on +every iteration: + +has_status_t rocprofiler_iterate_info( + const hsa_agent_t* agent, // [in] GPU handle, NULL for all + // GPU agents + rocprofiler info_kind_t kind, // kind of iterated info + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback + void *data); + +Iterate over the info for a given info query, and invoke an application-defined callback on +every iteration. The query +fields set to NULL define the query wildcard: + +has_status_t rocprofiler_query_info( + const hsa_agent_t* agent, // [in] GPU handle, NULL for all + // GPU agents + rocprofiler info_kind_t kind, // kind of iterated info + rocprofiler_info_data_t query, // info query + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback + void *data); // data passed to callback +``` +### 4.3. Context API +``` +Profiling context is accumulating all profiling information including profiling features +which carry profiling data, required buffers for profiling command packets and output data. +The context can be created and deleted by the library open/close methods. By deleting +the context all accumulated by the library resources associated with this context will be +released. If it is required more than one run to collect all requested counters data then +data for all profiling groups should be collected and then the metrics can be calculated by +loading the saved groups' data to the profiling context. Saving and loading of the groups +data is responsibility of the tool. The groups are automatically identified on the profiling +context open and there is API to access them, see the "Profiling groups" section below. + +Profiling context handle: + +typename rocprofiler_t; + +Profiling feature kind: + +typedef enum { + ROCPROFILER_FEATURE_KIND_METRIC = 0, // metric + ROCPROFILER_FEATURE_KIND_TRACE = 1 // trace +} rocprofiler_feature_kind_t; + +Profiling feature parameter: + +typedef hsa_ven_amd_aqlprofile_parameter_t rocprofiler_feature_parameter_t; + +Profiling data kind: + +typedef enum { + ROCPROFILER_DATA_KIND_UNINIT = 0, // data uninitialized + ROCPROFILER_DATA_KIND_INT32 = 1, // 32bit integer + ROCPROFILER_DATA_KIND_INT64 = 2, // 64bit integer + ROCPROFILER_DATA_KIND_FLOAT = 3, // float single-precision result + ROCPROFILER_DATA_KIND_DOUBLE = 4, // float double-precision result + ROCPROFILER_DATA_KIND_BYTES = 5 // trace output as a bytes array +} rocprofiler_data_kind_t; + + +Profiling data: + +typedef struct { + rocprofiler_data_kind_t kind; // result kind + union { + uint32_t result_int32; // 32bit integer result + uint64_t result_int64; // 64bit integer result + float result_float; // float single-precision result + double result_double; // float double-precision result + typedef struct { + void* ptr; // pointer + uint32_t size; // byte size + uint32_t instances; // number of trace instances + } result_bytes; // data by ptr and byte size + }; +} rocprofiler_data_t; + +Profiling feature: + +typedef struct { + rocprofiler_feature_kind_t type; // feature type + const char* name; // feature name + const rocprofiler_feature_parameter_t* parameters; // feature parameters + uint32_t parameter_count; // feature parameter count + rocprofiler_data_t* data; // profiling data +} rocprofiler_feature_t; + +Profiling mode masks: +There are several modes which can be specified for the profiling context. +STANDALONE mode can be used for the counters sampling in another then application context +to support statistical system wide profiling. In this mode the profiling context supports +its own queue which can be created on the context open if the CREATEQUEUE mode also specified. +See also "Profiler properties" section below for the standalone mode queue properties. +The profiler supports several profiling groups for collecting profiling data in several +runs and 'SINGLEGROUP' mode allows only one group and the context open will fail if more +groups are needed. + +typedef enum { + ROCPROFILER_MODE_STANDALONE = 1, // standalone mode when ROC profiler + // supports own AQL queue + ROCPROFILER_MODE_CREATEQUEUE = 2, // profiler creates queue in STANDALONE mode + ROCPROFILER_MODE_SINGLEGROUP = 4 // profiler allows one group only and fails + // if more groups are needed +} rocprofiler_mode_t; + +Context data readiness callback: + +typedef void (*rocprofiler_context_callback_t)( + rocprofiler_group_t* group, // profiling group + void* arg); // callback arg + +Profiler properties: +There are several properties which can be specified for the context. A callback can be +registered which will be called when the context data is ready. In standalone profiling mode +'ROCPROFILER_MODE_STANDALONE' the context supports its own queue and the queue can be set by +the property 'queue' or a queue will be created with the specified depth 'queue_depth' if mode +'ROCPROFILER_MODE_CREATEQUEUE' also specified. + +typedef struct { + rocprofiler_context_callback_t callback; // callback on the context data readiness + void* callback_arg; // callback arg + has_queue_t* queue; // HSA queue for standalone mode + uint32_t queue_depth; // created queue depth,for create-queue mode +} rocprofiler_properties_t; + +Open/close profiling context: + +hsa_status_t rocprofiler_open( + hsa_agent_t agent, // GPU handle + rocprofiler_feature_t* features, // [in/out] profiling feature array + uint32_t feature_count, // profiling feature count + rocprofiler_t** context, // [out] profiling context handle + uint32_t mode, // profiling mode mask + rocprofiler_properties_t* properties); // profiler properties + +hsa_status_t rocprofiler_close( + rocprofiler_t* context); // [in] profiling context + +Profiling groups: +The profiler on the context open automatically identifies a required number of the application +runs to collect all data needed for all specified metrics and creates a metric group per each +run. Data for all profiling groups should be collected and then the metrics can be calculated +by loading the saved groups' data to the profiling context. Saving and loading of he groups +data is responsibility of the tool. + +typedef struct { + uint32_t index; // profiling group index + rocprofiler_feature_t** features; // profiling features array + uint32_t feature_count; // profiling feature count + rocprofiler_t* context; // profiling context handle +} rocprofiler_group_t; + +Return profiling groups count: + +hsa_status_t rocprofiler_group_count( + rocprofiler_t* context); // [in/out] profiling context + uint32* count); // [out] profiling groups count + +Return the profiling group for a given index: + +hsa_status_t rocprofiler_get_group( + rocprofiler_t* context, // [in/out] profiling context, + // will be returned as + // a part of the group structure + uint32_t index, // [in] group index + rocprofiler_group_t* group); // [out] profiling group + +Calculate metrics data. The data will be stored to the registered profiling features data fields: +After all profiling context data is ready the registered metrics can be calculated. The context +data readiness can be checked by 'get_data' API or using the context callback. + +hsa_status_t rocprofiler_get_metrics( + rocprofiler_t* context); // [in/out] profiling context + +Method for iterating trace data instances: +Trace data can have several instance, for example, one instance per Shader Engine. + +hsa_status_t rocprofiler_iterate_trace_data( + const rocprofiler_t* contex, // [in] context object + hsa_ven_amd_aqlprofile_data_callback_t callback, // [in] callback to iterate + // the output data + void* callback_data); // [in/out] passed to callback data +``` +### 4.4. Sampling API +``` +The API supports the counters sampling usage model with start/read/stop methods and also lets +to wait for the profiling data in the intercepting usage model with get_data method. + +Start/stop/read methods: + +hsa_status_t rocprofiler_start( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +hsa_status_t rocprofiler_stop( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +hsa_status_t rocprofiler_read( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +Wait for profiling data: + +hsa_status_t rocprofiler_get_data( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +Group versions of the above start/stop/read/get_data methods: + +hsa_status_t rocprofiler_group_start( + rocprofiler_group_t* group); // [in/out] profiling group + +hsa_status_t rocprofiler_group_stop( + rocprofiler_group_t* group); // [in/out] profiling group + + +hsa_status_t rocprofiler_group_read( + rocprofiler_group_t* group); // [in/out] profiling group + + +hsa_status_t rocprofiler_group_get_data( + rocprofiler_group_t* group); // [in/out] profiling group +``` +### 4.5. Intercepting API +``` +The library provides a callback API for enabling profiling for the kernels dispatched to +HSA AQL queues. The API enables per-kernel profiling data collection. + +ROC profiler callback type: + +hsa_status_t (*rocprofiler_callback_t)( + const rocprofiler_callback_data_t* callback_data, // callback data passed by HSA runtime + void* user_data, // [in/out] user data passed + // to the callback + rocprofiler_group** group); // [out] returned profiling group + +Profiling callback data: + +typedef struct { + uint64_t dispatch; // dispatch timestamp + uint64_t begin; // begin timestamp + uint64_t end; // end timestamp + uint64_t complete; // completion signal timestamp +} rocprofiler_dispatch_record_t; + +typedef struct { + hsa_agent_t agent; // GPU agent handle + uint32_t agent_index; // GPU index + const hsa_queue_t* queue; // HSA queue + uint64_t queue_index; // Index in the queue + const hsa_kernel_dispatch_packet_t* packet; // HSA dispatch packet + const char* kernel_name; // Kernel name + const rocprofiler_dispatch_record_t* record; // Dispatch record +} rocprofiler_callback_data_t; + +Queue callbacks: + +typedef struct { + rocprofiler_callback_t dispatch; // kernel dispatch callback + hsa_status_t (*destroy)(hsa_queue_t* queue, void* data); // queue destroy callback +} rocprofiler_queue_callbacks_t; + +Adding/removing kernel dispatch and queue destroy callbacks + +hsa_status_t rocprofiler_set_intercepting( + rocprofiler_intercepting_t callbacks, // intercepting callbacks + void* data); // [in/out] passed callbacks data + +hsa_status_t rocprofiler_remove_intercepting(); +``` +## 5. Application code examples +### 5.1. Querying available metrics +``` +Info data callback: + + hsa_status_t info_data_callback(const rocprofiler_info_data_t info, void *data) { + switch (info.kind) { + case ROCPROFILER_INFO_KIND_METRIC: { + printf("metric %s, description %s\n", + info.metric.name, + info.metric.description); + break; + } + default: + printf("wrong info kind %u\n", kind); + return HSA_STATUS_ERROR; + } + return HSA_STATUS_SUCCESS; + } + +Printing all available metrics: + + hsa_status_t status = rocprofiler_iterate_info( + agent, + ROCPROFILER_INFO_KIND_METRIC, + info_data_callback, + NULL); + +``` +### 5.2. Profiling code example +``` +Profiling of L1 miss ratio, average memory bandwidth. +In the example below rocprofiler_group_get_data group APIs are used for the purpose of a usage +example but in SINGLEGROUP mode when only one group is allowed the context handle itself can be +saved and then direct context method rocprofiler_get_data with default group index equal to 0 +can be used. + +hsa_status_t_dispatch_callback( + const rocprofiler_callback_data_t* callback_data, + void* user_data, + rocprofiler_group_t* group) +{ + hsa_status_t status = HSA_STATUS_SUCCESS; + // Profiling context + rocprofiler_t* context; + // Profiling info objects + rocprofiler_feature_t features* = new rocprofiler_feature_t[2]; + // Tracing parameters + rocprofiler_feature_parameter_t* parameters = new rocprofiler_feature_parameter_t[2]; + + // Setting profiling features + features[0].type = ROCPROFILER_METRIC; + features[0].name = "L1_MISS_RATIO"; + features[1].type = ROCPROFILER_METRIC; + features[1].name = "DRAM_BANDWIDTH"; + + // Creating profiling context + status = rocprofiler_open(callback_data->dispatch.agent, features, 2, &context, + ROCPROFILER_MODE_SINGLEGROUP, NULL); + + + // Get the profiling group + // For general case with many groups there is rocprofiler_group_count() API + const uint32_t group_index = 0 + status = rocprofiler_get_group(context, group_index, group); + + + // In SINGLEGROUP mode the context handle itself can be saved, because there is just one group + + + return status; +} + +void profiling_libary_constructor() { + // Defining callback data, no data in this simple example + void* callback_data = NULL; + + // Adding observers + hsa_sttaus_t status = rocprofiler_add_dispatch_callback(dispatch_callback, callback_data); + + + // Dispatching profiled kernel + +} + +void profiling_libary_destructor() { + > { + // In SINGLEGROUP mode the rocprofiler_get_group() method with default zero group + // index can be used, if context handle would be saved + status = rocprofiler_group_get_data(entry->group); + + status = rocprofiler_get_metrics(entry->group->context); + + status = rocprofiler_close(entry->group->context); + + + dispatch_data, entry->features, entry->features_count)>; + } +} +``` +### 5.3. Option to use completion callback +``` +Creating profiling context with completion callback: + . . . + rocprofiler_properties_t properties = {}; + properties.callback = completion_callback; + properties.callback_arg = NULL; // no args defined + status = rocprofiler_open(agent, features, 3, &context, + ROCPROFILER_MODE_SINGLEGROUP, properties); + + . . . + +Definition of completion callback: + +void completion_callback(profiler_group_t group, void* arg) { + + hsa_status_t status = rocprofiler_close(group.context); + +} +``` diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h new file mode 100644 index 00000000..e7a5a1e0 --- /dev/null +++ b/inc/rocprofiler.h @@ -0,0 +1,364 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +//////////////////////////////////////////////////////////////////////////////// +// +// ROC Profiler API +// +// The goal of the implementation is to provide a HW specific low-level +// performance analysis interface for profiling of GPU compute applications. +// The profiling includes HW performance counters with complex +// performance metrics and HW traces. +// +// The library can be used by a tool library loaded by HSA runtime or by +// higher level HW independent performance analysis API like PAPI. +// +// The library is written on C and will be based on AQLprofile AMD specific +// HSA extension. The library implementation requires HSA API intercepting and +// a profiling queue supporting a submit callback interface. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef INC_ROCPROFILER_H_ +#define INC_ROCPROFILER_H_ + +#include +#include +#include + +#define ROCPROFILER_VERSION_MAJOR 1 +#define ROCPROFILER_VERSION_MINOR 1 + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +//////////////////////////////////////////////////////////////////////////////// +// Returning library version + +uint32_t rocprofiler_version_major(); +uint32_t rocprofiler_version_minor(); + +//////////////////////////////////////////////////////////////////////////////// +// Global properties structure + +typedef struct { + uint32_t intercept_mode; + uint32_t sqtt_size; + uint32_t sqtt_local; + uint64_t timeout; + uint32_t timestamp_on; +} rocprofiler_settings_t; + +//////////////////////////////////////////////////////////////////////////////// +// Returning the error string method + +hsa_status_t rocprofiler_error_string( + const char** str); // [out] the API error string pointer returning + +//////////////////////////////////////////////////////////////////////////////// +// Profiling features and data +// +// Profiling features objects have profiling feature info, type, parameters and data +// Also profiling data samplaes can be iterated using a callback + +// Profiling feature kind +typedef enum { + ROCPROFILER_FEATURE_KIND_METRIC = 0, + ROCPROFILER_FEATURE_KIND_TRACE = 1 +} rocprofiler_feature_kind_t; + +// Profiling feture parameter +typedef hsa_ven_amd_aqlprofile_parameter_t rocprofiler_parameter_t; + +// Profiling data kind +typedef enum { + ROCPROFILER_DATA_KIND_UNINIT = 0, + ROCPROFILER_DATA_KIND_INT32 = 1, + ROCPROFILER_DATA_KIND_INT64 = 2, + ROCPROFILER_DATA_KIND_FLOAT = 3, + ROCPROFILER_DATA_KIND_DOUBLE = 4, + ROCPROFILER_DATA_KIND_BYTES = 5 +} rocprofiler_data_kind_t; + +// Profiling data type +typedef struct { + rocprofiler_data_kind_t kind; // result kind + union { + uint32_t result_int32; // 32bit integer result + uint64_t result_int64; // 64bit integer result + float result_float; // float single-precision result + double result_double; // float double-precision result + struct { + void* ptr; + uint32_t size; + uint32_t instance_count; + bool copy; + } result_bytes; // data by ptr and byte size + }; +} rocprofiler_data_t; + +// Profiling feature type +typedef struct { + rocprofiler_feature_kind_t kind; // feature kind + union { + const char* name; // feature name + struct { + const char* block; // counter block name + uint32_t event; // counter event id + } counter; + }; + const rocprofiler_parameter_t* parameters; // feature parameters array + uint32_t parameter_count; // feature parameters count + rocprofiler_data_t data; // profiling data +} rocprofiler_feature_t; + +// Profiling features set type +typedef void rocprofiler_feature_set_t; + +//////////////////////////////////////////////////////////////////////////////// +// Profiling context +// +// Profiling context object accumuate all profiling information + +// Profiling context object +typedef void rocprofiler_t; + +// Profiling group object +typedef struct { + unsigned index; // group index + rocprofiler_feature_t** features; // profiling info array + uint32_t feature_count; // profiling info count + rocprofiler_t* context; // context object +} rocprofiler_group_t; + +// Profiling mode mask +typedef enum { + ROCPROFILER_MODE_STANDALONE = 1, // standalone mode when ROC profiler supports a queue + ROCPROFILER_MODE_CREATEQUEUE = 2, // ROC profiler creates queue in standalone mode + ROCPROFILER_MODE_SINGLEGROUP = 4 // only one group is allowed, failed otherwise +} rocprofiler_mode_t; + +// Profiling handler, calling on profiling completion +typedef bool (*rocprofiler_handler_t)(rocprofiler_group_t group, void* arg); + +// Profiling preperties +typedef struct { + hsa_queue_t* queue; // queue for STANDALONE mode + // the queue is created and returned in CREATEQUEUE mode + uint32_t queue_depth; // created queue depth + rocprofiler_handler_t handler; // handler on completion + void* handler_arg; // the handler arg +} rocprofiler_properties_t; + +// Create new profiling context +hsa_status_t rocprofiler_open(hsa_agent_t agent, // GPU handle + rocprofiler_feature_t* features, // [in] profiling features array + uint32_t feature_count, // profiling info count + rocprofiler_t** context, // [out] context object + uint32_t mode, // profiling mode mask + rocprofiler_properties_t* properties); // profiling properties + +// Add feature to e features set +hsa_status_t rocprofiler_add_feature(const rocprofiler_feature_t* feature, // [in] + rocprofiler_feature_set_t* features_set); // [in/out] profiling features set + +// Create new profiling context +hsa_status_t rocprofiler_features_set_open(hsa_agent_t agent, // GPU handle + rocprofiler_feature_set_t* features_set, // [in] profiling features set + rocprofiler_t** context, // [out] context object + uint32_t mode, // profiling mode mask + rocprofiler_properties_t* properties); // profiling properties + +// Delete profiling info +hsa_status_t rocprofiler_close(rocprofiler_t* context); // [in] profiling context + +// Context reset before reusing +hsa_status_t rocprofiler_reset(rocprofiler_t* context, // [in] profiling context + uint32_t group_index); // group index + +//////////////////////////////////////////////////////////////////////////////// +// Queue callbacks +// +// Queue callbacks for initiating profiling per kernel dispatch and to wait +// the profiling data on the queue destroy. + +// Dispatch record +typedef struct { + uint64_t dispatch; // dispatch timestamp + uint64_t begin; // begin timestamp + uint64_t end; // end timestamp + uint64_t complete; // completion signal timestamp +} rocprofiler_dispatch_record_t; + +// Profiling callback data +typedef struct { + hsa_agent_t agent; // GPU agent handle + uint32_t agent_index; // GPU index + const hsa_queue_t* queue; // HSA queue + uint64_t queue_index; // Index in the queue + const hsa_kernel_dispatch_packet_t* packet; // HSA dispatch packet + const char* kernel_name; // Kernel name + const rocprofiler_dispatch_record_t* record; // Dispatch record +} rocprofiler_callback_data_t; + +// Profiling callback type +typedef hsa_status_t (*rocprofiler_callback_t)( + const rocprofiler_callback_data_t* callback_data, // [in] callback data union, data depends on + // the callback API id + void* user_data, // [in/out] user data passed to the callback + rocprofiler_group_t* group); // [out] profiling group + +// Queue callbacks +typedef struct { + rocprofiler_callback_t dispatch; // dispatch callback + hsa_status_t (*destroy)(hsa_queue_t* queue, void* data); // destroy callback +} rocprofiler_queue_callbacks_t; + +// Set queue callbacks +hsa_status_t rocprofiler_set_queue_callbacks( + rocprofiler_queue_callbacks_t callbacks, // callbacks + void* data); // [in/out] passed callbacks data + +// Remove queue callbacks +hsa_status_t rocprofiler_remove_queue_callbacks(); + +//////////////////////////////////////////////////////////////////////////////// +// Start/stop profiling +// +// Start/stop the context profiling invocation, have to be as many as +// contect.invocations' to collect all profiling data + +// Start profiling +hsa_status_t rocprofiler_start(rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index); // group index + +// Stop profiling +hsa_status_t rocprofiler_stop(rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index); // group index + +// Read profiling +hsa_status_t rocprofiler_read(rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index); // group index + +// Read profiling data +hsa_status_t rocprofiler_get_data(rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index); // group index + +// Get profiling groups count +hsa_status_t rocprofiler_group_count(const rocprofiler_t* context, // [in] profiling context + uint32_t* group_count); // [out] profiling groups count + +// Get profiling group for a given index +hsa_status_t rocprofiler_get_group(rocprofiler_t* context, // [in] profiling context + uint32_t group_index, // profiling group index + rocprofiler_group_t* group); // [out] profiling group + +// Start profiling +hsa_status_t rocprofiler_group_start(rocprofiler_group_t* group); // [in/out] profiling group + +// Stop profiling +hsa_status_t rocprofiler_group_stop(rocprofiler_group_t* group); // [in/out] profiling group + +// Read profiling +hsa_status_t rocprofiler_group_read(rocprofiler_group_t* group); // [in/out] profiling group + +// Get profiling data +hsa_status_t rocprofiler_group_get_data(rocprofiler_group_t* group); // [in/out] profiling group + +// Get metrics data +hsa_status_t rocprofiler_get_metrics(const rocprofiler_t* context); // [in/out] profiling context + +// Definition of output data iterator callback +typedef hsa_ven_amd_aqlprofile_data_callback_t rocprofiler_trace_data_callback_t; + +// Method for iterating the events output data +hsa_status_t rocprofiler_iterate_trace_data( + rocprofiler_t* context, // [in] profiling context + rocprofiler_trace_data_callback_t callback, // callback to iterate the output data + void* data); // [in/out] callback data + +//////////////////////////////////////////////////////////////////////////////// +// Profiling features and data +// +// Profiling features objects have profiling feature info, type, parameters and data +// Also profiling data samplaes can be iterated using a callback + +// Profiling info kind +typedef enum { + ROCPROFILER_INFO_KIND_METRIC = 0, // metric info + ROCPROFILER_INFO_KIND_METRIC_COUNT = 1, // metric features count, int32 + ROCPROFILER_INFO_KIND_TRACE = 2, // trace info + ROCPROFILER_INFO_KIND_TRACE_COUNT = 3, // trace features count, int32 +} rocprofiler_info_kind_t; + +// Profiling info query +typedef union { + rocprofiler_info_kind_t info_kind; // queried profiling info kind + struct { + const char* trace_name; // queried info trace name + } trace_parameter; +} rocprofiler_info_query_t; + +// Profiling info data +typedef struct { + uint32_t agent_index; // GPU HSA agent index + rocprofiler_info_kind_t kind; // info data kind + union { + struct { + const char* name; // metric name + const char* expr; // metric expression, NULL for basic counters + const char* description; // metric description + } metric; + struct { + const char* name; // trace name + const char* description; // trace description + uint32_t parameter_count; // supported by the trace number parameters + } trace; + }; +} rocprofiler_info_data_t; + +// Return the info for a given info kind +hsa_status_t rocprofiler_get_info( + const hsa_agent_t* agent, // [in] GFXIP handle + rocprofiler_info_kind_t kind, // kind of iterated info + void *data); // [in/out] returned data + +// Iterate over the info for a given info kind, and invoke an application-defined callback on every iteration +hsa_status_t rocprofiler_iterate_info( + const hsa_agent_t* agent, // [in] GFXIP handle + rocprofiler_info_kind_t kind, // kind of iterated info + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback + void *data); // [in/out] data passed to callback + +// Iterate over the info for a given info query, and invoke an application-defined callback on every iteration +hsa_status_t rocprofiler_query_info( + const hsa_agent_t *agent, // [in] GFXIP handle + rocprofiler_info_query_t query, // iterated info query + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback + void *data); // [in/out] data passed to callback + +#ifdef __cplusplus +} // extern "C" block +#endif // __cplusplus + +#endif // INC_ROCPROFILER_H_ diff --git a/script/rpl_run.sh b/script/rpl_run.sh new file mode 100755 index 00000000..a8260e77 --- /dev/null +++ b/script/rpl_run.sh @@ -0,0 +1,377 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +#!/bin/sh +time_stamp=`date +%y%m%d_%H%M%S` +BIN_DIR=`dirname $0` +BIN_DIR=`cd $BIN_DIR; pwd` +RUN_DIR=`pwd` +TMP_DIR="/tmp" +DATA_PATH=$TMP_DIR +DATA_DIR="rpl_data_${time_stamp}_$$" + +PKG_DIR=`echo $BIN_DIR | sed "s/\/bin\/*$//"` +BIN_DIR=$PKG_DIR/bin + +# PATH to custom HSA and OpenCl runtimes +HSA_PATH=$PKG_DIR/lib/hsa + +export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$HSA_PATH +export PATH=.:$PATH + +# enable error logging +export HSA_TOOLS_REPORT_LOAD_FAILURE=1 +export HSA_VEN_AMD_AQLPROFILE_LOG=1 +export ROCPROFILER_LOG=1 + +# ROC Profiler environment +# Loading of ROC Profiler by HSA runtime +export HSA_TOOLS_LIB=librocprofiler64.so +# Loading of the test tool by ROC Profiler +export ROCP_TOOL_LIB=libtool.so +# Enabling HSA dispatches intercepting by ROC PRofiler +export ROCP_HSA_INTERCEPT=1 +# Disabling internal ROC Profiler proxy queue (simple version supported for testing purposes) +unset ROCP_PROXY_QUEUE +# ROC Profiler metrics definition +export ROCP_METRICS=$PKG_DIR/lib/metrics.xml +# ROC Profiler package path +export ROCP_PACKAGE_DIR=$PKG_DIR + +# error handling +fatal() { + echo "$0: Error: $1" + echo "" + usage +} + +error() { + echo "$0: Error: $1" + echo "" + exit 1 +} + +# usage method +usage() { + bin_name=`basename $0` + echo "ROCm Profiling Library (RPL) run script, a part of ROCprofiler library package." + echo "Full path: $BIN_DIR/$bin_name" + echo "Metrics definition: $PKG_DIR/lib/metrics.xml" + echo "" + echo "Usage:" + echo " rpl_run.sh [-h] [--list-basic] [--list-derived] [-i ] [-o ] " + echo "" + echo "Options:" + echo " -h - this help" + echo " --verbose - verbose mode, dumping all base counters used in the input metrics" + echo " --list-basic - to print the list of basic HW counters" + echo " --list-derived - to print the list of derived metrics with formulas" + echo "" + echo " -i <.txt|.xml file> - input file" + echo " Input file .txt format, automatically rerun application for every pmc/sqtt line:" + echo "" + echo " # Perf counters group 1" + echo " pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize" + echo " # Perf counters group 2" + echo " pmc : WriteSize L2CacheHit" + echo " # SQ tread trace" + echo " sqtt : MASK = 0x0F00 TOKEN_MASK = 0x144B TOKEN_MASK2 = 0xFFFF" + echo " # Filter by dispatches range, GPU index and kernel names" + echo " # supported range formats: \"3:9\", \"3:\", \"3\"" + echo " range: 1 : 4" + echo " gpu: 0 1 2 3" + echo " kernel: simple Pass1 simpleConvolutionPass2" + echo "" + echo " Input file .xml format, for single profiling run:" + echo "" + echo " # Metrics list definition, also the form \":\" can be used" + echo " # All defined metrics can be found in the 'metrics.xml'" + echo " # There are basic metrics for raw HW counters and high-level metrics for derived counters" + echo " " + echo "" + echo " # Trace enabling and the parameters definition" + echo " " + echo " " + echo " " + echo "" + echo " # Filter by dispatches range, GPU index and kernel names" + echo " " + echo "" + echo " Supported by profiler SQTT parameters:" + echo " TARGET_CU - target Compute Unit, MASK.CU_SEL field" + echo " VM_ID_MASK - select which VM IDs to capture, MASK.VM_ID_MASK field" + echo " MASK - MASK register value" + echo " TOKEN_MASK - TOKEN_MASK register value" + echo " TOKEN_MASK2 - TOKEN_MASK2 register value, traced instructions mask" + echo " The parameters defaults:" + echo " TARGET_CU = 0;" + echo " VM_ID_MASK = 0;" + echo " MASK:" + echo " mask.bits.CU_SEL = param{TARGET_CU};" + echo " mask.bits.SH_SEL = 0x0;" + echo " mask.bits.SIMD_EN = 0xF;" + echo " mask.bits.SQ_STALL_EN = 0x1;" + echo " mask.bits.SPI_STALL_EN = 0x1;" + echo " mask.bits.REG_STALL_EN = 0x1;" + echo " mask.bits.VM_ID_MASK = param{VM_ID_MASK};" + echo " TOKEN_MASK:" + echo " token_mask.bits.TOKEN_MASK = 0xFFFF;" + echo " token_mask.bits.REG_MASK = 0xFF;" + echo " token_mask.bits.REG_DROP_ON_STALL = 0x1;" + echo " TOKEN_MASK2:" + echo " token_mask2.bits.INST_MASK = 0xFFFFFF7F; // INST_PC is disabled because its tracing can cause extra stalling" + echo " // and it is recommended to disable by SQTT user guide" + echo " HIWATER = 6; // which is 6/8 fraction of the tread trace fifo" + echo "" + echo " -o - output CSV file [.csv]" + echo " -d - directory where profiler store profiling data including thread treaces [/tmp]" + echo " The data directory is renoving autonatically if the directory is matching the temporary one, which is the default." + echo " -t - to change the temporary directory [/tmp]" + echo " By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory." + echo "" + echo " --basenames - to turn on/off truncating of the kernel full function names till the base ones [off]" + echo " --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off]" + echo " --ctx-limit - maximum number of outstanding contexts [0 - unlimited]" + echo " --heartbeat - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [0 - disabled]" + echo " --sqtt-size - to set SQTT buffer size, aggregate for all SE [0x2000000]" + echo " Can be set in KB (1024B) or MB (1048576) units, examples 20K or 20M respectively." + echo " --sqtt-local - to allocate SQTT buffer in local GPU memory [on]" + echo "" + echo "Configuration file:" + echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" + echo" First the configuration file is looking in the current directory, then in your home, and then in the package directory." + echo " Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'sqtt-size', 'sqtt-local'." + echo " An example of 'rpl_rc.xml':" + echo " " + echo "" + exit 1 +} + +# profiling run method +OUTPUT_LIST="" +run() { + export ROCP_INPUT="$1" + OUTPUT_DIR="$2" + shift + shift + APP_CMD=$* + + if [ "$OUTPUT_DIR" = "-" ] ; then + input_tag=`echo $ROCP_INPUT | sed "s/\.xml//"` + export ROCP_OUTPUT_DIR=${input_tag}_results_${time_stamp} + elif [ "$OUTPUT_DIR" = "--" ] ; then + unset ROCP_OUTPUT_DIR + else + export ROCP_OUTPUT_DIR=$OUTPUT_DIR + fi + echo "RPL: result dir '$ROCP_OUTPUT_DIR'" + + if [ ! -e "$ROCP_INPUT" ] ; then + error "Input file '$ROCP_INPUT' not found" + fi + + if [ -n "$ROCP_OUTPUT_DIR" ] ; then + if [ "$OUTPUT_DIR" = "-" ] ; then + if [ -e "$ROCP_OUTPUT_DIR" ] ; then + error "generated dir '$ROCP_OUTPUT_DIR' exists" + fi + fi + mkdir -p "$ROCP_OUTPUT_DIR" + fi + + if [ -n "$ROCP_OUTPUT_DIR" ] ; then + OUTPUT_LIST="$OUTPUT_LIST $ROCP_OUTPUT_DIR/results.txt" + eval "$APP_CMD 2>&1 | tee $ROCP_OUTPUT_DIR/log.txt" + else + eval "$APP_CMD" + fi +} + +# main +echo "RPL: on '$time_stamp' from '$PKG_DIR' at '$RUN_DIR'" +# Parsing arguments +if [ -z "$1" ] ; then + usage +fi + +INPUT_FILE="" +OUTPUT_DIR="-" +output="" +csv_output="" + +ARG_IN="" +while [ 1 ] ; do + ARG_IN=$1 + ARG_VAL=1 + if [ "$1" = "-h" ] ; then + usage + elif [ "$1" = "-i" ] ; then + INPUT_FILE="$2" + elif [ "$1" = "-o" ] ; then + output="$2" + elif [ "$1" = "-d" ] ; then + OUTPUT_DIR="$2" + DATA_PATH=$OUTPUT_DIR + elif [ "$1" = "-t" ] ; then + TMP_DIR="$2" + if [ "$OUTPUT_DIR" = "-" ] ; then + DATA_PATH=$TMP_DIR + fi + elif [ "$1" = "--list-basic" ] ; then + export ROCP_INFO=b + eval "$PKG_DIR/test/SimpleConvolution" + exit 1 + elif [ "$1" = "--list-derived" ] ; then + export ROCP_INFO=d + eval "$PKG_DIR/test/SimpleConvolution" + exit 1 + elif [ "$1" = "--basenames" ] ; then + if [ "$2" = "on" ] ; then + export ROCP_TRUNCATE_NAMES=1 + else + export ROCP_TRUNCATE_NAMES=0 + fi + elif [ "$1" = "--timestamp" ] ; then + if [ "$2" = "on" ] ; then + export ROCP_TRACKER_ON=1 + else + export ROCP_TRACKER_ON=0 + fi + elif [ "$1" = "--ctx-limit" ] ; then + export ROCP_OUTSTANDING_MAX="$2" + elif [ "$1" = "--heartbeat" ] ; then + export ROCP_OUTSTANDING_MON="$2" + elif [ "$1" = "--sqtt-size" ] ; then + size_m=`echo "$2" | sed -n "s/^\(.*\)M$/\1/p"` + size_k=`echo "$2" | sed -n "s/^\(.*\)K$/\1/p"` + if [ -n "$size_m" ] ; then size_b=$((size_m*1024*1024)) + elif [ -n "$size_k" ] ; then size_b=$((size_k*1024)) + else size_b=$2 + fi + export ROCP_SQTT_SIZE=$size_b + elif [ "$1" = "--sqtt-local" ] ; then + if [ "$2" = "on" ] ; then + export ROCP_SQTT_LOCAL=1 + else + export ROCP_SQTT_LOCAL=0 + fi + elif [ "$1" = "--verbose" ] ; then + ARG_VAL=0 + export ROCP_VERBOSE_MODE=1 + else + break + fi + shift + if [ "$ARG_VAL" = 1 ] ; then shift; fi +done + +ARG_CK=`echo $ARG_IN | sed "s/^-.*$/-/"` +if [ "$ARG_CK" = "-" ] ; then + fatal "Wrong option '$ARG_IN'" +fi + +if [ -z "$INPUT_FILE" ] ; then + fatal "Need input file" +fi + +input_base=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\1/"` +input_type=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\2/"` +if [ -z "${input_base}" -o -z "${input_type}" ] ; then + fatal "Bad input file '$INPUT_FILE'" +fi +input_base=`basename $input_base` + +if [ "$OUTPUT_DIR" = "--" ] ; then + fatal "Bad output dir '$OUTPUT_DIR'" +fi + +if [ -n "$output" ] ; then + if [ "$output" = "--" ] ; then + OUTPUT_DIR="--" + else + csv_output=$output + fi +else + csv_output=$RUN_DIR/${input_base}.csv +fi + +APP_CMD=$* + +echo "RPL: profiling '$APP_CMD'" +echo "RPL: input file '$INPUT_FILE'" + +input_list="" +RES_DIR="" +if [ "$input_type" = "xml" ] ; then + input_list=$INPUT_FILE +elif [ "$input_type" = "txt" ] ; then + OUTPUT_DIR="-" + RES_DIR=$DATA_PATH/$DATA_DIR + if [ -e $RES_DIR ] ; then + error "Rundir '$RES_DIR' exists" + fi + mkdir -p $RES_DIR + echo "RPL: output dir '$RES_DIR'" + $BIN_DIR/txt2xml.sh $INPUT_FILE $RES_DIR + input_list=`/bin/ls $RES_DIR/input*.xml` +else + fatal "Bad input file type '$INPUT_FILE'" +fi + +for name in $input_list; do + run $name $OUTPUT_DIR $APP_CMD +done + +if [ -n "$csv_output" ] ; then + python $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST + if [ "$?" = 1 ] ; then + error "CSV generation error, profiling results '$RES_DIR'" + fi + echo "RPL: '$csv_output' is generated" +fi + +if [ "$DATA_PATH" = "$TMP_DIR" ] ; then + if [ -e "$RES_DIR" ] ; then + rm -rf $RES_DIR + fi +fi + +exit 0 diff --git a/script/tblextr.py b/script/tblextr.py new file mode 100755 index 00000000..9a314db4 --- /dev/null +++ b/script/tblextr.py @@ -0,0 +1,119 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +#!/usr/bin/python +import os, sys, re + +# Parsing results in the format: +#dispatch[0], queue_index(0), kernel_name("SimpleConvolution"), time(1048928000311041,1048928006154674,1048928006168274,1048928006170503): +# GRBM_GUI_ACTIVE (74332) +# SQ_WAVES (4096) +# SQ_INSTS_VMEM_RD (36864) + +# global vars +var_list = ['Index', 'KernelName', 'DispatchNs', 'BeginNs', 'EndNs', 'CompleteNs'] +var_table = {} +############################################################# + +def fatal(msg): + sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); + sys.exit(1) +############################################################# + +# parse results method +def parse_res(infile): + if not os.path.isfile(infile): fatal("Error: input file '" + infile + "' not found") + inp = open(infile, 'r') + + beg_pattern = re.compile("^dispatch\[(\d*)\], queue_index\(\d*\), kernel_name\(\"([^\"]*)\"\)") + ts_pattern = re.compile(", time\((\d*),(\d*),(\d*),(\d*)\)") + var_pattern = re.compile("^\s*([^\s]*)\s+\((\d*)\)") + + dispatch_number = 0 + for line in inp.readlines(): + record = line[:-1] + + m = var_pattern.match(record) + if m: + if not dispatch_number in var_table: fatal("Error: dispatch number not unique '" + str(dispatch_number) + "'") + var = m.group(1) + val = m.group(2) + var_table[dispatch_number][m.group(1)] = m.group(2) + if not var in var_list: var_list.append(var) + + m = beg_pattern.match(record) + if m: + dispatch_number = m.group(1) + if not dispatch_number in var_table: + var_table[dispatch_number] = { + 'Index': dispatch_number, + 'KernelName': "\"" + m.group(2) + "\"" + } + m = ts_pattern.search(record) + if m: + var_table[dispatch_number]['DispatchNs'] = m.group(1) + var_table[dispatch_number]['BeginNs'] = m.group(2) + var_table[dispatch_number]['EndNs'] = m.group(3) + var_table[dispatch_number]['CompleteNs'] = m.group(4) + + inp.close() +############################################################# + +# print results table method +def print_tbl(outfile): + global var_list + + out = open(outfile, 'w') + + keys = var_table.keys() + keys.sort(key=int) + + entry = var_table[keys[0]] + list1 = [] + for var in var_list: + if var in entry: + list1.append(var) + var_list = list1 + + for var in var_list: out.write(var + ',') + out.write("\n") + + for ind in keys: + entry = var_table[ind] + dispatch_number = entry['Index'] + if ind != dispatch_number: fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n") + for var in var_list: out.write(entry[var] + ',') + out.write("\n") + + out.close() +############################################################# + +# main +if (len(sys.argv) < 3): fatal("Usage: " + sys.argv[0] + " ") + +outfile = sys.argv[1] +infiles = sys.argv[2:] +for f in infiles : + parse_res(f) +print_tbl(outfile) +sys.exit(0) +############################################################# diff --git a/script/txt2xml.sh b/script/txt2xml.sh new file mode 100755 index 00000000..57cb4be7 --- /dev/null +++ b/script/txt2xml.sh @@ -0,0 +1,94 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +#!/bin/bash +timestamp=`date +%y%m%d_%H%M%S` + +if [ $# = 0 ] ; then + echo "Usage: $0 [output dir]" + exit -1 +fi + +input=$1 +outdir=$2 +if [ -z "$outdir" ] ; then + outdir="." +fi + +range="" +kernel="" +gpu_index="" + +parse() { + scan="$1" + index=0 + while read -r line ; do + line=`echo $line | sed "s/\s*#.*$//"` + if [ -z "$line" ] ; then + continue + fi + + feature=`echo $line | sed -n "s/^\s*\([a-z]*\)\s*:.*$/\1/p"` + line=`echo $line | sed "s/^[^:]*:\s*//"` + line=`echo "$line" | sed -e "s/\s*=\s*/=/g" -e "s/\s*:\s*/:/g" -e "s/,\{1,\}/ /g" -e "s/\s\{1,\}/ /g" -e "s/\s*$//"` + + if [ "$scan" = 0 ] ; then + line=`echo "$line" | sed -e "s/ /,/g"` + if [ "$feature" == "range" ] ; then + range=$line + fi + if [ "$feature" == "kernel" ] ; then + kernel=$line + fi + if [ "$feature" == "gpu" ] ; then + gpu_index=$line + fi + else + output=$outdir/input${index}.xml + header="# $timestamp '$output' generated with '$0 $*'" + + if [ "$feature" == "pmc" ] ; then + line=`echo "$line" | sed -e "s/ /,/g"` + cat >> $output < + +EOF + fi + + if [ "$feature" == "sqtt" ] ; then + cat >> $output < + +EOF + fi + fi + + index=$((index + 1)) + done < $input +} + +parse 0 +parse 1 + +exit 0 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 00000000..45bc2719 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,37 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +# +# Build dynamic Library object +# +set ( TARGET_LIB "${TARGET_NAME}" ) +set ( LIB_SRC + ${LIB_DIR}/core/rocprofiler.cpp + ${LIB_DIR}/core/proxy_queue.cpp + ${LIB_DIR}/core/simple_proxy_queue.cpp + ${LIB_DIR}/core/intercept_queue.cpp + ${LIB_DIR}/core/metrics.cpp + ${LIB_DIR}/util/hsa_rsrc_factory.cpp +) +add_library ( ${TARGET_LIB} SHARED ${LIB_SRC} ) +target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries( ${TARGET_LIB} PRIVATE ${HSA_RUNTIME_LIB} c stdc++) diff --git a/src/core/context.h b/src/core/context.h new file mode 100644 index 00000000..966acaef --- /dev/null +++ b/src/core/context.h @@ -0,0 +1,546 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_CORE_CONTEXT_H_ +#define SRC_CORE_CONTEXT_H_ + +#include "inc/rocprofiler.h" + +#include +#include +#include +#include +#include + +#include "core/metrics.h" +#include "core/profile.h" +#include "core/queue.h" +#include "core/types.h" +#include "util/exception.h" +#include "util/hsa_rsrc_factory.h" +#include "util/logger.h" + +namespace rocprofiler { +struct rocprofiler_contex_t; +class Context; + +inline unsigned align_size(unsigned size, unsigned alignment) { + return ((size + alignment - 1) & ~(alignment - 1)); +} + +// Block descriptor +struct block_des_t { + uint32_t id; + uint32_t index; +}; + +// block_des_t less-then functor +struct lt_block_des { + bool operator()(const block_des_t& a1, const block_des_t& a2) const { + return (a1.id < a2.id) || ((a1.id == a2.id) && (a1.index < a2.index)); + } +}; + +// Block status +struct block_status_t { + uint32_t max_counters; + uint32_t counter_index; + uint32_t group_index; +}; + +// Metrics arguments +template class MetricArgs : public xml::args_cache_t { + public: + MetricArgs(const Map& map) : map_(map) {} + bool Lookup(const std::string& name, uint64_t& result) const { + rocprofiler_feature_t* info = NULL; + auto it = map_.find(name); + if (it == map_.end()) EXC_RAISING(HSA_STATUS_ERROR, "var '" << name << "' is not found"); + info = it->second; + if (info) { + result = info->data.result_int64; + if (info->data.kind == ROCPROFILER_DATA_KIND_UNINIT) + EXC_RAISING(HSA_STATUS_ERROR, "var '" << name << "' is uninitialized"); + if (info->data.kind != ROCPROFILER_DATA_KIND_INT64) + EXC_RAISING(HSA_STATUS_ERROR, "var '" << name << "' is of incompatible type, not INT64"); + } else + EXC_RAISING(HSA_STATUS_ERROR, "var '" << name << "' info is NULL"); + return (info != NULL); + } + + private: + const Map& map_; +}; + +// Profiling group +class Group { + public: + Group(const util::AgentInfo* agent_info, Context* context, const uint32_t& index) + : pmc_profile_(agent_info), + sqtt_profile_(agent_info), + n_profiles_(0), + refs_(1), + context_(context), + index_(index) {} + + void Insert(const profile_info_t& info) { + const rocprofiler_feature_kind_t kind = info.rinfo->kind; + info_vector_.push_back(info.rinfo); + switch (kind) { + case ROCPROFILER_FEATURE_KIND_METRIC: + pmc_profile_.Insert(info); + break; + case ROCPROFILER_FEATURE_KIND_TRACE: + sqtt_profile_.Insert(info); + break; + default: + EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); + } + } + + hsa_status_t Finalize() { + hsa_status_t status = pmc_profile_.Finalize(start_vector_, stop_vector_, read_vector_); + if (status == HSA_STATUS_SUCCESS) { + status = sqtt_profile_.Finalize(start_vector_, stop_vector_, read_vector_); + } + if (status == HSA_STATUS_SUCCESS) { + if (!pmc_profile_.Empty()) ++n_profiles_; + if (!sqtt_profile_.Empty()) ++n_profiles_; + } + return status; + } + + void GetProfiles(profile_vector_t& vec) { + pmc_profile_.GetProfiles(vec); + sqtt_profile_.GetProfiles(vec); + } + + void GetTraceProfiles(profile_vector_t& vec) { sqtt_profile_.GetProfiles(vec); } + + info_vector_t& GetInfoVector() { return info_vector_; } + const pkt_vector_t& GetStartVector() const { return start_vector_; } + const pkt_vector_t& GetStopVector() const { return stop_vector_; } + const pkt_vector_t& GetReadVector() const { return read_vector_; } + Context* GetContext() { return context_; } + uint32_t GetIndex() const { return index_; } + + void ResetRefs() { refs_ = n_profiles_; } + uint32_t DecrRefs() { + return (refs_ > 0) ? --refs_ : 0; + } + + private: + PmcProfile pmc_profile_; + SqttProfile sqtt_profile_; + info_vector_t info_vector_; + pkt_vector_t start_vector_; + pkt_vector_t stop_vector_; + pkt_vector_t read_vector_; + uint32_t n_profiles_; + uint32_t refs_; + Context* const context_; + const uint32_t index_; +}; + +// Profiling context +class Context { + public: + typedef std::mutex mutex_t; + typedef std::map info_map_t; + + Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, + const uint32_t info_count, rocprofiler_handler_t handler, void* handler_arg) + : agent_(agent_info->dev_id), + agent_info_(agent_info), + queue_(queue), + hsa_rsrc_(&util::HsaRsrcFactory::Instance()), + api_(hsa_rsrc_->AqlProfileApi()), + handler_(handler), + handler_arg_(handler_arg) + { + metrics_ = MetricsDict::Create(agent_info); + if (metrics_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); + Initialize(info, info_count); + Finalize(); + + if (handler != NULL) { + for (unsigned group_index = 0; group_index < set_.size(); ++group_index) { + set_[group_index].ResetRefs(); + const profile_vector_t profile_vector = GetProfiles(group_index); + for (auto& tuple : profile_vector) { + // Handler for stop packet completion + hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, + &set_[group_index]); + } + } + } + } + + ~Context() { + for (const auto& v : info_map_) { + const std::string& name = v.first; + const rocprofiler_feature_t* info = v.second; + if ((info->kind == ROCPROFILER_FEATURE_KIND_METRIC) && + (metrics_map_.find(name) == metrics_map_.end())) { + delete info; + } + } + } + + // Initialize rocprofiler context + void Initialize(rocprofiler_feature_t* info_array, const uint32_t info_count) { + // Register input features to not duplicate by features referencing + for (unsigned i = 0; i < info_count; ++i) { + rocprofiler_feature_t* info = &info_array[i]; + if (!info->name) EXC_RAISING(HSA_STATUS_ERROR, "input feature name is NULL"); + info_map_[info->name] = info; + } + + // Adding zero group, always present + if (info_count) set_.push_back(Group(agent_info_, this, 0)); + + // Processing input features + for (unsigned i = 0; i < info_count; ++i) { + rocprofiler_feature_t* info = &info_array[i]; + const rocprofiler_feature_kind_t kind = info->kind; + const char* name = info->name; + + if (kind == ROCPROFILER_FEATURE_KIND_METRIC) { // Processing metrics features + const Metric* metric = metrics_->Get(name); + if (metric == NULL) + EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found"); +#if 0 + std::cout << " " << name << (metric->GetExpr() ? " = " + metric->GetExpr()->String() : " counter") << std::endl; +#endif + + auto ret = metrics_map_.insert({name, metric}); + if (!ret.second) + EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name + << "' is registered more then once"); + + counters_vec_t counters_vec = metric->GetCounters(); + if (counters_vec.empty()) + EXC_RAISING(HSA_STATUS_ERROR, "bad metric '" << name << "' is empty"); + + for (const counter_t* counter : counters_vec) { + // For metrics expressions checking that there is no the same counter in the input metrics + // and also that the counter wasn't registered already by another input metric expression + if (metric->GetExpr()) { + if (info_map_.find(counter->name) != info_map_.end()) { + continue; + } else { + info = NewCounterInfo(counter); + info_map_[info->name] = info; + } + } + + const event_t* event = &(counter->event); + const block_des_t block_des = {event->block_name, event->block_index}; + auto ret = groups_map_.insert({block_des, {}}); + block_status_t& block_status = ret.first->second; + if (block_status.max_counters == 0) { + profile_t query = {}; + query.agent = agent_; + query.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC; + query.events = event; + + uint32_t block_counters; + hsa_status_t status = api_->hsa_ven_amd_aqlprofile_get_info( + &query, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS, &block_counters); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "get block_counters info"); + block_status.max_counters = block_counters; + } + if (block_status.counter_index >= block_status.max_counters) { + block_status.counter_index = 0; + block_status.group_index += 1; + } + if (block_status.group_index >= set_.size()) { + set_.push_back(Group(agent_info_, this, block_status.group_index)); + } + const uint32_t group_index = block_status.group_index; + set_[group_index].Insert(profile_info_t{event, NULL, 0, info}); + } + } else if (kind == ROCPROFILER_FEATURE_KIND_TRACE) { // Processing traces features + set_[0].Insert(profile_info_t{NULL, info->parameters, info->parameter_count, info}); + } else { + EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); + } + } + } + + void Finalize() { + for (unsigned index = 0; index < set_.size(); ++index) { + const hsa_status_t status = set_[index].Finalize(); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "context finalize failed"); + } + } + + void Reset(const uint32_t& group_index) { set_[group_index].ResetRefs(); } + + uint32_t GetGroupCount() const { return set_.size(); } + + rocprofiler_group_t GetGroupInfo(Group* g) { + rocprofiler::info_vector_t& info_vector = g->GetInfoVector(); + rocprofiler_group_t group = {}; + group.index = g->GetIndex(); + group.context = reinterpret_cast(this); + group.features = &info_vector[0]; + group.feature_count = info_vector.size(); + return group; + } + rocprofiler_group_t GetGroupInfo(const uint32_t& index) { + return GetGroupInfo(&set_[index]); + } + + const pkt_vector_t& StartPackets(const uint32_t& group_index) const { + return set_[group_index].GetStartVector(); + } + const pkt_vector_t& StopPackets(const uint32_t& group_index) const { + return set_[group_index].GetStopVector(); + } + const pkt_vector_t& ReadPackets(const uint32_t& group_index) const { + return set_[group_index].GetReadVector(); + } + + void Start(const uint32_t& group_index, Queue* const queue = NULL) { + const pkt_vector_t& start_packets = StartPackets(group_index); + Queue* const submit_queue = (queue != NULL) ? queue : queue_; + submit_queue->Submit(&start_packets[0], start_packets.size()); + } + void Stop(const uint32_t& group_index, Queue* const queue = NULL) { + const pkt_vector_t& stop_packets = StopPackets(group_index); + Queue* const submit_queue = (queue != NULL) ? queue : queue_; + submit_queue->Submit(&stop_packets[0], stop_packets.size()); + } + void Read(const uint32_t& group_index, Queue* const queue = NULL) { + const pkt_vector_t& read_packets = ReadPackets(group_index); + if (read_packets.size() == 0) EXC_RAISING(HSA_STATUS_ERROR, "Read API disabled"); + Queue* const submit_queue = (queue != NULL) ? queue : queue_; + submit_queue->Submit(&read_packets[0], read_packets.size()); + } + void Submit(const uint32_t& group_index, const packet_t* packet, Queue* const queue = NULL) { + Queue* const submit_queue = (queue != NULL) ? queue : queue_; + Start(group_index, submit_queue); + submit_queue->Submit(packet); + Stop(group_index, submit_queue); + } + + struct callback_data_t { + const profile_t* profile; + info_vector_t* info_vector; + size_t index; + char* ptr; + }; + + void GetData(const uint32_t& group_index) { + const profile_vector_t profile_vector = GetProfiles(group_index); + for (auto& tuple : profile_vector) { + // Wait for stop packet to complete + const uint64_t timeout = timeout_; + bool complete = false; + while (!complete) { + const hsa_signal_value_t signal_value = hsa_signal_wait_scacquire(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, timeout, + HSA_WAIT_STATE_BLOCKED); + complete = (signal_value < 1); + if (!complete) WARN_LOGGING("timeout"); + } + for (rocprofiler_feature_t* rinfo : *(tuple.info_vector)) rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT; + callback_data_t callback_data{tuple.profile, tuple.info_vector, tuple.info_vector->size(), NULL}; + const hsa_status_t status = + api_->hsa_ven_amd_aqlprofile_iterate_data(tuple.profile, DataCallback, &callback_data); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "context iterate data failed"); + } + } + + void GetMetricsData() const { + const MetricArgs args(info_map_); + for (const auto v : metrics_map_) { + const std::string& name = v.first; + const Metric* metric = v.second; + const xml::Expr* expr = metric->GetExpr(); + if (expr) { + auto it = info_map_.find(name); + if (it == info_map_.end()) + EXC_RAISING(HSA_STATUS_ERROR, "metric '" << name << "', rocprofiler info is not found " << this); + rocprofiler_feature_t* info = it->second; + info->data.result_int64 = expr->Eval(args); + info->data.kind = ROCPROFILER_DATA_KIND_INT64; + } + } + } + + void IterateTraceData(rocprofiler_trace_data_callback_t callback, void* data) { + profile_vector_t profile_vector; + set_[0].GetTraceProfiles(profile_vector); + for (auto& tuple : profile_vector) { + const hsa_status_t status = + api_->hsa_ven_amd_aqlprofile_iterate_data(tuple.profile, callback, data); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "context iterate data failed"); + } + } + + static void SetTimeout(uint64_t timeout) { timeout_ = timeout; } + static uint64_t GetTimeout() { return timeout_; } + + private: + // Getting profling packets + profile_vector_t GetProfiles(const uint32_t& index) { + profile_vector_t vec; + if (index >= set_.size()) { + EXC_RAISING(HSA_STATUS_ERROR, "index exceeding the maximum " << set_.size()); + } + set_[index].GetProfiles(vec); + return vec; + } + + static bool Handler(hsa_signal_value_t value, void* arg) { + Group* group = reinterpret_cast(arg); + Context* context = group->GetContext(); + context->mutex_.lock(); + uint32_t r = group->DecrRefs(); + context->mutex_.unlock(); + if (r == 0) { + return context->handler_(context->GetGroupInfo(group), context->handler_arg_); + } + return false; + } + + static hsa_status_t DataCallback(hsa_ven_amd_aqlprofile_info_type_t ainfo_type, + hsa_ven_amd_aqlprofile_info_data_t* ainfo_data, void* data) { + hsa_status_t status = HSA_STATUS_SUCCESS; + callback_data_t* callback_data = reinterpret_cast(data); + const profile_t* profile = callback_data->profile; + info_vector_t& info_vector = *(callback_data->info_vector); + uint32_t index = callback_data->index; + const uint32_t sample_id = ainfo_data->sample_id; + if (info_vector.size() == index) { + index = 0; + } else { + if (sample_id == 0) index += 1; + } + callback_data->index = index; + + if (index < info_vector.size()) { + rocprofiler_feature_t* const rinfo = info_vector[index]; + rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT; + + if (ainfo_type == HSA_VEN_AMD_AQLPROFILE_INFO_PMC_DATA) { + if (ainfo_data->sample_id == 0) rinfo->data.result_int64 = 0; + rinfo->data.result_int64 += ainfo_data->pmc_data.result; + rinfo->data.kind = ROCPROFILER_DATA_KIND_INT64; + } else if (ainfo_type == HSA_VEN_AMD_AQLPROFILE_INFO_SQTT_DATA) { + if (rinfo->data.result_bytes.copy) { + const bool sqtt_local = SqttProfile::IsLocal(); + util::HsaRsrcFactory* hsa_rsrc = &util::HsaRsrcFactory::Instance(); + if (sample_id == 0) { + const uint32_t output_buffer_size = profile->output_buffer.size; + const uint32_t output_buffer_size64 = profile->output_buffer.size / sizeof(uint64_t); + const util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(profile->agent); + void* ptr = (sqtt_local) ? hsa_rsrc->AllocateSysMemory(agent_info, output_buffer_size) : + calloc(output_buffer_size64, sizeof(uint64_t)); + rinfo->data.result_bytes.size = output_buffer_size; + rinfo->data.result_bytes.ptr = ptr; + callback_data->ptr = reinterpret_cast(ptr); + } + char* result_bytes_ptr = reinterpret_cast(rinfo->data.result_bytes.ptr); + const char* end = result_bytes_ptr + rinfo->data.result_bytes.size; + const char* src = reinterpret_cast(ainfo_data->sqtt_data.ptr); + uint32_t size = ainfo_data->sqtt_data.size; + char* ptr = callback_data->ptr; + uint32_t* header = reinterpret_cast(ptr); + char* dest = ptr + sizeof(*header); + + if ((dest + size) >= end) { + if (dest < end) size = end - dest; + else EXC_RAISING(HSA_STATUS_ERROR, "SQTT data out of output buffer"); + } + + bool suc = true; + if (sqtt_local) { + suc = hsa_rsrc->Memcpy(profile->agent, dest, src, size); + } else { + memcpy(dest, src, size); + } + if (suc) { + *header = size; + callback_data->ptr = dest + align_size(size, sizeof(uint32_t)); + rinfo->data.result_bytes.instance_count = sample_id + 1; + rinfo->data.kind = ROCPROFILER_DATA_KIND_BYTES; + } else + EXC_RAISING(HSA_STATUS_ERROR, "Agent Memcpy failed, dst(" << (void*)dest << ") src(" << (void*)src << ") size(" << size << ")"); + } else { + if (sample_id == 0) { + rinfo->data.result_bytes.ptr = profile->output_buffer.ptr; + rinfo->data.result_bytes.size = profile->output_buffer.size; + rinfo->data.result_bytes.instance_count = UINT32_MAX; + } + + rinfo->data.result_bytes.instance_count += 1; + rinfo->data.kind = ROCPROFILER_DATA_KIND_BYTES; + } + } else { + EXC_RAISING(HSA_STATUS_ERROR, "unknown data type = " << ainfo_type); + } + } else + status = HSA_STATUS_ERROR; + + return status; + } + + rocprofiler_feature_t* NewCounterInfo(const counter_t* counter) { + rocprofiler_feature_t* info = new rocprofiler_feature_t{}; + info->kind = ROCPROFILER_FEATURE_KIND_METRIC; + info->name = counter->name.c_str(); + return info; + } + + // Profiling data waiting timeout + static uint64_t timeout_; + + // GPU handel + const hsa_agent_t agent_; + const util::AgentInfo* agent_info_; + // Profiling queue + Queue* queue_; + // HSA resources factory + util::HsaRsrcFactory* hsa_rsrc_; + // aqlprofile API table + const pfn_t* api_; + // Profile group set + std::vector set_; + // Metrics dictionary + const MetricsDict* metrics_; + // Groups map + std::map groups_map_; + // Info map + info_map_t info_map_; + // Metrics map + std::map metrics_map_; + // Context completion handler + rocprofiler_handler_t handler_; + void* handler_arg_; + mutex_t mutex_; +}; + +} // namespace rocprofiler + +#endif // SRC_CORE_CONTEXT_H_ diff --git a/src/core/hsa_proxy_queue.h b/src/core/hsa_proxy_queue.h new file mode 100644 index 00000000..dd4999b9 --- /dev/null +++ b/src/core/hsa_proxy_queue.h @@ -0,0 +1,67 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef _SRC_CORE_HSA_PROXY_QUEUE_H +#define _SRC_CORE_HSA_PROXY_QUEUE_H + +#include +#include +#include +#include + +#include "core/proxy_queue.h" +#include "util/exception.h" + +namespace rocprofiler { +extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; +extern decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn; +extern decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn; + +class HsaProxyQueue : public ProxyQueue { + public: + hsa_status_t SetInterceptCB(on_submit_cb_t on_submit_cb, void* data) { + return hsa_amd_queue_intercept_register_fn(queue_, on_submit_cb, data); + } + + void Submit(const packet_t* packet) { + EXC_RAISING(HSA_STATUS_ERROR, "HsaProxyQueue::Submit() is not supported"); + } + + private: + hsa_status_t Init(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), + void* data, uint32_t private_segment_size, uint32_t group_segment_size, + hsa_queue_t** queue) { + const auto status = hsa_amd_queue_intercept_create_fn( + agent, size, type, callback, data, private_segment_size, group_segment_size, &queue_); + *queue = queue_; + return status; + } + + hsa_status_t Cleanup() const { return hsa_queue_destroy_fn(queue_); } + + hsa_queue_t* queue_; +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_HSA_PROXY_QUEUE_H diff --git a/src/core/hsa_queue.h b/src/core/hsa_queue.h new file mode 100644 index 00000000..620f6224 --- /dev/null +++ b/src/core/hsa_queue.h @@ -0,0 +1,80 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef _SRC_CORE_HSA_QUEUE_H +#define _SRC_CORE_HSA_QUEUE_H + +#include + +#include "core/queue.h" +#include "core/types.h" + +namespace rocprofiler { + +class HsaQueue : public Queue { + public: + typedef void (HsaQueue::*submit_fptr_t)(const packet_t* packet); + enum { + LEGACY_SLOT_SIZE_W = HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE / sizeof(packet_word_t), + LEGACY_SLOT_SIZE_P = HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE / sizeof(packet_t) + }; + struct slot_pm4_t { + packet_word_t words[LEGACY_SLOT_SIZE_W]; + }; + + HsaQueue(const util::AgentInfo* agent_info, hsa_queue_t* queue) : queue_(queue) {} + + void Submit(const packet_t* packet) { + // Compute the write index of queue and copy Aql packet into it + const uint64_t que_idx = hsa_queue_load_write_index_relaxed(queue_); + // Increment the write index + hsa_queue_store_write_index_relaxed(queue_, que_idx + 1); + + const uint32_t mask = queue_->size - 1; + + // Copy packet to the queue + const packet_word_t* src = reinterpret_cast(packet); + packet_t* slot = reinterpret_cast(queue_->base_address) + (que_idx & mask); + packet_word_t* dst = reinterpret_cast(slot); + const uint32_t nwords = sizeof(packet_t) / sizeof(packet_word_t); + for (unsigned i = 1; i < nwords; ++i) { + dst[i] = src[i]; + } + + // To maintain global order to ensure the prior copy of the packet contents is made visible + // before the header is updated. + // With in-order CP it will wait until the first packet in the blob will be valid + std::atomic* header_atomic_ptr = + reinterpret_cast*>(&dst[0]); + header_atomic_ptr->store(src[0], std::memory_order_release); + + // Doorbell signaling + hsa_signal_store_relaxed(queue_->doorbell_signal, que_idx); + } + + private: + hsa_queue_t* queue_; +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_HSA_QUEUE_H diff --git a/src/core/intercept_queue.cpp b/src/core/intercept_queue.cpp new file mode 100644 index 00000000..a2a289aa --- /dev/null +++ b/src/core/intercept_queue.cpp @@ -0,0 +1,40 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "core/intercept_queue.h" + +namespace rocprofiler { +void InterceptQueue::HsaIntercept(HsaApiTable* table) { + table->core_->hsa_queue_create_fn = rocprofiler::InterceptQueue::QueueCreate; + table->core_->hsa_queue_destroy_fn = rocprofiler::InterceptQueue::QueueDestroy; +} + +InterceptQueue::mutex_t InterceptQueue::mutex_; +rocprofiler_callback_t InterceptQueue::dispatch_callback_ = NULL; +InterceptQueue::queue_callback_t InterceptQueue::destroy_callback_ = NULL; +void* InterceptQueue::callback_data_ = NULL; +InterceptQueue::obj_map_t* InterceptQueue::obj_map_ = NULL; +const char* InterceptQueue::kernel_none_ = ""; +uint64_t InterceptQueue::timeout_ = UINT64_MAX; +Tracker* InterceptQueue::tracker_ = NULL; +bool InterceptQueue::tracker_on_ = false; +} // namespace rocprofiler diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h new file mode 100644 index 00000000..c5376bb9 --- /dev/null +++ b/src/core/intercept_queue.h @@ -0,0 +1,230 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef _SRC_CORE_INTERCEPT_QUEUE_H +#define _SRC_CORE_INTERCEPT_QUEUE_H + +#include +#include +#include + +#include +#include +#include +#include + +#include "core/context.h" +#include "core/proxy_queue.h" +#include "core/tracker.h" +#include "core/types.h" +#include "inc/rocprofiler.h" +#include "util/hsa_rsrc_factory.h" + +namespace rocprofiler { +extern decltype(hsa_queue_create)* hsa_queue_create_fn; +extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; + +class InterceptQueue { + public: + typedef std::recursive_mutex mutex_t; + typedef std::map obj_map_t; + typedef hsa_status_t (*queue_callback_t)(hsa_queue_t*, void* data); + + static void HsaIntercept(HsaApiTable* table); + + static hsa_status_t QueueCreate(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, + void* data), + void* data, uint32_t private_segment_size, + uint32_t group_segment_size, hsa_queue_t** queue) { + hsa_status_t status = HSA_STATUS_ERROR; + std::lock_guard lck(mutex_); + + ProxyQueue* proxy = ProxyQueue::Create(agent, size, type, callback, data, private_segment_size, + group_segment_size, queue, &status); + if (status != HSA_STATUS_SUCCESS) abort(); + + if (tracker_on_ && (tracker_ == NULL)) { + tracker_ = new Tracker(timeout_); + status = hsa_amd_profiling_set_profiler_enabled(*queue, true); + if (status != HSA_STATUS_SUCCESS) abort(); + } + + if (!obj_map_) obj_map_ = new obj_map_t; + InterceptQueue* obj = new InterceptQueue(agent, *queue, proxy); + (*obj_map_)[(uint64_t)(*queue)] = obj; + status = proxy->SetInterceptCB(OnSubmitCB, obj); + + return status; + } + + static hsa_status_t QueueDestroy(hsa_queue_t* queue) { + std::lock_guard lck(mutex_); + hsa_status_t status = HSA_STATUS_ERROR; + + if (destroy_callback_ != NULL) { + status = destroy_callback_(queue, callback_data_); + if (status != HSA_STATUS_SUCCESS) return status; + } + + obj_map_t::iterator it = obj_map_->find((uint64_t)queue); + if (it != obj_map_->end()) { + const InterceptQueue* obj = it->second; + assert(queue == obj->queue_); + delete obj; + obj_map_->erase(it); + status = HSA_STATUS_SUCCESS; + } + + return status; + } + + static void OnSubmitCB(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data, + hsa_amd_queue_intercept_packet_writer writer) { + const packet_t* packets_arr = reinterpret_cast(in_packets); + InterceptQueue* obj = reinterpret_cast(data); + Queue* proxy = obj->proxy_; + + for (uint64_t j = 0; j < count; ++j) { + bool to_submit = true; + const packet_t* packet = &packets_arr[j]; + + if ((GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) && (dispatch_callback_ != NULL)) { + rocprofiler_group_t group = {}; + const hsa_kernel_dispatch_packet_t* dispatch_packet = + reinterpret_cast(packet); + const char* kernel_name = GetKernelName(dispatch_packet); + const rocprofiler_dispatch_record_t* record = NULL; + if (tracker_ != NULL) { + const auto* entry = tracker_->Add(obj->agent_info_->dev_id, dispatch_packet->completion_signal); + const_cast(dispatch_packet)->completion_signal = entry->signal; + record = entry->record; + } + rocprofiler_callback_data_t data = {obj->agent_info_->dev_id, + obj->agent_info_->dev_index, + obj->queue_, + user_que_idx, + dispatch_packet, + kernel_name, + record}; + hsa_status_t status = dispatch_callback_(&data, callback_data_, &group); + free(const_cast(kernel_name)); + if ((status == HSA_STATUS_SUCCESS) && (group.context != NULL)) { + Context* context = reinterpret_cast(group.context); + const pkt_vector_t& start_vector = context->StartPackets(group.index); + const pkt_vector_t& stop_vector = context->StopPackets(group.index); + + pkt_vector_t packets = start_vector; + packets.insert(packets.end(), *packet); + packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); + if (writer != NULL) { + writer(&packets[0], packets.size()); + } else { + proxy->Submit(&packets[0], packets.size()); + } + to_submit = false; + } + } + + if (to_submit) { + if (writer != NULL) { + writer(packet, 1); + } else { + proxy->Submit(packet, 1); + } + } + + packet += 1; + } + } + + static void SetCallbacks(rocprofiler_callback_t dispatch_callback, queue_callback_t destroy_callback, void* data) { + std::lock_guard lck(mutex_); + callback_data_ = data; + dispatch_callback_ = dispatch_callback; + destroy_callback_ = destroy_callback; + } + + static void SetTimeout(uint64_t timeout) { timeout_ = timeout; } + static void TrackerOn(bool on) { tracker_on_ = on; } + static bool IsTrackerOn() { return tracker_on_; } + + private: + InterceptQueue(const hsa_agent_t& agent, hsa_queue_t* const queue, ProxyQueue* proxy) : + queue_(queue), + proxy_(proxy) + { + agent_info_ = util::HsaRsrcFactory::Instance().GetAgentInfo(agent); + } + ~InterceptQueue() { ProxyQueue::Destroy(proxy_); } + + static packet_word_t GetHeaderType(const packet_t* packet) { + const packet_word_t* header = reinterpret_cast(packet); + return (*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask; + } + + static const char* GetKernelName(const hsa_kernel_dispatch_packet_t* dispatch_packet) { + const amd_kernel_code_t* kernel_code = NULL; + hsa_status_t status = + util::HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address( + reinterpret_cast(dispatch_packet->kernel_object), + reinterpret_cast(&kernel_code)); + if (HSA_STATUS_SUCCESS != status) { + kernel_code = reinterpret_cast(dispatch_packet->kernel_object); + } + amd_runtime_loader_debug_info_t* dbg_info = reinterpret_cast( + kernel_code->runtime_loader_kernel_symbol); + const char* kernel_name = (dbg_info != NULL) ? dbg_info->kernel_name : NULL; + + // Kernel name is mangled name + // apply __cxa_demangle() to demangle it + const char* funcname = NULL; + if (kernel_name != NULL) { + size_t funcnamesize = 0; + int status; + const char* ret = abi::__cxa_demangle(kernel_name, NULL, &funcnamesize, &status); + funcname = (ret != 0) ? ret : strdup(kernel_name); + } + if (funcname == NULL) funcname = strdup(kernel_none_); + + return funcname; + } + + static mutex_t mutex_; + static const packet_word_t header_type_mask = (1ul << HSA_PACKET_HEADER_WIDTH_TYPE) - 1; + static rocprofiler_callback_t dispatch_callback_; + static queue_callback_t destroy_callback_; + static void* callback_data_; + static obj_map_t* obj_map_; + static const char* kernel_none_; + static uint64_t timeout_; + static Tracker* tracker_; + static bool tracker_on_; + + hsa_queue_t* const queue_; + ProxyQueue* const proxy_; + const util::AgentInfo* agent_info_; +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_INTERCEPT_QUEUE_H diff --git a/src/core/metrics.cpp b/src/core/metrics.cpp new file mode 100644 index 00000000..67598632 --- /dev/null +++ b/src/core/metrics.cpp @@ -0,0 +1,28 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "core/metrics.h" + +namespace rocprofiler { +MetricsDict::map_t* MetricsDict::map_ = NULL; +MetricsDict::mutex_t MetricsDict::mutex_; +} diff --git a/src/core/metrics.h b/src/core/metrics.h new file mode 100644 index 00000000..8f05a3e7 --- /dev/null +++ b/src/core/metrics.h @@ -0,0 +1,302 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_CORE_METRICS_H_ +#define SRC_CORE_METRICS_H_ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "core/types.h" +#include "util/exception.h" +#include "util/hsa_rsrc_factory.h" +#include "xml/expr.h" +#include "xml/xml.h" + +namespace rocprofiler { +struct counter_t { + std::string name; + event_t event; +}; +typedef std::vector counters_vec_t; + +class Metric { + public: + Metric(const std::string& name) : name_(name) {} + virtual ~Metric() {} + std::string GetName() const { return name_; } + virtual void GetCounters(counters_vec_t& vec) const = 0; + counters_vec_t GetCounters() const { + counters_vec_t counters; + GetCounters(counters); + return counters; + } + virtual const xml::Expr* GetExpr() const = 0; + + private: + const std::string name_; +}; + +class BaseMetric : public Metric { + public: + BaseMetric(const std::string& name, const counter_t& counter) : Metric(name), counter_(counter) {} + void GetCounters(counters_vec_t& vec) const { vec.push_back(&counter_); } + const xml::Expr* GetExpr() const { return NULL; } + + private: + const counter_t counter_; +}; + +class ExprMetric : public Metric { + public: + ExprMetric(const std::string& name, const counters_vec_t& counters, const xml::Expr* expr) + : Metric(name), counters_(counters), expr_(expr) {} + ~ExprMetric() { delete expr_; } + void GetCounters(counters_vec_t& vec) const { + vec.insert(vec.end(), counters_.begin(), counters_.end()); + } + const xml::Expr* GetExpr() const { return expr_; } + + private: + const counters_vec_t counters_; + const xml::Expr* expr_; +}; + +class MetricsDict { + public: + typedef std::map cache_t; + typedef cache_t::const_iterator const_iterator_t; + typedef std::map map_t; + typedef std::mutex mutex_t; + + class ExprCache : public xml::expr_cache_t { + public: + ExprCache(const cache_t* cache) : cache_(cache) {} + bool Lookup(const std::string& name, std::string& result) const { + bool ret = false; + auto it = cache_->find(name); + if (it != cache_->end()) { + ret = true; + const rocprofiler::ExprMetric* expr_metric = + dynamic_cast(it->second); + if (expr_metric) result = expr_metric->GetExpr()->GetStr(); + } + return ret; + } + + private: + const cache_t* const cache_; + }; + + static MetricsDict* Create(const util::AgentInfo* agent_info) { + std::lock_guard lck(mutex_); + if (map_ == NULL) map_ = new map_t; + auto ret = map_->insert({agent_info->gfxip, NULL}); + if (ret.second) ret.first->second = new MetricsDict(agent_info); + return ret.first->second; + } + + static void Destroy() { + if (map_ != NULL) { + for (auto& entry : *map_) delete entry.second; + delete map_; + map_ = NULL; + } + } + + const Metric* Get(const std::string& name) const { + const Metric* metric = NULL; + + auto it = cache_.find(name); + if (it != cache_.end()) metric = it->second; + else { + const std::size_t pos = name.find(':'); + if (pos != std::string::npos) { + std::string block_name = name.substr(0, pos); + const std::string event_str = name.substr(pos + 1); + + uint32_t block_index = 0; + bool indexed = false; + const std::size_t pos1 = block_name.find('['); + if (pos1 != std::string::npos) { + const std::size_t pos2 = block_name.find(']'); + if (pos2 == std::string::npos) EXC_RAISING(HSA_STATUS_ERROR, "Malformed metric name '" << name << "'"); + block_name = name.substr(0, pos1); + const std::string block_index_str = name.substr(pos1 + 1, pos2 - (pos1 + 1)); + block_index = atol(block_index_str.c_str()); + indexed = true; + } + + const hsa_ven_amd_aqlprofile_id_query_t query = Translate(agent_info_, block_name); + const hsa_ven_amd_aqlprofile_block_name_t block_id = (hsa_ven_amd_aqlprofile_block_name_t)query.id; + if ((query.instance_count > 1) && (indexed == false)) EXC_RAISING(HSA_STATUS_ERROR, "Malformed indexed metric name '" << name << "'"); + const uint32_t event_id = atol(event_str.c_str()); + const counter_t counter = {name, {block_id, block_index, event_id}}; + metric = new BaseMetric(name, counter); + } + } + + return metric; + } + + uint32_t Size() const { return cache_.size(); } + const_iterator_t Begin() const { return cache_.begin(); } + const_iterator_t End() const { return cache_.end(); } + + xml::Xml::nodes_t GetNodes(const std::string& scope) const { + return xml_->GetNodes("top." + scope + ".metric"); + } + + private: + MetricsDict(const util::AgentInfo* agent_info) : xml_(NULL), agent_info_(agent_info) { + const char* xml_name = getenv("ROCP_METRICS"); + if (xml_name != NULL) { + xml_ = xml::Xml::Create(xml_name); + if (xml_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "metrics .xml open error '" << xml_name << "'"); + xml_->AddConst("top.const.metric", "MAX_WAVE_SIZE", agent_info->max_wave_size); + xml_->AddConst("top.const.metric", "CU_NUM", agent_info->cu_num); + xml_->AddConst("top.const.metric", "SIMD_NUM", agent_info->simds_per_cu * agent_info->cu_num); + xml_->AddConst("top.const.metric", "SE_NUM", agent_info->se_num); + ImportMetrics(agent_info, "const"); + ImportMetrics(agent_info, agent_info->gfxip); + ImportMetrics(agent_info, "global"); + } + } + + ~MetricsDict() { + xml::Xml::Destroy(xml_); + for (auto& entry : cache_) delete entry.second; + } + + static hsa_ven_amd_aqlprofile_id_query_t Translate(const util::AgentInfo* agent_info, const std::string& block_name) { + hsa_ven_amd_aqlprofile_profile_t profile; + profile.agent = agent_info->dev_id; + hsa_ven_amd_aqlprofile_id_query_t query = {block_name.c_str(), 0, 0}; + hsa_status_t status = + util::HsaRsrcFactory::Instance().AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info( + &profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID, &query); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(HSA_STATUS_ERROR, "ImportMetrics: bad block name '" << block_name << "'"); + return query; + } + + void ImportMetrics(const util::AgentInfo* agent_info, const std::string& scope) { + auto metrics_list = xml_->GetNodes("top." + scope + ".metric"); + if (!metrics_list.empty()) { + for (auto node : metrics_list) { + const std::string name = node->opts["name"]; + const std::string expr_str = node->opts["expr"]; + std::string descr = node->opts["descr"]; + if (descr.empty()) descr = (expr_str.empty()) ? name : expr_str; + + if (expr_str.empty()) { + const std::string block_name = node->opts["block"]; + const std::string event_str = node->opts["event"]; + const uint32_t event_id = atol(event_str.c_str()); + + const hsa_ven_amd_aqlprofile_id_query_t query = Translate(agent_info, block_name); + const hsa_ven_amd_aqlprofile_block_name_t block_id = (hsa_ven_amd_aqlprofile_block_name_t)query.id; + if (query.instance_count > 1) { + for (unsigned block_index = 0; block_index < query.instance_count; ++block_index) { + std::ostringstream full_name; + full_name << name << '[' << block_index << ']'; + std::ostringstream block_insance; + block_insance << block_name << "[" << block_index << "]"; + std::ostringstream alias; + alias << block_insance.str() << ":" << event_str; + const counter_t counter = {full_name.str(), {block_id, block_index, event_id}}; + AddMetric(full_name.str(), alias.str(), counter); + } + } else { + const std::string alias = block_name + ":" + event_str; + const counter_t counter = {name, {block_id, 0, event_id}}; + AddMetric(name, alias, counter); + } + } else { + xml::Expr* expr_obj = new xml::Expr(expr_str, new ExprCache(&cache_)); +#if 0 + std::cout << "# " << descr << std::endl; + std::cout << name << "=" << expr_obj->String() << "\n" << std::endl; +#endif + counters_vec_t counters_vec; + for (const std::string var : expr_obj->GetVars()) { + auto it = cache_.find(var); + if (it == cache_.end()) + EXC_RAISING(HSA_STATUS_ERROR, "Bad metric '" << name << "', var '" << var + << "' is not found"); + it->second->GetCounters(counters_vec); + } + AddMetric(name, counters_vec, expr_obj); + } + } + } + } + + const Metric* AddMetric(const std::string& name, const std::string& /*alias*/, const counter_t& counter) { + const Metric* metric = NULL; + const auto ret = cache_.insert({name, NULL}); + if (ret.second) { + metric = new BaseMetric(name, counter); + ret.first->second = metric; + } else EXC_RAISING(HSA_STATUS_ERROR, "metric redefined '" << name << "'"); + return metric; + } + + const Metric* AddMetric(const std::string& name, const counters_vec_t& counters_vec, const xml::Expr* expr_obj) { + const Metric* metric = NULL; + const auto ret = cache_.insert({name, NULL}); + if (ret.second) { + metric = new ExprMetric(name, counters_vec, expr_obj); + ret.first->second = metric; + } else EXC_RAISING(HSA_STATUS_ERROR, "expr-metric redefined '" << name << "'"); + return metric; + } + + void Print() { + for (auto& v : cache_) { + const Metric* metric = v.second; + counters_vec_t counters_vec; + printf("> Metric '%s'\n", metric->GetName().c_str()); + metric->GetCounters(counters_vec); + for (auto c : counters_vec) { + printf(" counter %s, b(%u), i (%u), e (%u)\n", c->name.c_str(), c->event.block_name, c->event.block_index, c->event.counter_id); + } + } + } + + xml::Xml* xml_; + const util::AgentInfo* agent_info_; + cache_t cache_; + + static map_t* map_; + static mutex_t mutex_; +}; + +} // namespace rocprofiler + +#endif // SRC_CORE_METRICS_H_ diff --git a/src/core/profile.h b/src/core/profile.h new file mode 100644 index 00000000..43d30a21 --- /dev/null +++ b/src/core/profile.h @@ -0,0 +1,271 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_CORE_PROFILE_H_ +#define SRC_CORE_PROFILE_H_ + +#include "inc/rocprofiler.h" + +#include +#include + +#include "core/types.h" +#include "util/exception.h" +#include "util/hsa_rsrc_factory.h" + +namespace rocprofiler { +struct profile_info_t { + const event_t* event; + const parameter_t* parameters; + uint32_t parameter_count; + rocprofiler_feature_t* rinfo; +}; +typedef std::vector info_vector_t; +typedef std::vector pkt_vector_t; +struct profile_tuple_t { + const profile_t* profile; + info_vector_t* info_vector; + hsa_signal_t completion_signal; +}; +typedef std::vector profile_vector_t; + +template class ConfigBase {}; + +template <> class ConfigBase { + public: + ConfigBase(profile_t* profile) : profile_(profile) {} + + protected: + void* Array() { return const_cast(profile_->events); } + unsigned Count() const { return profile_->event_count; } + void Set(event_t* events, const unsigned& count) { + profile_->events = events; + profile_->event_count = count; + } + profile_t* profile_; +}; + +template <> class ConfigBase { + public: + ConfigBase(profile_t* profile) : profile_(profile) {} + + protected: + void* Array() { return const_cast(profile_->parameters); } + unsigned Count() const { return profile_->parameter_count; } + void Set(parameter_t* parameters, const unsigned& count) { + profile_->parameters = parameters; + profile_->parameter_count = count; + } + profile_t* profile_; +}; + +template class Config : protected ConfigBase { + typedef ConfigBase Parent; + + public: + Config(profile_t* profile) : Parent(profile) {} + void Insert(const Item& item) { + auto count = Parent::Count(); + count += 1; + Item* array = + reinterpret_cast(realloc(const_cast(Parent::Array()), count * sizeof(Item))); + array[count - 1] = item; + Parent::Set(array, count); + } +}; + +class Profile { + public: + static const uint32_t LEGACY_SLOT_SIZE_PKT = + HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE / sizeof(packet_t); + + Profile(const util::AgentInfo* agent_info) : agent_info_(agent_info) { + profile_ = {}; + profile_.agent = agent_info->dev_id; + completion_signal_ = {}; + is_legacy_ = (strncmp(agent_info->name, "gfx8", 4) == 0); + } + + virtual ~Profile() { + info_vector_.clear(); + if (profile_.command_buffer.ptr) util::HsaRsrcFactory::FreeMemory(profile_.command_buffer.ptr); + if (profile_.output_buffer.ptr) util::HsaRsrcFactory::FreeMemory(profile_.output_buffer.ptr); + if (profile_.events) free(const_cast(profile_.events)); + if (profile_.parameters) free(const_cast(profile_.parameters)); + if (completion_signal_.handle) { + hsa_status_t status = hsa_signal_destroy(completion_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status); + } + } + + virtual void Insert(const profile_info_t& info) { info_vector_.push_back(info.rinfo); } + + hsa_status_t Finalize(pkt_vector_t& start_vector, pkt_vector_t& stop_vector, pkt_vector_t& read_vector) { + hsa_status_t status = HSA_STATUS_SUCCESS; + + if (!info_vector_.empty()) { + util::HsaRsrcFactory* rsrc = &util::HsaRsrcFactory::Instance(); + const pfn_t* api = rsrc->AqlProfileApi(); + packet_t start{}; + packet_t stop{}; + packet_t read{}; + + // Check the profile buffer sizes + status = api->hsa_ven_amd_aqlprofile_start(&profile_, NULL); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_start(NULL)"); + status = Allocate(rsrc); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "Allocate()"); + + // Generate start/stop/read profiling packets + status = api->hsa_ven_amd_aqlprofile_start(&profile_, &start); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_start"); + status = api->hsa_ven_amd_aqlprofile_stop(&profile_, &stop); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_stop"); + hsa_status_t rd_status = api->hsa_ven_amd_aqlprofile_read(&profile_, &read); +#if 0 // Read API returns error if disabled + if (rd_status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_read"); +#endif + + // Set completion signal + hsa_signal_t dummy_signal{}; + dummy_signal.handle = 0; + start.completion_signal = dummy_signal; + hsa_signal_t post_signal; + status = hsa_signal_create(1, 0, NULL, &post_signal); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); + stop.completion_signal = post_signal; + read.completion_signal = post_signal; + completion_signal_ = post_signal; + + // Fill packet vectors + if (is_legacy_) { + const uint32_t start_index = start_vector.size(); + const uint32_t stop_index = stop_vector.size(); + + start_vector.insert(start_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{}); + stop_vector.insert(stop_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{}); + + status = api->hsa_ven_amd_aqlprofile_legacy_get_pm4( + &start, reinterpret_cast(&start_vector[start_index])); + if (status != HSA_STATUS_SUCCESS) + AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + + status = api->hsa_ven_amd_aqlprofile_legacy_get_pm4( + &stop, reinterpret_cast(&stop_vector[stop_index])); + if (status != HSA_STATUS_SUCCESS) + AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + + if (rd_status == HSA_STATUS_SUCCESS) { + const uint32_t read_index = read_vector.size(); + read_vector.insert(read_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{}); + status = api->hsa_ven_amd_aqlprofile_legacy_get_pm4( + &read, reinterpret_cast(&read_vector[read_index])); + if (status != HSA_STATUS_SUCCESS) + AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + } + } else { + start_vector.push_back(start); + stop_vector.push_back(stop); + if (rd_status == HSA_STATUS_SUCCESS) { + read_vector.push_back(read); + } + } + } + + return status; + } + + void GetProfiles(profile_vector_t& vec) { + if (!info_vector_.empty()) { + vec.push_back(profile_tuple_t{&profile_, &info_vector_, completion_signal_}); + } + } + + bool Empty() const { return info_vector_.empty(); } + + protected: + virtual hsa_status_t Allocate(util::HsaRsrcFactory* rsrc) = 0; + + const util::AgentInfo* const agent_info_; + bool is_legacy_; + profile_t profile_; + info_vector_t info_vector_; + hsa_signal_t completion_signal_; +}; + +class PmcProfile : public Profile { + public: + PmcProfile(const util::AgentInfo* agent_info) : Profile(agent_info) { + profile_.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC; + } + + void Insert(const profile_info_t& info) { + Profile::Insert(info); + Config(&profile_).Insert(*(info.event)); + } + + hsa_status_t Allocate(util::HsaRsrcFactory* rsrc) { + profile_.command_buffer.ptr = + rsrc->AllocateSysMemory(agent_info_, profile_.command_buffer.size); + profile_.output_buffer.ptr = rsrc->AllocateSysMemory(agent_info_, profile_.output_buffer.size); + return (profile_.command_buffer.ptr && profile_.output_buffer.ptr) ? HSA_STATUS_SUCCESS + : HSA_STATUS_ERROR; + } +}; + +class SqttProfile : public Profile { + public: + static inline void SetSize(const uint32_t& size) { output_buffer_size_ = size; } + static inline uint32_t GetSize() { return output_buffer_size_; } + static inline void SetLocal(const bool& b) { output_buffer_local_ = b; } + static inline bool IsLocal() { return output_buffer_local_; } + + SqttProfile(const util::AgentInfo* agent_info) : Profile(agent_info) { + profile_.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_SQTT; + } + + void Insert(const profile_info_t& info) { + Profile::Insert(info); + for (unsigned j = 0; j < info.parameter_count; ++j) { + Config(&profile_).Insert(info.parameters[j]); + } + } + + hsa_status_t Allocate(util::HsaRsrcFactory* rsrc) { + profile_.command_buffer.ptr = + rsrc->AllocateSysMemory(agent_info_, profile_.command_buffer.size); + profile_.output_buffer.size = output_buffer_size_; + profile_.output_buffer.ptr = (output_buffer_local_) ? + rsrc->AllocateLocalMemory(agent_info_, profile_.output_buffer.size) : + rsrc->AllocateSysMemory(agent_info_, profile_.output_buffer.size); + return (profile_.command_buffer.ptr && profile_.output_buffer.ptr) ? HSA_STATUS_SUCCESS + : HSA_STATUS_ERROR; + } + + private: + static uint32_t output_buffer_size_; + static bool output_buffer_local_; +}; + +} // namespace rocprofiler + +#endif // SRC_CORE_PROFILE_H_ diff --git a/src/core/proxy_queue.cpp b/src/core/proxy_queue.cpp new file mode 100644 index 00000000..7a4f4476 --- /dev/null +++ b/src/core/proxy_queue.cpp @@ -0,0 +1,63 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "core/proxy_queue.h" + +#include "core/hsa_proxy_queue.h" +#include "core/simple_proxy_queue.h" + +namespace rocprofiler { +void ProxyQueue::HsaIntercept(HsaApiTable* table) { + if (rocp_type_) SimpleProxyQueue::HsaIntercept(table); +} + +ProxyQueue* ProxyQueue::Create(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, + void* data), + void* data, uint32_t private_segment_size, + uint32_t group_segment_size, hsa_queue_t** queue, + hsa_status_t* status) { + hsa_status_t suc = HSA_STATUS_ERROR; + ProxyQueue* instance = + (rocp_type_) ? (ProxyQueue*) new SimpleProxyQueue() : (ProxyQueue*) new HsaProxyQueue(); + if (instance != NULL) { + suc = instance->Init(agent, size, type, callback, data, private_segment_size, + group_segment_size, queue); + if (suc != HSA_STATUS_SUCCESS) { + delete instance; + instance = NULL; + } + } + *status = suc; + assert(*status == HSA_STATUS_SUCCESS); + return instance; +} + +hsa_status_t ProxyQueue::Destroy(const ProxyQueue* obj) { + assert(obj != NULL); + auto suc = obj->Cleanup(); + delete obj; + return suc; +} + +bool ProxyQueue::rocp_type_ = false; +} // namespace rocprofiler diff --git a/src/core/proxy_queue.h b/src/core/proxy_queue.h new file mode 100644 index 00000000..42e6c63b --- /dev/null +++ b/src/core/proxy_queue.h @@ -0,0 +1,77 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef _SRC_CORE_PROXY_QUEUE_H +#define _SRC_CORE_PROXY_QUEUE_H + +#include +#include +#include +#include +#include + +#include "core/queue.h" +#include "core/types.h" + +struct HsaApiTable; + +namespace rocprofiler { +typedef void (*hsa_amd_queue_intercept_packet_writer)(const void* packets, uint64_t count); +typedef void (*on_submit_cb_t)(const void* packet, uint64_t count, uint64_t que_idx, void* data, + hsa_amd_queue_intercept_packet_writer writer); + +class ProxyQueue : public Queue { + public: + static void InitFactory() { + const char* type = getenv("ROCP_PROXY_QUEUE"); + if (type != NULL) { + if (strncmp(type, "rocp", 4) == 0) rocp_type_ = true; + } + } + + static void HsaIntercept(HsaApiTable* table); + + static ProxyQueue* Create(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), + void* data, uint32_t private_segment_size, uint32_t group_segment_size, + hsa_queue_t** queue, hsa_status_t* status); + + static hsa_status_t Destroy(const ProxyQueue* obj); + + virtual hsa_status_t Init(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), + void* data, uint32_t private_segment_size, uint32_t group_segment_size, + hsa_queue_t** queue) = 0; + virtual hsa_status_t Cleanup() const = 0; + virtual hsa_status_t SetInterceptCB(on_submit_cb_t on_submit_cb, void* data) = 0; + virtual void Submit(const packet_t* packet) = 0; + + protected: + virtual ~ProxyQueue(){}; + + private: + static bool rocp_type_; +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_PROXY_QUEUE_H diff --git a/src/core/queue.h b/src/core/queue.h new file mode 100644 index 00000000..07e3b45b --- /dev/null +++ b/src/core/queue.h @@ -0,0 +1,42 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef _SRC_CORE_QUEUE_H +#define _SRC_CORE_QUEUE_H + +#include "core/types.h" + +namespace rocprofiler { + +class Queue { + public: + Queue() {} + virtual ~Queue() {} + virtual void Submit(const packet_t* packet) = 0; + virtual void Submit(const packet_t* packet, const size_t& count) { + for (const packet_t* p = packet; p < packet + count; ++p) Submit(p); + } +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_QUEUE_H diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp new file mode 100644 index 00000000..a96fadba --- /dev/null +++ b/src/core/rocprofiler.cpp @@ -0,0 +1,522 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "inc/rocprofiler.h" + +#include +#include +#include +#include + +#include "core/context.h" +#include "core/hsa_queue.h" +#include "core/intercept_queue.h" +#include "core/proxy_queue.h" +#include "core/simple_proxy_queue.h" +#include "util/exception.h" +#include "util/hsa_rsrc_factory.h" +#include "util/logger.h" + +#define PUBLIC_API __attribute__((visibility("default"))) +#define CONSTRUCTOR_API __attribute__((constructor)) +#define DESTRUCTOR_API __attribute__((destructor)) + +#define API_METHOD_PREFIX \ + hsa_status_t status = HSA_STATUS_SUCCESS; \ + try { + +#define API_METHOD_SUFFIX \ + } \ + catch (std::exception & e) { \ + ERR_LOGGING(__FUNCTION__ << "(), " << e.what()); \ + status = rocprofiler::GetExcStatus(e); \ + } \ + return status; + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Internal library methods +// +namespace rocprofiler { +decltype(hsa_queue_create)* hsa_queue_create_fn; +decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; + +decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn; +decltype(hsa_signal_store_relaxed)* hsa_signal_store_screlease_fn; + +decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn; +decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn; +decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn; + +decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn; +decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn; +decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacquire_fn; + +decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn; +decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn; + +::HsaApiTable* kHsaApiTable; + +void SaveHsaApi(::HsaApiTable* table) { + kHsaApiTable = table; + hsa_queue_create_fn = table->core_->hsa_queue_create_fn; + hsa_queue_destroy_fn = table->core_->hsa_queue_destroy_fn; + + hsa_signal_store_relaxed_fn = table->core_->hsa_signal_store_relaxed_fn; + hsa_signal_store_screlease_fn = table->core_->hsa_signal_store_screlease_fn; + + hsa_queue_load_write_index_relaxed_fn = table->core_->hsa_queue_load_write_index_relaxed_fn; + hsa_queue_store_write_index_relaxed_fn = table->core_->hsa_queue_store_write_index_relaxed_fn; + hsa_queue_load_read_index_relaxed_fn = table->core_->hsa_queue_load_read_index_relaxed_fn; + + hsa_queue_load_write_index_scacquire_fn = table->core_->hsa_queue_load_write_index_scacquire_fn; + hsa_queue_store_write_index_screlease_fn = table->core_->hsa_queue_store_write_index_screlease_fn; + hsa_queue_load_read_index_scacquire_fn = table->core_->hsa_queue_load_read_index_scacquire_fn; + + hsa_amd_queue_intercept_create_fn = table->amd_ext_->hsa_amd_queue_intercept_create_fn; + hsa_amd_queue_intercept_register_fn = table->amd_ext_->hsa_amd_queue_intercept_register_fn; +} + +void RestoreHsaApi() { + ::HsaApiTable* table = kHsaApiTable; + table->core_->hsa_queue_create_fn = hsa_queue_create_fn; + table->core_->hsa_queue_destroy_fn = hsa_queue_destroy_fn; + + table->core_->hsa_signal_store_relaxed_fn = hsa_signal_store_relaxed_fn; + table->core_->hsa_signal_store_screlease_fn = hsa_signal_store_screlease_fn; + + table->core_->hsa_queue_load_write_index_relaxed_fn = hsa_queue_load_write_index_relaxed_fn; + table->core_->hsa_queue_store_write_index_relaxed_fn = hsa_queue_store_write_index_relaxed_fn; + table->core_->hsa_queue_load_read_index_relaxed_fn = hsa_queue_load_read_index_relaxed_fn; + + table->core_->hsa_queue_load_write_index_scacquire_fn = hsa_queue_load_write_index_scacquire_fn; + table->core_->hsa_queue_store_write_index_screlease_fn = hsa_queue_store_write_index_screlease_fn; + table->core_->hsa_queue_load_read_index_scacquire_fn = hsa_queue_load_read_index_scacquire_fn; + + table->amd_ext_->hsa_amd_queue_intercept_create_fn = hsa_amd_queue_intercept_create_fn; + table->amd_ext_->hsa_amd_queue_intercept_register_fn = hsa_amd_queue_intercept_register_fn; +} + +typedef void (*tool_handler_t)(); +typedef void (*tool_handler_prop_t)(rocprofiler_settings_t*); +void * tool_handle = NULL; + +// Load profiling tool library +// Return true if intercepting mode is enabled +bool LoadTool() { + bool intercept_mode = false; + const char* tool_lib = getenv("ROCP_TOOL_LIB"); + + if (tool_lib) { + intercept_mode = true; + + tool_handle = dlopen(tool_lib, RTLD_NOW); + if (tool_handle == NULL) { + fprintf(stderr, "ROCProfiler: can't load tool library \"%s\"\n", tool_lib); + fprintf(stderr, "%s\n", dlerror()); + abort(); + } + tool_handler_t handler = reinterpret_cast(dlsym(tool_handle, "OnLoadTool")); + tool_handler_prop_t handler_prop = reinterpret_cast(dlsym(tool_handle, "OnLoadToolProp")); + if ((handler == NULL) && (handler_prop == NULL)) { + fprintf(stderr, "ROCProfiler: tool library corrupted, OnLoadTool()/OnLoadToolProp() method is expected\n"); + fprintf(stderr, "%s\n", dlerror()); + abort(); + } + tool_handler_t on_unload_handler = reinterpret_cast(dlsym(tool_handle, "OnUnloadTool")); + if (on_unload_handler == NULL) { + fprintf(stderr, "ROCProfiler: tool library corrupted, OnUnloadTool() method is expected\n"); + fprintf(stderr, "%s\n", dlerror()); + abort(); + } + + rocprofiler_settings_t settings{}; + settings.intercept_mode = (intercept_mode) ? 1 : 0; + settings.sqtt_size = SqttProfile::GetSize(); + settings.sqtt_local = SqttProfile::IsLocal() ? 1: 0; + settings.timeout = Context::GetTimeout(); + settings.timestamp_on = InterceptQueue::IsTrackerOn() ? 1 : 0; + + if (handler) handler(); + else if (handler_prop) handler_prop(&settings); + + intercept_mode = (settings.intercept_mode != 0); + SqttProfile::SetSize(settings.sqtt_size); + SqttProfile::SetLocal(settings.sqtt_local != 0); + Context::SetTimeout(settings.timeout); + InterceptQueue::SetTimeout(settings.timeout); + InterceptQueue::TrackerOn(settings.timestamp_on != 0); + } + + return intercept_mode; +} + +// Unload profiling tool librray +void UnloadTool() { + if (tool_handle) { + tool_handler_t handler = reinterpret_cast(dlsym(tool_handle, "OnUnloadTool")); + if (handler == NULL) { + fprintf(stderr, "ROCProfiler error: tool library corrupted, OnUnloadTool() method is expected\n"); + fprintf(stderr, "%s\n", dlerror()); + abort(); + } + handler(); + dlclose(tool_handle); + } +} + +CONSTRUCTOR_API void constructor() { + util::Logger::Create(); +} + +DESTRUCTOR_API void destructor() { + util::HsaRsrcFactory::Destroy(); + rocprofiler::MetricsDict::Destroy(); + util::Logger::Destroy(); +} + +const MetricsDict* GetMetrics(const hsa_agent_t& agent) { + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); + if (agent_info == NULL) { + EXC_RAISING(HSA_STATUS_ERROR, "agent is not found"); + } + const MetricsDict* metrics = MetricsDict::Create(agent_info); + if (metrics == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); + return metrics; +} + +hsa_status_t GetExcStatus(const std::exception& e) { + const util::exception* rocprofiler_exc_ptr = dynamic_cast(&e); + return (rocprofiler_exc_ptr) ? static_cast(rocprofiler_exc_ptr->status()) + : HSA_STATUS_ERROR; +} + +rocprofiler_properties_t rocprofiler_properties; +uint64_t Context::timeout_ = UINT64_MAX; +uint32_t SqttProfile::output_buffer_size_ = 0x2000000; // 32M +bool SqttProfile::output_buffer_local_ = true; +Tracker::mutex_t Tracker::mutex_; +util::Logger::mutex_t util::Logger::mutex_; +util::Logger* util::Logger::instance_ = NULL; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Public library methods +// +extern "C" { + +// HSA-runtime tool on-load method +PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count, + const char* const* failed_tool_names) { + rocprofiler::SaveHsaApi(table); + rocprofiler::ProxyQueue::InitFactory(); + bool intercept_mode = false; + const char* intercept_env = getenv("ROCP_HSA_INTERCEPT"); + if (intercept_env != NULL) { + if (strncmp(intercept_env, "1", 1) == 0) intercept_mode = true; + } + if (rocprofiler::LoadTool()) intercept_mode = true; + // HSA intercepting + if (intercept_mode) { + rocprofiler::ProxyQueue::HsaIntercept(table); + rocprofiler::InterceptQueue::HsaIntercept(table); + } + return true; +} + +// HSA-runtime tool on-unload method +PUBLIC_API void OnUnload() { + rocprofiler::UnloadTool(); + rocprofiler::RestoreHsaApi(); +} + +// Returns library vesrion +PUBLIC_API uint32_t rocprofiler_version_major() { return ROCPROFILER_VERSION_MAJOR; } +PUBLIC_API uint32_t rocprofiler_version_minor() { return ROCPROFILER_VERSION_MINOR; } + +// Returns the last error message +PUBLIC_API hsa_status_t rocprofiler_error_string(const char** str) { + API_METHOD_PREFIX + *str = rocprofiler::util::Logger::LastMessage().c_str(); + API_METHOD_SUFFIX +} + +// Create new profiling context +PUBLIC_API hsa_status_t rocprofiler_open(hsa_agent_t agent, rocprofiler_feature_t* features, + uint32_t feature_count, rocprofiler_t** handle, uint32_t mode, + rocprofiler_properties_t* properties) { + API_METHOD_PREFIX + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); + if (agent_info == NULL) { + EXC_RAISING(HSA_STATUS_ERROR, "agent is not found"); + } + + rocprofiler::Queue* queue = NULL; + if (mode != 0) { + if (mode & ROCPROFILER_MODE_STANDALONE) { + if (mode & ROCPROFILER_MODE_CREATEQUEUE) { + if (hsa_rsrc->CreateQueue(agent_info, properties->queue_depth, &(properties->queue)) == + false) { + EXC_RAISING(HSA_STATUS_ERROR, "CreateQueue() failed"); + } + } + queue = new rocprofiler::HsaQueue(agent_info, properties->queue); + } else { + EXC_RAISING(HSA_STATUS_ERROR, "invalid mode (" << mode << ")"); + } + } + + *handle = new rocprofiler::Context(agent_info, queue, features, feature_count, properties->handler, + properties->handler_arg); + API_METHOD_SUFFIX +} + +// Delete profiling info +PUBLIC_API hsa_status_t rocprofiler_close(rocprofiler_t* handle) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + if (context) delete context; + API_METHOD_SUFFIX +} + +// Reset context +PUBLIC_API hsa_status_t rocprofiler_reset(rocprofiler_t* handle, uint32_t group_index) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + context->Reset(group_index); + API_METHOD_SUFFIX +} + +// Get profiling group count +PUBLIC_API hsa_status_t rocprofiler_group_count(const rocprofiler_t* handle, + uint32_t* group_count) { + API_METHOD_PREFIX + const rocprofiler::Context* context = reinterpret_cast(handle); + *group_count = context->GetGroupCount(); + API_METHOD_SUFFIX +} + +// Get profiling group for a given group index +PUBLIC_API hsa_status_t rocprofiler_get_group(rocprofiler_t* handle, uint32_t group_index, + rocprofiler_group_t* group) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + *group = context->GetGroupInfo(group_index); + API_METHOD_SUFFIX +} + +// Start profiling +PUBLIC_API hsa_status_t rocprofiler_start(rocprofiler_t* handle, uint32_t group_index) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + context->Start(group_index); + API_METHOD_SUFFIX +} + +// Stop profiling +PUBLIC_API hsa_status_t rocprofiler_stop(rocprofiler_t* handle, uint32_t group_index) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + context->Stop(group_index); + API_METHOD_SUFFIX +} + +// Read profiling +PUBLIC_API hsa_status_t rocprofiler_read(rocprofiler_t* handle, uint32_t group_index) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + context->Read(group_index); + API_METHOD_SUFFIX +} + +// Get profiling data +PUBLIC_API hsa_status_t rocprofiler_get_data(rocprofiler_t* handle, uint32_t group_index) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + context->GetData(group_index); + API_METHOD_SUFFIX +} + +// Start profiling +PUBLIC_API hsa_status_t rocprofiler_group_start(rocprofiler_group_t* group) { + API_METHOD_PREFIX + rocprofiler_start(group->context, group->index); + API_METHOD_SUFFIX +} + +// Stop profiling +PUBLIC_API hsa_status_t rocprofiler_group_stop(rocprofiler_group_t* group) { + API_METHOD_PREFIX + rocprofiler_stop(group->context, group->index); + API_METHOD_SUFFIX +} + +// Read profiling +PUBLIC_API hsa_status_t rocprofiler_group_read(rocprofiler_group_t* group) { + API_METHOD_PREFIX + rocprofiler_read(group->context, group->index); + API_METHOD_SUFFIX +} + +// Get profiling data +PUBLIC_API hsa_status_t rocprofiler_group_get_data(rocprofiler_group_t* group) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(group->context); + context->GetData(group->index); + API_METHOD_SUFFIX +} + +// Get metrics data +PUBLIC_API hsa_status_t rocprofiler_get_metrics(const rocprofiler_t* handle) { + API_METHOD_PREFIX + const rocprofiler::Context* context = reinterpret_cast(handle); + context->GetMetricsData(); + API_METHOD_SUFFIX +} + +// Set/remove queue callbacks +PUBLIC_API hsa_status_t rocprofiler_set_queue_callbacks(rocprofiler_queue_callbacks_t callbacks, void* data) { + API_METHOD_PREFIX + rocprofiler::InterceptQueue::SetCallbacks(callbacks.dispatch, callbacks.destroy, data); + API_METHOD_SUFFIX +} + +// Remove queue callbacks +PUBLIC_API hsa_status_t rocprofiler_remove_queue_callbacks() { + API_METHOD_PREFIX + rocprofiler::InterceptQueue::SetCallbacks(NULL, NULL, NULL); + API_METHOD_SUFFIX +} + +// Method for iterating the events output data +PUBLIC_API hsa_status_t rocprofiler_iterate_trace_data( + rocprofiler_t* handle, hsa_ven_amd_aqlprofile_data_callback_t callback, void* data) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + context->IterateTraceData(callback, data); + API_METHOD_SUFFIX +} + +// Return the info for a given info kind +PUBLIC_API hsa_status_t rocprofiler_get_info( + const hsa_agent_t *agent, + rocprofiler_info_kind_t kind, + void *data) +{ + API_METHOD_PREFIX + if (agent == NULL) EXC_RAISING(HSA_STATUS_ERROR, "NULL agent"); + uint32_t* result_32bit_ptr = reinterpret_cast(data); + + switch (kind) { + case ROCPROFILER_INFO_KIND_METRIC_COUNT: + *result_32bit_ptr = rocprofiler::GetMetrics(*agent)->Size(); + break; + case ROCPROFILER_INFO_KIND_TRACE_COUNT: + *result_32bit_ptr = 1; + break; + default: + EXC_RAISING(HSA_STATUS_ERROR, "unknown info kind(" << kind << ")"); + } + API_METHOD_SUFFIX +} + +// Iterate over the info for a given info kind, and invoke an application-defined callback on every iteration +PUBLIC_API hsa_status_t rocprofiler_iterate_info( + const hsa_agent_t* agent, + rocprofiler_info_kind_t kind, + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void* data), + void* data) +{ + API_METHOD_PREFIX + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + rocprofiler_info_data_t info{}; + info.kind = kind; + uint32_t agent_idx = 0; + uint32_t agent_max = 0; + const rocprofiler::util::AgentInfo* agent_info = NULL; + + if (agent != NULL) { + agent_info = hsa_rsrc->GetAgentInfo(*agent); + agent_idx = agent_info->dev_index; + agent_max = agent_idx + 1; + } + + while (hsa_rsrc->GetGpuAgentInfo(agent_idx, &agent_info)) { + info.agent_index = agent_idx; + + switch (kind) { + case ROCPROFILER_INFO_KIND_METRIC: + { + const rocprofiler::MetricsDict* dict = rocprofiler::GetMetrics(agent_info->dev_id); + auto nodes_vec = dict->GetNodes(agent_info->gfxip); + auto global_vec = dict->GetNodes("global"); + nodes_vec.insert(nodes_vec.end(), global_vec.begin(), global_vec.end()); + + for (auto* node : nodes_vec) { + const std::string& name = node->opts["name"]; + const std::string& descr = node->opts["descr"]; + const std::string& expr = node->opts["expr"]; + info.metric.name = strdup(name.c_str()); + info.metric.description = strdup(descr.c_str()); + info.metric.expr = expr.empty() ? NULL : strdup(expr.c_str()); + status = callback(info, data); + if (status != HSA_STATUS_SUCCESS) break; + } + break; + } + case ROCPROFILER_INFO_KIND_TRACE: + { + info.trace.name = strdup("TT"); + info.trace.description = strdup("Thread Trace"); + info.trace.parameter_count = 5; + status = callback(info, data); + if (status != HSA_STATUS_SUCCESS) break; + break; + } + default: + EXC_RAISING(HSA_STATUS_ERROR, "unknown info kind(" << kind << ")"); + } + + ++agent_idx; + if (agent_idx == agent_max) break; + } + + if (status == HSA_STATUS_INFO_BREAK) status = HSA_STATUS_SUCCESS; + if (status != HSA_STATUS_SUCCESS) ERR_LOGGING("iterate_info error, info kind(" << kind << ")"); + + API_METHOD_SUFFIX +} + +// Iterate over the info for a given info query, and invoke an application-defined callback on every iteration +PUBLIC_API hsa_status_t rocprofiler_query_info( + const hsa_agent_t *agent, + rocprofiler_info_query_t query, + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), + void *data) +{ + API_METHOD_PREFIX + EXC_RAISING(HSA_STATUS_ERROR, "Not implemented"); + API_METHOD_SUFFIX +} + +} // extern "C" diff --git a/src/core/simple_proxy_queue.cpp b/src/core/simple_proxy_queue.cpp new file mode 100644 index 00000000..1c3b5ae1 --- /dev/null +++ b/src/core/simple_proxy_queue.cpp @@ -0,0 +1,40 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "core/simple_proxy_queue.h" + +namespace rocprofiler { +void SimpleProxyQueue::HsaIntercept(HsaApiTable* table) { + table->core_->hsa_signal_store_relaxed_fn = rocprofiler::SimpleProxyQueue::SignalStore; + table->core_->hsa_signal_store_screlease_fn = rocprofiler::SimpleProxyQueue::SignalStore; + + table->core_->hsa_queue_load_write_index_relaxed_fn = rocprofiler::SimpleProxyQueue::GetQueueIndex; + table->core_->hsa_queue_store_write_index_relaxed_fn = rocprofiler::SimpleProxyQueue::SetQueueIndex; + table->core_->hsa_queue_load_read_index_relaxed_fn = rocprofiler::SimpleProxyQueue::GetSubmitIndex; + + table->core_->hsa_queue_load_write_index_scacquire_fn = rocprofiler::SimpleProxyQueue::GetQueueIndex; + table->core_->hsa_queue_store_write_index_screlease_fn = rocprofiler::SimpleProxyQueue::SetQueueIndex; + table->core_->hsa_queue_load_read_index_scacquire_fn = rocprofiler::SimpleProxyQueue::GetSubmitIndex; +} + +SimpleProxyQueue::queue_map_t* SimpleProxyQueue::queue_map_ = NULL; +} // namespace rocprofiler diff --git a/src/core/simple_proxy_queue.h b/src/core/simple_proxy_queue.h new file mode 100644 index 00000000..8bad833a --- /dev/null +++ b/src/core/simple_proxy_queue.h @@ -0,0 +1,262 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef _SRC_CORE_SIMPLE_PROXY_QUEUE_H +#define _SRC_CORE_SIMPLE_PROXY_QUEUE_H + +#include +#include +#include +#include + +#include "core/proxy_queue.h" +#include "core/types.h" +#include "util/hsa_rsrc_factory.h" + +#ifndef ROCP_PROXY_LOCK +# define ROCP_PROXY_LOCK 1 +#endif + +namespace rocprofiler { +extern decltype(hsa_queue_create)* hsa_queue_create_fn; +extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; + +extern decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed_fn; +extern decltype(hsa_signal_store_relaxed)* hsa_signal_store_screlease_fn; + +extern decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed_fn; +extern decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed_fn; +extern decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed_fn; + +extern decltype(hsa_queue_load_write_index_scacquire)* hsa_queue_load_write_index_scacquire_fn; +extern decltype(hsa_queue_store_write_index_screlease)* hsa_queue_store_write_index_screlease_fn; +extern decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacquire_fn; + +typedef decltype(hsa_signal_t::handle) signal_handle_t; + + +class SimpleProxyQueue : public ProxyQueue { + public: + static void HsaIntercept(HsaApiTable* table); + + static void SignalStore(hsa_signal_t signal, hsa_signal_value_t que_idx) { + auto it = queue_map_->find(signal.handle); + if (it != queue_map_->end()) { + SimpleProxyQueue* instance = it->second; + instance->mutex_lock(); + const uint64_t begin = instance->submit_index_; + const uint64_t end = que_idx + 1; + instance->submit_index_ = end; + instance->mutex_unlock(); + for (uint64_t j = begin; j < end; ++j) { + // Submited packet + const uint32_t idx = j & instance->queue_mask_; + packet_t* packet = reinterpret_cast(instance->queue_->base_address) + idx; + if (instance->on_submit_cb_ != NULL) + instance->on_submit_cb_(packet, 1, j, instance->on_submit_cb_data_, NULL); + else + instance->Submit(packet); + } + } else { + hsa_signal_store_relaxed_fn(signal, que_idx); + } + } + + static uint64_t GetSubmitIndex(const hsa_queue_t* queue) { + uint64_t index = 0; + auto it = queue_map_->find(queue->doorbell_signal.handle); + if (it != queue_map_->end()) { + SimpleProxyQueue* instance = it->second; + index = instance->submit_index_; + } else { + index = hsa_queue_load_read_index_relaxed_fn(queue); + } + return index; + } + + static uint64_t GetQueueIndex(const hsa_queue_t* queue) { + uint64_t index = 0; + auto it = queue_map_->find(queue->doorbell_signal.handle); + if (it != queue_map_->end()) { + SimpleProxyQueue* instance = it->second; + instance->mutex_lock(); + index = instance->queue_index_; + } else { + index = hsa_queue_load_write_index_relaxed_fn(queue); + } + return index; + } + + static void SetQueueIndex(const hsa_queue_t* queue, uint64_t value) { + auto it = queue_map_->find(queue->doorbell_signal.handle); + if (it != queue_map_->end()) { + SimpleProxyQueue* instance = it->second; + instance->queue_index_ = value; + instance->mutex_unlock(); + } else { + hsa_queue_store_write_index_relaxed_fn(queue, value); + } + } + + hsa_status_t SetInterceptCB(on_submit_cb_t on_submit_cb, void* data) { + on_submit_cb_ = on_submit_cb; + on_submit_cb_data_ = data; + return HSA_STATUS_SUCCESS; + } + + void Submit(const packet_t* packet) { + // Compute the write index of queue + const uint64_t que_idx = hsa_queue_load_write_index_relaxed_fn(queue_); + + // Waiting untill there is a free space in the queue + while (que_idx >= (hsa_queue_load_read_index_relaxed_fn(queue_) + size_)); + + // Increment the write index + hsa_queue_store_write_index_relaxed_fn(queue_, que_idx + 1); + + const uint32_t mask = queue_->size - 1; + const uint32_t idx = que_idx & mask; + + // Copy packet to the queue + const packet_word_t* src = reinterpret_cast(packet); + packet_word_t* dst = reinterpret_cast(base_address_ + idx); + for (unsigned i = 1; i < sizeof(packet_t) / sizeof(packet_word_t); ++i) { + dst[i] = src[i]; + } + + // To maintain global order to ensure the prior copy of the packet contents is made visible + // before the header is updated. + // With in-order CP it will wait until the first packet in the blob will be valid. + std::atomic* header_atomic_ptr = + reinterpret_cast*>(&dst[0]); + header_atomic_ptr->store(src[0], std::memory_order_release); + + // Doorbell signaling to submit the packet + hsa_signal_store_relaxed_fn(doorbell_signal_, que_idx); + } + + SimpleProxyQueue() + : agent_info_(NULL), + queue_(NULL), + base_address_(NULL), + doorbell_signal_({}), + queue_index_(0), + queue_mask_(0), + submit_index_(0), + on_submit_cb_(NULL), + on_submit_cb_data_(NULL) + { + printf("ROCProfiler: SimpleProxyQueue is enabled\n"); + fflush(stdout); + } + + ~SimpleProxyQueue() {} + + private: + typedef std::map queue_map_t; + + hsa_status_t Init(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), + void* data, uint32_t private_segment_size, uint32_t group_segment_size, + hsa_queue_t** queue) { + size_ = size; + auto status = Init(agent, size); + *queue = queue_; + return status; + } + + hsa_status_t Init(hsa_agent_t agent, uint32_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + agent_info_ = util::HsaRsrcFactory::Instance().GetAgentInfo(agent); + if (agent_info_ != NULL) { + if (agent_info_->dev_type == HSA_DEVICE_TYPE_GPU) { + status = hsa_queue_create_fn(agent, size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, + UINT32_MAX, &queue_); + if (status == HSA_STATUS_SUCCESS) { + base_address_ = reinterpret_cast(queue_->base_address); + doorbell_signal_ = queue_->doorbell_signal; + data_array_ = calloc(size + 1, sizeof(packet_t)); + uintptr_t addr = (uintptr_t)data_array_; + queue_->base_address = (void*)((addr + align_mask_) & ~align_mask_); + status = hsa_signal_create(1, 0, NULL, &(queue_->doorbell_signal)); + if (status != HSA_STATUS_SUCCESS) abort(); + queue_mask_ = size - 1; + + if (queue_map_ == NULL) queue_map_ = new queue_map_t; + (*queue_map_)[queue_->doorbell_signal.handle] = this; + } + else abort(); + } + } + if (status != HSA_STATUS_SUCCESS) abort(); + return status; + } + + hsa_status_t Cleanup() const { + hsa_status_t status = HSA_STATUS_ERROR; + hsa_signal_t queue_signal = queue_->doorbell_signal; + + // Destroy original HSA queue + queue_->base_address = base_address_; + queue_->doorbell_signal = doorbell_signal_; + status = hsa_queue_destroy_fn(queue_); + if (status != HSA_STATUS_SUCCESS) abort(); + + // Destroy overloaded virtual queue data and signal + free(data_array_); + status = hsa_signal_destroy(queue_signal); + if (status != HSA_STATUS_SUCCESS) abort(); + + return status; + } + + void mutex_lock() { +#if ROCP_PROXY_LOCK + mutex_.lock(); +#endif + } + + void mutex_unlock() { +#if ROCP_PROXY_LOCK + mutex_.unlock(); +#endif + } + + uint32_t size_; + static queue_map_t* queue_map_; + const util::AgentInfo* agent_info_; + hsa_queue_t* queue_; + static const uintptr_t align_mask_ = sizeof(packet_t) - 1; + packet_t* base_address_; + hsa_signal_t doorbell_signal_; + uint64_t queue_index_; + uint64_t queue_mask_; + uint64_t submit_index_; + std::mutex mutex_; + on_submit_cb_t on_submit_cb_; + void* on_submit_cb_data_; + void* data_array_; +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_SIMPLE_PROXY_QUEUE_H diff --git a/src/core/tracker.h b/src/core/tracker.h new file mode 100644 index 00000000..eae0c112 --- /dev/null +++ b/src/core/tracker.h @@ -0,0 +1,188 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_CORE_TRACKER_H_ +#define SRC_CORE_TRACKER_H_ + +#include +#include +#include +#include + +#include +#include + +#include "inc/rocprofiler.h" +#include "util/exception.h" +#include "util/logger.h" + +namespace rocprofiler { + +class Tracker { + public: + typedef uint64_t timestamp_t; + typedef long double freq_t; + typedef std::mutex mutex_t; + typedef rocprofiler_dispatch_record_t record_t; + struct entry_t; + typedef std::list sig_list_t; + struct entry_t { + Tracker* tracker; + sig_list_t::iterator it; + hsa_agent_t agent; + hsa_signal_t orig; + hsa_signal_t signal; + record_t* record; + }; + + Tracker(uint64_t timeout = UINT64_MAX) : timeout_(timeout), outstanding(0) { + timestamp_t timestamp_hz = 0; + hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, ×tamp_hz); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)"); + timestamp_factor_ = (freq_t)1000000000 / (freq_t)timestamp_hz; + } + ~Tracker() { + mutex_.lock(); + for (entry_t* entry : sig_list_) { + assert(entry != NULL); + while (1) { + const hsa_signal_value_t signal_value = hsa_signal_wait_scacquire( + entry->signal, + HSA_SIGNAL_CONDITION_LT, + 1, + timeout_, + HSA_WAIT_STATE_BLOCKED); + if (signal_value < 1) break; + else WARN_LOGGING("tracker timeout"); + } + Del(entry); + } + mutex_.unlock(); + } + + // Add tracker entry + entry_t* Add(const hsa_agent_t& agent, const hsa_signal_t& orig) { + hsa_status_t status = HSA_STATUS_ERROR; + entry_t* entry = new entry_t{}; + assert(entry); + entry->tracker = this; + mutex_.lock(); + entry->it = sig_list_.insert(sig_list_.begin(), entry); + mutex_.unlock(); + + entry->agent = agent; + entry->orig = orig; + status = hsa_signal_create(1, 0, NULL, &(entry->signal)); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); + + record_t* record = new record_t{}; + assert(record); + entry->record = record; + status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &record->dispatch); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)"); + + hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + + if (trace_on_) { + mutex_.lock(); + entry->tracker->outstanding++; + fprintf(stdout, "Tracker::Add: entry %p, record %p, outst %lu\n", entry, entry->record, entry->tracker->outstanding); + fflush(stdout); + mutex_.unlock(); + } + + return entry; + } + + private: + // Delete tracker entry + void Del(entry_t* entry) { + hsa_signal_destroy(entry->signal); + mutex_.lock(); + sig_list_.erase(entry->it); + mutex_.unlock(); + delete entry; + } + + // Handler for packet completion + static bool Handler(hsa_signal_value_t value, void* arg) { + entry_t* entry = reinterpret_cast(arg); + record_t* record = entry->record; + + if (trace_on_) { + mutex_.lock(); + entry->tracker->outstanding--; + fprintf(stdout, "Tracker::Handler: entry %p, record %p, outst %lu\n", entry, entry->record, entry->tracker->outstanding); + fflush(stdout); + mutex_.unlock(); + } + + timestamp_t complete_timestamp = 0; + hsa_amd_profiling_dispatch_time_t dispatch_time{}; + + hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &complete_timestamp); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)"); + status = hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time"); + + record->complete = entry->tracker->timestamp2ns(complete_timestamp); + record->begin = entry->tracker->timestamp2ns(dispatch_time.start); + record->end = entry->tracker->timestamp2ns(dispatch_time.end); + + hsa_signal_t orig = entry->orig; + if (orig.handle) { + amd_signal_t* orig_signal_ptr = reinterpret_cast(orig.handle); + amd_signal_t* prof_signal_ptr = reinterpret_cast(entry->signal.handle); + orig_signal_ptr->start_ts = prof_signal_ptr->start_ts; + orig_signal_ptr->end_ts = prof_signal_ptr->end_ts; + + const hsa_signal_value_t value = hsa_signal_load_relaxed(orig); + hsa_signal_store_screlease(orig, value - 1); + } + entry->tracker->Del(entry); + + return false; + } + + inline timestamp_t timestamp2ns(const timestamp_t& timestamp) const { + const freq_t timestamp_ns = (freq_t)timestamp * timestamp_factor_; + return (timestamp_t)timestamp_ns; + } + + // Timestamp frequency factor + freq_t timestamp_factor_; + // Timeout for wait on destruction + timestamp_t timeout_; + // Tracked signals list + sig_list_t sig_list_; + // Inter-thread synchronization + static mutex_t mutex_; + // Outstanding dispatches + uint64_t outstanding; + // Enable tracing + static const bool trace_on_ = false; +}; + +} // namespace rocprofiler + +#endif // SRC_CORE_TRACKER_H_ diff --git a/src/core/types.h b/src/core/types.h new file mode 100644 index 00000000..fd8bae33 --- /dev/null +++ b/src/core/types.h @@ -0,0 +1,37 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_CORE_TYPES_H_ +#define SRC_CORE_TYPES_H_ + +#include + +namespace rocprofiler { +typedef hsa_ven_amd_aqlprofile_1_00_pfn_t pfn_t; +typedef hsa_ven_amd_aqlprofile_event_t event_t; +typedef hsa_ven_amd_aqlprofile_parameter_t parameter_t; +typedef hsa_ven_amd_aqlprofile_profile_t profile_t; +typedef hsa_ext_amd_aql_pm4_packet_t packet_t; +typedef uint32_t packet_word_t; +} // namespace rocprofiler + +#endif // SRC_CORE_TYPES_H_ diff --git a/src/util/exception.h b/src/util/exception.h new file mode 100644 index 00000000..8af5f980 --- /dev/null +++ b/src/util/exception.h @@ -0,0 +1,72 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_UTIL_EXCEPTION_H_ +#define SRC_UTIL_EXCEPTION_H_ + +#include + +#include +#include +#include + +#define EXC_ABORT(error, stream) \ + { \ + std::ostringstream oss; \ + oss << __FUNCTION__ << "(), " << stream; \ + std::cout << oss.str() << std::endl; \ + abort(); \ + } + +#define EXC_RAISING(error, stream) \ + { \ + std::ostringstream oss; \ + oss << __FUNCTION__ << "(), " << stream; \ + throw rocprofiler::util::exception(error, oss.str()); \ + } + +#define AQL_EXC_RAISING(error, stream) \ + { \ + const char* error_string = NULL; \ + const rocprofiler::pfn_t* api = util::HsaRsrcFactory::Instance().AqlProfileApi(); \ + api->hsa_ven_amd_aqlprofile_error_string(&error_string); \ + EXC_RAISING(error, stream << ", " << error_string); \ + } + +namespace rocprofiler { +namespace util { + +class exception : public std::exception { + public: + explicit exception(const uint32_t& status, const std::string& msg) : status_(status), str_(msg) {} + const char* what() const throw() { return str_.c_str(); } + uint32_t status() const throw() { return status_; } + + protected: + const uint32_t status_; + const std::string str_; +}; + +} // namespace util +} // namespace rocprofiler + +#endif // SRC_UTIL_EXCEPTION_H_ diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp new file mode 100644 index 00000000..ff749d15 --- /dev/null +++ b/src/util/hsa_rsrc_factory.cpp @@ -0,0 +1,562 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "util/hsa_rsrc_factory.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace rocprofiler { +namespace util { + +// Callback function to get available in the system agents +hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) { + hsa_status_t status = HSA_STATUS_ERROR; + HsaRsrcFactory* hsa_rsrc = reinterpret_cast(data); + const AgentInfo* agent_info = hsa_rsrc->AddAgentInfo(agent); + if (agent_info != NULL) status = HSA_STATUS_SUCCESS; + return status; +} + +// This function checks to see if the provided +// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true, +// the function adds an additional requirement that the pool have the +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT property. If kern_arg is false, +// pools must NOT have this property. +// Upon finding a pool that meets these conditions, HSA_STATUS_INFO_BREAK is +// returned. HSA_STATUS_SUCCESS is returned if no errors were encountered, but +// no pool was found meeting the requirements. If an error is encountered, we +// return that error. +static hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool kern_arg) { + hsa_status_t err; + hsa_amd_segment_t segment; + uint32_t flag; + + if (nullptr == data) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + CHECK_STATUS("hsa_amd_memory_pool_get_info", err); + if (HSA_AMD_SEGMENT_GLOBAL != segment) { + return HSA_STATUS_SUCCESS; + } + + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + CHECK_STATUS("hsa_amd_memory_pool_get_info", err); + + uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT; + + if ((karg_st == 0 && kern_arg) || (karg_st != 0 && !kern_arg)) { + return HSA_STATUS_SUCCESS; + } + + *(reinterpret_cast(data)) = pool; + return HSA_STATUS_INFO_BREAK; +} + +// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that +// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT +hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) { + return FindGlobalPool(pool, data, false); +} + +// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that +// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT +hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { + return FindGlobalPool(pool, data, true); +} + +// Constructor of the class +HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) { + hsa_status_t status; + // Initialize the Hsa Runtime + if (initialize_hsa_) { + status = hsa_init(); + CHECK_STATUS("Error in hsa_init", status); + } + // Discover the set of Gpu devices available on the platform + status = hsa_iterate_agents(GetHsaAgentsCallback, this); + CHECK_STATUS("Error Calling hsa_iterate_agents", status); + + // Get AqlProfile API table + aqlprofile_api_ = {0}; +#ifdef ROCP_LD_AQLPROFILE + status = LoadAqlProfileLib(&aqlprofile_api_); +#else + status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, 1, 0, &aqlprofile_api_); +#endif + CHECK_STATUS("aqlprofile API table load failed", status); + + // Get Loader API table + loader_api_ = {0}; + status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_); + CHECK_STATUS("loader API table query failed", status); +} + +// Destructor of the class +HsaRsrcFactory::~HsaRsrcFactory() { + for (auto p : cpu_list_) delete p; + for (auto p : gpu_list_) delete p; + if (initialize_hsa_) { + hsa_status_t status = hsa_shut_down(); + CHECK_STATUS("Error in hsa_shut_down", status); + } +} + +hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { + void* handle = dlopen(kAqlProfileLib, RTLD_NOW); + if (handle == NULL) { + fprintf(stderr, "Loading '%s' failed, %s\n", kAqlProfileLib, dlerror()); + return HSA_STATUS_ERROR; + } + dlerror(); /* Clear any existing error */ + + api->hsa_ven_amd_aqlprofile_error_string = + (decltype(::hsa_ven_amd_aqlprofile_error_string)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_error_string"); + api->hsa_ven_amd_aqlprofile_validate_event = + (decltype(::hsa_ven_amd_aqlprofile_validate_event)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_validate_event"); + api->hsa_ven_amd_aqlprofile_start = + (decltype(::hsa_ven_amd_aqlprofile_start)*)dlsym(handle, "hsa_ven_amd_aqlprofile_start"); + api->hsa_ven_amd_aqlprofile_stop = + (decltype(::hsa_ven_amd_aqlprofile_stop)*)dlsym(handle, "hsa_ven_amd_aqlprofile_stop"); + api->hsa_ven_amd_aqlprofile_read = + (decltype(::hsa_ven_amd_aqlprofile_read)*)dlsym(handle, "hsa_ven_amd_aqlprofile_read"); + api->hsa_ven_amd_aqlprofile_legacy_get_pm4 = + (decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + api->hsa_ven_amd_aqlprofile_get_info = (decltype(::hsa_ven_amd_aqlprofile_get_info)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_get_info"); + api->hsa_ven_amd_aqlprofile_iterate_data = + (decltype(::hsa_ven_amd_aqlprofile_iterate_data)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_iterate_data"); + + return HSA_STATUS_SUCCESS; +} + +// Add system agent info +const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { + // Determine if device is a Gpu agent + hsa_status_t status; + AgentInfo* agent_info = NULL; + + hsa_device_type_t type; + status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); + CHECK_STATUS("Error Calling hsa_agent_get_info", status); + + if (type == HSA_DEVICE_TYPE_CPU) { + agent_info = new AgentInfo{}; + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_CPU; + agent_info->dev_index = cpu_list_.size(); + + status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(cpu pool)", status); + status = hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(kern arg pool)", status); + agent_info->gpu_pool = {}; + + cpu_list_.push_back(agent_info); + cpu_agents_.push_back(agent); + } + + if (type == HSA_DEVICE_TYPE_GPU) { + agent_info = new AgentInfo{}; + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_GPU; + hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); + strncpy(agent_info->gfxip, agent_info->name, 4); + agent_info->gfxip[4] = '\0'; + hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); + hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); + hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); + agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false; + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), + &agent_info->cu_num); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), + &agent_info->waves_per_cu); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), + &agent_info->simds_per_cu); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), + &agent_info->se_num); + hsa_agent_get_info(agent, + static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), + &agent_info->shader_arrays_per_se); + + agent_info->cpu_pool = {}; + agent_info->kern_arg_pool = {}; + status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); + + // Set GPU index + agent_info->dev_index = gpu_list_.size(); + gpu_list_.push_back(agent_info); + gpu_agents_.push_back(agent); + } + + if (agent_info) agent_map_[agent.handle] = agent_info; + + return agent_info; +} + +// Return systen agent info +const AgentInfo* HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) { + const AgentInfo* agent_info = NULL; + auto it = agent_map_.find(agent.handle); + if (it != agent_map_.end()) { + agent_info = it->second; + } + return agent_info; +} + +// Get the count of Hsa Gpu Agents available on the platform +// +// @return uint32_t Number of Gpu agents on platform +// +uint32_t HsaRsrcFactory::GetCountOfGpuAgents() { return uint32_t(gpu_list_.size()); } + +// Get the count of Hsa Cpu Agents available on the platform +// +// @return uint32_t Number of Cpu agents on platform +// +uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size()); } + +// Get the AgentInfo handle of a Gpu device +// +// @param idx Gpu Agent at specified index +// +// @param agent_info Output parameter updated with AgentInfo +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { + // Determine if request is valid + uint32_t size = uint32_t(gpu_list_.size()); + if (idx >= size) { + return false; + } + + // Copy AgentInfo from specified index + *agent_info = gpu_list_[idx]; + + return true; +} + +// Get the AgentInfo handle of a Cpu device +// +// @param idx Cpu Agent at specified index +// +// @param agent_info Output parameter updated with AgentInfo +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { + // Determine if request is valid + uint32_t size = uint32_t(cpu_list_.size()); + if (idx >= size) { + return false; + } + + // Copy AgentInfo from specified index + *agent_info = cpu_list_[idx]; + return true; +} + +// Create a Queue object and return its handle. The queue object is expected +// to support user requested number of Aql dispatch packets. +// +// @param agent_info Gpu Agent on which to create a queue object +// +// @param num_Pkts Number of packets to be held by queue +// +// @param queue Output parameter updated with handle of queue object +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, + hsa_queue_t** queue) { + hsa_status_t status; + status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, + UINT32_MAX, UINT32_MAX, queue); + return (status == HSA_STATUS_SUCCESS); +} + +// Create a Signal object and return its handle. +// @param value Initial value of signal object +// @param signal Output parameter updated with handle of signal object +// @return bool true if successful, false otherwise +bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { + hsa_status_t status; + status = hsa_signal_create(value, 0, NULL, signal); + return (status == HSA_STATUS_SUCCESS); +} + +// Allocate memory for use by a kernel of specified size in specified +// agent's memory region. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + status = hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, reinterpret_cast(&buffer)); + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Allocate memory to pass kernel parameters. +// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + if (!cpu_agents_.empty()) { + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + status = hsa_amd_memory_pool_allocate(cpu_list_[0]->kern_arg_pool, size, 0, reinterpret_cast(&buffer)); + // Both the CPU and GPU can access the kernel arguments + if (status == HSA_STATUS_SUCCESS) { + hsa_agent_t ag_list[1] = {agent_info->dev_id}; + status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + } + } + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Allocate system memory accessible by both CPU and GPU +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + if (!cpu_agents_.empty()) { + status = hsa_amd_memory_pool_allocate(cpu_list_[0]->cpu_pool, size, 0, reinterpret_cast(&buffer)); + // Both the CPU and GPU can access the memory + if (status == HSA_STATUS_SUCCESS) { + hsa_agent_t ag_list[1] = {agent_info->dev_id}; + status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + } + } + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Allocate memory for command buffer. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t size) { + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + uint8_t* ptr = (agent_info->is_apu && CMD_MEMORY_MMAP) + ? reinterpret_cast( + mmap(NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, 0, 0)) + : AllocateSysMemory(agent_info, size); + return ptr; +} + +// Copy data from GPU to host memory +bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + if (!cpu_agents_.empty()) { + hsa_signal_t s = {}; + status = hsa_signal_create(1, 0, NULL, &s); + if (status == HSA_STATUS_SUCCESS) { + status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); + if (status == HSA_STATUS_SUCCESS) { + if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, + HSA_WAIT_STATE_BLOCKED) != 0) { + status = HSA_STATUS_ERROR; + } + } + status = hsa_signal_destroy(s); + } + } + return (status == HSA_STATUS_SUCCESS); +} +bool HsaRsrcFactory::Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size) { + return Memcpy(agent_info->dev_id, dst, src, size); +} + +// Memory free method +bool HsaRsrcFactory::FreeMemory(void* ptr) { + const hsa_status_t status = hsa_memory_free(ptr); + CHECK_STATUS("hsa_memory_free", status); + return (status == HSA_STATUS_SUCCESS); +} + +// Loads an Assembled Brig file and Finalizes it into Device Isa +// @param agent_info Gpu device for which to finalize +// @param brig_path File path of the Assembled Brig file +// @param kernel_name Name of the kernel to finalize +// @param code_desc Handle of finalized Code Descriptor that could +// be used to submit for execution +// @return bool true if successful, false otherwise +bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, + const char* kernel_name, hsa_executable_t* executable, + hsa_executable_symbol_t* code_desc) { + hsa_status_t status = HSA_STATUS_ERROR; + + // Build the code object filename + std::string filename(brig_path); + std::clog << "Code object filename: " << filename << std::endl; + + // Open the file containing code object + hsa_file_t file_handle = open(filename.c_str(), O_RDONLY); + if (file_handle == -1) { + std::cerr << "Error: failed to load '" << filename << "'" << std::endl; + assert(false); + return false; + } + + // Create code object reader + hsa_code_object_reader_t code_obj_rdr = {0}; + status = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); + if (status != HSA_STATUS_SUCCESS) { + std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl; + return false; + } + + // Create executable. + status = hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + NULL, executable); + CHECK_STATUS("Error in creating executable object", status); + + // Load code object. + status = hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, code_obj_rdr, + NULL, NULL); + CHECK_STATUS("Error in loading executable object", status); + + // Freeze executable. + status = hsa_executable_freeze(*executable, ""); + CHECK_STATUS("Error in freezing executable object", status); + + // Get symbol handle. + hsa_executable_symbol_t kernelSymbol; + status = hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0, + &kernelSymbol); + CHECK_STATUS("Error in looking up kernel symbol", status); + + // Update output parameter + *code_desc = kernelSymbol; + return true; +} + +// Print the various fields of Hsa Gpu Agents +bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { + std::clog << header << " :" << std::endl; + + const AgentInfo* agent_info; + int size = uint32_t(gpu_list_.size()); + for (int idx = 0; idx < size; idx++) { + agent_info = gpu_list_[idx]; + + std::clog << "> agent[" << idx << "] :" << std::endl; + std::clog << ">> Name : " << agent_info->name << std::endl; + std::clog << ">> APU : " << agent_info->is_apu << std::endl; + std::clog << ">> HSAIL profile : " << agent_info->profile << std::endl; + std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl; + std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl; + std::clog << ">> CU number : " << agent_info->cu_num << std::endl; + std::clog << ">> Waves per CU : " << agent_info->waves_per_cu << std::endl; + std::clog << ">> SIMDs per CU : " << agent_info->simds_per_cu << std::endl; + std::clog << ">> SE number : " << agent_info->se_num << std::endl; + std::clog << ">> Shader Arrays per SE : " << agent_info->shader_arrays_per_se << std::endl; + } + return true; +} + +uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { + const uint32_t slot_size_b = 0x40; + + // adevance command queue + const uint64_t write_idx = hsa_queue_load_write_index_relaxed(queue); + hsa_queue_store_write_index_relaxed(queue, write_idx + 1); + while ((write_idx - hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { + sched_yield(); + } + + uint32_t slot_idx = (uint32_t)(write_idx % queue->size); + uint32_t* queue_slot = reinterpret_cast((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b)); + const uint32_t* slot_data = reinterpret_cast(packet); + + // Copy buffered commands into the queue slot. + // Overwrite the AQL invalid header (first dword) last. + // This prevents the slot from being read until it's fully written. + memcpy(&queue_slot[1], &slot_data[1], slot_size_b - sizeof(uint32_t)); + std::atomic* header_atomic_ptr = + reinterpret_cast*>(&queue_slot[0]); + header_atomic_ptr->store(slot_data[0], std::memory_order_release); + + // ringdoor bell + hsa_signal_store_relaxed(queue->doorbell_signal, write_idx); + + return write_idx; +} + +uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes) { + const uint32_t slot_size_b = 0x40; + if ((size_bytes & (slot_size_b - 1)) != 0) { + fprintf(stderr, "HsaRsrcFactory::Submit: Bad packet size %zx\n", size_bytes); + abort(); + } + + const char* begin = reinterpret_cast(packet); + const char* end = begin + size_bytes; + uint64_t write_idx = 0; + for (const char* ptr = begin; ptr < end; ptr += slot_size_b) { + write_idx = Submit(queue, ptr); + } + + return write_idx; +} + +HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL; +HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; + +} // namespace util +} // namespace rocprofiler diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h new file mode 100644 index 00000000..b00ee8ed --- /dev/null +++ b/src/util/hsa_rsrc_factory.h @@ -0,0 +1,288 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_UTIL_HSA_RSRC_FACTORY_H_ +#define SRC_UTIL_HSA_RSRC_FACTORY_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define HSA_ARGUMENT_ALIGN_BYTES 16 +#define HSA_QUEUE_ALIGN_BYTES 64 +#define HSA_PACKET_ALIGN_BYTES 64 + +#define CHECK_STATUS(msg, status) \ + if (status != HSA_STATUS_SUCCESS) { \ + const char* emsg = 0; \ + hsa_status_string(status, &emsg); \ + printf("%s: %s\n", msg, emsg ? emsg : ""); \ + exit(1); \ + } + +#define CHECK_ITER_STATUS(msg, status) \ + if (status != HSA_STATUS_INFO_BREAK) { \ + const char* emsg = 0; \ + hsa_status_string(status, &emsg); \ + printf("%s: %s\n", msg, emsg ? emsg : ""); \ + exit(1); \ + } + +namespace rocprofiler { +namespace util { +static const size_t MEM_PAGE_BYTES = 0x1000; +static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; +typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t; + +// Encapsulates information about a Hsa Agent such as its +// handle, name, max queue size, max wavefront size, etc. +struct AgentInfo { + // Handle of Agent + hsa_agent_t dev_id; + + // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2 + uint32_t dev_type; + + // APU flag + bool is_apu; + + // Agent system index + uint32_t dev_index; + + // GFXIP name + char gfxip[64]; + + // Name of Agent whose length is less than 64 + char name[64]; + + // Max size of Wavefront size + uint32_t max_wave_size; + + // Max size of Queue buffer + uint32_t max_queue_size; + + // Hsail profile supported by agent + hsa_profile_t profile; + + // CPU/GPU/kern-arg memory pools + hsa_amd_memory_pool_t cpu_pool; + hsa_amd_memory_pool_t gpu_pool; + hsa_amd_memory_pool_t kern_arg_pool; + + // The number of compute unit available in the agent. + uint32_t cu_num; + + // Maximum number of waves possible in a Compute Unit. + uint32_t waves_per_cu; + + // Number of SIMD's per compute unit CU + uint32_t simds_per_cu; + + // Number of Shader Engines (SE) in Gpu + uint32_t se_num; + + // Number of Shader Arrays Per Shader Engines in Gpu + uint32_t shader_arrays_per_se; +}; + +class HsaRsrcFactory { + public: + typedef std::recursive_mutex mutex_t; + + static HsaRsrcFactory* Create(bool initialize_hsa = true) { + std::lock_guard lck(mutex_); + if (instance_ == NULL) { + instance_ = new HsaRsrcFactory(initialize_hsa); + } + return instance_; + } + + static HsaRsrcFactory& Instance() { + if (instance_ == NULL) instance_ = Create(false); + hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + CHECK_STATUS("HsaRsrcFactory::Instance() failed", status); + return *instance_; + } + + static void Destroy() { + std::lock_guard lck(mutex_); + if (instance_) delete instance_; + instance_ = NULL; + } + + // Return system agent info + const AgentInfo* GetAgentInfo(const hsa_agent_t agent); + + // Get the count of Hsa Gpu Agents available on the platform + // @return uint32_t Number of Gpu agents on platform + uint32_t GetCountOfGpuAgents(); + + // Get the count of Hsa Cpu Agents available on the platform + // @return uint32_t Number of Cpu agents on platform + uint32_t GetCountOfCpuAgents(); + + // Get the AgentInfo handle of a Gpu device + // @param idx Gpu Agent at specified index + // @param agent_info Output parameter updated with AgentInfo + // @return bool true if successful, false otherwise + bool GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); + + // Get the AgentInfo handle of a Cpu device + // @param idx Cpu Agent at specified index + // @param agent_info Output parameter updated with AgentInfo + // @return bool true if successful, false otherwise + bool GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); + + // Create a Queue object and return its handle. The queue object is expected + // to support user requested number of Aql dispatch packets. + // @param agent_info Gpu Agent on which to create a queue object + // @param num_Pkts Number of packets to be held by queue + // @param queue Output parameter updated with handle of queue object + // @return bool true if successful, false otherwise + bool CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue); + + // Create a Signal object and return its handle. + // @param value Initial value of signal object + // @param signal Output parameter updated with handle of signal object + // @return bool true if successful, false otherwise + bool CreateSignal(uint32_t value, hsa_signal_t* signal); + + // Allocate local GPU memory + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateLocalMemory(const AgentInfo* agent_info, size_t size); + + // Allocate memory tp pass kernel parameters + // Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateKernArgMemory(const AgentInfo* agent_info, size_t size); + + // Allocate system memory accessible from both CPU and GPU + // Memory is alocated accessible to all CPU agents and AgentInfo parameter is ignored. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateSysMemory(const AgentInfo* agent_info, size_t size); + + // Allocate memory for command buffer. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size); + + // Copy data from GPU to host memory + bool Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size); + bool Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size); + + // Memory free method + static bool FreeMemory(void* ptr); + + // Loads an Assembled Brig file and Finalizes it into Device Isa + // @param agent_info Gpu device for which to finalize + // @param brig_path File path of the Assembled Brig file + // @param kernel_name Name of the kernel to finalize + // @param code_desc Handle of finalized Code Descriptor that could + // be used to submit for execution + // @return true if successful, false otherwise + bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name, + hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc); + + // Print the various fields of Hsa Gpu Agents + bool PrintGpuAgents(const std::string& header); + + // Submit AQL packet to given queue + static uint64_t Submit(hsa_queue_t* queue, const void* packet); + static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); + + // Return AqlProfile API table + typedef hsa_ven_amd_aqlprofile_1_00_pfn_t aqlprofile_pfn_t; + const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; } + + // Return Loader API table + const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; } + + private: + // System agents iterating callback + static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); + + // Callback function to find and bind kernarg region of an agent + static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data); + + // Load AQL profile HSA extension library directly + static hsa_status_t LoadAqlProfileLib(aqlprofile_pfn_t* api); + + // Constructor of the class. Will initialize the Hsa Runtime and + // query the system topology to get the list of Cpu and Gpu devices + explicit HsaRsrcFactory(bool initialize_hsa); + + // Destructor of the class + ~HsaRsrcFactory(); + + // Add an instance of AgentInfo representing a Hsa Gpu agent + const AgentInfo* AddAgentInfo(const hsa_agent_t agent); + + // To mmap command buffer memory + static const bool CMD_MEMORY_MMAP = false; + + // HSA was initialized + const bool initialize_hsa_; + + static HsaRsrcFactory* instance_; + static mutex_t mutex_; + + // Used to maintain a list of Hsa Gpu Agent Info + std::vector gpu_list_; + std::vector gpu_agents_; + + // Used to maintain a list of Hsa Cpu Agent Info + std::vector cpu_list_; + std::vector cpu_agents_; + + // System agents map + std::map agent_map_; + + // AqlProfile API table + aqlprofile_pfn_t aqlprofile_api_; + + // Loader API table + hsa_ven_amd_loader_1_00_pfn_t loader_api_; +}; + +} // namespace util +} // namespace rocprofiler + +#endif // SRC_UTIL_HSA_RSRC_FACTORY_H_ diff --git a/src/util/logger.h b/src/util/logger.h new file mode 100644 index 00000000..97477899 --- /dev/null +++ b/src/util/logger.h @@ -0,0 +1,191 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_UTIL_LOGGER_H_ +#define SRC_UTIL_LOGGER_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace rocprofiler { +namespace util { + +class Logger { + public: + typedef std::recursive_mutex mutex_t; + + template Logger& operator<<(const T& m) { + std::ostringstream oss; + oss << m; + if (!streaming_) + Log(oss.str()); + else + Put(oss.str()); + streaming_ = true; + return *this; + } + + typedef void (*manip_t)(); + Logger& operator<<(manip_t f) { + f(); + return *this; + } + + static void begm() { Instance().ResetStreaming(true); } + static void endl() { Instance().ResetStreaming(false); } + + static const std::string& LastMessage() { + Logger& logger = Instance(); + std::lock_guard lck(mutex_); + return logger.message_[GetTid()]; + } + + static Logger* Create() { + std::lock_guard lck(mutex_); + if (instance_ == NULL) instance_ = new Logger(); + return instance_; + } + + static void Destroy() { + std::lock_guard lck(mutex_); + if (instance_ != NULL) delete instance_; + instance_ = NULL; + } + + static Logger& Instance() { + Create(); + return *instance_; + } + + private: + static uint32_t GetPid() { return syscall(__NR_getpid); } + static uint32_t GetTid() { return syscall(__NR_gettid); } + + Logger() : file_(NULL), dirty_(false), streaming_(false), messaging_(false) { + const char* path = getenv("ROCPROFILER_LOG"); + if (path != NULL) { + file_ = fopen("/tmp/rocprofiler_log.txt", "a"); + } + ResetStreaming(false); + } + + ~Logger() { + if (file_ != NULL) { + if (dirty_) Put("\n"); + fclose(file_); + } + } + + void ResetStreaming(const bool messaging) { + std::lock_guard lck(mutex_); + if (messaging) { + message_[GetTid()] = ""; + } else if (streaming_) { + Put("\n"); + dirty_ = false; + } + messaging_ = messaging; + streaming_ = messaging; + } + + void Put(const std::string& m) { + std::lock_guard lck(mutex_); + if (messaging_) { + message_[GetTid()] += m; + } + if (file_ != NULL) { + dirty_ = true; + flock(fileno(file_), LOCK_EX); + fprintf(file_, "%s", m.c_str()); + fflush(file_); + flock(fileno(file_), LOCK_UN); + } + } + + void Log(const std::string& m) { + const time_t rawtime = time(NULL); + tm tm_info; + localtime_r(&rawtime, &tm_info); + char tm_str[26]; + strftime(tm_str, 26, "%Y-%m-%d %H:%M:%S", &tm_info); + std::ostringstream oss; + oss << "<" << tm_str << std::dec << " pid" << GetPid() << " tid" << GetTid() << "> " << m; + Put(oss.str()); + } + + FILE* file_; + bool dirty_; + bool streaming_; + bool messaging_; + + static mutex_t mutex_; + static Logger* instance_; + std::map message_; +}; + +} // namespace util +} // namespace rocprofiler + +#define ERR_LOGGING(stream) \ + { \ + rocprofiler::util::Logger::Instance() << "error: " << rocprofiler::util::Logger::begm \ + << stream << rocprofiler::util::Logger::endl; \ + } + +#define INFO_LOGGING(stream) \ + { \ + rocprofiler::util::Logger::Instance() << "info: " << rocprofiler::util::Logger::begm << stream \ + << rocprofiler::util::Logger::endl; \ + } + +#define WARN_LOGGING(stream) \ + { \ + std::cerr << "ROCProfiler: " << stream << std::endl; \ + rocprofiler::util::Logger::Instance() << "warning: " << rocprofiler::util::Logger::begm << stream \ + << rocprofiler::util::Logger::endl; \ + } + +#ifdef DEBUG +#define DBG_LOGGING(stream) \ + { \ + rocprofiler::util::Logger::Instance() << rocprofiler::util::Logger::begm << "debug: \"" \ + << stream << "\"" < < < < \ + " in " << __FUNCTION__ << " at " << __FILE__ << " line " << __LINE__ \ + << rocprofiler::util::Logger::endl; \ + } +#endif + +#endif // SRC_UTIL_LOGGER_H_ diff --git a/src/xml/expr.h b/src/xml/expr.h new file mode 100644 index 00000000..731e25e4 --- /dev/null +++ b/src/xml/expr.h @@ -0,0 +1,446 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef _SRC_XML_EXPR_H +#define _SRC_XML_EXPR_H + +#include +#include +#include +#include +#include +#include + +namespace xml { +class exception_t : public std::exception { + public: + explicit exception_t(const std::string& msg) : str_(msg) {} + const char* what() const throw() { return str_.c_str(); } + + protected: + const std::string str_; +}; + +class div_zero_exception_t : public exception_t { + public: + explicit div_zero_exception_t(const std::string& msg) : exception_t("Divide by zero exception " + msg) {} +}; + +typedef uint64_t args_t; +static const args_t ARGS_MAX = UINT64_MAX; +typedef std::map args_map_t; +class Expr; + +template class any_cache_t { + public: + virtual ~any_cache_t() {} + virtual bool Lookup(const std::string& name, T& result) const = 0; +}; + +typedef any_cache_t expr_cache_t; +typedef any_cache_t args_cache_t; + +class bin_expr_t { + public: + static const bin_expr_t* CreateExpr(const bin_expr_t* arg1, const bin_expr_t* arg2, + const char op); + static const bin_expr_t* CreateArg(Expr* obj, const std::string str); + + bin_expr_t() : arg1_(NULL), arg2_(NULL) {} + bin_expr_t(const bin_expr_t* arg1, const bin_expr_t* arg2) : arg1_(arg1), arg2_(arg2) {} + virtual ~bin_expr_t() { + if (arg1_) delete arg1_; + if (arg2_) delete arg2_; + } + + virtual args_t Eval(const args_cache_t& args) const = 0; + virtual std::string Symbol() const = 0; + + std::string String() const { + std::string str; + if (arg1_) { + str = "(" + arg1_->String() + " " + Symbol() + " " + arg2_->String() + ")"; + } else + str = Symbol(); + return str; + } + + protected: + const bin_expr_t* arg1_; + const bin_expr_t* arg2_; +}; + +class Expr { + public: + explicit Expr(const std::string& expr, const expr_cache_t* cache) + : expr_(expr), pos_(0), sub_count_(0), cache_(cache), is_sub_expr_(false) + { + sub_vec_ = new std::vector; + var_vec_ = new std::vector; + tree_ = ParseExpr(); + } + + explicit Expr(const std::string& expr, const Expr* obj) + : expr_(expr), + pos_(0), + sub_count_(0), + cache_(obj->cache_), + sub_vec_(obj->sub_vec_), + var_vec_(obj->var_vec_), + is_sub_expr_(true) + { + sub_vec_->push_back(this); + tree_ = ParseExpr(); + if (!SubCheck()) throw exception_t("expr '" + expr_ + "', bad parenthesis count"); + } + + ~Expr() { + if (!is_sub_expr_) { + delete cache_; + for (auto it : *sub_vec_) delete it; + delete sub_vec_; + delete var_vec_; + delete tree_; + } + } + + std::string GetStr() const { return expr_; } + const expr_cache_t* GetCache() const { return cache_; } + const bin_expr_t* GetTree() const { return tree_; } + + args_t Eval(const args_cache_t& args) const { + args_t result = 0; + try { + result = tree_->Eval(args); + } catch (const div_zero_exception_t& e) { + if (div_zero_exc_on) std::cout << "Expr::Eval() exc(" << e.what() << ") : " << String() << std::endl; + } catch (const exception_t& e) { + throw e; + } + return result; + } + + std::string Lookup(const std::string& str) const { + std::string result; + if (cache_ && !(cache_->Lookup(str, result))) + throw exception_t("expr '" + expr_ + "', lookup '" + str + "' failed"); + return result; + } + + void AddVar(const std::string& str) { + bool found = false; + for (std::string s : *var_vec_) + if (s == str) found = true; + if (!found) var_vec_->push_back(str); + } + + const std::vector& GetVars() const { return *var_vec_; } + + std::string String() const { return tree_->String(); } + + private: + const bin_expr_t* ParseExpr() { + const bin_expr_t* expr = ParseArg(); + while (!IsEnd()) { + const char op = Symb(); + const bin_expr_t* second_arg = NULL; + if (IsSymb(')')) { + Next(); + SubClose(); + break; + } + if (IsSymb('*') || IsSymb('/')) { + Next(); + second_arg = ParseArg(); + expr = bin_expr_t::CreateExpr(expr, second_arg, op); + } else if (IsSymb('+') || IsSymb('-')) { + Next(); + second_arg = ParseExpr(); + expr = bin_expr_t::CreateExpr(expr, second_arg, op); + break; + } else { + throw exception_t("expr '" + expr_ + "', bad operator '" + op + "'"); + } + } + return expr; + } + + const bin_expr_t* ParseArg() { + const bin_expr_t* arg = NULL; + if (IsSymb('(')) { + Next(); + SubOpen(); + arg = ParseExpr(); + } else { + const unsigned pos = FindOp(); + const std::string str = CutTill(pos); + arg = bin_expr_t::CreateArg(this, str); + if (arg == NULL) throw exception_t("expr '" + expr_ + "', bad argument '" + str + "'"); + } + return arg; + } + + char Symb() const { return Symb(pos_); } + char Symb(const unsigned ind) const { return expr_[ind]; } + bool IsEnd() const { return (pos_ >= expr_.length()); } + bool IsSymb(const char c) const { return IsSymb(pos_, c); } + bool IsSymb(const unsigned ind, const char c) const { return (expr_[ind] == c); } + void Next() { ++pos_; } + void SubOpen() { ++sub_count_; } + void SubClose() { --sub_count_; } + bool SubCheck() const { return (sub_count_ == 0); } + unsigned FindOp() const { + unsigned i = pos_; + unsigned open_n = 0; + while (i < expr_.length()) { + switch (Symb(i)) { + case '*': + case '/': + case '+': + case '-': + goto end; + case '(': + ++open_n; + break; + case ')': + if (open_n != 0) i += 1; + goto end; + } + ++i; + } + end: + return i; + } + std::string CutTill(const unsigned pos) { + const std::string str = (pos > pos_) ? expr_.substr(pos_, pos - pos_) : ""; + pos_ = pos; + return str; + } + + static const bool div_zero_exc_on = false; + + const std::string expr_; + unsigned pos_; + unsigned sub_count_; + const bin_expr_t* tree_; + const expr_cache_t* const cache_; + std::vector* sub_vec_; + std::vector* var_vec_; + const bool is_sub_expr_; +}; + +class add_expr_t : public bin_expr_t { + public: + add_expr_t(const bin_expr_t* arg1, const bin_expr_t* arg2) : bin_expr_t(arg1, arg2) {} + args_t Eval(const args_cache_t& args) const { return (arg1_->Eval(args) + arg2_->Eval(args)); } + std::string Symbol() const { return "+"; } +}; +class sub_expr_t : public bin_expr_t { + public: + sub_expr_t(const bin_expr_t* arg1, const bin_expr_t* arg2) : bin_expr_t(arg1, arg2) {} + args_t Eval(const args_cache_t& args) const { return (arg1_->Eval(args) - arg2_->Eval(args)); } + std::string Symbol() const { return "-"; } +}; +class mul_expr_t : public bin_expr_t { + public: + mul_expr_t(const bin_expr_t* arg1, const bin_expr_t* arg2) : bin_expr_t(arg1, arg2) {} + args_t Eval(const args_cache_t& args) const { return (arg1_->Eval(args) * arg2_->Eval(args)); } + std::string Symbol() const { return "*"; } +}; +class div_expr_t : public bin_expr_t { + public: + div_expr_t(const bin_expr_t* arg1, const bin_expr_t* arg2) : bin_expr_t(arg1, arg2) {} + args_t Eval(const args_cache_t& args) const { + const args_t denominator = arg2_->Eval(args); + if (denominator == 0) throw div_zero_exception_t("div_expr_t::Eval()"); + return (arg1_->Eval(args) / denominator); + } + std::string Symbol() const { return "/"; } +}; +class const_expr_t : public bin_expr_t { + public: + const_expr_t(const args_t value) : value_(value) {} + args_t Eval(const args_cache_t&) const { return value_; } + std::string Symbol() const { + std::ostringstream os; + os << value_; + return os.str(); + } + + private: + const args_t value_; +}; +class var_expr_t : public bin_expr_t { + public: + var_expr_t(const std::string name) : name_(name) {} + args_t Eval(const args_cache_t& args) const { + args_t result = 0; + if (!args.Lookup(name_, result)) throw exception_t("expr arg lookup '" + name_ + "' failed"); + return result; + } + std::string Symbol() const { return name_; } + + private: + const std::string name_; +}; + +class fun_expr_t : public bin_expr_t { + public: + typedef std::vector vvect_t; + fun_expr_t(const std::string& fname, const std::string& vname, const uint32_t& vnum) : fname_(fname) { + for (uint32_t i = 0; i < vnum; ++i) { + std::ostringstream var_full_name; + var_full_name << vname << "[" << i << "]"; + vvect.push_back(var_expr_t(var_full_name.str())); + } + } + const vvect_t& GetVars() const { return vvect; } + std::string Symbol() const { + const std::string var = vvect[0].Symbol(); + const std::string vname = var.substr(0, var.length() - 3); + std::ostringstream oss; + std::string str("("); + str.back() = ')'; + oss << fname_ << "(" << vname << "," << vvect.size() << ")"; + return oss.str(); + } + + private: + const std::string fname_; + vvect_t vvect; +}; +class sum_expr_t : public fun_expr_t { + public: + sum_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("sum", vname, vnum) {} + args_t Eval(const args_cache_t& args) const { + args_t result = 0; + for (const auto& var : GetVars()) result += var.Eval(args); + return result; + } +}; +class avr_expr_t : public fun_expr_t { + public: + avr_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("avr", vname, vnum) {} + args_t Eval(const args_cache_t& args) const { + args_t result = 0; + for (const auto& var : GetVars()) result += var.Eval(args); + return result / GetVars().size(); + } +}; +class min_expr_t : public fun_expr_t { + public: + min_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("min", vname, vnum) {} + args_t Eval(const args_cache_t& args) const { + args_t result = ARGS_MAX; + for (const auto& var : GetVars()) { + args_t val = var.Eval(args); + result = (val < result) ? val : result; + } + return result; + } +}; +class max_expr_t : public fun_expr_t { + public: + max_expr_t(const std::string& vname, const uint32_t& vnum) : fun_expr_t("max", vname, vnum) {} + args_t Eval(const args_cache_t& args) const { + args_t result = 0; + for (const auto& var : GetVars()) { + args_t val = var.Eval(args); + result = (val > result) ? val : result; + } + return result; + } +}; + +inline const bin_expr_t* bin_expr_t::CreateExpr(const bin_expr_t* arg1, const bin_expr_t* arg2, + const char op) { + const bin_expr_t* expr = NULL; + switch (op) { + case '+': + expr = new add_expr_t(arg1, arg2); + break; + case '-': + expr = new sub_expr_t(arg1, arg2); + break; + case '*': + expr = new mul_expr_t(arg1, arg2); + break; + case '/': + expr = new div_expr_t(arg1, arg2); + break; + } + return expr; +} + +inline const bin_expr_t* bin_expr_t::CreateArg(Expr* obj, const std::string str) { + const bin_expr_t* arg = NULL; + + const unsigned i = strspn(str.c_str(), "1234567890"); + if (i == str.length()) { + const unsigned value = atoi(str.c_str()); + arg = new const_expr_t(value); + } + + if (arg == NULL) { + const std::size_t pos = str.find('('); + if (pos != std::string::npos) { + char* fname = NULL; + char* vname = NULL; + int vnum = 0; + int ret = sscanf(str.c_str(), "%m[a-zA-Z_](%m[0-9a-zA-Z_],%d)", &fname, &vname, &vnum); + if (ret == 3) { + const std::string fun_name(fname); + const fun_expr_t* farg = NULL; + if (fun_name == "sum") { + farg = new sum_expr_t(vname, vnum); + } else if (fun_name == "avr") { + farg = new avr_expr_t(vname, vnum); + } else if (fun_name == "min") { + farg = new min_expr_t(vname, vnum); + } else if (fun_name == "max") { + farg = new max_expr_t(vname, vnum); + } + if (farg) for (const auto& var : farg->GetVars()) obj->AddVar(var.Symbol()); + arg = farg; + } + free(fname); + free(vname); + } + } + + if (arg == NULL) { + const std::string sub_expr = obj->Lookup(str); + if (sub_expr.empty()) { + arg = new var_expr_t(str); + obj->AddVar(str); + } else { + const Expr* expr = new Expr(sub_expr, obj); + arg = expr->GetTree(); + } + } + + return arg; +} + +} // namespace xml + +#endif // _SRC_XML_EXPR_H diff --git a/src/xml/xml.h b/src/xml/xml.h new file mode 100644 index 00000000..933cd2b6 --- /dev/null +++ b/src/xml/xml.h @@ -0,0 +1,457 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_XML_XML_H_ +#define SRC_XML_XML_H_ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace xml { + +class Xml { + public: + typedef std::vector token_t; + + struct level_t; + typedef std::vector nodes_t; + typedef std::map opts_t; + struct level_t { + std::string tag; + nodes_t nodes; + opts_t opts; + }; + typedef std::vector nodes_vec_t; + typedef std::map map_t; + + enum { DECL_STATE, BODY_STATE }; + + static Xml* Create(const std::string& file_name, const Xml* obj = NULL) { + Xml* xml = new Xml(file_name, obj); + if (xml != NULL) { + if (xml->Init() == false) { + delete xml; + xml = NULL; + } else { + const std::size_t pos = file_name.rfind('/'); + const std::string path = (pos != std::string::npos) ? file_name.substr(0, pos + 1) : ""; + + xml->PreProcess(); + nodes_t incl_nodes; + for (auto* node : xml->GetNodes("top.include")) { + if (node->opts.find("touch") == node->opts.end()) { + node->opts["touch"] = ""; + incl_nodes.push_back(node); + } + } + for (auto* incl : incl_nodes) { + const std::string& incl_name = path + incl->opts["file"]; + Xml* ixml = Create(incl_name, xml); + if (ixml == NULL) { + delete xml; + xml = NULL; + break; + } else { + delete ixml; + } + } + if (xml) { + xml->Process(); + } + } + } + + return xml; + } + + static void Destroy(Xml* xml) { delete xml; } + + std::string GetName() { return file_name_; } + + void AddExpr(const std::string& full_tag, const std::string& name, const std::string& expr) { + const std::size_t pos = full_tag.rfind('.'); + const std::size_t pos1 = (pos == std::string::npos) ? 0 : pos + 1; + const std::string level_tag = full_tag.substr(pos1); + level_t* level = new level_t; + (*map_)[full_tag].push_back(level); + level->tag = level_tag; + level->opts["name"] = name; + level->opts["expr"] = expr; + } + + void AddConst(const std::string& full_tag, const std::string& name, const uint64_t& val) { + std::ostringstream oss; + oss << val; + AddExpr(full_tag, name, oss.str()); + } + + nodes_t GetNodes(const std::string& global_tag) { return (*map_)[global_tag]; } + + template F ForEach(const F& f_i) { + F f = f_i; + if (map_) { + for (auto& entry : *map_) { + for (auto node : entry.second) { + if (f.fun(entry.first, node) == false) break; + } + } + } + return f; + } + + template F ForEach(const F& f_i) const { + F f = f_i; + if (map_) { + for (auto& entry : *map_) { + for (auto node : entry.second) { + if (f.fun(entry.first, node) == false) break; + } + } + } + return f; + } + + struct print_func { + bool fun(const std::string& global_tag, level_t* node) { + for (auto& opt : node->opts) { + std::cout << global_tag << "." << opt.first << " = " << opt.second << std::endl; + } + return true; + } + }; + + void Print() const { + std::cout << "XML file '" << file_name_ << "':" << std::endl; + ForEach(print_func()); + } + + private: + Xml(const std::string& file_name, const Xml* obj) + : file_name_(file_name), + file_line_(0), + data_size_(0), + index_(0), + state_(BODY_STATE), + comment_(false), + included_(false), + level_(NULL), + map_(NULL) { + if (obj != NULL) { + map_ = obj->map_; + level_ = obj->level_; + included_ = true; + } + } + + struct delete_func { + bool fun(const std::string&, level_t* node) { + delete node; + return true; + } + }; + + ~Xml() { + if (included_ == false) { + ForEach(delete_func()); + delete map_; + } + } + + bool Init() { + fd_ = open(file_name_.c_str(), O_RDONLY); + if (fd_ == -1) { + // perror((std::string("open XML file ") + file_name_).c_str()); + return false; + } + + if (map_ == NULL) { + map_ = new map_t; + if (map_ == NULL) return false; + AddLevel("top"); + } + + return true; + } + + void PreProcess() { + uint32_t ind = 0; + char buf[kBufSize]; + bool error = false; + + while (1) { + const uint32_t pos = lseek(fd_, 0, SEEK_CUR); + uint32_t size = read(fd_, buf, kBufSize); + if (size <= 0) break; + buf[size - 1] = '\0'; + + if (strncmp(buf, "#include \"", 10) == 0) { + for (ind = 0; (ind < size) && (buf[ind] != '\n'); ++ind) {} + if (ind == size) { + fprintf(stderr, "XML PreProcess failed, line size limit %zu\n", kBufSize); + error = true; + break; + } + buf[ind] = '\0'; + size = ind; + lseek(fd_, pos + ind + 1, SEEK_SET); + + for (ind = 10; (ind < size) && (buf[ind] != '"'); ++ind) {} + if (ind == size) { + error = true; + break; + } + buf[ind] = '\0'; + + AddLevel("include"); + AddOption("file", &buf[10]); + UpLevel(); + } + } + + if (error) { + fprintf(stderr, "XML PreProcess failed, line '%s'\n", buf); + exit(1); + } + + lseek(fd_, 0, SEEK_SET); + } + + void Process() { + token_t remainder; + + while (1) { + token_t token = (remainder.size()) ? remainder : NextToken(); + remainder.clear(); + + // token_t token1 = token; + // token1.push_back('\0'); + // std::cout << "> " << &token1[0] << std::endl; + + // End of file + if (token.size() == 0) break; + + switch (state_) { + case BODY_STATE: + if (token[0] == '<') { + bool node_begin = true; + unsigned ind = 1; + if (token[1] == '/') { + node_begin = false; + ++ind; + } + + unsigned i = ind; + while (i < token.size()) { + if (token[i] == '>') break; + ++i; + } + for (unsigned j = i + 1; j < token.size(); ++j) remainder.push_back(token[j]); + + if (i == token.size()) { + if (node_begin) + state_ = DECL_STATE; + else + BadFormat(token); + token.push_back('\0'); + } else { + token[i] = '\0'; + } + + const char* tag = &token[ind]; + if (node_begin) { + AddLevel(tag); + } else { + if (strncmp(CurrentLevel().c_str(), tag, strlen(tag)) != 0) { + token.back() = '>'; + BadFormat(token); + } + UpLevel(); + } + } else { + BadFormat(token); + } + break; + case DECL_STATE: + if (token[0] == '>') { + state_ = BODY_STATE; + for (unsigned j = 1; j < token.size(); ++j) remainder.push_back(token[j]); + continue; + } else { + token.push_back('\0'); + unsigned j = 0; + for (j = 0; j < token.size(); ++j) + if (token[j] == '=') break; + if (j == token.size()) BadFormat(token); + token[j] = '\0'; + const char* key = &token[0]; + const char* value = &token[j + 1]; + AddOption(key, value); + } + break; + default: + std::cout << "XML parser error: wrong state: " << state_ << std::endl; + exit(1); + } + } + } + + bool SpaceCheck() const { + bool cond = ((buffer_[index_] == ' ') || (buffer_[index_] == '\t')); + return cond; + } + + bool LineEndCheck() { + bool found = false; + if (buffer_[index_] == '\n') { + buffer_[index_] = ' '; + ++file_line_; + found = true; + comment_ = false; + } else if (comment_ || (buffer_[index_] == '#')) { + found = true; + comment_ = true; + } + return found; + } + + token_t NextToken() { + token_t token; + bool in_string = false; + bool special_symb = false; + + while (1) { + if (data_size_ == 0) { + data_size_ = read(fd_, buffer_, kBufSize); + if (data_size_ <= 0) break; + } + + if (token.empty()) { + while ((index_ < data_size_) && (SpaceCheck() || LineEndCheck())) { + ++index_; + } + } + while ((index_ < data_size_) && (in_string || !(SpaceCheck() || LineEndCheck()))) { + const char symb = buffer_[index_]; + bool skip_symb = false; + + switch (symb) { + case '\\': + if (special_symb) { + special_symb = false; + } else { + special_symb = true; + skip_symb = true; + } + break; + case '"': + if (special_symb) { + special_symb = false; + } else { + in_string = !in_string; + if (!in_string) { + buffer_[index_] = ' '; + --index_; + } + skip_symb = true; + } + break; + } + + if (!skip_symb) token.push_back(symb); + ++index_; + } + + if (index_ == data_size_) { + index_ = 0; + data_size_ = 0; + } else { + if (special_symb || in_string) BadFormat(token); + break; + } + } + + return token; + } + + void BadFormat(token_t token) { + token.push_back('\0'); + std::cout << "Error: " << file_name_ << ", line " << file_line_ << ", bad XML token '" + << &token[0] << "'" << std::endl; + exit(1); + } + + void AddLevel(const std::string& tag) { + level_t* level = new level_t; + level->tag = tag; + if (level_) { + level_->nodes.push_back(level); + stack_.push_back(level_); + } + level_ = level; + + std::string global_tag; + for (level_t* level : stack_) { + global_tag += level->tag + "."; + } + global_tag += tag; + (*map_)[global_tag].push_back(level_); + } + + void UpLevel() { + level_ = stack_.back(); + stack_.pop_back(); + } + + std::string CurrentLevel() const { return level_->tag; } + + void AddOption(const std::string& key, const std::string& value) { level_->opts[key] = value; } + + const std::string file_name_; + unsigned file_line_; + int fd_; + + static const size_t kBufSize = 256; + char buffer_[kBufSize]; + + unsigned data_size_; + unsigned index_; + unsigned state_; + bool comment_; + std::vector stack_; + bool included_; + level_t* level_; + map_t* map_; +}; + +} // namespace xml + +#endif // SRC_XML_XML_H_ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt new file mode 100644 index 00000000..278bc5c4 --- /dev/null +++ b/test/CMakeLists.txt @@ -0,0 +1,62 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +cmake_minimum_required ( VERSION 3.5.0 ) +set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) + +set ( EXE_NAME "ctrl" ) + +if ( NOT DEFINED TEST_DIR ) + set ( TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR} ) + project ( ${EXE_NAME} ) + ## Set build environment + include ( env ) +endif () + +## Util sources +file( GLOB UTIL_SRC "${TEST_DIR}/util/*.cpp" ) + +## Test control sources +set ( CTRL_SRC + ${TEST_DIR}/app/test.cpp + ${TEST_DIR}/ctrl/test_hsa.cpp +) + +## Test kernel sources +set ( TEST_NAME simple_convolution ) +set ( KERN_SRC ${TEST_DIR}/${TEST_NAME}/${TEST_NAME}.cpp ) +execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/${TEST_NAME}/*.hsaco ${PROJECT_BINARY_DIR}" ) + +## Building test executable +add_executable ( ${EXE_NAME} ${KERN_SRC} ${CTRL_SRC} ${UTIL_SRC} ) +target_include_directories ( ${EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries( ${EXE_NAME} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt atomic ) +execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/run.sh ${PROJECT_BINARY_DIR}" ) +execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/tool/*.xml ${PROJECT_BINARY_DIR}" ) +execute_process ( COMMAND sh -xc "mkdir -p ${PROJECT_BINARY_DIR}/RESULTS" ) + +## Build test library +set ( TEST_LIB "tool" ) +set ( TEST_LIB_SRC ${TEST_DIR}/tool/tool.cpp ${UTIL_SRC} ) +add_library ( ${TEST_LIB} SHARED ${TEST_LIB_SRC} ) +target_include_directories ( ${TEST_LIB} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries ( ${TEST_LIB} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt atomic ) diff --git a/test/app/test.cpp b/test/app/test.cpp new file mode 100644 index 00000000..9e694833 --- /dev/null +++ b/test/app/test.cpp @@ -0,0 +1,40 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include + +#include "ctrl/run_kernel.h" +#include "ctrl/test_aql.h" +#include "simple_convolution/simple_convolution.h" + +int main(int argc, char** argv) { + const char* kiter_s = getenv("ROCP_KITER"); + const char* diter_s = getenv("ROCP_DITER"); + const int kiter = (kiter_s != NULL) ? atol(kiter_s) : 1; + const int diter = (diter_s != NULL) ? atol(diter_s) : 1; + TestHsa::HsaInstantiate(); + for (int i = 0; i < kiter; ++i) RunKernel(argc, argv, diter); + TestHsa::HsaShutdown(); + return 0; +} diff --git a/test/ctrl/run_kernel.h b/test/ctrl/run_kernel.h new file mode 100644 index 00000000..b122664b --- /dev/null +++ b/test/ctrl/run_kernel.h @@ -0,0 +1,83 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_CTRL_RUN_KERNEL_H_ +#define TEST_CTRL_RUN_KERNEL_H_ + +#include "ctrl/test_hsa.h" +#include "util/test_assert.h" + +template bool RunKernel(int argc, char* argv[], int count = 1) { + bool ret_val = false; + + // Create test kernel object + Kernel test_kernel; + TestAql* test_aql = new TestHsa(&test_kernel); + test_aql = new Test(test_aql); + TEST_ASSERT(test_aql != NULL); + if (test_aql == NULL) return 1; + + // Initialization of Hsa Runtime + ret_val = test_aql->Initialize(argc, argv); + if (ret_val == false) { + std::cerr << "Error in the test initialization" << std::endl; + // TEST_ASSERT(ret_val); + return false; + } + + // Setup Hsa resources needed for execution + ret_val = test_aql->Setup(); + if (ret_val == false) { + std::cerr << "Error in creating hsa resources" << std::endl; + TEST_ASSERT(ret_val); + return false; + } + + // Kernel dspatch iterations + for (int i = 0; i < count; ++i) { + // Run test kernel + ret_val = test_aql->Run(); + if (ret_val == false) { + std::cerr << "Error in running the test kernel" << std::endl; + TEST_ASSERT(ret_val); + return false; + } + + // Verify the results of the execution + ret_val = test_aql->VerifyResults(); + if (ret_val) { + std::clog << "Test : Passed" << std::endl; + } else { + std::clog << "Test : Failed" << std::endl; + } + } + + // Print time taken by sample + test_aql->PrintTime(); + + test_aql->Cleanup(); + delete test_aql; + + return ret_val; +} + +#endif // TEST_CTRL_RUN_KERNEL_H_ diff --git a/test/ctrl/test_aql.h b/test/ctrl/test_aql.h new file mode 100644 index 00000000..d77363ee --- /dev/null +++ b/test/ctrl/test_aql.h @@ -0,0 +1,77 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_AQL_H_ +#define TEST_CTRL_TEST_AQL_H_ + +#include +#include + +#include "util/hsa_rsrc_factory.h" + +// Test AQL interface +class TestAql { + public: + explicit TestAql(TestAql* t = 0) : test_(t) {} + virtual ~TestAql() { + if (test_) delete test_; + } + + TestAql* Test() { return test_; } + virtual const AgentInfo* GetAgentInfo() { return (test_) ? test_->GetAgentInfo() : 0; } + virtual hsa_queue_t* GetQueue() { return (test_) ? test_->GetQueue() : 0; } + virtual HsaRsrcFactory* GetRsrcFactory() { return (test_) ? test_->GetRsrcFactory() : 0; } + + // Initialize application environment including setting + // up of various configuration parameters based on + // command line arguments + // @return bool true on success and false on failure + virtual bool Initialize(int argc, char** argv) { + return (test_) ? test_->Initialize(argc, argv) : true; + } + + // Setup application parameters for exectuion + // @return bool true on success and false on failure + virtual bool Setup() { return (test_) ? test_->Setup() : true; } + + // Run the kernel + // @return bool true on success and false on failure + virtual bool Run() { return (test_) ? test_->Run() : true; } + + // Verify results + // @return bool true on success and false on failure + virtual bool VerifyResults() { return (test_) ? test_->VerifyResults() : true; } + + // Print to console the time taken to execute kernel + virtual void PrintTime() { + if (test_) test_->PrintTime(); + } + + // Release resources e.g. memory allocations + // @return bool true on success and false on failure + virtual bool Cleanup() { return (test_) ? test_->Cleanup() : true; } + + private: + TestAql* const test_; +}; + +#endif // TEST_CTRL_TEST_AQL_H_ diff --git a/test/ctrl/test_hsa.cpp b/test/ctrl/test_hsa.cpp new file mode 100644 index 00000000..87861821 --- /dev/null +++ b/test/ctrl/test_hsa.cpp @@ -0,0 +1,283 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "ctrl/test_hsa.h" + +#include + +#include "util/test_assert.h" +#include "util/helper_funcs.h" +#include "util/hsa_rsrc_factory.h" + +HsaRsrcFactory* TestHsa::hsa_rsrc_ = NULL; +const AgentInfo* TestHsa::agent_info_ = NULL; +hsa_queue_t* TestHsa::hsa_queue_ = NULL; +uint32_t TestHsa::agent_id_ = 0; + +HsaRsrcFactory* TestHsa::HsaInstantiate(const uint32_t agent_ind) { + // Instantiate an instance of Hsa Resources Factory + if (hsa_rsrc_ == NULL) { + agent_id_ = agent_ind; + + hsa_rsrc_ = HsaRsrcFactory::Create(); + + // Print properties of the agents + hsa_rsrc_->PrintGpuAgents("> GPU agents"); + + // Create an instance of Gpu agent + if (!hsa_rsrc_->GetGpuAgentInfo(agent_ind, &agent_info_)) { + agent_info_ = NULL; + std::cerr << "> error: agent[" << agent_ind << "] is not found" << std::endl; + return NULL; + } + std::clog << "> Using agent[" << agent_ind << "] : " << agent_info_->name << std::endl; + + // Create an instance of Aql Queue + if (hsa_queue_ == NULL) { + uint32_t num_pkts = 128; + if (hsa_rsrc_->CreateQueue(agent_info_, num_pkts, &hsa_queue_) == false) { + hsa_queue_ = NULL; + TEST_ASSERT(false); + } + } + } + return hsa_rsrc_; +} + +void TestHsa::HsaShutdown() { + if (hsa_queue_ != NULL) { + hsa_queue_destroy(hsa_queue_); + hsa_queue_ = NULL; + } + if (hsa_rsrc_) hsa_rsrc_->Destroy(); +} + +bool TestHsa::Initialize(int arg_cnt, char** arg_list) { + std::clog << "TestHsa::Initialize :" << std::endl; + + // Instantiate a Timer object + setup_timer_idx_ = hsa_timer_.CreateTimer(); + dispatch_timer_idx_ = hsa_timer_.CreateTimer(); + + if (HsaInstantiate(agent_id_) == NULL) { + TEST_ASSERT(false); + return false; + } + + // Obtain handle of signal + hsa_rsrc_->CreateSignal(1, &hsa_signal_); + + // Obtain the code object file name + std::string agentName(agent_info_->name); + if (agentName.compare(0, 4, "gfx8") == 0) { + brig_path_obj_.append("gfx8"); + } else if (agentName.compare(0, 4, "gfx9") == 0) { + brig_path_obj_.append("gfx9"); + } else { + TEST_ASSERT(false); + return false; + } + brig_path_obj_.append("_" + name_ + ".hsaco"); + + return true; +} + +bool TestHsa::Setup() { + std::clog << "TestHsa::setup :" << std::endl; + + // Start the timer object + hsa_timer_.StartTimer(setup_timer_idx_); + + // Load and Finalize Kernel Code Descriptor + const char* brig_path = brig_path_obj_.c_str(); + bool suc = hsa_rsrc_->LoadAndFinalize(agent_info_, brig_path, name_.c_str(), &hsa_exec_, + &kernel_code_desc_); + if (suc == false) { + std::cerr << "Error in loading and finalizing Kernel" << std::endl; + return false; + } + + mem_map_t& mem_map = test_->GetMemMap(); + for (mem_it_t it = mem_map.begin(); it != mem_map.end(); ++it) { + mem_descr_t& des = it->second; + switch (des.id) { + case TestKernel::LOCAL_DES_ID: + des.ptr = hsa_rsrc_->AllocateLocalMemory(agent_info_, des.size); + break; + case TestKernel::KERNARG_DES_ID: { + // Check the kernel args size + const size_t kernarg_size = des.size; + size_t size_info = 0; + hsa_executable_symbol_get_info( + kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &size_info); + const bool kernarg_missmatch = (kernarg_size > size_info); + if (kernarg_missmatch) { + std::cout << "kernarg_size = " << kernarg_size << ", size_info = " << size_info + << std::flush << std::endl; + TEST_ASSERT(!kernarg_missmatch); + break; + } + // ALlocate kernarg memory + des.size = size_info; + des.ptr = hsa_rsrc_->AllocateKernArgMemory(agent_info_, size_info); + if (des.ptr) memset(des.ptr, 0, size_info); + break; + } + case TestKernel::SYS_DES_ID: + des.ptr = hsa_rsrc_->AllocateSysMemory(agent_info_, des.size); + if (des.ptr) memset(des.ptr, 0, des.size); + break; + case TestKernel::NULL_DES_ID: + des.ptr = NULL; + break; + default: + break; + } + TEST_ASSERT(des.ptr != NULL); + if (des.ptr == NULL) return false; + } + test_->Init(); + + // Stop the timer object + hsa_timer_.StopTimer(setup_timer_idx_); + setup_time_taken_ = hsa_timer_.ReadTimer(setup_timer_idx_); + total_time_taken_ = setup_time_taken_; + + return true; +} + +bool TestHsa::Run() { + std::clog << "TestHsa::run :" << std::endl; + + const uint32_t work_group_size = 64; + const uint32_t work_grid_size = test_->GetGridSize(); + uint32_t group_segment_size = 0; + uint32_t private_segment_size = 0; + uint64_t code_handle = 0; + + // Retrieve the amount of group memory needed + hsa_executable_symbol_get_info( + kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_segment_size); + + // Retrieve the amount of private memory needed + hsa_executable_symbol_get_info(kernel_code_desc_, + HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, + &private_segment_size); + + + // Retrieve handle of the code block + hsa_executable_symbol_get_info(kernel_code_desc_, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, + &code_handle); + + // Initialize the dispatch packet. + hsa_kernel_dispatch_packet_t aql; + memset(&aql, 0, sizeof(aql)); + // Set the packet's type, barrier bit, acquire and release fences + aql.header = HSA_PACKET_TYPE_KERNEL_DISPATCH; + aql.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE; + aql.header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE; + // Populate Aql packet with default values + aql.setup = 1; + aql.grid_size_x = work_grid_size; + aql.grid_size_y = 1; + aql.grid_size_z = 1; + aql.workgroup_size_x = work_group_size; + aql.workgroup_size_y = 1; + aql.workgroup_size_z = 1; + // Bind the kernel code descriptor and arguments + aql.kernel_object = code_handle; + aql.kernarg_address = test_->GetKernargPtr(); + aql.group_segment_size = group_segment_size; + aql.private_segment_size = private_segment_size; + // Initialize Aql packet with handle of signal + hsa_signal_store_relaxed(hsa_signal_, 1); + aql.completion_signal = hsa_signal_; + + std::clog << "> Executing kernel: \"" << name_ << "\"" << std::endl; + + // Start the timer object + hsa_timer_.StartTimer(dispatch_timer_idx_); + + // Submit AQL packet to the queue + const uint64_t que_idx = hsa_rsrc_->Submit(hsa_queue_, &aql); + + std::clog << "> Waiting on kernel dispatch signal, que_idx=" << que_idx << std::endl; + + // Wait on the dispatch signal until the kernel is finished. + // Update wait condition to HSA_WAIT_STATE_ACTIVE for Polling + if (hsa_signal_wait_scacquire(hsa_signal_, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, + HSA_WAIT_STATE_BLOCKED) != 0) { + TEST_ASSERT("signal_wait failed"); + } + + std::clog << "> DONE, que_idx=" << que_idx << std::endl; + + // Stop the timer object + hsa_timer_.StopTimer(dispatch_timer_idx_); + dispatch_time_taken_ = hsa_timer_.ReadTimer(dispatch_timer_idx_); + total_time_taken_ += dispatch_time_taken_; + + return true; +} + +bool TestHsa::VerifyResults() { + bool cmp = false; + void* output = NULL; + const uint32_t size = test_->GetOutputSize(); + bool suc = false; + + // Copy local kernel output buffers from local memory into host memory + if (test_->IsOutputLocal()) { + output = hsa_rsrc_->AllocateSysMemory(agent_info_, size); + suc = hsa_rsrc_->Memcpy(agent_info_, output, test_->GetOutputPtr(), size); + if (!suc) std::clog << "> VerifyResults: Memcpy failed" << std::endl << std::flush; + } else { + output = test_->GetOutputPtr(); + suc = true; + } + + if ((output != NULL) && suc) { + // Print the test output + test_->PrintOutput(output); + // Compare the results and see if they match + cmp = (memcmp(output, test_->GetRefOut(), size) == 0); + } + + if (test_->IsOutputLocal() && (output != NULL)) hsa_rsrc_->FreeMemory(output); + + return cmp; +} + +void TestHsa::PrintTime() { + std::clog << "Time taken for Setup by " << this->name_ << " : " << this->setup_time_taken_ + << std::endl; + std::clog << "Time taken for Dispatch by " << this->name_ << " : " << this->dispatch_time_taken_ + << std::endl; + std::clog << "Time taken in Total by " << this->name_ << " : " << this->total_time_taken_ + << std::endl; +} + +bool TestHsa::Cleanup() { + hsa_executable_destroy(hsa_exec_); + hsa_signal_destroy(hsa_signal_); + return true; +} diff --git a/test/ctrl/test_hsa.h b/test/ctrl/test_hsa.h new file mode 100644 index 00000000..84080e77 --- /dev/null +++ b/test/ctrl/test_hsa.h @@ -0,0 +1,124 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_HSA_H_ +#define TEST_CTRL_TEST_HSA_H_ + +#include "ctrl/test_aql.h" +#include "ctrl/test_kernel.h" +#include "util/hsa_rsrc_factory.h" +#include "util/perf_timer.h" + +// Class implements HSA test +class TestHsa : public TestAql { + public: + // Instantiate HSA resources + static HsaRsrcFactory* HsaInstantiate(const uint32_t agent_ind = agent_id_); + static void HsaShutdown(); + static void SetQueue(hsa_queue_t* queue) { hsa_queue_ = queue; } + static uint32_t HsaAgentId() { return agent_id_; } + + // Constructor + explicit TestHsa(TestKernel* test) : test_(test), name_(test->Name()) { + total_time_taken_ = 0; + setup_time_taken_ = 0; + dispatch_time_taken_ = 0; + hsa_exec_ = {}; + } + + // Get methods for Agent Info, HAS queue, HSA Resourcse Manager + const AgentInfo* GetAgentInfo() { return agent_info_; } + hsa_queue_t* GetQueue() { return hsa_queue_; } + HsaRsrcFactory* GetRsrcFactory() { return hsa_rsrc_; } + + // Initialize application environment including setting + // up of various configuration parameters based on + // command line arguments + // @return bool true on success and false on failure + bool Initialize(int argc, char** argv); + + // Setup application parameters for exectuion + // @return bool true on success and false on failure + bool Setup(); + + // Run the BinarySearch kernel + // @return bool true on success and false on failure + bool Run(); + + // Verify against reference implementation + // @return bool true on success and false on failure + bool VerifyResults(); + + // Print to console the time taken to execute kernel + void PrintTime(); + + // Release resources e.g. memory allocations + // @return bool true on success and false on failure + bool Cleanup(); + + private: + typedef TestKernel::mem_descr_t mem_descr_t; + typedef TestKernel::mem_map_t mem_map_t; + typedef TestKernel::mem_it_t mem_it_t; + + // Test object + TestKernel* test_; + + // Path of Brig file + std::string brig_path_obj_; + + // Used to track time taken to run the sample + double total_time_taken_; + double setup_time_taken_; + double dispatch_time_taken_; + + // Handle of signal + hsa_signal_t hsa_signal_; + + // Handle of Kernel Code Descriptor + hsa_executable_symbol_t kernel_code_desc_; + + // Instance of timer object + uint32_t setup_timer_idx_; + uint32_t dispatch_timer_idx_; + PerfTimer hsa_timer_; + + // Instance of Hsa Resources Factory + static HsaRsrcFactory* hsa_rsrc_; + + // GPU id + static uint32_t agent_id_; + + // Handle to an Hsa Gpu Agent + static const AgentInfo* agent_info_; + + // Handle to an Hsa Queue + static hsa_queue_t* hsa_queue_; + + // Test kernel name + std::string name_; + + // Kernel executable + hsa_executable_t hsa_exec_; +}; + +#endif // TEST_CTRL_TEST_HSA_H_ diff --git a/test/ctrl/test_kernel.h b/test/ctrl/test_kernel.h new file mode 100644 index 00000000..95da162c --- /dev/null +++ b/test/ctrl/test_kernel.h @@ -0,0 +1,134 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_KERNEL_H_ +#define TEST_CTRL_TEST_KERNEL_H_ + +#include +#include +#include + +// Class implements kernel test +class TestKernel { + public: + // Exported buffers IDs + enum buf_id_t { KERNARG_EXP_ID, OUTPUT_EXP_ID, REFOUT_EXP_ID }; + // Memory descriptors IDs + enum des_id_t { NULL_DES_ID, LOCAL_DES_ID, KERNARG_DES_ID, SYS_DES_ID, REFOUT_DES_ID }; + + // Memory descriptors vector declaration + struct mem_descr_t { + des_id_t id; + void* ptr; + uint32_t size; + }; + + // Memory map declaration + typedef std::map mem_map_t; + typedef mem_map_t::iterator mem_it_t; + typedef mem_map_t::const_iterator mem_const_it_t; + + virtual ~TestKernel() {} + + // Initialize method + virtual void Init() = 0; + + // Return kernel memory map + mem_map_t& GetMemMap() { return mem_map_; } + + // Return NULL descriptor + static mem_descr_t NullDescriptor() { return {NULL_DES_ID, NULL, 0}; } + + // Check if decripter is local + bool IsLocal(const mem_descr_t& descr) const { return (descr.id == LOCAL_DES_ID); } + + // Methods to get the kernel attributes + const mem_descr_t& GetKernargDescr() { return *test_map_[KERNARG_EXP_ID]; } + const mem_descr_t& GetOutputDescr() { return *test_map_[OUTPUT_EXP_ID]; } + void* GetKernargPtr() { return GetKernargDescr().ptr; } + uint32_t GetKernargSize() { return GetKernargDescr().size; } + void* GetOutputPtr() { return GetOutputDescr().ptr; } + uint32_t GetOutputSize() { return GetOutputDescr().size; } + bool IsOutputLocal() { return IsLocal(GetOutputDescr()); } + virtual uint32_t GetGridSize() const = 0; + + // Return reference output + void* GetRefOut() { return test_map_[REFOUT_EXP_ID]->ptr; } + + // Print output + virtual void PrintOutput(const void* ptr) const = 0; + + // Return name + virtual std::string Name() const = 0; + + protected: + // Set buffer descriptor + bool SetInDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + bool suc = SetMemDescr(buf_id, des_id, size); + if (des_id == KERNARG_DES_ID) { + test_map_[KERNARG_EXP_ID] = &mem_map_[buf_id]; + } + return suc; + } + + // Set results descriptor + bool SetOutDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + bool suc = SetMemDescr(buf_id, des_id, size); + test_map_[OUTPUT_EXP_ID] = &mem_map_[buf_id]; + return suc; + } + + // Set host descriptor + bool SetHostDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + bool suc = SetMemDescr(buf_id, des_id, size); + if (suc) { + mem_descr_t& descr = mem_map_[buf_id]; + descr.ptr = malloc(size); + if (des_id == REFOUT_DES_ID) { + test_map_[REFOUT_EXP_ID] = &descr; + } + if (descr.ptr == NULL) suc = false; + } + return suc; + } + + // Get memory descriptor + mem_descr_t GetDescr(const uint32_t& buf_id) const { + mem_const_it_t it = mem_map_.find(buf_id); + return (it != mem_map_.end()) ? it->second : NullDescriptor(); + } + + private: + // Set memory descriptor + bool SetMemDescr(const uint32_t& buf_id, const des_id_t& des_id, const uint32_t& size) { + const mem_descr_t des = {des_id, NULL, size}; + auto ret = mem_map_.insert(mem_map_t::value_type(buf_id, des)); + return ret.second; + } + + // Kernel memory map object + mem_map_t mem_map_; + // Test memory map object + std::map test_map_; +}; + +#endif // TEST_CTRL_TEST_KERNEL_H_ diff --git a/test/run.sh b/test/run.sh new file mode 100755 index 00000000..037b47a2 --- /dev/null +++ b/test/run.sh @@ -0,0 +1,61 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ +#!/bin/sh + +test_bin_dflt=./test/ctrl + +# paths to ROC profiler and oher libraries +export LD_LIBRARY_PATH=$PWD +# enable error messages logging to '/tmp/rocprofiler_log.txt' +export ROCPROFILER_LOG=1 + +# ROC profiler library loaded by HSA runtime +export HSA_TOOLS_LIB=librocprofiler64.so +# tool library loaded by ROC profiler +export ROCP_TOOL_LIB=libtool.so +# ROC profiler metrics config file +unset ROCP_PROXY_QUEUE +# ROC profiler metrics config file +export ROCP_METRICS=metrics.xml +# output directory for the tool library, for metrics results file 'results.txt' +export ROCP_OUTPUT_DIR=./RESULTS + +if [ ! -e $ROCP_TOOL_LIB ] ; then + export ROCP_TOOL_LIB=test/libtool.so +fi + +if [ -n "$1" ] ; then + tbin="$*" +else + tbin=$test_bin_dflt +fi + +export ROCP_KITER=100 +export ROCP_DITER=100 +export ROCP_INPUT=input.xml +eval $tbin + +#valgrind --leak-check=full $tbin +#valgrind --tool=massif $tbin +#ms_print massif.out. + +exit 0 diff --git a/test/simple_convolution/gfx8_SimpleConvolution.hsaco b/test/simple_convolution/gfx8_SimpleConvolution.hsaco new file mode 100644 index 0000000000000000000000000000000000000000..831484c2267528bc7ff55923b2294aab223b1cef GIT binary patch literal 9392 zcmeGiO>YxN^zC?e?W`RqgfvZQDT_)=RS_#Ju8B}9B_wS?1QH0JMHMmeCK&ux$Bvpx zS)A~ZAR#~qA&$LPs?QJLZU<2(bU+B(Fw}vLW0lhczf&OO3RvLA03;5% zbRv`9^WghB!I40h!g+J=9<$y2+&E$vb9QE|$}vo{ z%{-RO+Eo%N4W_e&jNP5jjpsAv5}HCW)9g#;M$5_3s%U?~&UN>h-RroX^Yf-9oRV%= zadeGWByb&aKA8ThYLP@cTox*2(xYo;v6gi@nJL>x(z%q3yEl`+n9Lk3kBrzw&j6P? zIZ^N&oZ4r3%&y_#QjsJ({(1CGp!F zZ+DqzlH>MBF`p|PD`y9L2Tq`}B}?#3zIY{Fva`rKEm(W^Z)(&1QJ2fmTg;aWuDcr6 zKrua@EZLhgy;$V)BVoj1F?tFm;z3J=lEi57_%ujn&-uu-qw7`}%BRn6fs=kV_j`%Mb zsMn#QRg`I9%u$%?033RPDPRFbUueY`uQ7eWL>vg{3pB@Tpf4bT3d&!F+w*EfcZ|m{ z1yj+xD&jz|?f?rptGbRGfl7a%L3cE`xg5az3>|K84DiEZ#b8j07;)SQEa_o}JA#|b zS`cwCVrfA=fh(~TsAo`UxINEMFQnq`Km#iRU@@|%Fx)TPSZ4g5*lY6rxcz5sm7Wsy~_}{b*G1D`Y3A z0}DQ!ylhtH9GIQW!Yt06Ig}T1FHo7khB>2cjnd1URmS-Il?LXh7_WkHY8WqJ?mL9@ zH$ri|XIRqe&_2qj0mK2rLY%-EEGd{5g>-SQVV>jgUDdTQ#SH5)i&y|SchXlv2{rU^ z5@X@>B)gUAZUVfN zW2&jistZ4wyxequ<_nA=THoSOeHgE^Jxs>5dX)7)#*^5dfy++?eBH-f7a!@}NLA2qFKir3j1bjUzlc0I=$^wiMBP=kRQ z`WvqNFE#WXF8x^zeLv9|X3gXRUh^*YHoMn{ST0|(A(ooRjZI`Bb}2b_NsLNGh?VRw zOMH~friURmoX^rL4~U^?DR~j`rLj^`^aO^32iq^~<*mireY}nHHo@Bt*!areLeIHl zT}Kahd*Qxtht`8~&yR;>MRr^)|6f2X0{K4!F%r5|V^U4loy4*lNRNDpy9EGQj}1tp znw0vo?j)9Sk}v&BjK^{Qp9aa7buDqta`PBa#e?nGqqyYvM!v_9*p!Z>1_`f$cd$>o zc)ugST#tv`p)@1sPxmSWd+ zdjhmnL21<+Yv-H!&G&utn{Q^;v%7cZ)jd0@rgagKeB>d~upHn(7Z=3a$O|uk=ptca zLT(X1F`%w9jJJ1j0j)euGiQWIX$l-+w0m5-?B^{~N1R5$emU$lvpBAoi!VpY{A>O8XhQz)Mhs zQYoPx=@}YHCt^EN$&;zXXm%)-Bz=7&v624t>As<4HkL_75`B@x&|oqaMRn{N@ZTV`+( zLjaf!5%f z6#mbKHjqd~zOrfC`Anw`7i4K!d`atZ`)RiLimU0#OYy0f+W!FOFDa0_hoaF~lJCv^ z0|O^w*|#4b)YrCj6u0Ly>;K4C-2XTJb^kk3(b$1ZDqXxO3lscj63Gnq#0E!V$t;gh zSl(54Wm2PQ(X?~uR4mHd+l1|b%+Se5HuiXhrOw68TRML$axylMNy$!ayEs$(Mn`&f z9eATi)pq1&gqFigS~kBZxa^Ib+L?%C-%4eUbJpyih|{^n-`d(Lj(qUnwVcWkC`X_i zfpP?%bp-HviC^Lg2`f)X=gEg;7TT-XQ&36q?NX7{k>!hO@Z6;9#sZ`z;D1SFJEhb3 zTz8Hb(;$dXGp6f7o}*Ia;N|^Xm9L>Z3%fFydMT9o8zL zjCvY{Hf}dYjra^TBl)2b|H zZ$Q5qHC&GZw~E?fQVV`+^+w&io`XqA_^J#-E3f5-?R;S;$xl@Sclf*H^BL3Zx!R(Q z=O7;V3P7V~LbGVRVy1emn)=-edOcU+^Ao-LT28Yp@LfBM{dn4Yg<<=0VH5bPtO&$C z?MgxS1ziiqQw4mC6`^$Om_BFJ@^^?_iEx@Zu!`<3d6y869UOgq1v=RKTzH30a2HKWA2a)j(%vt1&h0;&X z_|wZKr~EY`S0lf=>tAi?ZgBp>gM|xIASc|w{KS~9!(85(r7#Di_Pq(WcK*VzvAq`B z&H0JXDW%#noyQXUg$Vl~wxI1m+{466&k=eCA~iNe$mG;*GG3UW0oSlXPUZ5tut_f8 z8w2h?2-P)JtqN{D8@QeGXviJ5#|y48p5F`c8ZTn`f@|jU>j9ex+|SK0y^DplP@gqJ z&2(kR@m1G_{S}|W+)Wl%0`BuQx*Eg~<4=7JF7eevJbUMA;SJ8^h6cxLYz$wn-{^R2 zHic_f&gF(-jA`IW$am%}nBWlTuLOG>_n+mst@#P$==$d;%yr0Vo^d*-aQft_I2~e~ z@`6*oz6m%rE1a^7QxG_Yz@mEZ75eu>OLs zuevr&eIZzX0a$-su=hyqSSaLpb#>vfK)vGy*M$8wu>Pz`vHrd_XF`xI(60e|*AnXw z)*f=h7+`$70Pi37#DZxxKZO4$v)FYS~3D=H#(N%v7D z{(8b7c;;T5_~&i-F0J4-Ovxa<3eG=Y8zipa_$?T}`0L{PHmgGRHC8Z4lY)O;;FAje zyudp`tWeAxfp;tTR)O~_c$dJ_3cgF=Hx+!Jz<;XX@TnV;L2fB{ufV4jykFq63O+3G z8_Iao0)MFJza#L^6&xN8AQ{Aj_W+C2yR2f61_hrKc&~zgPvB7ne_!Bn1^=PIvkLxW zfuC3K4+VZx8Slpe&nx=31b$t?e=hwf_%8*1N5St3{Goz>D)7$~{5JxJmv2kb?**_>i0pUHF`RE;`9h< z9ZZh4#v>= z9G;P+y*&3wX$wVZFV9C(;(5cj$X=eaq?G3@;f9StmQk*Q3(r2XfA#l&^7nqIi%Is& zYlLio0&**!1My`)CLxQzqvR-8mSvF04{dP_EZIvXKNpnxmGP6^YyoxbzfG7)k+ykB z#$Vzxj#9n?4RY*K2jz8}*Ji~57T3tjaLNAV{Np*Q$eo+8Jl@PJM;2jNl>JKhAQT>D NKO^i5ih)uk{{-s);-UZm literal 0 HcmV?d00001 diff --git a/test/simple_convolution/simple_convolution.cl b/test/simple_convolution/simple_convolution.cl new file mode 100644 index 00000000..3f8115a6 --- /dev/null +++ b/test/simple_convolution/simple_convolution.cl @@ -0,0 +1,76 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +/** + * SimpleConvolution is where each pixel of the output image + * is the weighted sum of the neighborhood pixels of the input image + * The neighborhood is defined by the dimensions of the mask and + * weight of each neighbor is defined by the mask itself. + * @param output Output matrix after performing convolution + * @param input Input matrix on which convolution is to be performed + * @param mask mask matrix using which convolution was to be performed + * @param inputDimensions dimensions of the input matrix + * @param maskDimensions dimensions of the mask matrix + */ +__kernel void SimpleConvolution(__global uint * output, + __global uint * input, + __global float * mask, + const uint2 inputDimensions, + const uint2 maskDimensions) { + + uint tid = get_global_id(0); + + uint width = inputDimensions.x; + uint height = inputDimensions.y; + + uint x = tid%width; + uint y = tid/width; + + uint maskWidth = maskDimensions.x; + uint maskHeight = maskDimensions.y; + + uint vstep = (maskWidth -1)/2; + uint hstep = (maskHeight -1)/2; + + // find the left, right, top and bottom indices such that + // the indices do not go beyond image boundaires + uint left = (x < vstep) ? 0 : (x - vstep); + uint right = ((x + vstep) >= width) ? width - 1 : (x + vstep); + uint top = (y < hstep) ? 0 : (y - hstep); + uint bottom = ((y + hstep) >= height)? height - 1: (y + hstep); + + // initializing wighted sum value + float sumFX = 0; + + for(uint i = left; i <= right; ++i) { + for(uint j = top; j <= bottom; ++j) { + // performing wighted sum within the mask boundaries + uint maskIndex = (j - (y - hstep)) * maskWidth + (i - (x - vstep)); + uint index = j * width + i; + sumFX += ((float)input[index] * mask[maskIndex]); + } + } + + // To round to the nearest integer + sumFX += 0.5f; + output[tid] = (uint)sumFX; +} diff --git a/test/simple_convolution/simple_convolution.cpp b/test/simple_convolution/simple_convolution.cpp new file mode 100644 index 00000000..546f9a6a --- /dev/null +++ b/test/simple_convolution/simple_convolution.cpp @@ -0,0 +1,388 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "simple_convolution/simple_convolution.h" + +#include +#include +#include + +#include "util/helper_funcs.h" +#include "util/test_assert.h" + +const uint32_t SimpleConvolution::input_data_[]{ + 15, 201, 51, 89, 92, 34, 96, 66, 11, 225, 161, 96, 81, 211, 108, 124, 202, 244, 182, + 90, 215, 92, 98, 20, 44, 225, 55, 247, 202, 0, 45, 218, 202, 97, 51, 39, 131, 147, + 105, 143, 116, 11, 239, 198, 222, 92, 67, 169, 81, 250, 3, 40, 86, 101, 60, 131, 70, + 116, 123, 17, 117, 168, 236, 64, 10, 31, 103, 142, 179, 209, 29, 40, 220, 13, 239, 187, + 105, 50, 100, 186, 44, 104, 227, 131, 205, 32, 6, 20, 149, 130, 38, 10, 43, 18, 75, + 53, 50, 178, 195, 230, 132, 225, 14, 96, 238, 253, 27, 88, 48, 128, 18, 92, 232, 246, + 224, 182, 23, 231, 203, 172, 105, 241, 183, 148, 4, 2, 202, 55, 181, 142, 29, 57, 111, + 43, 153, 93, 41, 181, 181, 89, 54, 200, 182, 31, 190, 150, 213, 213, 126, 160, 130, 232, + 146, 57, 125, 151, 59, 71, 206, 240, 213, 236, 42, 68, 24, 195, 162, 65, 121, 87, 155, + 175, 31, 81, 207, 222, 232, 164, 180, 102, 69, 55, 79, 216, 112, 204, 112, 171, 19, 63, + 156, 233, 43, 198, 46, 67, 138, 208, 132, 4, 39, 32, 180, 71, 113, 131, 38, 90, 40, + 219, 193, 109, 18, 16, 70, 131, 220, 182, 46, 240, 245, 203, 217, 32, 146, 7, 100, 28, + 216, 233, 32, 255, 9, 213, 71, 123, 88, 110, 213, 128, 74, 150, 238, 93, 166, 52, 224, + 131, 234, 15, 115, 224, 218, 76, 1, 108, 84, 101, 137, 44, 79, 170, 44, 88, 127, 116, + 211, 216, 226, 168, 88, 45, 63, 70, 138, 230, 123, 107, 105, 101, 122, 220, 70, 84, 41, + 71, 193, 125, 173, 75, 169, 252, 245, 213, 84, 117, 73, 40, 77, 44, 209, 166, 90, 16, + 237, 229, 246, 104, 80, 95, 206, 202, 60, 20, 31, 101, 92, 225, 226, 9, 44, 140, 5, + 34, 97, 89, 151, 171, 129, 229, 216, 82, 139, 51, 99, 120, 24, 89, 225, 104, 185, 175, + 50, 246, 196, 82, 91, 32, 51, 62, 42, 96, 202, 47, 130, 44, 137, 26, 215, 10, 255, + 176, 93, 138, 227, 193, 3, 251, 27, 229, 100, 212, 149, 151, 202, 89, 233, 38, 122, 29, + 100, 164, 125, 46, 212, 0, 90, 93, 26, 50, 103, 25, 226, 197, 164, 198, 135, 168, 194, + 162, 141, 38, 119, 34, 190, 66, 124, 167, 104, 247, 197, 204, 156, 67, 251, 112, 67, 85, + 205, 93, 135, 53, 119, 106, 251, 28, 49, 130, 196, 243, 36, 82, 26, 155, 117, 216, 221, + 241, 128, 70, 233, 70, 18, 133, 137, 14, 245, 204, 99, 195, 42, 235, 248, 161, 86, 243, + 190, 135, 118, 130, 123, 154, 213, 150, 54, 74, 111, 20, 60, 240, 90, 37, 54, 109, 171, + 191, 123, 161, 140, 222, 100, 182, 202, 93, 88, 32, 80, 23, 168, 198, 153, 36, 97, 111, + 187, 151, 185, 43, 172, 245, 27, 6, 27, 82, 115, 199, 18, 239, 104, 158, 206, 205, 85, + 152, 42, 174, 185, 123, 197, 98, 65, 95, 135, 163, 206, 66, 59, 136, 109, 231, 125, 137, + 237, 153, 219, 97, 96, 237, 81, 201, 140, 31, 150, 226, 183, 192, 144, 113, 59, 86, 212, + 125, 182, 91, 33, 132, 158, 92, 12, 12, 68, 138, 149, 50, 36, 113, 147, 133, 95, 229, + 78, 235, 4, 228, 206, 188, 165, 95, 45, 225, 181, 1, 94, 107, 93, 128, 240, 251, 220, + 252, 7, 32, 135, 156, 83, 171, 14, 230, 48, 109, 203, 126, 89, 208, 99, 39, 140, 9, + 134, 185, 234, 60, 187, 73, 167, 24, 201, 152, 20, 166, 148, 27, 199, 28, 184, 26, 199, + 198, 0, 248, 52, 204, 119, 141, 157, 218, 181, 41, 227, 59, 227, 206, 119, 159, 23, 31, + 184, 224, 183, 204, 134, 76, 231, 77, 105, 160, 103, 48, 103, 104, 41, 155, 53, 160, 41, + 210, 123, 222, 252, 95, 26, 223, 45, 146, 126, 68, 177, 54, 37, 105, 3, 171, 182, 235, + 249, 31, 139, 97, 80, 243, 202, 121, 143, 0, 26, 184, 210, 149, 151, 207, 244, 177, 174, + 34, 67, 45, 102, 245, 100, 140, 95, 104, 55, 21, 83, 49, 53, 223, 147, 134, 210, 93, + 0, 97, 93, 26, 26, 48, 175, 178, 255, 164, 99, 174, 198, 167, 220, 45, 156, 64, 185, + 252, 168, 241, 18, 252, 35, 71, 219, 182, 205, 173, 19, 206, 15, 113, 232, 42, 161, 152, + 220, 160, 60, 64, 79, 3, 231, 43, 49, 132, 108, 235, 128, 21, 220, 146, 17, 255, 218, + 236, 182, 168, 154, 201, 118, 170, 58, 94, 212, 220, 246, 177, 125, 51, 241, 204, 55, 216, + 248, 104, 92, 100, 83, 221, 121, 48, 111, 138, 47, 73, 119, 230, 241, 17, 175, 103, 187, + 234, 198, 144, 199, 188, 65, 68, 240, 51, 17, 39, 11, 9, 143, 104, 109, 227, 70, 231, + 19, 181, 113, 66, 255, 233, 41, 241, 250, 217, 89, 182, 196, 31, 71, 139, 220, 137, 208, + 204, 188, 225, 243, 200, 234, 131, 48, 88, 102, 119, 63, 121, 44, 177, 188, 44, 154, 229, + 29, 149, 190, 118, 76, 130, 150, 147, 14, 114, 28, 222, 62, 217, 191, 50, 161, 170, 181, + 210, 2, 28, 73, 66, 149, 117, 243, 81, 162, 141, 55, 191, 35, 245, 54, 111, 120, 204, + 2, 134, 62, 31, 100, 125, 248, 36, 175, 153, 206, 101, 107, 209, 129, 181, 19, 22, 43, + 7, 104, 205, 149, 159, 140, 184, 149, 195, 39, 14, 143, 42, 148, 205, 73, 249, 74, 66, + 30, 250, 219, 237, 96, 71, 190, 225, 253, 210, 248, 40, 218, 96, 245, 111, 0, 130, 39, + 150, 69, 79, 165, 212, 122, 57, 162, 195, 51, 237, 6, 82, 231, 225, 63, 71, 41, 253, + 41, 38, 208, 33, 78, 170, 130, 68, 26, 131, 198, 66, 26, 12, 145, 191, 224, 11, 249, + 130, 207, 44, 112, 213, 126, 88, 183, 190, 160, 225, 187, 201, 8, 140, 235, 87, 55, 109, + 155, 81, 241, 98, 147, 11, 110, 37, 202, 79, 49, 195, 210, 0, 240, 66, 214, 110, 154, + 142, 44, 58, 111, 232, 4, 119, 117, 239, 207, 172, 93, 106, 254, 78, 205, 145, 89, 59, + 183, 35, 138, 232, 230, 92, 233, 214, 159, 191, 69, 58, 78, 114, 116, 189, 91, 121, 53, + 208, 104, 4, 125, 198, 111, 123, 20, 60, 13, 109, 120, 196, 145, 3, 172, 119, 95, 150, + 78, 255, 85, 147, 57, 163, 6, 174, 97, 97, 39, 151, 50, 144, 155, 175, 86, 11, 43, + 107, 71, 56, 216, 191, 253, 105, 194, 170, 225, 34, 64, 47, 34, 150, 195, 91, 58, 201, + 10, 155, 43, 49, 50, 93, 194, 206, 13, 25, 217, 56, 132, 33, 112, 92, 225, 109, 198, + 164, 23, 167, 199, 88, 215, 234, 238, 155, 69, 40, 100, 80, 196, 144, 129, 246, 237, 68, + 197, 250, 93, 159, 51, 225, 193, 163, 62, 163, 17, 4, 71, 41, 172, 15, 130, 132, 249, + 112, 31, 63, 152, 132, 143, 92, 20, 17, 83, 1, 86, 25, 252, 179, 185, 47, 149, 122, + 211, 211, 29, 229, 216, 101, 15, 133, 117, 145, 9, 111, 1, 40, 175, 154, 173, 62, 247, + 193, 80, 75, 194, 166, 100, 191, 90, 29, 239, 239, 152, 194, 195, 182, 168, 156, 27, 183, + 33, 145, 73, 43, 0, 75, 83, 175, 229, 0, 238, 221, 194, 63, 40, 133, 230, 140, 68, + 64, 170, 51, 48, 66, 246, 243, 248, 159, 144, 20, 87, 177, 165, 160, 220, 166, 235, 48, + 86, 209, 49, 68, 174, 243, 132, 214, 120, 106, 99, 189, 170, 13, 241, 219, 80, 232, 207, + 72, 135, 95, 92, 223, 16, 2, 127, 237, 169, 107, 29, 255, 61, 79, 68, 236, 67, 200, + 194, 188, 50, 38, 121, 221, 52, 107, 184, 132, 84, 136, 204, 219, 231, 41, 186, 248, 44, + 58, 229, 213, 166, 3, 212, 227, 82, 25, 207, 150, 225, 146, 82, 20, 185, 204, 242, 237, + 55, 170, 113, 139, 50, 62, 103, 26, 103, 34, 18, 148, 93, 247, 105, 3, 251, 62, 231, + 77, 87, 182, 227, 57, 73, 54, 77, 2, 2, 63, 239, 57, 234, 97, 197, 29, 159, 44, + 55, 7, 79, 74, 155, 172, 66, 5, 175, 61, 67, 150, 139, 155, 77, 111, 212, 151, 165, + 34, 153, 167, 98, 137, 225, 77, 234, 166, 107, 138, 211, 163, 145, 34, 237, 45, 206, 47, + 50, 126, 108, 117, 21, 248, 17, 98, 103, 230, 249, 12, 9, 147, 179, 107, 29, 149, 185, + 7, 59, 37, 146, 14, 200, 35, 49, 182, 80, 0, 230, 130, 126, 83, 248, 148, 75, 9, + 247, 178, 240, 240, 190, 249, 132, 114, 101, 161, 7, 30, 169, 67, 68, 59, 82, 12, 95, + 131, 195, 176, 131, 169, 51, 2, 252, 44, 150, 72, 54, 141, 250, 38, 126, 185, 31, 3, + 44, 132, 165, 52, 163, 78, 120, 231, 138, 202, 244, 234, 77, 183, 155, 209, 97, 207, 212, + 94, 251, 107, 166, 49, 249, 161, 88, 120, 91, 120, 123, 135, 253, 33, 188, 160, 112, 52, + 136, 250, 254, 125, 229, 76, 53, 128, 30, 150, 79, 243, 244, 75, 95, 155, 125, 88, 60, + 213, 209, 152, 78, 77, 32, 75, 110, 220, 236, 222, 17, 117, 217, 15, 242, 190, 92, 39, + 63, 123, 190, 143, 111, 178, 219, 206, 78, 88, 38, 138, 46, 247, 34, 124, 69, 66, 199, + 179, 31, 179, 145, 48, 41, 106, 64, 27, 41, 157, 67, 105, 24, 1, 249, 135, 179, 212, + 86, 1, 44, 124, 140, 91, 116, 175, 215, 185, 242, 159, 108, 17, 83, 254, 66, 124, 105, + 131, 151, 146, 32, 218, 252, 57, 219, 245, 193, 143, 201, 23, 145, 246, 148, 30, 82, 8, + 206, 41, 194, 192, 201, 47, 210, 28, 46, 20, 152, 151, 151, 48, 42, 184, 11, 38, 241, + 231, 28, 179, 119, 230, 202, 8, 220, 94, 39, 46, 103, 245, 88, 42, 181, 33, 90, 136, + 62, 136, 156, 214, 31, 52, 7, 74, 237, 19, 113, 223, 250, 141, 146, 113, 115, 92, 122, + 80, 187, 161, 126, 35, 150, 215, 78, 76, 249, 168, 212, 55, 48, 113, 14, 80, 166, 21, + 154, 147, 40, 12, 114, 35, 153, 5, 148, 12, 98, 15, 92, 29, 176, 219, 65, 71, 179, + 143, 147, 172, 56, 104, 227, 104, 218, 241, 185, 128, 7, 84, 20, 47, 96, 135, 82, 249, + 140, 231, 6, 238, 246, 99, 12, 167, 63, 77, 238, 242, 221, 130, 158, 21, 235, 129, 126, + 197, 114, 56, 69, 121, 140, 90, 169, 237, 225, 252, 231, 109, 228, 237, 91, 219, 81, 104, + 130, 144, 181, 113, 130, 147, 244, 32, 169, 223, 162, 39, 164, 21, 95, 234, 143, 236, 68, + 57, 217, 37, 53, 192, 147, 25, 174, 239, 245, 0, 87, 119, 144, 13, 232, 19, 160, 220, + 51, 73, 188, 214, 113, 96, 235, 209, 75, 122, 190, 144, 179, 151, 181, 233, 88, 73, 3, + 7, 56, 248, 7, 143, 112, 152, 156, 89, 171, 61, 53, 223, 135, 242, 181, 248, 83, 161, + 202, 158, 28, 136, 46, 208, 32, 228, 186, 121, 45, 189, 128, 102, 182, 136, 246, 38, 32, + 147, 127, 204, 208, 181, 171, 87, 167, 97, 80, 250, 2, 26, 153, 31, 163, 200, 239, 195, + 172, 169, 60, 218, 103, 188, 65, 30, 69, 55, 68, 102, 202, 196, 50, 154, 121, 221, 242, + 33, 63, 67, 28, 66, 93, 181, 97, 0, 126, 81, 196, 43, 251, 0, 5, 98, 189, 70, + 128, 3, 126, 197, 105, 72, 137, 155, 227, 3, 121, 214, 36, 184, 25, 65, 250, 118, 247, + 91, 119, 117, 173, 60, 160, 168, 60, 166, 10, 250, 237, 139, 253, 107, 80, 102, 180, 217, + 2, 151, 221, 123, 109, 1, 52, 134, 66, 46, 253, 57, 138, 117, 175, 55, 178, 79, 223, + 239, 245, 234, 233, 226, 117, 231, 78, 198, 78, 2, 159, 80, 154, 124, 204, 7, 126, 0, + 142, 193, 47, 140, 251, 185, 2, 170, 241, 180, 249, 208, 163, 239, 186, 141, 210, 48, 116, + 32, 246, 195, 34, 150, 19, 188, 19, 224, 196, 146, 224, 83, 83, 15, 224, 78, 201, 226, + 249, 186, 151, 243, 139, 58, 226, 70, 199, 181, 118, 60, 213, 109, 255, 248, 3, 19, 181, + 23, 243, 122, 169, 212, 205, 252, 228, 173, 75, 173, 144, 68, 104, 39, 55, 243, 98, 26, + 57, 41, 207, 175, 102, 165, 29, 102, 158, 32, 121, 83, 56, 109, 205, 225, 66, 155, 222, + 38, 73, 42, 212, 218, 110, 60, 1, 166, 48, 99, 193, 105, 141, 145, 25, 244, 54, 54, + 90, 213, 87, 212, 40, 143, 66, 246, 112, 132, 146, 79, 171, 220, 121, 128, 182, 232, 189, + 184, 143, 237, 27, 80, 86, 169, 226, 112, 158, 25, 166, 248, 238, 253, 204, 23, 141, 15, + 13, 254, 147, 160, 77, 63, 124, 199, 191, 50, 175, 124, 234, 62, 105, 6, 143, 192, 176, + 113, 48, 78, 139, 215, 71, 121, 213, 20, 144, 98, 35, 158, 96, 183, 62, 174, 246, 187, + 117, 182, 237, 37, 50, 216, 99, 156, 223, 243, 93, 143, 101, 142, 222, 240, 101, 37, 106, + 58, 57, 250, 157, 93, 153, 254, 20, 216, 172, 10, 147, 34, 192, 129, 71, 243, 90, 171, + 144, 57, 159, 238, 201, 4, 124, 167, 244, 225, 205, 95, 28, 7, 89, 185, 100, 243, 184, + 121, 203, 100, 131, 95, 135, 68, 224, 207, 56, 58, 122, 201, 115, 25, 183, 61, 30, 51, + 229, 18, 21, 178, 113, 49, 186, 203, 235, 31, 191, 163, 152, 138, 8, 28, 233, 143, 97, + 202, 95, 153, 4, 217, 98, 120, 243, 26, 182, 17, 77, 155, 36, 99, 78, 150, 149, 8, + 98, 128, 39, 33, 36, 192, 172, 45, 220, 149, 189, 61, 96, 28, 215, 100, 246, 58, 221, + 233, 84, 147, 251, 162, 47, 31, 5, 125, 181, 154, 134, 23, 27, 174, 57, 64, 110, 229, + 109, 75, 123, 43, 136, 219, 71, 95, 64, 61, 154, 29, 39, 238, 177, 34, 145, 225, 65, + 150, 94, 247, 49, 229, 15, 77, 147, 72, 141, 2, 45, 251, 77, 169, 38, 213, 132, 110, + 53, 196, 172, 207, 226, 212, 190, 148, 246, 79, 117, 56, 230, 212, 48, 23, 185, 63, 100, + 76, 136, 242, 78, 181, 237, 156, 95, 20, 113, 227, 131, 167, 168, 47, 119, 139, 3, 53, + 31, 250, 133, 149, 50, 107, 105, 99, 130, 34, 162, 231, 111, 42, 217, 190, 224, 199, 90, + 63, 220, 204, 35, 95, 115, 203, 143, 234, 86, 147, 32, 118, 141, 165, 11, 192, 16, 117, + 35, 147, 152, 198, 123, 7, 240, 84, 198, 209, 28, 33, 17, 248, 237, 52, 88, 97, 255, + 231, 76, 86, 122, 109, 204, 8, 18, 216, 201, 35, 77, 237, 183, 229, 179, 50, 237, 164, + 135, 179, 118, 164, 213, 135, 157, 195, 187, 245, 36, 187, 220, 113, 18, 87, 222, 222, 96, + 241, 183, 42, 21, 4, 23, 205, 233, 203, 0, 214, 112, 136, 138, 230, 44, 95, 110, 201, + 34, 41, 191, 71, 229, 155, 185, 247, 243, 151, 214, 84, 137, 141, 126, 159, 146, 149, 108, + 124, 97, 109, 82, 209, 245, 221, 183, 34, 60, 37, 236, 95, 79, 171, 167, 53, 71, 96, + 45, 58, 248, 3, 142, 129, 145, 12, 33, 36, 162, 142, 160, 3, 251, 243, 213, 240, 208, + 141, 19, 13, 178, 255, 109, 2, 170, 20, 55, 241, 116, 101, 44, 108, 105, 186, 238, 251, + 199, 15, 31, 106, 157, 191, 110, 152, 178, 67, 137, 131, 208, 156, 144, 131, 155, 253, 134, + 70, 18, 190, 55, 134, 35, 99, 243, 140, 30, 225, 135, 230, 240, 166, 81, 142, 102, 191, + 39, 25, 3, 177, 156, 211, 77, 45, 87, 233, 43, 221, 48, 61, 155, 103, 195, 191, 203, + 182, 75, 233, 152, 211, 208, 136, 121, 33, 23, 224, 224, 62, 249, 227, 239, 149, 183, 61, + 195, 15, 39, 238, 236, 87, 43, 136, 191, 239, 71, 138, 166, 147, 116, 62, 102, 68, 199, + 224, 101, 223, 193, 70, 29, 186, 42, 13, 80, 225, 75, 19, 241, 115, 1, 221, 202, 45, + 102, 137, 29, 174, 20, 195, 66, 136, 2, 168, 205, 201, 137, 50, 168, 74, 121, 198, 4, + 163, 212, 85, 133, 31, 105, 118, 146, 106, 84, 93, 152, 187, 231, 181, 105, 251, 121, 171, + 132, 123, 84, 81, 69, 221, 132, 238, 40, 253, 181, 45, 161, 137, 130, 39, 169, 235, 158, + 59, 86, 242, 153, 239, 173, 128, 165, 23, 123, 30, 195, 0, 154, 23, 81, 224, 245, 214, + 206, 30, 212, 131, 75, 117, 12, 206, 157, 181, 186, 59, 241, 17, 45, 138, 0, 219, 11, + 165, 243, 135, 196, 182, 135, 95, 205, 217, 63, 195, 175, 14, 225, 131, 145, 45, 249, 158, + 251, 150, 84, 182, 209, 70, 199, 255, 209, 199, 219, 220, 109, 206, 99, 50, 132, 234, 146, + 82, 195, 209, 22, 114, 223, 247, 246, 113, 37, 239, 16, 33, 134, 100, 215, 88, 170, 158, + 87, 123, 102, 50, 88, 211, 1, 187, 6, 134, 165, 152, 216, 105, 106, 239, 220, 74, 231, + 210, 187, 12, 194, 204, 45, 72, 49, 4, 160, 219, 162, 248, 87, 8, 43, 176, 220, 44, + 107, 227, 178, 17, 124, 139, 122, 230, 122, 87, 48, 97, 42, 236, 110, 236, 185, 155, 53, + 234, 159, 214, 198, 66, 206, 30, 75, 249, 206, 40, 38, 57, 11, 217, 74, 136, 100, 197, + 110, 223, 29, 159, 65, 71, 140, 175, 51, 69, 74, 105, 48, 234, 63, 246, 45, 13, 20, + 121, 7, 226, 161, 46, 28, 173, 7, 103, 53, 108, 45, 164, 76, 74, 68, 141, 145, 208, + 61, 197, 22, 136, 46, 70, 115, 110, 60, 161, 124, 81, 26, 132, 51, 188, 178, 79, 106, + 186, 183, 160, 39, 228, 68, 115, 46, 136, 1, 192, 89, 62, 133, 112, 198, 180, 182, 58, + 34, 243, 219, 158, 69, 245, 34, 120, 178, 213, 200, 28, 143, 128, 188, 182, 100, 1, 41, + 146, 137, 43, 82, 227, 105, 216, 83, 48, 140, 10, 106, 175, 254, 70, 77, 67, 59, 112, + 188, 237, 69, 133, 10, 212, 5, 198, 138, 105, 199, 180, 252, 81, 223, 79, 53, 73, 39, + 137, 121, 180, 148, 228, 99, 146, 42, 177, 214, 102, 33, 147, 84, 102, 25, 94, 59, 31, + 37, 197, 137, 237, 122, 133, 63, 90, 213, 116, 163, 253, 253, 29, 177, 145, 2, 21, 36, + 45, 198, 251, 147, 231, 143, 232, 78, 168, 71, 137, 199, 108, 79, 80, 90, 201, 214, 153, + 35, 172, 13, 199, 169, 11, 228, 91, 157, 231, 112, 193, 20, 54, 189, 167, 30, 77, 144, + 108, 245, 215, 246, 189, 68, 69, 14, 158, 14, 228, 55, 50, 145, 69, 249, 58, 80, 222, + 149, 237, 198, 5, 175, 218, 60, 109, 130, 91, 186, 18, 200, 175, 234, 190, 109, 46, 3, + 123, 204, 18, 96, 4, 68, 241, 73, 62, 44, 154, 29, 193, 136, 227, 199, 55, 189, 4, + 164, 64, 95, 95, 82, 39, 15, 60, 230, 124, 107, 233, 248, 55, 251, 89, 60, 63, 75, + 134, 126, 119, 32, 156, 57, 168, 127, 0, 224, 61, 5, 133, 125, 100, 228, 208, 140, 243, + 12, 114, 111, 119, 92, 104, 175, 87, 193, 236, 151, 13, 114, 21, 132, 146, 177, 189, 59, + 49, 190, 27, 110, 195, 160, 236, 40, 132, 188, 181, 120, 201, 40, 232, 65, 132, 80, 241, + 220, 18, 221, 115, 31, 79, 137, 164, 226, 58, 98, 29, 108, 32, 57, 219, 228, 218, 199, + 13, 95, 132, 195, 215, 77, 235, 191, 143, 112, 16, 128, 76, 35, 93, 191, 66, 173, 73, + 231, 143, 132, 73, 173, 240, 106, 231, 203, 78, 193, 147, 92, 33, 23, 31, 248, 100, 11, + 184, 243, 123, 201, 115, 200, 236, 209, 135, 47, 126, 209, 22, 14, 85, 95, 188, 69, 202, + 163, 17, 24, 101, 164, 117, 134, 187, 148, 127, 31, 159, 55, 19, 27, 1, 135, 227, 237, + 89, 107, 28, 216, 60, 51, 230, 145, 147, 163, 215, 93, 70, 232, 118, 172, 140, 235, 50, + 71, 128, 177, 103, 32, 233, 123, 60, 234, 2, 31, 216, 91, 139, 244, 52, 200, 40, 26, + 90, 188, 189, 49, 25, 4, 25, 144, 176, 166, 124, 227, 237, 252, 148, 85, 29, 125, 208, + 89, 104, 210, 121, 64, 46, 4, 53, 99, 204, 93, 125, 38, 25, 59, 88, 51, 64, 113, + 195, 241, 23, 64, 212, 5, 60, 104, 90, 90, 230, 42, 179, 78, 253, 44, 143, 44, 49, + 196, 143, 254, 34, 13, 36, 60, 73, 125, 112, 137, 239, 52, 122, 7, 116, 79, 12, 177, + 183, 103, 11, 158, 146, 190, 237, 143, 235, 124, 188, 28, 65, 76, 26, 100, 89, 63, 160, + 163, 188, 17, 44, 172, 69, 167, 179, 185, 246, 191, 107, 174, 38, 118, 76, 184, 53, 58, + 72, 32, 182, 5, 61, 248, 81, 88, 92, 170, 152, 253, 77, 84, 14, 122, 1, 83, 34, + 180, 13, 25, 115, 120, 199, 154, 238, 20, 83, 36, 79, 155, 68, 5, 160, 130, 254, 242, + 218, 90, 156, 114, 87, 234, 199, 101, 101, 200, 185, 135, 124, 198, 160, 240, 62, 104, 138, + 45, 125, 222, 81, 204, 122, 150, 210, 26, 24, 208, 12, 242, 42, 169, 101, 130, 148, 44, + 232, 249, 245, 161, 128, 113, 103, 33, 98, 166, 137, 236, 212, 7, 202, 38, 211, 69, 188, + 165, 95, 212, 118, 108, 199, 161, 22, 45, 35, 170, 90, 11, 163, 79, 173, 36, 193, 20, + 69, 35, 187, 207, 16, 144, 214, 219, 182, 170, 32, 114, 79, 128, 71, 198, 237, 15, 103, + 4, 60, 139, 175, 150, 151, 82, 230, 68, 119, 168, 89, 188, 204, 20, 140, 220, 165, 98, + 184, 91, 12, 217, 205, 92, 90, 20, 35, 71, 36, 138, 76, 96, 22, 251, 247, 173, 78, + 222, 241, 197, 134, 75, 130, 83, 96, 14, 47, 5, 113, 232, 96, 126, 193, 45, 218, 28, + 66, 253, 99, 103, 136, 176, 200, 158, 171, 191, 76, 249, 158, 62, 190, 37, 137, 65, 120, + 233, 80, 168, 238, 193, 145, 79, 63, 82, 125, 26, 111, 191, 24, 210, 39, 161, 131, 239, + 64, 46, 175, 140, 39, 77, 202, 230, 115, 84, 40, 235, 62, 120, 148, 45, 57, 37, 124, + 121, 120, 249, 148, 231, 185, 172, 186, 224, 77, 61, 207, 141, 107, 126, 26, 147, 204, 229, + 121, 63, 58, 161, 43, 120, 25, 191, 165, 83, 228, 34, 205, 92, 27, 97, 67, 213, 13, + 253, 182, 91, 59, 133, 233, 166, 4, 4, 57, 209, 233, 179, 16, 35, 85, 59, 155, 111, + 250, 65, 194, 223, 99, 144, 59, 127, 241, 127, 85, 255, 125, 11, 90, 184, 145, 68, 95, + 150, 72, 153, 103, 49, 76, 120, 85, 161, 179, 241, 16, 174, 51, 211, 142, 150, 99, 201, + 22, 85, 73, 108, 84, 199, 120, 175, 128, 9, 243, 223, 160, 59, 120, 8, 109, 197, 128, + 194, 103, 52, 180, 119, 227, 231, 75, 113, 126, 175, 59, 148, 4, 132, 1, 89, 75, 121, + 8, 204, 131, 251, 171, 36, 55, 36, 44, 165, 233, 172, 103, 80, 224, 28, 200, 195, 3, + 20, 53, 129, 195, 112, 22, 200, 244, 23, 34, 64, 145, 42, 12, 20, 38, 184, 56, 94, + 220, 101, 3, 198, 17, 107, 22, 242, 135, 222, 182, 138, 243, 235, 11, 182, 91, 34, 127, + 80, 58, 161, 145, 203, 204, 158, 224, 242, 86, 24, 81, 51, 126, 84, 249, 143, 191, 15, + 130, 70, 238, 57, 209, 225, 36, 221, 152, 128, 255, 24, 208, 57, 186, 97, 4, 134, 255, + 229, 121, 86, 254, 202, 137, 124, 31, 130, 12, 222, 146, 142, 37, 129, 199, 247, 98, 236, + 212, 251, 108, 211, 20, 60, 13, 206, 158, 18, 84}; + +SimpleConvolution::SimpleConvolution() { + width_ = 64; + height_ = 64; + mask_width_ = 3; + mask_height_ = mask_width_; + randomize_seed_ = 0; + + if (!IsPowerOf2(width_)) { + width_ = RoundToPowerOf2(width_); + } + + if (!IsPowerOf2(height_)) { + height_ = RoundToPowerOf2(height_); + } + + if (!(mask_width_ % 2)) { + mask_width_++; + } + + if (!(mask_height_ % 2)) { + mask_height_++; + } + + if (width_ * height_ < 256) { + width_ = 64; + height_ = 64; + } + + const uint32_t input_size_bytes = width_ * height_ * sizeof(uint32_t); + const uint32_t mask_size_bytes = mask_width_ * mask_height_ * sizeof(float); + + SetInDescr(KERNARG_BUF_ID, KERNARG_DES_ID, sizeof(kernel_args_t)); + SetInDescr(INPUT_BUF_ID, SYS_DES_ID, input_size_bytes); + SetInDescr(MASK_BUF_ID, SYS_DES_ID, mask_size_bytes); + SetOutDescr(LOCAL_BUF_ID, LOCAL_DES_ID, input_size_bytes); + SetHostDescr(REFOUT_BUF_ID, REFOUT_DES_ID, input_size_bytes); + + if (!randomize_seed_) TEST_ASSERT(sizeof(input_data_) <= input_size_bytes); +} + +void SimpleConvolution::Init() { + std::clog << "SimpleConvolution::init :" << std::endl; + + mem_descr_t kernarg_des = GetDescr(KERNARG_BUF_ID); + mem_descr_t input_des = GetDescr(INPUT_BUF_ID); + mem_descr_t mask_des = GetDescr(MASK_BUF_ID); + mem_descr_t output_des = GetDescr(LOCAL_BUF_ID); +#if 0 + printf("kernarg_des %p 0x%x\n", kernarg_des.ptr, kernarg_des.size); + printf("input_des %p 0x%x\n", input_des.ptr, input_des.size); + printf("mask_des %p 0x%x\n", mask_des.ptr, mask_des.size); + printf("output_des %p 0x%x\n", output_des.ptr, output_des.size); +#endif + uint32_t* input = reinterpret_cast(input_des.ptr); + uint32_t* output_local = reinterpret_cast(output_des.ptr); + float* mask = reinterpret_cast(mask_des.ptr); + kernel_args_t* kernel_args = reinterpret_cast(kernarg_des.ptr); + + if (randomize_seed_) { + // random initialisation of input + FillRandom(input, width_, height_, 0, 255, randomize_seed_); + } else { + // initialization with preset values + memcpy(input, input_data_, width_ * height_ * sizeof(uint32_t)); + } + + // Fill a blurr filter or some other filter of your choice + const float val = 1.0f / (mask_width_ * 2.0f - 1.0f); + for (uint32_t i = 0; i < (mask_width_ * mask_height_); i++) { + mask[i] = 0; + } + for (uint32_t i = 0; i < mask_width_; i++) { + uint32_t y = mask_height_ / 2; + mask[y * mask_width_ + i] = val; + } + for (uint32_t i = 0; i < mask_height_; i++) { + uint32_t x = mask_width_ / 2; + mask[i * mask_width_ + x] = val; + } + + // Print the INPUT array. + std::clog << std::dec; + PrintArray("> Input[0]", input, width_, 1); + PrintArray("> Mask", mask, mask_width_, mask_height_); + + // Fill the kernel args + kernel_args->arg1 = output_local; + kernel_args->arg2 = input; + kernel_args->arg3 = mask; + kernel_args->arg4 = width_; + kernel_args->arg41 = height_; + kernel_args->arg5 = mask_width_; + kernel_args->arg51 = mask_height_; + + // Calculate the reference output + ReferenceImplementation(reinterpret_cast(GetRefOut()), input, mask, width_, height_, + mask_width_, mask_height_); +} + +void SimpleConvolution::PrintOutput(const void* ptr) const { + PrintArray("> Output[0]", reinterpret_cast(ptr), width_, 1); +} + +bool SimpleConvolution::ReferenceImplementation(uint32_t* output, const uint32_t* input, + const float* mask, const uint32_t width, + const uint32_t height, const uint32_t mask_width, + const uint32_t mask_height) { + const uint32_t vstep = (mask_width - 1) / 2; + const uint32_t hstep = (mask_height - 1) / 2; + + // for each pixel in the input + for (uint32_t x = 0; x < width; x++) { + for (uint32_t y = 0; y < height; y++) { + // find the left, right, top and bottom indices such that + // the indices do not go beyond image boundaires + const uint32_t left = (x < vstep) ? 0 : (x - vstep); + const uint32_t right = ((x + vstep) >= width) ? width - 1 : (x + vstep); + const uint32_t top = (y < hstep) ? 0 : (y - hstep); + const uint32_t bottom = ((y + hstep) >= height) ? height - 1 : (y + hstep); + + // initializing wighted sum value + float sum_fx = 0; + for (uint32_t i = left; i <= right; ++i) { + for (uint32_t j = top; j <= bottom; ++j) { + // performing wighted sum within the mask boundaries + uint32_t mask_idx = (j - (y - hstep)) * mask_width + (i - (x - vstep)); + uint32_t index = j * width + i; + + // to round to the nearest integer + sum_fx += ((float)input[index] * mask[mask_idx]); + } + } + sum_fx += 0.5f; + output[y * width + x] = uint32_t(sum_fx); + } + } + + return true; +} diff --git a/test/simple_convolution/simple_convolution.h b/test/simple_convolution/simple_convolution.h new file mode 100644 index 00000000..550d1320 --- /dev/null +++ b/test/simple_convolution/simple_convolution.h @@ -0,0 +1,94 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_SIMPLE_CONVOLUTION_SIMPLE_CONVOLUTION_H_ +#define TEST_SIMPLE_CONVOLUTION_SIMPLE_CONVOLUTION_H_ + +#include +#include + +#include "ctrl/test_kernel.h" + +// Class implements SimpleConvolution kernel parameters +class SimpleConvolution : public TestKernel { + public: + // Kernel buffers IDs + enum { INPUT_BUF_ID, LOCAL_BUF_ID, MASK_BUF_ID, KERNARG_BUF_ID, REFOUT_BUF_ID }; + + // Constructor + SimpleConvolution(); + + // Initialize method + void Init(); + + // Return compute grid size + uint32_t GetGridSize() const { return width_ * height_; } + + // Print output + void PrintOutput(const void* ptr) const; + + // Return name + std::string Name() const { return std::string("SimpleConvolution"); } + + private: + // Local kernel arguments declaration + struct kernel_args_t { + void* arg1; + void* arg2; + void* arg3; + uint32_t arg4; + uint32_t arg41; + uint32_t arg5; + uint32_t arg51; + }; + + // Reference CPU implementation of Simple Convolution + // @param output Output matrix after performing convolution + // @param input Input matrix on which convolution is to be performed + // @param mask mask matrix using which convolution was to be performed + // @param input_dimensions dimensions of the input matrix + // @param mask_dimensions dimensions of the mask matrix + // @return bool true on success and false on failure + bool ReferenceImplementation(uint32_t* output, const uint32_t* input, const float* mask, + const uint32_t width, const uint32_t height, + const uint32_t maskWidth, const uint32_t maskHeight); + + // Width of the Input array + uint32_t width_; + + // Height of the Input array + uint32_t height_; + + // Mask dimensions + uint32_t mask_width_; + + // Mask dimensions + uint32_t mask_height_; + + // Randomize input data + unsigned randomize_seed_; + + // Input data + static const uint32_t input_data_[]; +}; + +#endif // TEST_SIMPLE_CONVOLUTION_SIMPLE_CONVOLUTION_H_ diff --git a/test/tool/gfx_metrics.xml b/test/tool/gfx_metrics.xml new file mode 100644 index 00000000..899ca85e --- /dev/null +++ b/test/tool/gfx_metrics.xml @@ -0,0 +1,69 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/tool/input.xml b/test/tool/input.xml new file mode 100644 index 00000000..f4ecd178 --- /dev/null +++ b/test/tool/input.xml @@ -0,0 +1,14 @@ +# Filter by dispatches range, GPU index and kernel names + + +# List of metrics + diff --git a/test/tool/metrics.xml b/test/tool/metrics.xml new file mode 100644 index 00000000..a346eee9 --- /dev/null +++ b/test/tool/metrics.xml @@ -0,0 +1,205 @@ +#include "gfx_metrics.xml" + + + # average for 16 instances + + + + # sum for 16 instances + + + + + + + + + # FETCH_SIZE, kilobytes + # The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + + # WRITE_SIZE, kilobytes + # The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + + + + + # average for 16 instances + + + + # sum for 16 instances + + + + + + + + + + + # FETCH_SIZE, kilobytes + # The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + + # WRITE_SIZE, kilobytes + # The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + + + + + # GPUBusy, percentage + # The percentage of time GPU was busy. + + + # Wavefronts Total wavefronts., + + + # VALUInsts The average number of vector ALU instructions executed per work-item (affected by flow control). + + + # SALUInsts The average number of scalar ALU instructions executed per work-item (affected by flow control). + + + # VFetchInsts The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory. + + + # SFetchInsts The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control). + + + # VWriteInsts The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory. + + + # FlatVMemInsts The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch. + + + # LDSInsts The average number of LDS read or LDS write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS. + + + # FlatLDSInsts The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control). + + + # GDSInsts The average number of GDS read or GDS write instructions executed per work item (affected by flow control). + + + # VALUUtilization The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence). + + + # VALUBusy The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). + + + # SALUBusy The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). + + + # FetchSize The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + + + # WriteSize The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. + + + # L2CacheHit The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal). + + + # MemUnitBusy The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound). + + + # MemUnitStalled The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad). + + + # WriteUnitStalled The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad). + + + # The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). + + + # LDSBankConflict The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad). + + + diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp new file mode 100644 index 00000000..0eb79940 --- /dev/null +++ b/test/tool/tool.cpp @@ -0,0 +1,1048 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +/////////////////////////////////////////////////////////////////////////////// +// // +// Test tool used as ROC profiler library demo // +// // +/////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include +#include +#include /* For SYS_xxx definitions */ +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "inc/rocprofiler.h" +#include "util/hsa_rsrc_factory.h" +#include "util/xml.h" + +#define PUBLIC_API __attribute__((visibility("default"))) +#define CONSTRUCTOR_API __attribute__((constructor)) +#define DESTRUCTOR_API __attribute__((destructor)) +#define KERNEL_NAME_LEN_MAX 128 + +// Disoatch callback data type +struct callbacks_data_t { + rocprofiler_feature_t* features; + unsigned feature_count; + std::vector* set; + unsigned group_index; + FILE* file_handle; + int filter_on; + std::vector* gpu_index; + std::vector* kernel_string; + std::vector* range; +}; + +// Context stored entry type +struct context_entry_t { + uint32_t valid; + uint32_t index; + hsa_agent_t agent; + rocprofiler_group_t group; + rocprofiler_feature_t* features; + unsigned feature_count; + rocprofiler_callback_data_t data; + FILE* file_handle; +}; + +// +const std::string rcfile_name = "rpl_rc.xml"; +// verbose mode +static uint32_t verbose = 0; +// Enable tracing +static const bool trace_on = false; +// Tool is unloaded +volatile bool is_loaded = false; +// Dispatch callbacks and context handlers synchronization +pthread_mutex_t mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; +// Dispatch callback data +callbacks_data_t* callbacks_data = NULL; +// Stored contexts array +typedef std::map context_array_t; +context_array_t* context_array = NULL; +typedef std::list wait_list_t; +wait_list_t* wait_list = NULL; +// Contexts collected count +volatile uint32_t context_count = 0; +volatile uint32_t context_collected = 0; +// Profiling results output file name +const char* result_prefix = NULL; +// Global results file handle +FILE* result_file_handle = NULL; +// True if a result file is opened +bool result_file_opened = false; +// Dispatch filters +// Metrics set +std::vector* metrics_set = NULL; +// GPU index filter +std::vector* gpu_index_vec = NULL; +// Kernel name filter +std::vector* kernel_string_vec = NULL; +// DIspatch number range filter +std::vector* range_vec = NULL; +// Otstanding dispatches parameters +static uint32_t CTX_OUTSTANDING_MAX = 0; +static uint32_t CTX_OUTSTANDING_MON = 0; +// to truncate kernel names +uint32_t to_truncate_names = 0; +// local SQTT buffer +bool is_sqtt_local = true; + +static inline uint32_t GetPid() { return syscall(__NR_getpid); } +static inline uint32_t GetTid() { return syscall(__NR_gettid); } + +// Error handler +void fatal(const std::string msg) { + fflush(stdout); + fprintf(stderr, "%s\n\n", msg.c_str()); + fflush(stderr); + abort(); +} + +// Check returned HSA API status +void check_status(hsa_status_t status) { + if (status != HSA_STATUS_SUCCESS) { + const char* error_string = NULL; + rocprofiler_error_string(&error_string); + fprintf(stderr, "ERROR: %s\n", error_string); + abort(); + } +} + +std::string filtr_kernel_name(const std::string name) { + auto rit = name.rbegin(); + auto rend = name.rend(); + uint32_t counter = 0; + char open_token = 0; + char close_token = 0; + while (rit != rend) { + if (counter == 0) { + switch (*rit) { + case ')': + counter = 1; + open_token = ')'; + close_token = '('; + break; + case '>': + counter = 1; + open_token = '>'; + close_token = '<'; + break; + } + if (counter == 0) break; + } else { + if (*rit == open_token) counter++; + if (*rit == close_token) counter--; + } + ++rit; + } + while (((*rit == ' ') || (*rit == ' ')) && (rit != rend)) rit++; + auto rbeg = rit; + while ((*rit != ' ') && (*rit != ':') && (rit != rend)) rit++; + const uint32_t pos = rend - rit; + const uint32_t length = rit - rbeg; + return name.substr(pos, length); +} + +void* monitor_thr_fun(void*) { + while (context_array != NULL) { + sleep(CTX_OUTSTANDING_MON); + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + const uint32_t inflight = context_count - context_collected; + std::cerr << std::flush; + std::clog << std::flush; + std::cout << "ROCProfiler: count(" << context_count << "), outstanding(" << inflight << "/" << CTX_OUTSTANDING_MAX << ")" << std::endl << std::flush; + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + } + return NULL; +} + +uint32_t next_context_count() { + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + ++context_count; + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + return context_count; +} + +// Allocate entry to store profiling context +context_entry_t* alloc_context_entry() { + if (CTX_OUTSTANDING_MAX != 0) { + while((context_count - context_collected) > CTX_OUTSTANDING_MAX) usleep(1000); + } + + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + const uint32_t index = next_context_count() - 1; + auto ret = context_array->insert({index, context_entry_t{}}); + if (ret.second == false) { + fprintf(stderr, "context_array corruption, index repeated %u\n", index); + abort(); + } + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + context_entry_t* entry = &(ret.first->second); + entry->index = index; + return entry; +} + +// Allocate entry to store profiling context +void dealloc_context_entry(context_entry_t* entry) { + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + assert(context_array != NULL); + context_array->erase(entry->index); + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } +} + +// Dump trace data to file +void dump_sqtt_trace(const char* label, const uint32_t chunk, const void* data, const uint32_t& size) { + if (result_prefix != NULL) { + // Open SQTT file + std::ostringstream oss; + oss << result_prefix << "/thread_trace_" << label << "_se" << chunk << ".out"; + FILE* file = fopen(oss.str().c_str(), "w"); + if (file == NULL) { + std::ostringstream errmsg; + errmsg << "fopen error, file '" << oss.str().c_str() << "'"; + perror(errmsg.str().c_str()); + abort(); + } + + // Write the buffer in terms of shorts (16 bits) + const unsigned short* ptr = reinterpret_cast(data); + for (uint32_t i = 0; i < (size / sizeof(short)); ++i) { + fprintf(file, "%04x\n", ptr[i]); + } + + // Close SQTT file + fclose(file); + } +} + +struct trace_data_arg_t { + FILE* file; + const char* label; + hsa_agent_t agent; +}; + +// Trace data callback for getting trace data from GPU local mamory +hsa_status_t trace_data_cb(hsa_ven_amd_aqlprofile_info_type_t info_type, + hsa_ven_amd_aqlprofile_info_data_t* info_data, void* data) { + hsa_status_t status = HSA_STATUS_SUCCESS; + trace_data_arg_t* arg = reinterpret_cast(data); + if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_SQTT_DATA) { + const void* data_ptr = info_data->sqtt_data.ptr; + const uint32_t data_size = info_data->sqtt_data.size; + fprintf(arg->file, " SE(%u) size(%u)\n", info_data->sample_id, data_size); + + if (is_sqtt_local) { + HsaRsrcFactory* hsa_rsrc = &HsaRsrcFactory::Instance(); + const AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(arg->agent); + const uint32_t mem_size = data_size; + void* buffer = hsa_rsrc->AllocateSysMemory(agent_info, mem_size); + if(!hsa_rsrc->Memcpy(agent_info, buffer, data_ptr, mem_size)) { + fatal("SQTT data memcopy to host failed"); + } + dump_sqtt_trace(arg->label, info_data->sample_id, buffer, data_size); + HsaRsrcFactory::FreeMemory(buffer); + } else { + dump_sqtt_trace(arg->label, info_data->sample_id, data_ptr, data_size); + } + } else + status = HSA_STATUS_ERROR; + return status; +} + +// Align to specified alignment +unsigned align_size(unsigned size, unsigned alignment) { + return ((size + alignment - 1) & ~(alignment - 1)); +} + +// Output profiling results for input features +void output_results(const context_entry_t* entry, const char* label) { + FILE* file = entry->file_handle; + const rocprofiler_feature_t* features = entry->features; + const unsigned feature_count = entry->feature_count; + rocprofiler_t* context = entry->group.context; + + for (unsigned i = 0; i < feature_count; ++i) { + const rocprofiler_feature_t* p = &features[i]; + fprintf(file, " %s ", p->name); + switch (p->data.kind) { + // Output metrics results + case ROCPROFILER_DATA_KIND_INT64: + fprintf(file, "(%lu)\n", p->data.result_int64); + break; + // Output trace results + case ROCPROFILER_DATA_KIND_BYTES: { + if (p->data.result_bytes.copy) { + uint64_t size = 0; + + const char* ptr = reinterpret_cast(p->data.result_bytes.ptr); + const char* end = reinterpret_cast(ptr + p->data.result_bytes.size); + for (unsigned i = 0; i < p->data.result_bytes.instance_count; ++i) { + const uint32_t chunk_size = *reinterpret_cast(ptr); + const char* chunk_data = ptr + sizeof(uint32_t); + if (chunk_data >= end) fatal("SQTT data is out of the result buffer size"); + + dump_sqtt_trace(label, i, chunk_data, chunk_size); + const uint32_t off = align_size(chunk_size, sizeof(uint32_t)); + ptr = chunk_data + off; + if (chunk_data >= end) fatal("SQTT data ptr is out of the result buffer size"); + size += chunk_size; + } + fprintf(file, "size(%lu)\n", size); + HsaRsrcFactory::FreeMemory(p->data.result_bytes.ptr); + const_cast(p)->data.result_bytes.size = 0; + } else { + fprintf(file, "(\n"); + trace_data_arg_t trace_data_arg{file, label, entry->agent}; + hsa_status_t status = rocprofiler_iterate_trace_data(context, trace_data_cb, reinterpret_cast(&trace_data_arg)); + check_status(status); + fprintf(file, " )\n"); + } + break; + } + default: + fprintf(stderr, "RPL-tool: undefined data kind(%u)\n", p->data.kind); + abort(); + } + } +} + +// Output group intermeadate profiling results, created internally for complex metrics +void output_group(const context_entry_t* entry, const char* label) { + const rocprofiler_group_t* group = &(entry->group); + context_entry_t group_entry = *entry; + for (unsigned i = 0; i < group->feature_count; ++i) { + if (group->features[i]->data.kind == ROCPROFILER_DATA_KIND_INT64) { + group_entry.features = group->features[i]; + group_entry.feature_count = 1; + output_results(&group_entry, label); + } + } +} + +// Dump stored context profiling output data +bool dump_context(context_entry_t* entry) { + hsa_status_t status = HSA_STATUS_ERROR; + + if (entry->valid == 0) return true; + + const rocprofiler_dispatch_record_t* record = entry->data.record; + if (record) { + if (record->complete == 0) { + return false; + } + } + + ++context_collected; + + const uint32_t index = entry->index; + FILE* file_handle = entry->file_handle; + const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); + + fprintf(file_handle, "dispatch[%u], queue_index(%lu), kernel_name(\"%s\")", + index, + entry->data.queue_index, + nik_name.c_str()); + if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", + record->dispatch, + record->begin, + record->end, + record->complete); + fprintf(file_handle, "\n"); + fflush(file_handle); + + if (record) { + delete record; + entry->data.record = NULL; + } + + rocprofiler_group_t& group = entry->group; + if (group.context != NULL) { + status = rocprofiler_group_get_data(&group); + check_status(status); + if (verbose == 1) output_group(entry, "group0-data"); + + status = rocprofiler_get_metrics(group.context); + check_status(status); + std::ostringstream oss; + oss << index << "__" << filtr_kernel_name(entry->data.kernel_name); + output_results(entry, oss.str().substr(0, KERNEL_NAME_LEN_MAX).c_str()); + free(const_cast(entry->data.kernel_name)); + + // Finishing cleanup + // Deleting profiling context will delete all allocated resources + rocprofiler_close(group.context); + } + + entry->valid = 0; + return true; +} + +// Dump and clean a given context entry +static inline bool dump_context_entry(context_entry_t* entry) { + const bool ret = dump_context(entry); + if (ret) dealloc_context_entry(entry); + return ret; +} + +// Dump waiting entries +static inline void dump_wait_list() { + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + auto it = wait_list->begin(); + auto end = wait_list->end(); + while (it != end) { + auto cur = it++; + if (dump_context_entry(*cur)) { + wait_list->erase(cur); + } + } + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } +} + +// Dump all stored contexts profiling output data +void dump_context_array() { + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + if (context_array) { + if (!wait_list->empty()) dump_wait_list(); + + auto it = context_array->begin(); + auto end = context_array->end(); + while (it != end) { + auto cur = it++; + dump_context(&(cur->second)); + } + } + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } +} + +// Profiling completion handler +bool handler(rocprofiler_group_t group, void* arg) { + context_entry_t* entry = reinterpret_cast(arg); + + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + if (!wait_list->empty()) dump_wait_list(); + + if (!dump_context_entry(entry)) { + wait_list->push_back(entry); + } + + if (trace_on) { + fprintf(stdout, "tool::handler: context_array %d tid %u\n", (int)(context_array->size()), GetTid()); + fflush(stdout); + } + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + return false; +} + +bool check_filter(const rocprofiler_callback_data_t* callback_data, const callbacks_data_t* tool_data) { + bool found = true; + + std::vector* range_ptr = tool_data->range; + if (found && range_ptr) { + found = false; + std::vector& range = *range_ptr; + if (range.size() == 1) { + if (context_count >= range[0]) found = true; + } else if (range.size() == 2) { + if ((context_count >= range[0]) && (context_count < range[1])) found = true; + } + } + std::vector* gpu_index = tool_data->gpu_index; + if (found && gpu_index) { + found = false; + for (uint32_t i : *gpu_index) { + if (i == callback_data->agent_index) { + found = true; + } + } + } + std::vector* kernel_string = tool_data->kernel_string; + if (found && kernel_string) { + found = false; + for (const std::string& s : *kernel_string) { + if (std::string(callback_data->kernel_name).find(s) != std::string::npos) { + found = true; + } + } + } + + return found; +} + +// Kernel disoatch callback +hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data, + rocprofiler_group_t* group) { + // Passed tool data + callbacks_data_t* tool_data = reinterpret_cast(user_data); + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; + + // Checking dispatch condition + if (tool_data->filter_on == 1) { + if (check_filter(callback_data, tool_data) == false) { + next_context_count(); + return HSA_STATUS_SUCCESS; + } + } + // Profiling context + rocprofiler_t* context = NULL; + // Context entry + context_entry_t* entry = alloc_context_entry(); + // context properties + rocprofiler_properties_t properties{}; + properties.handler = (result_prefix != NULL) ? handler : NULL; + properties.handler_arg = (void*)entry; + + rocprofiler_feature_t* features = tool_data->features; + unsigned feature_count = tool_data->feature_count; + + if (tool_data->set != NULL) { + uint32_t set_offset = 0; + uint32_t next_offset = 0; + const auto entry_index = entry->index; + if (entry_index < (tool_data->set->size() - 1)) { + set_offset = (*(tool_data->set))[entry_index]; + next_offset = (*(tool_data->set))[entry_index + 1]; + } else { + set_offset = tool_data->set->back(); + next_offset = feature_count; + } + features += set_offset; + feature_count = next_offset - set_offset; + } + + if (tool_data->feature_count > 0) { + // Open profiling context + status = rocprofiler_open(callback_data->agent, features, feature_count, + &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); + check_status(status); + + // Check that we have only one profiling group + uint32_t group_count = 0; + status = rocprofiler_group_count(context, &group_count); + check_status(status); + assert(group_count == 1); + // Get group[0] + const uint32_t group_index = 0; + status = rocprofiler_get_group(context, group_index, group); + check_status(status); + } + + // Fill profiling context entry + entry->agent = callback_data->agent; + entry->group = *group; + entry->features = features; + entry->feature_count = feature_count; + entry->data = *callback_data; + entry->data.kernel_name = strdup(callback_data->kernel_name); + entry->file_handle = tool_data->file_handle; + entry->valid = 1; + + if (trace_on) { + fprintf(stdout, "tool::dispatch: context_array %d tid %u\n", (int)(context_array->size()), GetTid()); + fflush(stdout); + } + + return status; +} + +hsa_status_t destroy_callback(hsa_queue_t* queue, void*) { + if (result_file_opened == false) printf("\nROCProfiler results:\n"); + dump_context_array(); + return HSA_STATUS_SUCCESS; +} + +static hsa_status_t info_callback(const rocprofiler_info_data_t info, void * arg) { + const char symb = *reinterpret_cast(arg); + if (((symb == 'b') && (info.metric.expr == NULL)) || + ((symb == 'd') && (info.metric.expr != NULL))) + { + printf("\n gpu-agent%d : %s : %s\n", info.agent_index, info.metric.name, info.metric.description); + if (info.metric.expr != NULL) printf(" %s = %s\n", info.metric.name, info.metric.expr); + } + return HSA_STATUS_SUCCESS; +} + +std::string normalize_token(const std::string token, bool not_empty, std::string label) { + const std::string space_chars_set = " \t"; + const size_t first_pos = token.find_first_not_of(space_chars_set); + size_t norm_len = 0; + std::string error_str = "none"; + if (first_pos != std::string::npos) { + const size_t last_pos = token.find_last_not_of(space_chars_set); + if (last_pos == std::string::npos) error_str = "token string error: \"" + token + "\""; + else { + const size_t end_pos = last_pos + 1; + if (end_pos <= first_pos) error_str = "token string error: \"" + token + "\""; + else norm_len = end_pos - first_pos; + } + } + if (((first_pos != std::string::npos) && (norm_len == 0)) || + ((first_pos == std::string::npos) && not_empty)) { + fatal(label + ": " + error_str); + } + return (norm_len != 0) ? token.substr(first_pos, norm_len) : std::string(""); +} + +int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, const std::string& delim, std::vector* vec, const char* label = NULL) { + int parse_iter = 0; + auto nodes = xml->GetNodes(tag); + auto rit = nodes.rbegin(); + auto rend = nodes.rend(); + while (rit != rend) { + auto& opts = (*rit)->opts; + if (opts.find(field) != opts.end()) break; + ++rit; + } + if (rit != rend) { + const std::string array_string = (*rit)->opts[field]; + if (label != NULL) printf("%s%s = %s\n", label, field.c_str(), array_string.c_str()); + size_t pos1 = 0; + const size_t string_len = array_string.length(); + while (pos1 < string_len) { + const size_t pos2 = array_string.find(delim, pos1); + const bool found = (pos2 != std::string::npos); + const size_t token_len = (pos2 != std::string::npos) ? pos2 - pos1 : string_len - pos1; + const std::string token = array_string.substr(pos1, token_len); + const std::string norm_str = normalize_token(token, found, "Tokens array parsing error, file '" + xml->GetName() + "', " + tag + "::" + field); + if (norm_str.length() != 0) vec->push_back(norm_str); + if (!found) break; + pos1 = pos2 + 1; + ++parse_iter; + } + } + + return parse_iter; +} + +int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, const std::string& delim, std::vector* vec, const char* label = NULL) { + std::vector str_vec; + const int parse_iter = get_xml_array(xml, tag, field, delim, &str_vec, label); + for (const std::string& str : str_vec) vec->push_back(atoi(str.c_str())); + return parse_iter; +} + +static inline void check_env_var(const char* var_name, uint32_t& val) { + const char* str = getenv(var_name); + if (str != NULL ) val = atol(str); +} +static inline void check_env_var(const char* var_name, uint64_t& val) { + const char* str = getenv(var_name); + if (str != NULL ) val = atoll(str); +} + +// Tool constructor +extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) +{ + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + if (is_loaded) return; + is_loaded = true; + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + // Loading configuration rcfile + std::string rcpath = std::string("./") + rcfile_name; + xml::Xml* rcfile = xml::Xml::Create(rcpath); + const char* home_dir = getenv("HOME"); + if (rcfile == NULL && home_dir != NULL) { + rcpath = std::string(home_dir) + "/" + rcfile_name; + rcfile = xml::Xml::Create(rcpath); + } + const char* pkg_dir = getenv("ROCP_PACKAGE_DIR"); + if (rcfile == NULL && pkg_dir != NULL) { + rcpath = std::string(pkg_dir) + "/" + rcfile_name; + rcfile = xml::Xml::Create(rcpath); + } + if (rcfile != NULL) { + // Getting defaults + printf("ROCProfiler: rc-file '%s'\n", rcpath.c_str()); + auto defaults_list = rcfile->GetNodes("top.defaults"); + for (auto* entry : defaults_list) { + const auto& opts = entry->opts; + auto it = opts.find("basenames"); + if (it != opts.end()) { to_truncate_names = (it->second == "on") ? 1 : 0; } + it = opts.find("timestamp"); + if (it != opts.end()) { settings->timestamp_on = (it->second == "on") ? 1 : 0; } + it = opts.find("ctx-limit"); + if (it != opts.end()) { CTX_OUTSTANDING_MAX = atol(it->second.c_str()); } + it = opts.find("heartbeat"); + if (it != opts.end()) { CTX_OUTSTANDING_MON = atol(it->second.c_str()); } + it = opts.find("sqtt-size"); + if (it != opts.end()) { + std::string str = normalize_token(it->second, true, "option sqtt-size"); + uint32_t multiplier = 1; + switch (str.back()) { + case 'K': multiplier = 1024; break; + case 'M': multiplier = 1024 * 1024; break; + } + if (multiplier != 1) str = str.substr(0, str.length() - 1); + settings->sqtt_size = strtoull(str.c_str(), NULL, 0) * multiplier; + } + it = opts.find("sqtt-local"); + if (it != opts.end()) { settings->sqtt_local = (it->second == "on"); } + } + } + // Enable verbose mode + check_env_var("ROCP_VERBOSE_MODE", verbose); + // Enable kernel names truncating + check_env_var("ROCP_TRUNCATE_NAMES", to_truncate_names); + // Set outstanding dispatches parameter + check_env_var("ROCP_OUTSTANDING_MAX", CTX_OUTSTANDING_MAX); + check_env_var("ROCP_OUTSTANDING_MON", CTX_OUTSTANDING_MON); + // Enable timestamping + check_env_var("ROCP_TIMESTAMP_ON", settings->timestamp_on); + // Set data timeout + check_env_var("ROCP_DATA_TIMEOUT", settings->timeout); + // Set SQTT size + check_env_var("ROCP_SQTT_SIZE", settings->sqtt_size); + // Set SQTT local buffer + check_env_var("ROCP_SQTT_LOCAL", settings->sqtt_local); + + is_sqtt_local = settings->sqtt_local; + + // Printing out info + char* info_symb = getenv("ROCP_INFO"); + if (info_symb != NULL) { + if (*info_symb != 'b' && *info_symb != 'd') { + fprintf(stderr, "ROCProfiler: bad info symbol '%c', ROCP_INFO env", *info_symb); + } else { + if (*info_symb == 'b') printf("Basic HW counters:\n"); + else printf("Derived metrics:\n"); + rocprofiler_iterate_info(NULL, ROCPROFILER_INFO_KIND_METRIC, info_callback, info_symb); + } + exit(1); + } + + // Set output file + result_prefix = getenv("ROCP_OUTPUT_DIR"); + if (result_prefix != NULL) { + DIR* dir = opendir(result_prefix); + if (dir == NULL) { + std::ostringstream errmsg; + errmsg << "ROCProfiler: Cannot open output directory '" << result_prefix << "'"; + perror(errmsg.str().c_str()); + abort(); + } + std::ostringstream oss; + oss << result_prefix << "/results.txt"; + result_file_handle = fopen(oss.str().c_str(), "w"); + if (result_file_handle == NULL) { + std::ostringstream errmsg; + errmsg << "ROCProfiler: fopen error, file '" << oss.str().c_str() << "'"; + perror(errmsg.str().c_str()); + abort(); + } + } else result_file_handle = stdout; + + result_file_opened = (result_prefix != NULL) && (result_file_handle != NULL); + + // Getting input + const char* xml_name = getenv("ROCP_INPUT"); + if (xml_name == NULL) fatal("ROCProfiler: input is not specified, ROCP_INPUT env"); + printf("ROCProfiler: input from \"%s\"\n", xml_name); + xml::Xml* xml = xml::Xml::Create(xml_name); + if (xml == NULL) { + fprintf(stderr, "ROCProfiler: Input file not found '%s'\n", xml_name); + abort(); + } + + // Getting metrics + std::vector metrics_vec; + get_xml_array(xml, "top.metric", "name", ",", &metrics_vec); + + // Metrics set + metrics_set = new std::vector; + get_xml_array(xml, "top.metric", "set", ",", metrics_set, " "); + if (metrics_set->size() != 0) { + uint32_t accum = 0; + metrics_set->insert(metrics_set->begin(), 0); + for (auto it = metrics_set->begin(); it != metrics_set->end(); ++it) { + accum += *it; + *it = accum; + } + } + + // Getting GPU indexes + gpu_index_vec = new std::vector; + get_xml_array(xml, "top.metric", "gpu_index", ",", gpu_index_vec, " "); + + // Getting kernel names + kernel_string_vec = new std::vector; + get_xml_array(xml, "top.metric", "kernel", ",", kernel_string_vec, " "); + + // Getting profiling range + range_vec = new std::vector; + const int range_parse_iter = get_xml_array(xml, "top.metric", "range", ":", range_vec, " "); + if ((range_vec->size() > 2) || (range_parse_iter > 1)) + { + fatal("Bad range format, input file " + xml->GetName()); + } + if ((range_vec->size() == 1) && (range_parse_iter == 0)) { + range_vec->push_back(*(range_vec->begin()) + 1); + } + + // Getting traces + auto traces_list = xml->GetNodes("top.trace"); + + const unsigned feature_count = metrics_vec.size() + traces_list.size(); + rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; + memset(features, 0, feature_count * sizeof(rocprofiler_feature_t)); + + printf(" %d metrics\n", (int)metrics_vec.size()); + for (unsigned i = 0; i < metrics_vec.size(); ++i) { + const std::string& name = metrics_vec[i]; + printf("%s%s", (i == 0) ? " " : ", ", name.c_str()); + features[i] = {}; + features[i].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[i].name = strdup(name.c_str()); + } + if (metrics_vec.size()) printf("\n"); + + printf(" %d traces\n", (int)traces_list.size()); + unsigned index = metrics_vec.size(); + for (auto* entry : traces_list) { + auto params_list = xml->GetNodes("top.trace.parameters"); + if (params_list.size() > 1) { + fatal("ROCProfiler: Single input 'parameters' section is supported"); + } + std::string name = ""; + bool to_copy_data = false; + for (const auto& opt : entry->opts) { + if (opt.first == "name") name = opt.second; + else if (opt.first == "copy") to_copy_data = (opt.second == "true"); + else fatal("ROCProfiler: Bad trace property '" + opt.first + "'"); + } + if (name == "") fatal("ROCProfiler: Bad trace properties, name is not specified"); + + std::map parameters_dict; + parameters_dict["TARGET_CU"] = + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET; + parameters_dict["VM_ID_MASK"] = + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK; + parameters_dict["MASK"] = + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK; + parameters_dict["TOKEN_MASK"] = + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK; + parameters_dict["TOKEN_MASK2"] = + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2; + parameters_dict["SE_MASK"] = + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK; + + printf(" %s (", name.c_str()); + features[index] = {}; + features[index].kind = ROCPROFILER_FEATURE_KIND_TRACE; + features[index].name = strdup(name.c_str()); + features[index].data.result_bytes.copy = to_copy_data; + + for (auto* params : params_list) { + const unsigned parameter_count = params->opts.size(); + rocprofiler_parameter_t* parameters = new rocprofiler_parameter_t[parameter_count]; + unsigned p_index = 0; + for (auto& v : params->opts) { + const std::string parameter_name = v.first; + if (parameters_dict.find(parameter_name) == parameters_dict.end()) { + fprintf(stderr, "ROCProfiler: unknown trace parameter '%s'\n", parameter_name.c_str()); + abort(); + } + const uint32_t value = strtol(v.second.c_str(), NULL, 0); + printf("\n %s = 0x%x", parameter_name.c_str(), value); + parameters[p_index] = {}; + parameters[p_index].parameter_name = parameters_dict[parameter_name]; + parameters[p_index].value = value; + ++p_index; + } + + features[index].parameters = parameters; + features[index].parameter_count = parameter_count; + } + if (params_list.empty() == false) printf("\n "); + printf(")\n"); + fflush(stdout); + ++index; + } + fflush(stdout); + + // Context array aloocation + context_array = new context_array_t; + wait_list = new wait_list_t; + + // Adding dispatch observer + rocprofiler_queue_callbacks_t callbacks_ptrs{0}; + callbacks_ptrs.dispatch = dispatch_callback; + callbacks_ptrs.destroy = destroy_callback; + + callbacks_data = new callbacks_data_t{}; + callbacks_data->features = features; + callbacks_data->feature_count = feature_count; + callbacks_data->set = (metrics_set->empty()) ? NULL : metrics_set; + callbacks_data->group_index = 0; + callbacks_data->file_handle = result_file_handle; + callbacks_data->gpu_index = (gpu_index_vec->empty()) ? NULL : gpu_index_vec; + callbacks_data->kernel_string = (kernel_string_vec->empty()) ? NULL : kernel_string_vec; + callbacks_data->range = (range_vec->empty()) ? NULL : range_vec;; + callbacks_data->filter_on = (callbacks_data->gpu_index != NULL) || + (callbacks_data->kernel_string != NULL) || + (callbacks_data->range != NULL) + ? 1 : 0; + + rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_data); + + xml::Xml::Destroy(xml); + + if (CTX_OUTSTANDING_MON != 0) { + pthread_t thread; + pthread_attr_t attr; + int err = pthread_attr_init(&attr); + if (err) { errno = err; perror("pthread_attr_init"); abort(); } + err = pthread_create(&thread, &attr, monitor_thr_fun, NULL); + } +} + +// Tool destructor +extern "C" PUBLIC_API void OnUnloadTool() { + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + if (!is_loaded) return; + is_loaded = false; + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + // Unregister dispatch callback + rocprofiler_remove_queue_callbacks(); + + // Dump stored profiling output data + printf("\nROCPRofiler: %u contexts collected", context_collected); + if (result_file_opened) printf(", output directory %s", result_prefix); + printf("\n"); fflush(stdout); + dump_context_array(); + if (wait_list) { + if (!wait_list->empty()) { + printf("\nWaiting for pending kernels ..."); fflush(stdout); + while (wait_list->size() != 0) { + usleep(1000); + dump_wait_list(); + } + printf(".done\n"); fflush(stdout); + } + } + if (result_file_opened) fclose(result_file_handle); + + // Cleanup + if (callbacks_data != NULL) { + delete[] callbacks_data->features; + delete callbacks_data; + callbacks_data = NULL; + } + delete metrics_set; + metrics_set = NULL; + delete gpu_index_vec; + gpu_index_vec = NULL; + delete kernel_string_vec; + kernel_string_vec = NULL; + delete range_vec; + range_vec = NULL; + delete context_array; + context_array = NULL; + delete wait_list; + wait_list = NULL; +} + +extern "C" DESTRUCTOR_API void destructor() { + if (is_loaded == true) OnUnloadTool(); +} diff --git a/test/util/helper_funcs.h b/test/util/helper_funcs.h new file mode 100644 index 00000000..c76854ba --- /dev/null +++ b/test/util/helper_funcs.h @@ -0,0 +1,86 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_UTIL_HELPER_FUNCS_H_ +#define TEST_UTIL_HELPER_FUNCS_H_ + +#include +#include +#include +#include +#include + +static inline void Error(std::string error_msg) { + std::cerr << "Error: " << error_msg << std::endl; +} + +template +void PrintArray(const std::string header, const T* data, const int width, const int height) { + std::clog << header << " :\n"; + for (int i = 0; i < height; i++) { + std::clog << "> "; + for (int j = 0; j < width; j++) { + std::clog << data[i * width + j] << " "; + } + std::clog << "\n"; + } +} + +template +bool FillRandom(T* array_ptr, const int width, const int height, const T range_min, + const T range_max, unsigned int seed = 123) { + if (!array_ptr) { + Error("Cannot fill array. NULL pointer."); + return false; + } + + if (!seed) seed = (unsigned int)time(NULL); + + srand(seed); + double range = double(range_max - range_min) + 1.0; + + /* random initialisation of input */ + for (int i = 0; i < height; i++) + for (int j = 0; j < width; j++) { + int index = i * width + j; + array_ptr[index] = range_min + T(range * rand() / (RAND_MAX + 1.0)); + } + + return true; +} + +template T RoundToPowerOf2(T val) { + int bytes = sizeof(T); + + val--; + for (int i = 0; i < bytes; i++) val |= val >> (1 << i); + val++; + + return val; +} + +template bool IsPowerOf2(T val) { + long long long_val = val; + return (((long_val & (-long_val)) - long_val == 0) && (long_val != 0)); +} + +#endif // TEST_UTIL_HELPER_FUNCS_H_ diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp new file mode 100644 index 00000000..5116a3a8 --- /dev/null +++ b/test/util/hsa_rsrc_factory.cpp @@ -0,0 +1,556 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "util/hsa_rsrc_factory.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +// Callback function to get available in the system agents +hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) { + hsa_status_t status = HSA_STATUS_ERROR; + HsaRsrcFactory* hsa_rsrc = reinterpret_cast(data); + const AgentInfo* agent_info = hsa_rsrc->AddAgentInfo(agent); + if (agent_info != NULL) status = HSA_STATUS_SUCCESS; + return status; +} + +// This function checks to see if the provided +// pool has the HSA_AMD_SEGMENT_GLOBAL property. If the kern_arg flag is true, +// the function adds an additional requirement that the pool have the +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT property. If kern_arg is false, +// pools must NOT have this property. +// Upon finding a pool that meets these conditions, HSA_STATUS_INFO_BREAK is +// returned. HSA_STATUS_SUCCESS is returned if no errors were encountered, but +// no pool was found meeting the requirements. If an error is encountered, we +// return that error. +static hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool kern_arg) { + hsa_status_t err; + hsa_amd_segment_t segment; + uint32_t flag; + + if (nullptr == data) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + CHECK_STATUS("hsa_amd_memory_pool_get_info", err); + if (HSA_AMD_SEGMENT_GLOBAL != segment) { + return HSA_STATUS_SUCCESS; + } + + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + CHECK_STATUS("hsa_amd_memory_pool_get_info", err); + + uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT; + + if ((karg_st == 0 && kern_arg) || (karg_st != 0 && !kern_arg)) { + return HSA_STATUS_SUCCESS; + } + + *(reinterpret_cast(data)) = pool; + return HSA_STATUS_INFO_BREAK; +} + +// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that +// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that is NOT +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT +hsa_status_t FindStandardPool(hsa_amd_memory_pool_t pool, void* data) { + return FindGlobalPool(pool, data, false); +} + +// This is the call-back function for hsa_amd_agent_iterate_memory_pools() that +// finds a pool with the properties of HSA_AMD_SEGMENT_GLOBAL and that IS +// HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT +hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { + return FindGlobalPool(pool, data, true); +} + +// Constructor of the class +HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) { + hsa_status_t status; + // Initialize the Hsa Runtime + if (initialize_hsa_) { + status = hsa_init(); + CHECK_STATUS("Error in hsa_init", status); + } + // Discover the set of Gpu devices available on the platform + status = hsa_iterate_agents(GetHsaAgentsCallback, this); + CHECK_STATUS("Error Calling hsa_iterate_agents", status); + + // Get AqlProfile API table + aqlprofile_api_ = {0}; +#ifdef ROCP_LD_AQLPROFILE + status = LoadAqlProfileLib(&aqlprofile_api_); +#else + status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, 1, 0, &aqlprofile_api_); +#endif + CHECK_STATUS("aqlprofile API table load failed", status); + + // Get Loader API table + loader_api_ = {0}; + status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_); + CHECK_STATUS("loader API table query failed", status); +} + +// Destructor of the class +HsaRsrcFactory::~HsaRsrcFactory() { + for (auto p : cpu_list_) delete p; + for (auto p : gpu_list_) delete p; + if (initialize_hsa_) { + hsa_status_t status = hsa_shut_down(); + CHECK_STATUS("Error in hsa_shut_down", status); + } +} + +hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { + void* handle = dlopen(kAqlProfileLib, RTLD_NOW); + if (handle == NULL) { + fprintf(stderr, "Loading '%s' failed, %s\n", kAqlProfileLib, dlerror()); + return HSA_STATUS_ERROR; + } + dlerror(); /* Clear any existing error */ + + api->hsa_ven_amd_aqlprofile_error_string = + (decltype(::hsa_ven_amd_aqlprofile_error_string)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_error_string"); + api->hsa_ven_amd_aqlprofile_validate_event = + (decltype(::hsa_ven_amd_aqlprofile_validate_event)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_validate_event"); + api->hsa_ven_amd_aqlprofile_start = + (decltype(::hsa_ven_amd_aqlprofile_start)*)dlsym(handle, "hsa_ven_amd_aqlprofile_start"); + api->hsa_ven_amd_aqlprofile_stop = + (decltype(::hsa_ven_amd_aqlprofile_stop)*)dlsym(handle, "hsa_ven_amd_aqlprofile_stop"); + api->hsa_ven_amd_aqlprofile_read = + (decltype(::hsa_ven_amd_aqlprofile_read)*)dlsym(handle, "hsa_ven_amd_aqlprofile_read"); + api->hsa_ven_amd_aqlprofile_legacy_get_pm4 = + (decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + api->hsa_ven_amd_aqlprofile_get_info = (decltype(::hsa_ven_amd_aqlprofile_get_info)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_get_info"); + api->hsa_ven_amd_aqlprofile_iterate_data = + (decltype(::hsa_ven_amd_aqlprofile_iterate_data)*)dlsym( + handle, "hsa_ven_amd_aqlprofile_iterate_data"); + + return HSA_STATUS_SUCCESS; +} + +// Add system agent info +const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { + // Determine if device is a Gpu agent + hsa_status_t status; + AgentInfo* agent_info = NULL; + + hsa_device_type_t type; + status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); + CHECK_STATUS("Error Calling hsa_agent_get_info", status); + + if (type == HSA_DEVICE_TYPE_CPU) { + agent_info = new AgentInfo{}; + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_CPU; + agent_info->dev_index = cpu_list_.size(); + + status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(cpu pool)", status); + status = hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(kern arg pool)", status); + agent_info->gpu_pool = {}; + + cpu_list_.push_back(agent_info); + cpu_agents_.push_back(agent); + } + + if (type == HSA_DEVICE_TYPE_GPU) { + agent_info = new AgentInfo{}; + agent_info->dev_id = agent; + agent_info->dev_type = HSA_DEVICE_TYPE_GPU; + hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); + strncpy(agent_info->gfxip, agent_info->name, 4); + agent_info->gfxip[4] = '\0'; + hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); + hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); + hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); + agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false; + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), + &agent_info->cu_num); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), + &agent_info->waves_per_cu); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), + &agent_info->simds_per_cu); + hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), + &agent_info->se_num); + hsa_agent_get_info(agent, + static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), + &agent_info->shader_arrays_per_se); + + agent_info->cpu_pool = {}; + agent_info->kern_arg_pool = {}; + status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); + CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); + + // Set GPU index + agent_info->dev_index = gpu_list_.size(); + gpu_list_.push_back(agent_info); + gpu_agents_.push_back(agent); + } + + if (agent_info) agent_map_[agent.handle] = agent_info; + + return agent_info; +} + +// Return systen agent info +const AgentInfo* HsaRsrcFactory::GetAgentInfo(const hsa_agent_t agent) { + const AgentInfo* agent_info = NULL; + auto it = agent_map_.find(agent.handle); + if (it != agent_map_.end()) { + agent_info = it->second; + } + return agent_info; +} + +// Get the count of Hsa Gpu Agents available on the platform +// +// @return uint32_t Number of Gpu agents on platform +// +uint32_t HsaRsrcFactory::GetCountOfGpuAgents() { return uint32_t(gpu_list_.size()); } + +// Get the count of Hsa Cpu Agents available on the platform +// +// @return uint32_t Number of Cpu agents on platform +// +uint32_t HsaRsrcFactory::GetCountOfCpuAgents() { return uint32_t(cpu_list_.size()); } + +// Get the AgentInfo handle of a Gpu device +// +// @param idx Gpu Agent at specified index +// +// @param agent_info Output parameter updated with AgentInfo +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { + // Determine if request is valid + uint32_t size = uint32_t(gpu_list_.size()); + if (idx >= size) { + return false; + } + + // Copy AgentInfo from specified index + *agent_info = gpu_list_[idx]; + + return true; +} + +// Get the AgentInfo handle of a Cpu device +// +// @param idx Cpu Agent at specified index +// +// @param agent_info Output parameter updated with AgentInfo +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) { + // Determine if request is valid + uint32_t size = uint32_t(cpu_list_.size()); + if (idx >= size) { + return false; + } + + // Copy AgentInfo from specified index + *agent_info = cpu_list_[idx]; + return true; +} + +// Create a Queue object and return its handle. The queue object is expected +// to support user requested number of Aql dispatch packets. +// +// @param agent_info Gpu Agent on which to create a queue object +// +// @param num_Pkts Number of packets to be held by queue +// +// @param queue Output parameter updated with handle of queue object +// +// @return bool true if successful, false otherwise +// +bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, + hsa_queue_t** queue) { + hsa_status_t status; + status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, + UINT32_MAX, UINT32_MAX, queue); + return (status == HSA_STATUS_SUCCESS); +} + +// Create a Signal object and return its handle. +// @param value Initial value of signal object +// @param signal Output parameter updated with handle of signal object +// @return bool true if successful, false otherwise +bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { + hsa_status_t status; + status = hsa_signal_create(value, 0, NULL, signal); + return (status == HSA_STATUS_SUCCESS); +} + +// Allocate memory for use by a kernel of specified size in specified +// agent's memory region. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + status = hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, reinterpret_cast(&buffer)); + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Allocate memory to pass kernel parameters. +// Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + if (!cpu_agents_.empty()) { + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + status = hsa_amd_memory_pool_allocate(cpu_list_[0]->kern_arg_pool, size, 0, reinterpret_cast(&buffer)); + // Both the CPU and GPU can access the kernel arguments + if (status == HSA_STATUS_SUCCESS) { + hsa_agent_t ag_list[1] = {agent_info->dev_id}; + status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + } + } + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Allocate system memory accessible by both CPU and GPU +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + uint8_t* buffer = NULL; + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + if (!cpu_agents_.empty()) { + status = hsa_amd_memory_pool_allocate(cpu_list_[0]->cpu_pool, size, 0, reinterpret_cast(&buffer)); + // Both the CPU and GPU can access the memory + if (status == HSA_STATUS_SUCCESS) { + hsa_agent_t ag_list[1] = {agent_info->dev_id}; + status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + } + } + uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; + return ptr; +} + +// Allocate memory for command buffer. +// @param agent_info Agent from whose memory region to allocate +// @param size Size of memory in terms of bytes +// @return uint8_t* Pointer to buffer, null if allocation fails. +uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t size) { + size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; + uint8_t* ptr = (agent_info->is_apu && CMD_MEMORY_MMAP) + ? reinterpret_cast( + mmap(NULL, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_SHARED | MAP_ANONYMOUS, 0, 0)) + : AllocateSysMemory(agent_info, size); + return ptr; +} + +// Copy data from GPU to host memory +bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size) { + hsa_status_t status = HSA_STATUS_ERROR; + if (!cpu_agents_.empty()) { + hsa_signal_t s = {}; + status = hsa_signal_create(1, 0, NULL, &s); + if (status == HSA_STATUS_SUCCESS) { + status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); + if (status == HSA_STATUS_SUCCESS) { + if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, + HSA_WAIT_STATE_BLOCKED) != 0) { + status = HSA_STATUS_ERROR; + } + } + status = hsa_signal_destroy(s); + } + } + return (status == HSA_STATUS_SUCCESS); +} +bool HsaRsrcFactory::Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size) { + return Memcpy(agent_info->dev_id, dst, src, size); +} + +// Memory free method +bool HsaRsrcFactory::FreeMemory(void* ptr) { + const hsa_status_t status = hsa_memory_free(ptr); + CHECK_STATUS("hsa_memory_free", status); + return (status == HSA_STATUS_SUCCESS); +} + +// Loads an Assembled Brig file and Finalizes it into Device Isa +// @param agent_info Gpu device for which to finalize +// @param brig_path File path of the Assembled Brig file +// @param kernel_name Name of the kernel to finalize +// @param code_desc Handle of finalized Code Descriptor that could +// be used to submit for execution +// @return bool true if successful, false otherwise +bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, + const char* kernel_name, hsa_executable_t* executable, + hsa_executable_symbol_t* code_desc) { + hsa_status_t status = HSA_STATUS_ERROR; + + // Build the code object filename + std::string filename(brig_path); + std::clog << "Code object filename: " << filename << std::endl; + + // Open the file containing code object + hsa_file_t file_handle = open(filename.c_str(), O_RDONLY); + if (file_handle == -1) { + std::cerr << "Error: failed to load '" << filename << "'" << std::endl; + assert(false); + return false; + } + + // Create code object reader + hsa_code_object_reader_t code_obj_rdr = {0}; + status = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); + if (status != HSA_STATUS_SUCCESS) { + std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl; + return false; + } + + // Create executable. + status = hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + NULL, executable); + CHECK_STATUS("Error in creating executable object", status); + + // Load code object. + status = hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, code_obj_rdr, + NULL, NULL); + CHECK_STATUS("Error in loading executable object", status); + + // Freeze executable. + status = hsa_executable_freeze(*executable, ""); + CHECK_STATUS("Error in freezing executable object", status); + + // Get symbol handle. + hsa_executable_symbol_t kernelSymbol; + status = hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0, + &kernelSymbol); + CHECK_STATUS("Error in looking up kernel symbol", status); + + // Update output parameter + *code_desc = kernelSymbol; + return true; +} + +// Print the various fields of Hsa Gpu Agents +bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { + std::clog << header << " :" << std::endl; + + const AgentInfo* agent_info; + int size = uint32_t(gpu_list_.size()); + for (int idx = 0; idx < size; idx++) { + agent_info = gpu_list_[idx]; + + std::clog << "> agent[" << idx << "] :" << std::endl; + std::clog << ">> Name : " << agent_info->name << std::endl; + std::clog << ">> APU : " << agent_info->is_apu << std::endl; + std::clog << ">> HSAIL profile : " << agent_info->profile << std::endl; + std::clog << ">> Max Wave Size : " << agent_info->max_wave_size << std::endl; + std::clog << ">> Max Queue Size : " << agent_info->max_queue_size << std::endl; + std::clog << ">> CU number : " << agent_info->cu_num << std::endl; + std::clog << ">> Waves per CU : " << agent_info->waves_per_cu << std::endl; + std::clog << ">> SIMDs per CU : " << agent_info->simds_per_cu << std::endl; + std::clog << ">> SE number : " << agent_info->se_num << std::endl; + std::clog << ">> Shader Arrays per SE : " << agent_info->shader_arrays_per_se << std::endl; + } + return true; +} + +uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { + const uint32_t slot_size_b = 0x40; + + // adevance command queue + const uint64_t write_idx = hsa_queue_load_write_index_relaxed(queue); + hsa_queue_store_write_index_relaxed(queue, write_idx + 1); + while ((write_idx - hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { + sched_yield(); + } + + uint32_t slot_idx = (uint32_t)(write_idx % queue->size); + uint32_t* queue_slot = reinterpret_cast((uintptr_t)(queue->base_address) + (slot_idx * slot_size_b)); + const uint32_t* slot_data = reinterpret_cast(packet); + + // Copy buffered commands into the queue slot. + // Overwrite the AQL invalid header (first dword) last. + // This prevents the slot from being read until it's fully written. + memcpy(&queue_slot[1], &slot_data[1], slot_size_b - sizeof(uint32_t)); + std::atomic* header_atomic_ptr = + reinterpret_cast*>(&queue_slot[0]); + header_atomic_ptr->store(slot_data[0], std::memory_order_release); + + // ringdoor bell + hsa_signal_store_relaxed(queue->doorbell_signal, write_idx); + + return write_idx; +} + +uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes) { + const uint32_t slot_size_b = 0x40; + if ((size_bytes & (slot_size_b - 1)) != 0) { + fprintf(stderr, "HsaRsrcFactory::Submit: Bad packet size %zx\n", size_bytes); + abort(); + } + + const char* begin = reinterpret_cast(packet); + const char* end = begin + size_bytes; + uint64_t write_idx = 0; + for (const char* ptr = begin; ptr < end; ptr += slot_size_b) { + write_idx = Submit(queue, ptr); + } + + return write_idx; +} + +HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL; +HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; diff --git a/test/util/hsa_rsrc_factory.h b/test/util/hsa_rsrc_factory.h new file mode 100644 index 00000000..e7dcc559 --- /dev/null +++ b/test/util/hsa_rsrc_factory.h @@ -0,0 +1,284 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_UTIL_HSA_RSRC_FACTORY_H_ +#define TEST_UTIL_HSA_RSRC_FACTORY_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define HSA_ARGUMENT_ALIGN_BYTES 16 +#define HSA_QUEUE_ALIGN_BYTES 64 +#define HSA_PACKET_ALIGN_BYTES 64 + +#define CHECK_STATUS(msg, status) \ + if (status != HSA_STATUS_SUCCESS) { \ + const char* emsg = 0; \ + hsa_status_string(status, &emsg); \ + printf("%s: %s\n", msg, emsg ? emsg : ""); \ + exit(1); \ + } + +#define CHECK_ITER_STATUS(msg, status) \ + if (status != HSA_STATUS_INFO_BREAK) { \ + const char* emsg = 0; \ + hsa_status_string(status, &emsg); \ + printf("%s: %s\n", msg, emsg ? emsg : ""); \ + exit(1); \ + } + +static const size_t MEM_PAGE_BYTES = 0x1000; +static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; +typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t; + +// Encapsulates information about a Hsa Agent such as its +// handle, name, max queue size, max wavefront size, etc. +struct AgentInfo { + // Handle of Agent + hsa_agent_t dev_id; + + // Agent type - Cpu = 0, Gpu = 1 or Dsp = 2 + uint32_t dev_type; + + // APU flag + bool is_apu; + + // Agent system index + uint32_t dev_index; + + // GFXIP name + char gfxip[64]; + + // Name of Agent whose length is less than 64 + char name[64]; + + // Max size of Wavefront size + uint32_t max_wave_size; + + // Max size of Queue buffer + uint32_t max_queue_size; + + // Hsail profile supported by agent + hsa_profile_t profile; + + // CPU/GPU/kern-arg memory pools + hsa_amd_memory_pool_t cpu_pool; + hsa_amd_memory_pool_t gpu_pool; + hsa_amd_memory_pool_t kern_arg_pool; + + // The number of compute unit available in the agent. + uint32_t cu_num; + + // Maximum number of waves possible in a Compute Unit. + uint32_t waves_per_cu; + + // Number of SIMD's per compute unit CU + uint32_t simds_per_cu; + + // Number of Shader Engines (SE) in Gpu + uint32_t se_num; + + // Number of Shader Arrays Per Shader Engines in Gpu + uint32_t shader_arrays_per_se; +}; + +class HsaRsrcFactory { + public: + typedef std::recursive_mutex mutex_t; + + static HsaRsrcFactory* Create(bool initialize_hsa = true) { + std::lock_guard lck(mutex_); + if (instance_ == NULL) { + instance_ = new HsaRsrcFactory(initialize_hsa); + } + return instance_; + } + + static HsaRsrcFactory& Instance() { + if (instance_ == NULL) instance_ = Create(false); + hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + CHECK_STATUS("HsaRsrcFactory::Instance() failed", status); + return *instance_; + } + + static void Destroy() { + std::lock_guard lck(mutex_); + if (instance_) delete instance_; + instance_ = NULL; + } + + // Return system agent info + const AgentInfo* GetAgentInfo(const hsa_agent_t agent); + + // Get the count of Hsa Gpu Agents available on the platform + // @return uint32_t Number of Gpu agents on platform + uint32_t GetCountOfGpuAgents(); + + // Get the count of Hsa Cpu Agents available on the platform + // @return uint32_t Number of Cpu agents on platform + uint32_t GetCountOfCpuAgents(); + + // Get the AgentInfo handle of a Gpu device + // @param idx Gpu Agent at specified index + // @param agent_info Output parameter updated with AgentInfo + // @return bool true if successful, false otherwise + bool GetGpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); + + // Get the AgentInfo handle of a Cpu device + // @param idx Cpu Agent at specified index + // @param agent_info Output parameter updated with AgentInfo + // @return bool true if successful, false otherwise + bool GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info); + + // Create a Queue object and return its handle. The queue object is expected + // to support user requested number of Aql dispatch packets. + // @param agent_info Gpu Agent on which to create a queue object + // @param num_Pkts Number of packets to be held by queue + // @param queue Output parameter updated with handle of queue object + // @return bool true if successful, false otherwise + bool CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue); + + // Create a Signal object and return its handle. + // @param value Initial value of signal object + // @param signal Output parameter updated with handle of signal object + // @return bool true if successful, false otherwise + bool CreateSignal(uint32_t value, hsa_signal_t* signal); + + // Allocate local GPU memory + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateLocalMemory(const AgentInfo* agent_info, size_t size); + + // Allocate memory tp pass kernel parameters + // Memory is alocated accessible for all CPU agents and for GPU given by AgentInfo parameter. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateKernArgMemory(const AgentInfo* agent_info, size_t size); + + // Allocate system memory accessible from both CPU and GPU + // Memory is alocated accessible to all CPU agents and AgentInfo parameter is ignored. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateSysMemory(const AgentInfo* agent_info, size_t size); + + // Allocate memory for command buffer. + // @param agent_info Agent from whose memory region to allocate + // @param size Size of memory in terms of bytes + // @return uint8_t* Pointer to buffer, null if allocation fails. + uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size); + + // Copy data from GPU to host memory + bool Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size); + bool Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size); + + // Memory free method + static bool FreeMemory(void* ptr); + + // Loads an Assembled Brig file and Finalizes it into Device Isa + // @param agent_info Gpu device for which to finalize + // @param brig_path File path of the Assembled Brig file + // @param kernel_name Name of the kernel to finalize + // @param code_desc Handle of finalized Code Descriptor that could + // be used to submit for execution + // @return true if successful, false otherwise + bool LoadAndFinalize(const AgentInfo* agent_info, const char* brig_path, const char* kernel_name, + hsa_executable_t* hsa_exec, hsa_executable_symbol_t* code_desc); + + // Print the various fields of Hsa Gpu Agents + bool PrintGpuAgents(const std::string& header); + + // Submit AQL packet to given queue + static uint64_t Submit(hsa_queue_t* queue, const void* packet); + static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); + + // Return AqlProfile API table + typedef hsa_ven_amd_aqlprofile_1_00_pfn_t aqlprofile_pfn_t; + const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; } + + // Return Loader API table + const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; } + + private: + // System agents iterating callback + static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); + + // Callback function to find and bind kernarg region of an agent + static hsa_status_t FindMemRegionsCallback(hsa_region_t region, void* data); + + // Load AQL profile HSA extension library directly + static hsa_status_t LoadAqlProfileLib(aqlprofile_pfn_t* api); + + // Constructor of the class. Will initialize the Hsa Runtime and + // query the system topology to get the list of Cpu and Gpu devices + explicit HsaRsrcFactory(bool initialize_hsa); + + // Destructor of the class + ~HsaRsrcFactory(); + + // Add an instance of AgentInfo representing a Hsa Gpu agent + const AgentInfo* AddAgentInfo(const hsa_agent_t agent); + + // To mmap command buffer memory + static const bool CMD_MEMORY_MMAP = false; + + // HSA was initialized + const bool initialize_hsa_; + + static HsaRsrcFactory* instance_; + static mutex_t mutex_; + + // Used to maintain a list of Hsa Gpu Agent Info + std::vector gpu_list_; + std::vector gpu_agents_; + + // Used to maintain a list of Hsa Cpu Agent Info + std::vector cpu_list_; + std::vector cpu_agents_; + + // System agents map + std::map agent_map_; + + // AqlProfile API table + aqlprofile_pfn_t aqlprofile_api_; + + // Loader API table + hsa_ven_amd_loader_1_00_pfn_t loader_api_; +}; + + +#endif // TEST_UTIL_HSA_RSRC_FACTORY_H_ diff --git a/test/util/perf_timer.cpp b/test/util/perf_timer.cpp new file mode 100644 index 00000000..85c490b6 --- /dev/null +++ b/test/util/perf_timer.cpp @@ -0,0 +1,179 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "util/perf_timer.h" + +PerfTimer::PerfTimer() { freq_in_100mhz_ = MeasureTSCFreqHz(); } + +PerfTimer::~PerfTimer() { + while (!timers_.empty()) { + Timer* temp = timers_.back(); + timers_.pop_back(); + delete temp; + } +} + +// New cretaed timer instantance index will be returned +int PerfTimer::CreateTimer() { + Timer* newTimer = new Timer; + newTimer->start = 0; + newTimer->clocks = 0; + +#ifdef _WIN32 + QueryPerformanceFrequency((LARGE_INTEGER*)&newTimer->freq); +#else + newTimer->freq = (long long)1.0E3; +#endif + + /* Push back the address of new Timer instance created */ + timers_.push_back(newTimer); + return (int)(timers_.size() - 1); +} + +int PerfTimer::StartTimer(int index) { + if (index >= (int)timers_.size()) { + Error("Cannot reset timer. Invalid handle."); + return FAILURE; + } + +#ifdef _WIN32 +// General Windows timing method +#ifndef _AMD + long long tmpStart; + QueryPerformanceCounter((LARGE_INTEGER*)&(tmpStart)); + timers_[index]->start = (double)tmpStart; +#else +// AMD Windows timing method +#endif +#else +// General Linux timing method +#ifndef _AMD + struct timeval s; + gettimeofday(&s, 0); + timers_[index]->start = s.tv_sec * 1.0E3 + ((double)(s.tv_usec / 1.0E3)); +#else + // AMD timing method + unsigned int unused; + timers_[index]->start = __rdtscp(&unused); +#endif +#endif + + return SUCCESS; +} + + +int PerfTimer::StopTimer(int index) { + double n = 0; + if (index >= (int)timers_.size()) { + Error("Cannot reset timer. Invalid handle."); + return FAILURE; + } +#ifdef _WIN32 +#ifndef _AMD + long long n1; + QueryPerformanceCounter((LARGE_INTEGER*)&(n1)); + n = (double)n1; +#else +// AMD Window Timing +#endif + +#else +// General Linux timing method +#ifndef _AMD + struct timeval s; + gettimeofday(&s, 0); + n = s.tv_sec * 1.0E3 + (double)(s.tv_usec / 1.0E3); +#else + // AMD Linux timing + unsigned int unused; + n = __rdtscp(&unused); +#endif +#endif + + n -= timers_[index]->start; + timers_[index]->start = 0; + +#ifndef _AMD + timers_[index]->clocks += n; +#else + // timers_[index]->clocks += 10 * n / freq_in_100mhz_; // unit is ns + timers_[index]->clocks += 1.0E-6 * 10 * n / freq_in_100mhz_; // convert to ms +#endif + + return SUCCESS; +} + +void PerfTimer::Error(std::string str) { std::cout << str << std::endl; } + + +double PerfTimer::ReadTimer(int index) { + if (index >= (int)timers_.size()) { + Error("Cannot read timer. Invalid handle."); + return FAILURE; + } + + double reading = double(timers_[index]->clocks); + + reading = double(reading / timers_[index]->freq); + + return reading; +} + + +uint64_t PerfTimer::CoarseTimestampUs() { +#ifdef _WIN32 + uint64_t freqHz, ticks; + QueryPerformanceFrequency((LARGE_INTEGER*)&freqHz); + QueryPerformanceCounter((LARGE_INTEGER*)&ticks); + + // Scale numerator and divisor until (ticks * 1000000) fits in uint64_t. + while (ticks > (1ULL << 44)) { + ticks /= 16; + freqHz /= 16; + } + + return (ticks * 1000000) / freqHz; +#else + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return uint64_t(ts.tv_sec) * 1000000 + ts.tv_nsec / 1000; +#endif +} + +uint64_t PerfTimer::MeasureTSCFreqHz() { + // Make a coarse interval measurement of TSC ticks for 1 gigacycles. + unsigned int unused; + uint64_t tscTicksEnd; + + uint64_t coarseBeginUs = CoarseTimestampUs(); + uint64_t tscTicksBegin = __rdtscp(&unused); + do { + tscTicksEnd = __rdtscp(&unused); + } while (tscTicksEnd - tscTicksBegin < 1000000000); + + uint64_t coarseEndUs = CoarseTimestampUs(); + + // Compute the TSC frequency and round to nearest 100MHz. + uint64_t coarseIntervalNs = (coarseEndUs - coarseBeginUs) * 1000; + uint64_t tscIntervalTicks = tscTicksEnd - tscTicksBegin; + return (tscIntervalTicks * 10 + (coarseIntervalNs / 2)) / coarseIntervalNs; +} diff --git a/test/util/perf_timer.h b/test/util/perf_timer.h new file mode 100644 index 00000000..bfd55324 --- /dev/null +++ b/test/util/perf_timer.h @@ -0,0 +1,83 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_UTIL_PERF_TIMER_H_ +#define TEST_UTIL_PERF_TIMER_H_ + +// Will use AMD timer or general Linux timer based on compilation flag +// Need to consider platform is Windows or Linux + +#include +#include +#include + +#if defined(_MSC_VER) +#include +#include +#include +#else +#if defined(__GNUC__) +#include +#include +#endif // __GNUC__ +#endif // _MSC_VER + +#include +#include +#include + +class PerfTimer { + public: + enum { SUCCESS = 0, FAILURE = 1 }; + + PerfTimer(); + ~PerfTimer(); + + // General Linux timing method + int CreateTimer(); + int StartTimer(int index); + int StopTimer(int index); + + // retrieve time + double ReadTimer(int index); + // write into a file + double WriteTimer(int index); + + private: + struct Timer { + std::string name; /* name of time object */ + long long freq; /* frequency */ + double clocks; /* number of ticks at end */ + double start; /* start point ticks */ + }; + + std::vector timers_; /* vector to Timer objects */ + double freq_in_100mhz_; + + // AMD timing method + uint64_t CoarseTimestampUs(); + uint64_t MeasureTSCFreqHz(); + + void Error(std::string str); +}; + +#endif // TEST_UTIL_PERF_TIMER_H_ diff --git a/test/util/test_assert.h b/test/util/test_assert.h new file mode 100644 index 00000000..ee183810 --- /dev/null +++ b/test/util/test_assert.h @@ -0,0 +1,46 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_CTRL_TEST_ASSERT_H_ +#define TEST_CTRL_TEST_ASSERT_H_ + +#define TEST_ASSERT(cond) \ + { \ + if (!(cond)) { \ + std::cerr << "Assert failed(" << #cond << ") at " << __FILE__ << ", line " << __LINE__ \ + << std::endl; \ + exit(-1); \ + } \ + } + +#define TEST_STATUS(cond) \ + { \ + if (!(cond)) { \ + std::cerr << "Test error at " << __FILE__ << ", line " << __LINE__ << std::endl; \ + const char* message; \ + rocprofiler_error_string(&message); \ + std::cerr << "ERROR: " << message << std::endl; \ + exit(-1); \ + } \ + } + +#endif // TEST_CTRL_TEST_ASSERT_H_ diff --git a/test/util/xml.h b/test/util/xml.h new file mode 100644 index 00000000..eb2f5074 --- /dev/null +++ b/test/util/xml.h @@ -0,0 +1,457 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_UTIL_XML_H_ +#define TEST_UTIL_XML_H_ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace xml { + +class Xml { + public: + typedef std::vector token_t; + + struct level_t; + typedef std::vector nodes_t; + typedef std::map opts_t; + struct level_t { + std::string tag; + nodes_t nodes; + opts_t opts; + }; + typedef std::vector nodes_vec_t; + typedef std::map map_t; + + enum { DECL_STATE, BODY_STATE }; + + static Xml* Create(const std::string& file_name, const Xml* obj = NULL) { + Xml* xml = new Xml(file_name, obj); + if (xml != NULL) { + if (xml->Init() == false) { + delete xml; + xml = NULL; + } else { + const std::size_t pos = file_name.rfind('/'); + const std::string path = (pos != std::string::npos) ? file_name.substr(0, pos + 1) : ""; + + xml->PreProcess(); + nodes_t incl_nodes; + for (auto* node : xml->GetNodes("top.include")) { + if (node->opts.find("touch") == node->opts.end()) { + node->opts["touch"] = ""; + incl_nodes.push_back(node); + } + } + for (auto* incl : incl_nodes) { + const std::string& incl_name = path + incl->opts["file"]; + Xml* ixml = Create(incl_name, xml); + if (ixml == NULL) { + delete xml; + xml = NULL; + break; + } else { + delete ixml; + } + } + if (xml) { + xml->Process(); + } + } + } + + return xml; + } + + static void Destroy(Xml* xml) { delete xml; } + + std::string GetName() { return file_name_; } + + void AddExpr(const std::string& full_tag, const std::string& name, const std::string& expr) { + const std::size_t pos = full_tag.rfind('.'); + const std::size_t pos1 = (pos == std::string::npos) ? 0 : pos + 1; + const std::string level_tag = full_tag.substr(pos1); + level_t* level = new level_t; + (*map_)[full_tag].push_back(level); + level->tag = level_tag; + level->opts["name"] = name; + level->opts["expr"] = expr; + } + + void AddConst(const std::string& full_tag, const std::string& name, const uint64_t& val) { + std::ostringstream oss; + oss << val; + AddExpr(full_tag, name, oss.str()); + } + + nodes_t GetNodes(const std::string& global_tag) { return (*map_)[global_tag]; } + + template F ForEach(const F& f_i) { + F f = f_i; + if (map_) { + for (auto& entry : *map_) { + for (auto node : entry.second) { + if (f.fun(entry.first, node) == false) break; + } + } + } + return f; + } + + template F ForEach(const F& f_i) const { + F f = f_i; + if (map_) { + for (auto& entry : *map_) { + for (auto node : entry.second) { + if (f.fun(entry.first, node) == false) break; + } + } + } + return f; + } + + struct print_func { + bool fun(const std::string& global_tag, level_t* node) { + for (auto& opt : node->opts) { + std::cout << global_tag << "." << opt.first << " = " << opt.second << std::endl; + } + return true; + } + }; + + void Print() const { + std::cout << "XML file '" << file_name_ << "':" << std::endl; + ForEach(print_func()); + } + + private: + Xml(const std::string& file_name, const Xml* obj) + : file_name_(file_name), + file_line_(0), + data_size_(0), + index_(0), + state_(BODY_STATE), + comment_(false), + included_(false), + level_(NULL), + map_(NULL) { + if (obj != NULL) { + map_ = obj->map_; + level_ = obj->level_; + included_ = true; + } + } + + struct delete_func { + bool fun(const std::string&, level_t* node) { + delete node; + return true; + } + }; + + ~Xml() { + if (included_ == false) { + ForEach(delete_func()); + delete map_; + } + } + + bool Init() { + fd_ = open(file_name_.c_str(), O_RDONLY); + if (fd_ == -1) { + // perror((std::string("open XML file ") + file_name_).c_str()); + return false; + } + + if (map_ == NULL) { + map_ = new map_t; + if (map_ == NULL) return false; + AddLevel("top"); + } + + return true; + } + + void PreProcess() { + uint32_t ind = 0; + char buf[kBufSize]; + bool error = false; + + while (1) { + const uint32_t pos = lseek(fd_, 0, SEEK_CUR); + uint32_t size = read(fd_, buf, kBufSize); + if (size <= 0) break; + buf[size - 1] = '\0'; + + if (strncmp(buf, "#include \"", 10) == 0) { + for (ind = 0; (ind < size) && (buf[ind] != '\n'); ++ind) {} + if (ind == size) { + fprintf(stderr, "XML PreProcess failed, line size limit %zu\n", kBufSize); + error = true; + break; + } + buf[ind] = '\0'; + size = ind; + lseek(fd_, pos + ind + 1, SEEK_SET); + + for (ind = 10; (ind < size) && (buf[ind] != '"'); ++ind) {} + if (ind == size) { + error = true; + break; + } + buf[ind] = '\0'; + + AddLevel("include"); + AddOption("file", &buf[10]); + UpLevel(); + } + } + + if (error) { + fprintf(stderr, "XML PreProcess failed, line '%s'\n", buf); + exit(1); + } + + lseek(fd_, 0, SEEK_SET); + } + + void Process() { + token_t remainder; + + while (1) { + token_t token = (remainder.size()) ? remainder : NextToken(); + remainder.clear(); + + // token_t token1 = token; + // token1.push_back('\0'); + // std::cout << "> " << &token1[0] << std::endl; + + // End of file + if (token.size() == 0) break; + + switch (state_) { + case BODY_STATE: + if (token[0] == '<') { + bool node_begin = true; + unsigned ind = 1; + if (token[1] == '/') { + node_begin = false; + ++ind; + } + + unsigned i = ind; + while (i < token.size()) { + if (token[i] == '>') break; + ++i; + } + for (unsigned j = i + 1; j < token.size(); ++j) remainder.push_back(token[j]); + + if (i == token.size()) { + if (node_begin) + state_ = DECL_STATE; + else + BadFormat(token); + token.push_back('\0'); + } else { + token[i] = '\0'; + } + + const char* tag = &token[ind]; + if (node_begin) { + AddLevel(tag); + } else { + if (strncmp(CurrentLevel().c_str(), tag, strlen(tag)) != 0) { + token.back() = '>'; + BadFormat(token); + } + UpLevel(); + } + } else { + BadFormat(token); + } + break; + case DECL_STATE: + if (token[0] == '>') { + state_ = BODY_STATE; + for (unsigned j = 1; j < token.size(); ++j) remainder.push_back(token[j]); + continue; + } else { + token.push_back('\0'); + unsigned j = 0; + for (j = 0; j < token.size(); ++j) + if (token[j] == '=') break; + if (j == token.size()) BadFormat(token); + token[j] = '\0'; + const char* key = &token[0]; + const char* value = &token[j + 1]; + AddOption(key, value); + } + break; + default: + std::cout << "XML parser error: wrong state: " << state_ << std::endl; + exit(1); + } + } + } + + bool SpaceCheck() const { + bool cond = ((buffer_[index_] == ' ') || (buffer_[index_] == '\t')); + return cond; + } + + bool LineEndCheck() { + bool found = false; + if (buffer_[index_] == '\n') { + buffer_[index_] = ' '; + ++file_line_; + found = true; + comment_ = false; + } else if (comment_ || (buffer_[index_] == '#')) { + found = true; + comment_ = true; + } + return found; + } + + token_t NextToken() { + token_t token; + bool in_string = false; + bool special_symb = false; + + while (1) { + if (data_size_ == 0) { + data_size_ = read(fd_, buffer_, kBufSize); + if (data_size_ <= 0) break; + } + + if (token.empty()) { + while ((index_ < data_size_) && (SpaceCheck() || LineEndCheck())) { + ++index_; + } + } + while ((index_ < data_size_) && (in_string || !(SpaceCheck() || LineEndCheck()))) { + const char symb = buffer_[index_]; + bool skip_symb = false; + + switch (symb) { + case '\\': + if (special_symb) { + special_symb = false; + } else { + special_symb = true; + skip_symb = true; + } + break; + case '"': + if (special_symb) { + special_symb = false; + } else { + in_string = !in_string; + if (!in_string) { + buffer_[index_] = ' '; + --index_; + } + skip_symb = true; + } + break; + } + + if (!skip_symb) token.push_back(symb); + ++index_; + } + + if (index_ == data_size_) { + index_ = 0; + data_size_ = 0; + } else { + if (special_symb || in_string) BadFormat(token); + break; + } + } + + return token; + } + + void BadFormat(token_t token) { + token.push_back('\0'); + std::cout << "Error: " << file_name_ << ", line " << file_line_ << ", bad XML token '" + << &token[0] << "'" << std::endl; + exit(1); + } + + void AddLevel(const std::string& tag) { + level_t* level = new level_t; + level->tag = tag; + if (level_) { + level_->nodes.push_back(level); + stack_.push_back(level_); + } + level_ = level; + + std::string global_tag; + for (level_t* level : stack_) { + global_tag += level->tag + "."; + } + global_tag += tag; + (*map_)[global_tag].push_back(level_); + } + + void UpLevel() { + level_ = stack_.back(); + stack_.pop_back(); + } + + std::string CurrentLevel() const { return level_->tag; } + + void AddOption(const std::string& key, const std::string& value) { level_->opts[key] = value; } + + const std::string file_name_; + unsigned file_line_; + int fd_; + + static const size_t kBufSize = 256; + char buffer_[kBufSize]; + + unsigned data_size_; + unsigned index_; + unsigned state_; + bool comment_; + std::vector stack_; + bool included_; + level_t* level_; + map_t* map_; +}; + +} // namespace xml + +#endif // TEST_UTIL_XML_H_ From 12f8613b5c7436bc5d9c2a6f4633f89701f549ab Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 17 Jul 2018 15:06:05 -0500 Subject: [PATCH 002/168] update --- LICENSE | 2 -- 1 file changed, 2 deletions(-) diff --git a/LICENSE b/LICENSE index fe4ce68b..9e78331e 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,3 @@ -/****************************************************************************** Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy @@ -18,4 +17,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*******************************************************************************/ From edde6c11ac0bee2ced5c18446b7823bbb089665a Mon Sep 17 00:00:00 2001 From: Gregory Stoner Date: Sat, 18 Aug 2018 11:40:44 -0500 Subject: [PATCH 003/168] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 5492d17d..3ac63141 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library supports GFX8/GFX9. +HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics and HW traces + The library source tree: - doc - Documentation - inc/rocprofiler.h - Library public API From 05e63d2529c4d2b8133d74427d7820c62df19079 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Sun, 19 Aug 2018 21:42:46 -0500 Subject: [PATCH 004/168] update, version 3.0 --- CMakeLists.txt | 24 ++- bin/rpl_run.sh | 352 +++++++++++++++++++++++++++++++++ bin/tblextr.py | 121 ++++++++++++ bin/txt2xml.sh | 94 +++++++++ cmake_modules/env.cmake | 13 +- cmake_modules/utils.cmake | 6 +- inc/rocprofiler.h | 24 ++- src/CMakeLists.txt | 6 +- src/core/context.h | 110 +++++------ src/core/group_set.h | 244 +++++++++++++++++++++++ src/core/intercept_queue.cpp | 3 +- src/core/intercept_queue.h | 178 ++++++++++++----- src/core/profile.h | 4 + src/core/rocprofiler.cpp | 85 +++++++- src/core/tracker.h | 166 +++++++++------- src/core/types.h | 1 + src/util/exception.h | 16 +- src/util/hsa_rsrc_factory.cpp | 110 +++++++---- src/util/hsa_rsrc_factory.h | 126 +++++++++--- src/util/logger.h | 66 +++++-- test/CMakeLists.txt | 12 +- test/run.sh | 13 +- test/tool/input1.xml | 5 + test/tool/tool.cpp | 187 +++++++++--------- test/util/hsa_rsrc_factory.cpp | 106 ++++++---- test/util/hsa_rsrc_factory.h | 126 +++++++++--- 26 files changed, 1729 insertions(+), 469 deletions(-) create mode 100755 bin/rpl_run.sh create mode 100755 bin/tblextr.py create mode 100755 bin/txt2xml.sh create mode 100644 src/core/group_set.h create mode 100644 test/tool/input1.xml diff --git a/CMakeLists.txt b/CMakeLists.txt index 6249e098..92bc348f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,16 +1,16 @@ ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -20,7 +20,7 @@ # THE SOFTWARE. ################################################################################ -cmake_minimum_required ( VERSION 3.5.0 ) +cmake_minimum_required ( VERSION 2.8.12 ) ## Verbose output. set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) @@ -76,6 +76,22 @@ add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test ) ## Install information install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION ${ROCPROFILER_NAME}/lib ) install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h DESTINATION ${ROCPROFILER_NAME}/include ) +# rpl_run.sh tblextr.py txt2xml.sh +install ( FILES + ${CMAKE_CURRENT_SOURCE_DIR}/bin/rpl_run.sh + ${CMAKE_CURRENT_SOURCE_DIR}/bin/txt2xml.sh + ${CMAKE_CURRENT_SOURCE_DIR}/bin/tblextr.py + DESTINATION ${ROCPROFILER_NAME}/bin + PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) +# gfx_metrics.xml metrics.xml +install ( FILES + ${CMAKE_CURRENT_SOURCE_DIR}/test/tool/metrics.xml + ${CMAKE_CURRENT_SOURCE_DIR}/test/tool/gfx_metrics.xml + DESTINATION ${ROCPROFILER_NAME}/lib ) +# libtool.so +install ( FILES ${PROJECT_BINARY_DIR}/test/libtool.so DESTINATION ${ROCPROFILER_NAME}/tool ) +install ( FILES ${PROJECT_BINARY_DIR}/test/ctrl DESTINATION ${ROCPROFILER_NAME}/tool + PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) ## Packaging directives set ( CPACK_PACKAGE_NAME "${ROCPROFILER_NAME}-dev" ) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh new file mode 100755 index 00000000..64185761 --- /dev/null +++ b/bin/rpl_run.sh @@ -0,0 +1,352 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +#!/bin/sh +time_stamp=`date +%y%m%d_%H%M%S` +BIN_DIR=`dirname $0` +BIN_DIR=`cd $BIN_DIR; pwd` +RUN_DIR=`pwd` +TMP_DIR="/tmp" +DATA_PATH=$TMP_DIR +DATA_DIR="rpl_data_${time_stamp}_$$" + +PKG_DIR=`echo $BIN_DIR | sed "s/\/bin\/*//"` +BIN_DIR=$PKG_DIR/bin + +# PATH to custom HSA and OpenCl runtimes +HSA_PATH=$PKG_DIR/lib/hsa + +export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$HSA_PATH +export PATH=.:$PATH + +# enable error logging +export HSA_TOOLS_REPORT_LOAD_FAILURE=1 +export HSA_VEN_AMD_AQLPROFILE_LOG=1 +export ROCPROFILER_LOG=1 +unset ROCPROFILER_SESS + +# ROC Profiler environment +# Loading of ROC Profiler by HSA runtime +export HSA_TOOLS_LIB=librocprofiler64.so +# Loading of the test tool by ROC Profiler +export ROCP_TOOL_LIB=libtool.so +# Enabling HSA dispatches intercepting by ROC PRofiler +export ROCP_HSA_INTERCEPT=1 +# Disabling internal ROC Profiler proxy queue (simple version supported for testing purposes) +unset ROCP_PROXY_QUEUE +# ROC Profiler metrics definition +export ROCP_METRICS=$PKG_DIR/lib/metrics.xml +# Disable AQL-profile read API +export AQLPROFILE_READ_API=0 +# ROC Profiler package path +export ROCP_PACKAGE_DIR=$PKG_DIR + +# error handling +fatal() { + echo "$0: Error: $1" + echo "" + usage +} + +error() { + echo "$0: Error: $1" + echo "" + exit 1 +} + +# usage method +usage() { + bin_name=`basename $0` + echo "ROCm Profiling Library (RPL) run script, a part of ROCprofiler library package." + echo "Full path: $BIN_DIR/$bin_name" + echo "Metrics definition: $PKG_DIR/lib/metrics.xml" + echo "" + echo "Usage:" + echo " rpl_run.sh [-h] [--list-basic] [--list-derived] [-i ] [-o ] " + echo "" + echo "Options:" + echo " -h - this help" + echo " --verbose - verbose mode, dumping all base counters used in the input metrics" + echo " --list-basic - to print the list of basic HW counters" + echo " --list-derived - to print the list of derived metrics with formulas" + echo "" + echo " -i <.txt|.xml file> - input file" + echo " Input file .txt format, automatically rerun application for every pmc/sqtt line:" + echo "" + echo " # Perf counters group 1" + echo " pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize" + echo " # Perf counters group 2" + echo " pmc : WriteSize L2CacheHit" + echo " # Filter by dispatches range, GPU index and kernel names" + echo " # supported range formats: \"3:9\", \"3:\", \"3\"" + echo " range: 1 : 4" + echo " gpu: 0 1 2 3" + echo " kernel: simple Pass1 simpleConvolutionPass2" + echo "" + echo " Input file .xml format, for single profiling run:" + echo "" + echo " # Metrics list definition, also the form \":\" can be used" + echo " # All defined metrics can be found in the 'metrics.xml'" + echo " # There are basic metrics for raw HW counters and high-level metrics for derived counters" + echo " " + echo "" + echo " # Filter by dispatches range, GPU index and kernel names" + echo " " + echo "" + echo " -o - output CSV file [.csv]" + echo " -d - directory where profiler store profiling data including thread treaces [/tmp]" + echo " The data directory is renoving autonatically if the directory is matching the temporary one, which is the default." + echo " -t - to change the temporary directory [/tmp]" + echo " By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory." + echo "" + echo " --basenames - to turn on/off truncating of the kernel full function names till the base ones [off]" + echo " --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off]" + echo " --ctx-limit - maximum number of outstanding contexts [0 - unlimited]" + echo " --heartbeat - to print progress heartbeats [0 - disabled]" + echo " --sqtt-size - to set SQTT buffer size, aggregate for all SE [0x2000000]" + echo " Can be set in KB (1024B) or MB (1048576) units, examples 20K or 20M respectively." + echo " --sqtt-local - to allocate SQTT buffer in local GPU memory [on]" + echo "" + echo "Configuration file:" + echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" + echo " First the configuration file is looking in the current directory, then in your home, and then in the package directory." + echo " Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'sqtt-size', 'sqtt-local'." + echo " An example of 'rpl_rc.xml':" + echo " " + echo "" + exit 1 +} + +# profiling run method +OUTPUT_LIST="" +run() { + export ROCP_INPUT="$1" + OUTPUT_DIR="$2" + shift + shift + APP_CMD=$* + + if [ "$OUTPUT_DIR" = "-" ] ; then + input_tag=`echo $ROCP_INPUT | sed "s/\.xml//"` + export ROCP_OUTPUT_DIR=${input_tag}_results_${time_stamp} + elif [ "$OUTPUT_DIR" = "--" ] ; then + unset ROCP_OUTPUT_DIR + else + export ROCP_OUTPUT_DIR=$OUTPUT_DIR + fi + echo "RPL: result dir '$ROCP_OUTPUT_DIR'" + + if [ ! -e "$ROCP_INPUT" ] ; then + error "Input file '$ROCP_INPUT' not found" + fi + + if [ -n "$ROCP_OUTPUT_DIR" ] ; then + if [ "$OUTPUT_DIR" = "-" ] ; then + if [ -e "$ROCP_OUTPUT_DIR" ] ; then + error "generated dir '$ROCP_OUTPUT_DIR' exists" + fi + fi + mkdir -p "$ROCP_OUTPUT_DIR" + fi + + if [ -n "$ROCP_OUTPUT_DIR" ] ; then + OUTPUT_LIST="$OUTPUT_LIST $ROCP_OUTPUT_DIR/results.txt" + eval "$APP_CMD 2>&1 | tee $ROCP_OUTPUT_DIR/log.txt" + else + eval "$APP_CMD" + fi +} + +# main +echo "RPL: on '$time_stamp' from '$PKG_DIR' in '$RUN_DIR'" +# Parsing arguments +if [ -z "$1" ] ; then + usage +fi + +INPUT_FILE="" +OUTPUT_DIR="-" +output="" +csv_output="" + +ARG_IN="" +while [ 1 ] ; do + ARG_IN=$1 + ARG_VAL=1 + if [ "$1" = "-h" ] ; then + usage + elif [ "$1" = "-i" ] ; then + INPUT_FILE="$2" + elif [ "$1" = "-o" ] ; then + output="$2" + elif [ "$1" = "-d" ] ; then + OUTPUT_DIR="$2" + DATA_PATH=$OUTPUT_DIR + elif [ "$1" = "-t" ] ; then + TMP_DIR="$2" + if [ "$OUTPUT_DIR" = "-" ] ; then + DATA_PATH=$TMP_DIR + fi + elif [ "$1" = "--list-basic" ] ; then + export ROCP_INFO=b + eval "$PKG_DIR/tool/ctrl" + exit 1 + elif [ "$1" = "--list-derived" ] ; then + export ROCP_INFO=d + eval "$PKG_DIR/tool/ctrl" + exit 1 + elif [ "$1" = "--basenames" ] ; then + if [ "$2" = "on" ] ; then + export ROCP_TRUNCATE_NAMES=1 + else + export ROCP_TRUNCATE_NAMES=0 + fi + elif [ "$1" = "--timestamp" ] ; then + if [ "$2" = "on" ] ; then + export ROCP_TIMESTAMP_ON=1 + else + export ROCP_TIMESTAMP_ON=0 + fi + elif [ "$1" = "--ctx-limit" ] ; then + export ROCP_OUTSTANDING_MAX="$2" + elif [ "$1" = "--heartbeat" ] ; then + export ROCP_OUTSTANDING_MON="$2" + elif [ "$1" = "--sqtt-size" ] ; then + size_m=`echo "$2" | sed -n "s/^\(.*\)M$/\1/p"` + size_k=`echo "$2" | sed -n "s/^\(.*\)K$/\1/p"` + if [ -n "$size_m" ] ; then size_b=$((size_m*1024*1024)) + elif [ -n "$size_k" ] ; then size_b=$((size_k*1024)) + else size_b=$2 + fi + export ROCP_SQTT_SIZE=$size_b + elif [ "$1" = "--sqtt-local" ] ; then + if [ "$2" = "on" ] ; then + export ROCP_SQTT_LOCAL=1 + else + export ROCP_SQTT_LOCAL=0 + fi + elif [ "$1" = "--verbose" ] ; then + ARG_VAL=0 + export ROCP_VERBOSE_MODE=1 + else + break + fi + shift + if [ "$ARG_VAL" = 1 ] ; then shift; fi +done + +ARG_CK=`echo $ARG_IN | sed "s/^-.*$/-/"` +if [ "$ARG_CK" = "-" ] ; then + fatal "Wrong option '$ARG_IN'" +fi + +if [ -z "$INPUT_FILE" ] ; then + fatal "Need input file" +fi + +input_base=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\1/"` +input_type=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\2/"` +if [ -z "${input_base}" -o -z "${input_type}" ] ; then + fatal "Bad input file '$INPUT_FILE'" +fi +input_base=`basename $input_base` + +if [ "$OUTPUT_DIR" = "--" ] ; then + fatal "Bad output dir '$OUTPUT_DIR'" +fi + +if [ -n "$output" ] ; then + if [ "$output" = "--" ] ; then + OUTPUT_DIR="--" + else + csv_output=$output + fi +else + csv_output=$RUN_DIR/${input_base}.csv +fi + +APP_CMD=$* + +echo "RPL: profiling '$APP_CMD'" +echo "RPL: input file '$INPUT_FILE'" + +input_list="" +RES_DIR="" +if [ "$input_type" = "xml" ] ; then + input_list=$INPUT_FILE +elif [ "$input_type" = "txt" ] ; then + OUTPUT_DIR="-" + RES_DIR=$DATA_PATH/$DATA_DIR + if [ -e $RES_DIR ] ; then + error "Rundir '$RES_DIR' exists" + fi + mkdir -p $RES_DIR + echo "RPL: output dir '$RES_DIR'" + $BIN_DIR/txt2xml.sh $INPUT_FILE $RES_DIR + input_list=`/bin/ls $RES_DIR/input*.xml` + export ROCPROFILER_SESS=$RES_DIR +else + fatal "Bad input file type '$INPUT_FILE'" +fi + +if [ -n "$csv_output" ] ; then + rm -f $csv_output +fi + +for name in $input_list; do + run $name $OUTPUT_DIR $APP_CMD + if [ -n "$ROCPROFILER_SESS" -a -e "$ROCPROFILER_SESS/error" ] ; then + echo "Error found, profiling aborted." + csv_output="" + break + fi +done + +if [ -n "$csv_output" ] ; then + python $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST + if [ "$?" -eq 0 ] ; then + echo "RPL: '$csv_output' is generated" + fi +fi + +if [ "$DATA_PATH" = "$TMP_DIR" ] ; then + if [ -e "$RES_DIR" ] ; then + rm -rf $RES_DIR + fi +fi + +exit 0 diff --git a/bin/tblextr.py b/bin/tblextr.py new file mode 100755 index 00000000..630417ce --- /dev/null +++ b/bin/tblextr.py @@ -0,0 +1,121 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +#!/usr/bin/python +import os, sys, re + +# Parsing results in the format: +#dispatch[0], queue_index(0), kernel_name("SimpleConvolution"), time(1048928000311041,1048928006154674,1048928006168274,1048928006170503): +# GRBM_GUI_ACTIVE (74332) +# SQ_WAVES (4096) +# SQ_INSTS_VMEM_RD (36864) + +# global vars +var_list = ['Index', 'KernelName', 'DispatchNs', 'BeginNs', 'EndNs', 'CompleteNs'] +var_table = {} +############################################################# + +def fatal(msg): + sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); + sys.exit(1) +############################################################# + +# parse results method +def parse_res(infile): + if not os.path.isfile(infile): fatal("Error: input file '" + infile + "' not found") + inp = open(infile, 'r') + + beg_pattern = re.compile("^dispatch\[(\d*)\], queue_index\(\d*\), kernel_name\(\"([^\"]*)\"\)") + ts_pattern = re.compile(", time\((\d*),(\d*),(\d*),(\d*)\)") + var_pattern = re.compile("^\s*([^\s]*)\s+\((\d*)\)") + + dispatch_number = 0 + for line in inp.readlines(): + record = line[:-1] + + m = var_pattern.match(record) + if m: + if not dispatch_number in var_table: fatal("Error: dispatch number not unique '" + str(dispatch_number) + "'") + var = m.group(1) + val = m.group(2) + var_table[dispatch_number][m.group(1)] = m.group(2) + if not var in var_list: var_list.append(var) + + m = beg_pattern.match(record) + if m: + dispatch_number = m.group(1) + if not dispatch_number in var_table: + var_table[dispatch_number] = { + 'Index': dispatch_number, + 'KernelName': "\"" + m.group(2) + "\"" + } + m = ts_pattern.search(record) + if m: + var_table[dispatch_number]['DispatchNs'] = m.group(1) + var_table[dispatch_number]['BeginNs'] = m.group(2) + var_table[dispatch_number]['EndNs'] = m.group(3) + var_table[dispatch_number]['CompleteNs'] = m.group(4) + + inp.close() +############################################################# + +# print results table method +def print_tbl(outfile): + global var_list + if len(var_table) == 0: return 1 + + out = open(outfile, 'w') + + keys = var_table.keys() + keys.sort(key=int) + + entry = var_table[keys[0]] + list1 = [] + for var in var_list: + if var in entry: + list1.append(var) + var_list = list1 + + for var in var_list: out.write(var + ',') + out.write("\n") + + for ind in keys: + entry = var_table[ind] + dispatch_number = entry['Index'] + if ind != dispatch_number: fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n") + for var in var_list: out.write(entry[var] + ',') + out.write("\n") + + out.close() + return 0 +############################################################# + +# main +if (len(sys.argv) < 3): fatal("Usage: " + sys.argv[0] + " ") + +outfile = sys.argv[1] +infiles = sys.argv[2:] +for f in infiles : + parse_res(f) +ret = print_tbl(outfile) +sys.exit(ret) +############################################################# diff --git a/bin/txt2xml.sh b/bin/txt2xml.sh new file mode 100755 index 00000000..9881160d --- /dev/null +++ b/bin/txt2xml.sh @@ -0,0 +1,94 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +#!/bin/bash +timestamp=`date +%y%m%d_%H%M%S` + +if [ $# = 0 ] ; then + echo "Usage: $0 [output dir]" + exit -1 +fi + +input=$1 +outdir=$2 +if [ -z "$outdir" ] ; then + outdir="." +fi + +range="" +kernel="" +gpu_index="" + +parse() { + scan="$1" + index=0 + while read -r line ; do + line=`echo $line | sed "s/\s*#.*$//"` + if [ -z "$line" ] ; then + continue + fi + + feature=`echo $line | sed -n "s/^\s*\([a-z]*\)\s*:.*$/\1/p"` + line=`echo $line | sed "s/^[^:]*:\s*//"` + line=`echo "$line" | sed -e "s/\s*=\s*/=/g" -e "s/\s*:\s*/:/g" -e "s/,\{1,\}/ /g" -e "s/\s\{1,\}/ /g" -e "s/\s*$//"` + + if [ "$scan" = 0 ] ; then + line=`echo "$line" | sed -e "s/ /,/g"` + if [ "$feature" == "range" ] ; then + range=$line + fi + if [ "$feature" == "kernel" ] ; then + kernel=$line + fi + if [ "$feature" == "gpu" ] ; then + gpu_index=$line + fi + else + output=$outdir/input${index}.xml + header="# $timestamp '$output' generated with '$0 $*'" + + if [ "$feature" == "pmc" ] ; then + line=`echo "$line" | sed -e "s/ /,/g"` + cat >> $output < + +EOF + fi + + if [ "$feature" == "sqtt" ] ; then + cat >> $output < + +EOF + fi + fi + + index=$((index + 1)) + done < $input +} + +parse 0 +parse 1 + +exit 0 diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index ca7c4804..a71acb66 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -1,16 +1,16 @@ ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -48,6 +48,8 @@ set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics" ) set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" ) set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions" ) set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" ) +set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-result" ) +#set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=int-in-bool-context" ) set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" ) set ( CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bdynamic -Wl,-z,noexecstack" ) @@ -64,6 +66,11 @@ if ( DEFINED ENV{CMAKE_DEBUG_TRACE} ) add_definitions ( -DDEBUG_TRACE=1 ) endif() +## Enable AQL-profile new API +if ( NOT DEFINED ENV{CMAKE_CURR_API} ) + add_definitions ( -DAQLPROF_NEW_API=1 ) +endif() + ## Enable direct loading of AQL-profile HSA extension if ( DEFINED ENV{CMAKE_LD_AQLPROFILE} ) add_definitions ( -DROCP_LD_AQLPROFILE=1 ) diff --git a/cmake_modules/utils.cmake b/cmake_modules/utils.cmake index 15865820..f95a7833 100644 --- a/cmake_modules/utils.cmake +++ b/cmake_modules/utils.cmake @@ -1,16 +1,16 @@ ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index e7a5a1e0..17106687 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -45,8 +45,8 @@ THE SOFTWARE. #include #include -#define ROCPROFILER_VERSION_MAJOR 1 -#define ROCPROFILER_VERSION_MINOR 1 +#define ROCPROFILER_VERSION_MAJOR 3 +#define ROCPROFILER_VERSION_MINOR 0 #ifdef __cplusplus extern "C" { @@ -178,7 +178,7 @@ hsa_status_t rocprofiler_open(hsa_agent_t agent, // GPU han uint32_t mode, // profiling mode mask rocprofiler_properties_t* properties); // profiling properties -// Add feature to e features set +// Add feature to a features set hsa_status_t rocprofiler_add_feature(const rocprofiler_feature_t* feature, // [in] rocprofiler_feature_set_t* features_set); // [in/out] profiling features set @@ -204,10 +204,10 @@ hsa_status_t rocprofiler_reset(rocprofiler_t* context, // [in] profiling contex // Dispatch record typedef struct { - uint64_t dispatch; // dispatch timestamp - uint64_t begin; // begin timestamp - uint64_t end; // end timestamp - uint64_t complete; // completion signal timestamp + uint64_t dispatch; // dispatch timestamp, ns + uint64_t begin; // kernel begin timestamp, ns + uint64_t end; // kernel end timestamp, ns + uint64_t complete; // completion signal timestamp, ns } rocprofiler_dispatch_record_t; // Profiling callback data @@ -326,8 +326,11 @@ typedef struct { union { struct { const char* name; // metric name + uint32_t instances; // instances number const char* expr; // metric expression, NULL for basic counters const char* description; // metric description + const char* block_name; // block name + uint32_t block_counters; // number of block counters } metric; struct { const char* name; // trace name @@ -357,6 +360,13 @@ hsa_status_t rocprofiler_query_info( hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback void *data); // [in/out] data passed to callback +// Creates a profiled queue. All dispatches on this queue will be profiled +hsa_status_t rocprofiler_queue_create_profiled( + hsa_agent_t agent_handle,uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), + void* data, uint32_t private_segment_size, uint32_t group_segment_size, + hsa_queue_t** queue); + #ifdef __cplusplus } // extern "C" block #endif // __cplusplus diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 45bc2719..9a398411 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,16 +1,16 @@ ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE diff --git a/src/core/context.h b/src/core/context.h index 966acaef..f7ad792d 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -27,10 +27,14 @@ THE SOFTWARE. #include #include +#include // usleep +#include +#include #include #include #include +#include "core/group_set.h" #include "core/metrics.h" #include "core/profile.h" #include "core/queue.h" @@ -47,26 +51,6 @@ inline unsigned align_size(unsigned size, unsigned alignment) { return ((size + alignment - 1) & ~(alignment - 1)); } -// Block descriptor -struct block_des_t { - uint32_t id; - uint32_t index; -}; - -// block_des_t less-then functor -struct lt_block_des { - bool operator()(const block_des_t& a1, const block_des_t& a2) const { - return (a1.id < a2.id) || ((a1.id == a2.id) && (a1.index < a2.index)); - } -}; - -// Block status -struct block_status_t { - uint32_t max_counters; - uint32_t counter_index; - uint32_t group_index; -}; - // Metrics arguments template class MetricArgs : public xml::args_cache_t { public: @@ -94,6 +78,9 @@ template class MetricArgs : public xml::args_cache_t { // Profiling group class Group { public: + typedef uint32_t refs_t; + typedef std::atomic atomic_refs_t; + Group(const util::AgentInfo* agent_info, Context* context, const uint32_t& index) : pmc_profile_(agent_info), sqtt_profile_(agent_info), @@ -143,10 +130,10 @@ class Group { Context* GetContext() { return context_; } uint32_t GetIndex() const { return index_; } - void ResetRefs() { refs_ = n_profiles_; } - uint32_t DecrRefs() { - return (refs_ > 0) ? --refs_ : 0; - } + atomic_refs_t* AtomicRefsCount() { return reinterpret_cast(&refs_); } + void ResetRefsCount() { AtomicRefsCount()->store(n_profiles_, std::memory_order_release); } + void IncrRefsCount() { AtomicRefsCount()->fetch_add(1, std::memory_order_acq_rel); } + uint32_t FetchDecrRefsCount() { return AtomicRefsCount()->fetch_sub(1, std::memory_order_acq_rel); } private: PmcProfile pmc_profile_; @@ -156,7 +143,7 @@ class Group { pkt_vector_t stop_vector_; pkt_vector_t read_vector_; uint32_t n_profiles_; - uint32_t refs_; + refs_t refs_; Context* const context_; const uint32_t index_; }; @@ -164,7 +151,6 @@ class Group { // Profiling context class Context { public: - typedef std::mutex mutex_t; typedef std::map info_map_t; Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, @@ -177,14 +163,21 @@ class Context { handler_(handler), handler_arg_(handler_arg) { + if (info_count == 0) return; + metrics_ = MetricsDict::Create(agent_info); if (metrics_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); - Initialize(info, info_count); + if (Initialize(info, info_count) == false) { + fprintf(stdout, "\nInput metrics out of HW limit. Proposed metrics group set:\n"); fflush(stdout); + MetricsGroupSet(agent_info, info, info_count).Print(stdout); + fprintf(stdout, "\n"); fflush(stdout); + EXC_RAISING(HSA_STATUS_ERROR, "Metrics list exceeds HW limits"); + } Finalize(); if (handler != NULL) { for (unsigned group_index = 0; group_index < set_.size(); ++group_index) { - set_[group_index].ResetRefs(); + set_[group_index].ResetRefsCount(); const profile_vector_t profile_vector = GetProfiles(group_index); for (auto& tuple : profile_vector) { // Handler for stop packet completion @@ -207,7 +200,7 @@ class Context { } // Initialize rocprofiler context - void Initialize(rocprofiler_feature_t* info_array, const uint32_t info_count) { + bool Initialize(rocprofiler_feature_t* info_array, const uint32_t info_count) { // Register input features to not duplicate by features referencing for (unsigned i = 0; i < info_count; ++i) { rocprofiler_feature_t* info = &info_array[i]; @@ -270,9 +263,12 @@ class Context { block_status.max_counters = block_counters; } if (block_status.counter_index >= block_status.max_counters) { + return false; + block_status.counter_index = 0; block_status.group_index += 1; } + block_status.counter_index += 1; if (block_status.group_index >= set_.size()) { set_.push_back(Group(agent_info_, this, block_status.group_index)); } @@ -285,6 +281,8 @@ class Context { EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); } } + + return true; } void Finalize() { @@ -294,11 +292,11 @@ class Context { } } - void Reset(const uint32_t& group_index) { set_[group_index].ResetRefs(); } + void Reset(const uint32_t& group_index) { set_[group_index].ResetRefsCount(); } uint32_t GetGroupCount() const { return set_.size(); } - rocprofiler_group_t GetGroupInfo(Group* g) { + inline rocprofiler_group_t GetGroupInfo(Group* g) { rocprofiler::info_vector_t& info_vector = g->GetInfoVector(); rocprofiler_group_t group = {}; group.index = g->GetIndex(); @@ -307,8 +305,14 @@ class Context { group.feature_count = info_vector.size(); return group; } - rocprofiler_group_t GetGroupInfo(const uint32_t& index) { - return GetGroupInfo(&set_[index]); + inline rocprofiler_group_t GetGroupInfo(const uint32_t& index) { + rocprofiler_group_t group = {}; + if (set_.empty()) { + group.context = reinterpret_cast(this); + } else { + group = GetGroupInfo(&set_[index]); + } + return group; } const pkt_vector_t& StartPackets(const uint32_t& group_index) const { @@ -355,14 +359,7 @@ class Context { const profile_vector_t profile_vector = GetProfiles(group_index); for (auto& tuple : profile_vector) { // Wait for stop packet to complete - const uint64_t timeout = timeout_; - bool complete = false; - while (!complete) { - const hsa_signal_value_t signal_value = hsa_signal_wait_scacquire(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, timeout, - HSA_WAIT_STATE_BLOCKED); - complete = (signal_value < 1); - if (!complete) WARN_LOGGING("timeout"); - } + hsa_rsrc_->SignalWaitRestore(tuple.completion_signal, 1); for (rocprofiler_feature_t* rinfo : *(tuple.info_vector)) rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT; callback_data_t callback_data{tuple.profile, tuple.info_vector, tuple.info_vector->size(), NULL}; const hsa_status_t status = @@ -398,8 +395,19 @@ class Context { } } - static void SetTimeout(uint64_t timeout) { timeout_ = timeout; } - static uint64_t GetTimeout() { return timeout_; } + static bool Handler(hsa_signal_value_t value, void* arg) { + Group* group = reinterpret_cast(arg); + Context* context = group->GetContext(); + auto r = group->FetchDecrRefsCount(); + if (r == 1) { + const rocprofiler_group_t group_info = context->GetGroupInfo(group); + context->handler_(group_info, context->handler_arg_); + } + return false; + } + + Group* GetGroup(const uint32_t& index) { return &set_[index]; } + rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; } private: // Getting profling packets @@ -412,18 +420,6 @@ class Context { return vec; } - static bool Handler(hsa_signal_value_t value, void* arg) { - Group* group = reinterpret_cast(arg); - Context* context = group->GetContext(); - context->mutex_.lock(); - uint32_t r = group->DecrRefs(); - context->mutex_.unlock(); - if (r == 0) { - return context->handler_(context->GetGroupInfo(group), context->handler_arg_); - } - return false; - } - static hsa_status_t DataCallback(hsa_ven_amd_aqlprofile_info_type_t ainfo_type, hsa_ven_amd_aqlprofile_info_data_t* ainfo_data, void* data) { hsa_status_t status = HSA_STATUS_SUCCESS; @@ -513,9 +509,6 @@ class Context { return info; } - // Profiling data waiting timeout - static uint64_t timeout_; - // GPU handel const hsa_agent_t agent_; const util::AgentInfo* agent_info_; @@ -538,7 +531,6 @@ class Context { // Context completion handler rocprofiler_handler_t handler_; void* handler_arg_; - mutex_t mutex_; }; } // namespace rocprofiler diff --git a/src/core/group_set.h b/src/core/group_set.h new file mode 100644 index 00000000..b255079b --- /dev/null +++ b/src/core/group_set.h @@ -0,0 +1,244 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_CORE_GROUP_SET_H_ +#define SRC_CORE_GROUP_SET_H_ + +#include +#include +#include + +#include "core/metrics.h" +#include "util/exception.h" +#include "util/hsa_rsrc_factory.h" + +namespace rocprofiler { + +// Block descriptor +struct block_des_t { + uint32_t id; + uint32_t index; +}; + +// block_des_t less-then functor +struct lt_block_des { + bool operator()(const block_des_t& a1, const block_des_t& a2) const { + return (a1.id < a2.id) || ((a1.id == a2.id) && (a1.index < a2.index)); + } +}; + +// Block status +struct block_status_t { + uint32_t max_counters; + uint32_t counter_index; + uint32_t group_index; +}; + +// Metrics set class +class MetricsGroup { + public: + // Info map type + typedef std::map info_map_t; + // Blocks map type + typedef std::map blocks_map_t; + + MetricsGroup(const util::AgentInfo* agent_info) : + agent_info_(agent_info) + { + metrics_ = MetricsDict::Create(agent_info); + if (metrics_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); + } + + void Print(FILE* file) const { + for (const Metric* metric : metrics_vec_) { + fprintf(file, " %s", metric->GetName().c_str()); fflush(stdout); + } + fprintf(file, "\n"); fflush(stdout); + } + + static const Metric* GetMetric(const MetricsDict* metrics, const std::string& name) { + // Metric object + const Metric* metric = metrics->Get(name); + if (metric == NULL) + EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found"); + return metric; + } + + static const Metric* GetMetric(const MetricsDict* metrics, const rocprofiler_feature_t* info) { + // Metrics name + const char* name = info->name; + if (name == NULL) EXC_RAISING(HSA_STATUS_ERROR, "input feature name is NULL"); + const Metric* metric = GetMetric(metrics, name); +#if 0 + std::cout << " " << name << (metric->GetExpr() ? " = " + metric->GetExpr()->String() : " counter") << std::endl; +#endif + return metric; + } + + // Add metric + bool AddMetric(const rocprofiler_feature_t* info) { + return AddMetric(GetMetric(metrics_, info)); + } + + bool AddMetric(const Metric* metric) { + // Blocks utilization delta + blocks_map_t blocks_delta; + + // Process metrics counters + const counters_vec_t& counters_vec = metric->GetCounters(); + if (counters_vec.empty()) + EXC_RAISING(HSA_STATUS_ERROR, "bad metric '" << metric->GetName() << "' is empty"); + + for (const counter_t* counter : counters_vec) { + const event_t* event = &(counter->event); + + // For metrics expressions checking that there is no the same counter in the input metrics + // and also that the counter wasn't registered already by another input metric expression + if (info_map_.find(counter->name) != info_map_.end()) continue; + + const block_des_t block_des = {event->block_name, event->block_index}; + auto ret = blocks_map_.insert({block_des, {}}); + block_status_t& block_status = ret.first->second; + if (ret.second == true) { + profile_t query = {}; + query.agent = agent_info_->dev_id; + query.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC; + query.events = event; + + uint32_t block_counters; + hsa_status_t status = util::HsaRsrcFactory::Instance().AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info( + &query, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS, &block_counters); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "get block_counters info"); + block_status.max_counters = block_counters; + } + + ret = blocks_delta.insert({block_des, block_status}); + block_status_t& delta_status = ret.first->second; + delta_status.counter_index += 1; + if (delta_status.counter_index > delta_status.max_counters) return false; + } + + // Register metric + metrics_vec_.push_back(metric); + info_map_[metric->GetName()] = metric; + for (const counter_t* counter : counters_vec) { + if (info_map_.find(counter->name) == info_map_.end()) info_map_[counter->name] = NewCounterInfo(counter->name); + } + for (const auto& entry : blocks_delta) { + blocks_map_[entry.first] = entry.second; + } + + return true; + } + + private: + const Metric* NewCounterInfo(const std::string& name) const { + return GetMetric(metrics_, name); + } + + // Agent info + const util::AgentInfo* const agent_info_; + // Metrics dictionary + const MetricsDict* metrics_; + // Info map + info_map_t info_map_; + // Blocks map + blocks_map_t blocks_map_; + // Metrics vector + std::vector metrics_vec_; +}; + +// Metrics groups class +class MetricsGroupSet { + public: + MetricsGroupSet(const util::AgentInfo* agent_info, const rocprofiler_feature_t* info_array, const uint32_t info_count) : + agent_info_(agent_info) + { + metrics_ = MetricsDict::Create(agent_info); + if (metrics_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); + Initialize(info_array, info_count); + } + + ~MetricsGroupSet() { + for (auto* group : groups_) delete group; + } + + uint32_t GetSize() const { return groups_.size(); } + + void Print(FILE* file) const { + uint32_t idx = 0; + for (const auto* group : groups_) { + ++idx; + fprintf(stdout, " group%u:", idx); fflush(stdout); + group->Print(file); + } + } + + private: + void Initialize(const rocprofiler_feature_t* info_array, const uint32_t info_count) { + std::multimap > input_metrics; + for (unsigned i = 0; i < info_count; ++i) { + const rocprofiler_feature_t* info = &info_array[i]; + if (info->kind != ROCPROFILER_FEATURE_KIND_METRIC) continue; + const Metric* metric = MetricsGroup::GetMetric(metrics_, info); + const uint32_t counters_num = metric->GetCounters().size(); + input_metrics.insert({counters_num, metric}); + + if (MetricsGroup(agent_info_).AddMetric(metric) == false) { + AQL_EXC_RAISING(HSA_STATUS_ERROR, "Metric '" << metric->GetName() << "' doesn't fit in one group"); + } + } +#if 0 + for (const auto& entry : input_metrics) { + printf("%u %s\n", entry.first, entry.second->GetName().c_str()); + } +#endif + auto end = input_metrics.end(); + while (!input_metrics.empty()) { + MetricsGroup* group = NextGroup(); + auto it = input_metrics.begin(); + do { + auto curr = it++; + const Metric* metric = curr->second; + if (group->AddMetric(metric) == true) { + input_metrics.erase(curr); + } + } while (it != end); + } + } + + MetricsGroup* NextGroup() { + groups_.push_back(new MetricsGroup(agent_info_)); + return groups_.back(); + } + + // Agent info + const util::AgentInfo* const agent_info_; + // Metrics dictionary + const MetricsDict* metrics_; + // Metrics group vector + std::vector groups_; +}; + +} // namespace rocprofiler + +#endif // SRC_CORE_GROUP_SET_H_ diff --git a/src/core/intercept_queue.cpp b/src/core/intercept_queue.cpp index a2a289aa..7703c662 100644 --- a/src/core/intercept_queue.cpp +++ b/src/core/intercept_queue.cpp @@ -34,7 +34,8 @@ InterceptQueue::queue_callback_t InterceptQueue::destroy_callback_ = NULL; void* InterceptQueue::callback_data_ = NULL; InterceptQueue::obj_map_t* InterceptQueue::obj_map_ = NULL; const char* InterceptQueue::kernel_none_ = ""; -uint64_t InterceptQueue::timeout_ = UINT64_MAX; Tracker* InterceptQueue::tracker_ = NULL; bool InterceptQueue::tracker_on_ = false; +bool InterceptQueue::in_constr_call_ = false; + } // namespace rocprofiler diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index c5376bb9..627a718a 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -48,51 +48,68 @@ class InterceptQueue { typedef std::recursive_mutex mutex_t; typedef std::map obj_map_t; typedef hsa_status_t (*queue_callback_t)(hsa_queue_t*, void* data); + typedef void (*queue_event_callback_t)(hsa_status_t status, hsa_queue_t *queue, void *arg); static void HsaIntercept(HsaApiTable* table); - static hsa_status_t QueueCreate(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + static hsa_status_t InterceptQueueCreate(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), void* data, uint32_t private_segment_size, - uint32_t group_segment_size, hsa_queue_t** queue) { - hsa_status_t status = HSA_STATUS_ERROR; + uint32_t group_segment_size, hsa_queue_t** queue, + const bool& tracker_on) { std::lock_guard lck(mutex_); + hsa_status_t status = HSA_STATUS_ERROR; + + if (in_constr_call_) EXC_ABORT(status, "recursive InterceptQueueCreate()"); + in_constr_call_ = true; - ProxyQueue* proxy = ProxyQueue::Create(agent, size, type, callback, data, private_segment_size, + ProxyQueue* proxy = ProxyQueue::Create(agent, size, type, queue_event_callback, data, private_segment_size, group_segment_size, queue, &status); - if (status != HSA_STATUS_SUCCESS) abort(); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "ProxyQueue::Create()"); - if (tracker_on_ && (tracker_ == NULL)) { - tracker_ = new Tracker(timeout_); + if (tracker_on || tracker_on_) { + if (tracker_ == NULL) tracker_ = new Tracker; status = hsa_amd_profiling_set_profiler_enabled(*queue, true); - if (status != HSA_STATUS_SUCCESS) abort(); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_set_profiler_enabled()"); } if (!obj_map_) obj_map_ = new obj_map_t; InterceptQueue* obj = new InterceptQueue(agent, *queue, proxy); (*obj_map_)[(uint64_t)(*queue)] = obj; status = proxy->SetInterceptCB(OnSubmitCB, obj); + obj->queue_event_callback_ = callback; + in_constr_call_ = false; return status; } + static hsa_status_t QueueCreate(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, + void* data), + void* data, uint32_t private_segment_size, + uint32_t group_segment_size, hsa_queue_t** queue) { + return InterceptQueueCreate(agent, size, type, callback, data, private_segment_size, group_segment_size, queue, false); + } + + static hsa_status_t QueueCreateTracked(hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, + void* data), + void* data, uint32_t private_segment_size, + uint32_t group_segment_size, hsa_queue_t** queue) { + return InterceptQueueCreate(agent, size, type, callback, data, private_segment_size, group_segment_size, queue, true); + } + static hsa_status_t QueueDestroy(hsa_queue_t* queue) { std::lock_guard lck(mutex_); hsa_status_t status = HSA_STATUS_ERROR; - if (destroy_callback_ != NULL) { - status = destroy_callback_(queue, callback_data_); - if (status != HSA_STATUS_SUCCESS) return status; - } + if (destroy_callback_ != NULL) { + status = destroy_callback_(queue, callback_data_); + } - obj_map_t::iterator it = obj_map_->find((uint64_t)queue); - if (it != obj_map_->end()) { - const InterceptQueue* obj = it->second; - assert(queue == obj->queue_); - delete obj; - obj_map_->erase(it); - status = HSA_STATUS_SUCCESS; + if (status == HSA_STATUS_SUCCESS) { + status = DelObj(queue); } return status; @@ -104,47 +121,73 @@ class InterceptQueue { InterceptQueue* obj = reinterpret_cast(data); Queue* proxy = obj->proxy_; + // Travers input packets for (uint64_t j = 0; j < count; ++j) { - bool to_submit = true; const packet_t* packet = &packets_arr[j]; + bool to_submit = true; + // Checking for dispatch packet type if ((GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) && (dispatch_callback_ != NULL)) { - rocprofiler_group_t group = {}; const hsa_kernel_dispatch_packet_t* dispatch_packet = reinterpret_cast(packet); - const char* kernel_name = GetKernelName(dispatch_packet); - const rocprofiler_dispatch_record_t* record = NULL; + + // Adding kernel timing tracker + Tracker::entry_t* tracker_entry = NULL; if (tracker_ != NULL) { - const auto* entry = tracker_->Add(obj->agent_info_->dev_id, dispatch_packet->completion_signal); - const_cast(dispatch_packet)->completion_signal = entry->signal; - record = entry->record; + tracker_entry = tracker_->Alloc(obj->agent_info_->dev_id, dispatch_packet->completion_signal); + const_cast(dispatch_packet)->completion_signal = tracker_entry->signal; } + + // Prepareing dispatch callback data + const char* kernel_name = GetKernelName(dispatch_packet); rocprofiler_callback_data_t data = {obj->agent_info_->dev_id, obj->agent_info_->dev_index, obj->queue_, user_que_idx, dispatch_packet, kernel_name, - record}; + (tracker_entry) ? tracker_entry->record : NULL}; + + // Calling dispatch callback + rocprofiler_group_t group = {}; hsa_status_t status = dispatch_callback_(&data, callback_data_, &group); free(const_cast(kernel_name)); - if ((status == HSA_STATUS_SUCCESS) && (group.context != NULL)) { + // Injecting profiling start/stop packets + if ((status != HSA_STATUS_SUCCESS) || (group.context == NULL)) { + if (tracker_entry != NULL) tracker_->Delete(tracker_entry); + } else { Context* context = reinterpret_cast(group.context); - const pkt_vector_t& start_vector = context->StartPackets(group.index); - const pkt_vector_t& stop_vector = context->StopPackets(group.index); - - pkt_vector_t packets = start_vector; - packets.insert(packets.end(), *packet); - packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); - if (writer != NULL) { - writer(&packets[0], packets.size()); + + if (group.feature_count != 0) { + if (tracker_entry != NULL) { + Group* context_group = context->GetGroup(group.index); + context_group->IncrRefsCount(); + tracker_->Enable(tracker_entry, Context::Handler, reinterpret_cast(context_group)); + } + + const pkt_vector_t& start_vector = context->StartPackets(group.index); + const pkt_vector_t& stop_vector = context->StopPackets(group.index); + pkt_vector_t packets = start_vector; + packets.insert(packets.end(), *packet); + packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); + if (writer != NULL) { + writer(&packets[0], packets.size()); + } else { + proxy->Submit(&packets[0], packets.size()); + } + to_submit = false; } else { - proxy->Submit(&packets[0], packets.size()); + if (tracker_entry != NULL) { + void* context_handler_arg = NULL; + rocprofiler_handler_t context_handler_fun = context->GetHandler(&context_handler_arg); + tracker_->Enable(tracker_entry, context_handler_fun, context_handler_arg); + rocprofiler_close(context); + } } - to_submit = false; } } + // Submitting the original packets if profiling was not enabled if (to_submit) { if (writer != NULL) { writer(packet, 1); @@ -152,8 +195,6 @@ class InterceptQueue { proxy->Submit(packet, 1); } } - - packet += 1; } } @@ -164,22 +205,19 @@ class InterceptQueue { destroy_callback_ = destroy_callback; } - static void SetTimeout(uint64_t timeout) { timeout_ = timeout; } static void TrackerOn(bool on) { tracker_on_ = on; } static bool IsTrackerOn() { return tracker_on_; } private: - InterceptQueue(const hsa_agent_t& agent, hsa_queue_t* const queue, ProxyQueue* proxy) : - queue_(queue), - proxy_(proxy) - { - agent_info_ = util::HsaRsrcFactory::Instance().GetAgentInfo(agent); + static void queue_event_callback(hsa_status_t status, hsa_queue_t *queue, void *arg) { + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "queue error handling is not supported"); + InterceptQueue* obj = GetObj(queue); + if (obj->queue_event_callback_) obj->queue_event_callback_(status, obj->queue_, arg); } - ~InterceptQueue() { ProxyQueue::Destroy(proxy_); } - static packet_word_t GetHeaderType(const packet_t* packet) { + static hsa_packet_type_t GetHeaderType(const packet_t* packet) { const packet_word_t* header = reinterpret_cast(packet); - return (*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask; + return static_cast((*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask); } static const char* GetKernelName(const hsa_kernel_dispatch_packet_t* dispatch_packet) { @@ -209,6 +247,45 @@ class InterceptQueue { return funcname; } + // method to get an intercept queue object + static InterceptQueue* GetObj(const hsa_queue_t* queue) { + std::lock_guard lck(mutex_); + InterceptQueue* obj = NULL; + obj_map_t::const_iterator it = obj_map_->find((uint64_t)queue); + if (it != obj_map_->end()) { + obj = it->second; + assert(queue == obj->queue_); + } + return obj; + } + + // method to delete an intercept queue object + static hsa_status_t DelObj(const hsa_queue_t* queue) { + std::lock_guard lck(mutex_); + hsa_status_t status = HSA_STATUS_ERROR; + obj_map_t::const_iterator it = obj_map_->find((uint64_t)queue); + if (it != obj_map_->end()) { + const InterceptQueue* obj = it->second; + assert(queue == obj->queue_); + delete obj; + obj_map_->erase(it); + status = HSA_STATUS_SUCCESS;; + } + return status; + } + + InterceptQueue(const hsa_agent_t& agent, hsa_queue_t* const queue, ProxyQueue* proxy) : + queue_(queue), + proxy_(proxy) + { + agent_info_ = util::HsaRsrcFactory::Instance().GetAgentInfo(agent); + queue_event_callback_ = NULL; + } + + ~InterceptQueue() { + ProxyQueue::Destroy(proxy_); + } + static mutex_t mutex_; static const packet_word_t header_type_mask = (1ul << HSA_PACKET_HEADER_WIDTH_TYPE) - 1; static rocprofiler_callback_t dispatch_callback_; @@ -216,13 +293,14 @@ class InterceptQueue { static void* callback_data_; static obj_map_t* obj_map_; static const char* kernel_none_; - static uint64_t timeout_; static Tracker* tracker_; static bool tracker_on_; + static bool in_constr_call_; hsa_queue_t* const queue_; ProxyQueue* const proxy_; const util::AgentInfo* agent_info_; + queue_event_callback_t queue_event_callback_; }; } // namespace rocprofiler diff --git a/src/core/profile.h b/src/core/profile.h index 43d30a21..6d91192b 100644 --- a/src/core/profile.h +++ b/src/core/profile.h @@ -140,10 +140,14 @@ class Profile { if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_start"); status = api->hsa_ven_amd_aqlprofile_stop(&profile_, &stop); if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_stop"); +#ifdef AQLPROF_NEW_API hsa_status_t rd_status = api->hsa_ven_amd_aqlprofile_read(&profile_, &read); #if 0 // Read API returns error if disabled if (rd_status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_read"); #endif +#else + hsa_status_t rd_status = HSA_STATUS_ERROR; +#endif // Set completion signal hsa_signal_t dummy_signal{}; diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index a96fadba..e8901387 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -152,7 +152,7 @@ bool LoadTool() { settings.intercept_mode = (intercept_mode) ? 1 : 0; settings.sqtt_size = SqttProfile::GetSize(); settings.sqtt_local = SqttProfile::IsLocal() ? 1: 0; - settings.timeout = Context::GetTimeout(); + settings.timeout = util::HsaRsrcFactory::GetTimeoutNs(); settings.timestamp_on = InterceptQueue::IsTrackerOn() ? 1 : 0; if (handler) handler(); @@ -161,8 +161,7 @@ bool LoadTool() { intercept_mode = (settings.intercept_mode != 0); SqttProfile::SetSize(settings.sqtt_size); SqttProfile::SetLocal(settings.sqtt_local != 0); - Context::SetTimeout(settings.timeout); - InterceptQueue::SetTimeout(settings.timeout); + util::HsaRsrcFactory::SetTimeoutNs(settings.timeout); InterceptQueue::TrackerOn(settings.timestamp_on != 0); } @@ -188,8 +187,8 @@ CONSTRUCTOR_API void constructor() { } DESTRUCTOR_API void destructor() { - util::HsaRsrcFactory::Destroy(); rocprofiler::MetricsDict::Destroy(); + util::HsaRsrcFactory::Destroy(); util::Logger::Destroy(); } @@ -211,10 +210,8 @@ hsa_status_t GetExcStatus(const std::exception& e) { } rocprofiler_properties_t rocprofiler_properties; -uint64_t Context::timeout_ = UINT64_MAX; uint32_t SqttProfile::output_buffer_size_ = 0x2000000; // 32M bool SqttProfile::output_buffer_local_ = true; -Tracker::mutex_t Tracker::mutex_; util::Logger::mutex_t util::Logger::mutex_; util::Logger* util::Logger::instance_ = NULL; } @@ -230,11 +227,36 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa rocprofiler::SaveHsaApi(table); rocprofiler::ProxyQueue::InitFactory(); bool intercept_mode = false; + + // Checking environment to enable intercept mode const char* intercept_env = getenv("ROCP_HSA_INTERCEPT"); if (intercept_env != NULL) { - if (strncmp(intercept_env, "1", 1) == 0) intercept_mode = true; + switch (atoi(intercept_env)) { + // Intercepting disabled + case 0: + intercept_mode = false; + rocprofiler::InterceptQueue::TrackerOn(false); + break; + // Intercepting enabled without timestamping + case 1: + intercept_mode = true; + rocprofiler::InterceptQueue::TrackerOn(false); + break; + // Intercepting enabled with timestamping + case 2: + intercept_mode = true; + rocprofiler::InterceptQueue::TrackerOn(true); + break; + default: + ERR_LOGGING("Bad ROCP_HSA_INTERCEPT env var value (" << intercept_env << ")"); + return false; + } } - if (rocprofiler::LoadTool()) intercept_mode = true; + + // Loading a tool lib and setting of intercept mode + const bool intercept_mode_on = rocprofiler::LoadTool(); + if (intercept_mode_on) intercept_mode = true; + // HSA intercepting if (intercept_mode) { rocprofiler::ProxyQueue::HsaIntercept(table); @@ -479,6 +501,43 @@ PUBLIC_API hsa_status_t rocprofiler_iterate_info( info.metric.name = strdup(name.c_str()); info.metric.description = strdup(descr.c_str()); info.metric.expr = expr.empty() ? NULL : strdup(expr.c_str()); + + if (expr.empty()) { + // Getting the block name + const std::string block_name = node->opts["block"]; + + // Querying profile + rocprofiler::profile_t profile = {}; + profile.agent = agent_info->dev_id; + profile.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC; + + // Query block id info + hsa_ven_amd_aqlprofile_id_query_t query = {block_name.c_str(), 0, 0}; + hsa_status_t status = rocprofiler::util::HsaRsrcFactory::Instance().AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info( + &profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_ID, &query); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(HSA_STATUS_ERROR, "get block id info: '" << block_name << "'"); + + // Metric object + const std::string metric_name = (query.instance_count > 1) ? name + "[0]" : name; + const rocprofiler::Metric* metric = dict->Get(metric_name); + if (metric == NULL) EXC_RAISING(HSA_STATUS_ERROR, "metric '" << name << "' is not found"); + + // Process metrics counters + const rocprofiler::counters_vec_t& counters_vec = metric->GetCounters(); + if (counters_vec.size() != 1) EXC_RAISING(HSA_STATUS_ERROR, "error: '" << metric->GetName() << "' is not basic"); + + // Query block counters number + uint32_t block_counters; + profile.events = &(counters_vec[0]->event); + status = rocprofiler::util::HsaRsrcFactory::Instance().AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info( + &profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS, &block_counters); + if (status != HSA_STATUS_SUCCESS) continue; + + info.metric.instances = query.instance_count; + info.metric.block_name = block_name.c_str(); + info.metric.block_counters = block_counters; + } + status = callback(info, data); if (status != HSA_STATUS_SUCCESS) break; } @@ -519,4 +578,14 @@ PUBLIC_API hsa_status_t rocprofiler_query_info( API_METHOD_SUFFIX } +// Creates a profiled queue. All dispatches on this queue will be profiled +PUBLIC_API hsa_status_t rocprofiler_queue_create_profiled( + hsa_agent_t agent, uint32_t size, hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), + void* data, uint32_t private_segment_size, uint32_t group_segment_size, + hsa_queue_t** queue) +{ + return rocprofiler::InterceptQueue::QueueCreateTracked(agent, size, type, callback, data, private_segment_size, group_segment_size, queue); +} + } // extern "C" diff --git a/src/core/tracker.h b/src/core/tracker.h index eae0c112..acbf5cf6 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -28,9 +28,11 @@ THE SOFTWARE. #include #include +#include #include #include +#include "util/hsa_rsrc_factory.h" #include "inc/rocprofiler.h" #include "util/exception.h" #include "util/logger.h" @@ -39,12 +41,13 @@ namespace rocprofiler { class Tracker { public: - typedef uint64_t timestamp_t; - typedef long double freq_t; typedef std::mutex mutex_t; + typedef util::HsaRsrcFactory::timestamp_t timestamp_t; typedef rocprofiler_dispatch_record_t record_t; struct entry_t; typedef std::list sig_list_t; + typedef sig_list_t::iterator sig_list_it_t; + struct entry_t { Tracker* tracker; sig_list_t::iterator it; @@ -52,71 +55,59 @@ class Tracker { hsa_signal_t orig; hsa_signal_t signal; record_t* record; + std::atomic handler; + void* arg; + bool context_active; }; - Tracker(uint64_t timeout = UINT64_MAX) : timeout_(timeout), outstanding(0) { - timestamp_t timestamp_hz = 0; - hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, ×tamp_hz); - if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)"); - timestamp_factor_ = (freq_t)1000000000 / (freq_t)timestamp_hz; - } + Tracker() : + outstanding_(0), + hsa_rsrc_(&(util::HsaRsrcFactory::Instance())) + {} + ~Tracker() { - mutex_.lock(); - for (entry_t* entry : sig_list_) { - assert(entry != NULL); - while (1) { - const hsa_signal_value_t signal_value = hsa_signal_wait_scacquire( - entry->signal, - HSA_SIGNAL_CONDITION_LT, - 1, - timeout_, - HSA_WAIT_STATE_BLOCKED); - if (signal_value < 1) break; - else WARN_LOGGING("tracker timeout"); - } - Del(entry); + auto it = sig_list_.begin(); + auto end = sig_list_.end(); + while (it != end) { + auto cur = it++; + hsa_rsrc_->SignalWait((*cur)->signal); + Erase(cur); } - mutex_.unlock(); } // Add tracker entry - entry_t* Add(const hsa_agent_t& agent, const hsa_signal_t& orig) { + entry_t* Alloc(const hsa_agent_t& agent, const hsa_signal_t& orig) { hsa_status_t status = HSA_STATUS_ERROR; + + // Creating a new tracker entry entry_t* entry = new entry_t{}; assert(entry); entry->tracker = this; - mutex_.lock(); - entry->it = sig_list_.insert(sig_list_.begin(), entry); - mutex_.unlock(); - entry->agent = agent; entry->orig = orig; - status = hsa_signal_create(1, 0, NULL, &(entry->signal)); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); + // Creating a record with the dispatch timestamps record_t* record = new record_t{}; assert(record); + record->dispatch = hsa_rsrc_->TimestampNs(); entry->record = record; - status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &record->dispatch); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)"); - hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); + // Creating a proxy signal + status = hsa_signal_create(1, 0, NULL, &(entry->signal)); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); + status = hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); - if (trace_on_) { - mutex_.lock(); - entry->tracker->outstanding++; - fprintf(stdout, "Tracker::Add: entry %p, record %p, outst %lu\n", entry, entry->record, entry->tracker->outstanding); - fflush(stdout); - mutex_.unlock(); - } + // Adding antry to the list + mutex_.lock(); + entry->it = sig_list_.insert(sig_list_.begin(), entry); + mutex_.unlock(); return entry; } - private: // Delete tracker entry - void Del(entry_t* entry) { + void Delete(entry_t* entry) { hsa_signal_destroy(entry->signal); mutex_.lock(); sig_list_.erase(entry->it); @@ -124,31 +115,53 @@ class Tracker { delete entry; } - // Handler for packet completion - static bool Handler(hsa_signal_value_t value, void* arg) { - entry_t* entry = reinterpret_cast(arg); + // Enable tracker entry + void Enable(entry_t* entry, void* handler, void* arg) { + // Set entry handler and release the entry + entry->arg = arg; + entry->handler.store(handler, std::memory_order_release); + + // Debug trace + if (trace_on_) { + auto outstanding = outstanding_.fetch_add(1); + fprintf(stdout, "Tracker::Add: entry %p, record %p, outst %lu\n", entry, entry->record, outstanding); + fflush(stdout); + } + } + + void Enable(entry_t* entry, hsa_amd_signal_handler handler, void* arg) { + entry->context_active = true; + Enable(entry, reinterpret_cast(handler), arg); + } + void Enable(entry_t* entry, rocprofiler_handler_t handler, void* arg) { + Enable(entry, reinterpret_cast(handler), arg); + } + + private: + // Delete an entry by iterator + void Erase(const sig_list_it_t& it) { Delete(*it); } + + // Entry completion + void Complete(entry_t* entry) { record_t* record = entry->record; + // Debug trace if (trace_on_) { - mutex_.lock(); - entry->tracker->outstanding--; - fprintf(stdout, "Tracker::Handler: entry %p, record %p, outst %lu\n", entry, entry->record, entry->tracker->outstanding); + auto outstanding = outstanding_.fetch_sub(1); + fprintf(stdout, "Tracker::Handler: entry %p, record %p, outst %lu\n", entry, entry->record, outstanding); fflush(stdout); - mutex_.unlock(); } - timestamp_t complete_timestamp = 0; + // Query begin/end and complete timestamps hsa_amd_profiling_dispatch_time_t dispatch_time{}; - - hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &complete_timestamp); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)"); - status = hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time); + hsa_status_t status = hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time"); - record->complete = entry->tracker->timestamp2ns(complete_timestamp); - record->begin = entry->tracker->timestamp2ns(dispatch_time.start); - record->end = entry->tracker->timestamp2ns(dispatch_time.end); + record->begin = hsa_rsrc_->SysclockToNs(dispatch_time.start); + record->end = hsa_rsrc_->SysclockToNs(dispatch_time.end); + record->complete = hsa_rsrc_->TimestampNs(); + // Original intercepted signal completion hsa_signal_t orig = entry->orig; if (orig.handle) { amd_signal_t* orig_signal_ptr = reinterpret_cast(orig.handle); @@ -159,26 +172,41 @@ class Tracker { const hsa_signal_value_t value = hsa_signal_load_relaxed(orig); hsa_signal_store_screlease(orig, value - 1); } - entry->tracker->Del(entry); - - return false; } - inline timestamp_t timestamp2ns(const timestamp_t& timestamp) const { - const freq_t timestamp_ns = (freq_t)timestamp * timestamp_factor_; - return (timestamp_t)timestamp_ns; + // Handler for packet completion + static bool Handler(hsa_signal_value_t, void* arg) { + // Acquire entry + entry_t* entry = reinterpret_cast(arg); + volatile std::atomic* ptr = &entry->handler; + while (ptr->load(std::memory_order_acquire) == NULL) sched_yield(); + + // Complete entry + entry->tracker->Complete(entry); + + // Call entry handler + void* handler = static_cast(entry->handler); + if (entry->context_active) { + reinterpret_cast(handler)(0, entry->arg); + } else { + rocprofiler_group_t group{}; + reinterpret_cast(handler)(group, entry->arg); + } + + // Delete tracker entry + entry->tracker->Delete(entry); + + return false; } - // Timestamp frequency factor - freq_t timestamp_factor_; - // Timeout for wait on destruction - timestamp_t timeout_; // Tracked signals list sig_list_t sig_list_; // Inter-thread synchronization - static mutex_t mutex_; + mutex_t mutex_; // Outstanding dispatches - uint64_t outstanding; + std::atomic outstanding_; + // HSA resources factory + util::HsaRsrcFactory* hsa_rsrc_; // Enable tracing static const bool trace_on_ = false; }; diff --git a/src/core/types.h b/src/core/types.h index fd8bae33..c58d6cf2 100644 --- a/src/core/types.h +++ b/src/core/types.h @@ -32,6 +32,7 @@ typedef hsa_ven_amd_aqlprofile_parameter_t parameter_t; typedef hsa_ven_amd_aqlprofile_profile_t profile_t; typedef hsa_ext_amd_aql_pm4_packet_t packet_t; typedef uint32_t packet_word_t; +typedef uint64_t timestamp_t; } // namespace rocprofiler #endif // SRC_CORE_TYPES_H_ diff --git a/src/util/exception.h b/src/util/exception.h index 8af5f980..730028c2 100644 --- a/src/util/exception.h +++ b/src/util/exception.h @@ -30,27 +30,27 @@ THE SOFTWARE. #include #define EXC_ABORT(error, stream) \ - { \ + do { \ std::ostringstream oss; \ oss << __FUNCTION__ << "(), " << stream; \ - std::cout << oss.str() << std::endl; \ + std::cout << "error(" << error << ") \"" << oss.str() << "\"" << std::endl; \ abort(); \ - } + } while (0) #define EXC_RAISING(error, stream) \ - { \ + do { \ std::ostringstream oss; \ oss << __FUNCTION__ << "(), " << stream; \ throw rocprofiler::util::exception(error, oss.str()); \ - } + } while (0) #define AQL_EXC_RAISING(error, stream) \ - { \ + do { \ const char* error_string = NULL; \ - const rocprofiler::pfn_t* api = util::HsaRsrcFactory::Instance().AqlProfileApi(); \ + const rocprofiler::pfn_t* api = rocprofiler::util::HsaRsrcFactory::Instance().AqlProfileApi(); \ api->hsa_ven_amd_aqlprofile_error_string(&error_string); \ EXC_RAISING(error, stream << ", " << error_string); \ - } + } while (0) namespace rocprofiler { namespace util { diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index ff749d15..3c50d27d 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -1,24 +1,26 @@ -/****************************************************************************** -Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*******************************************************************************/ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +<95> Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +<95> Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ #include "util/hsa_rsrc_factory.h" @@ -42,6 +44,9 @@ THE SOFTWARE. #include #include +#include "util/exception.h" +#include "util/logger.h" + namespace rocprofiler { namespace util { @@ -108,14 +113,21 @@ hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { // Constructor of the class HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) { hsa_status_t status; + + cpu_pool_ = NULL; + kern_arg_pool_ = NULL; + // Initialize the Hsa Runtime if (initialize_hsa_) { status = hsa_init(); CHECK_STATUS("Error in hsa_init", status); } + // Discover the set of Gpu devices available on the platform status = hsa_iterate_agents(GetHsaAgentsCallback, this); CHECK_STATUS("Error Calling hsa_iterate_agents", status); + if (cpu_pool_ == NULL) CHECK_STATUS("CPU memory pool is not found", HSA_STATUS_ERROR); + if (kern_arg_pool_ == NULL) CHECK_STATUS("Kern-arg memory pool is not found", HSA_STATUS_ERROR); // Get AqlProfile API table aqlprofile_api_ = {0}; @@ -130,10 +142,19 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize loader_api_ = {0}; status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_); CHECK_STATUS("loader API table query failed", status); + + // Instantiate HSA timer + timer_ = new HsaTimer; + CHECK_STATUS("HSA timer allocation failed", + (timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); + + // System timeout + timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_); } // Destructor of the class HsaRsrcFactory::~HsaRsrcFactory() { + delete timer_; for (auto p : cpu_list_) delete p; for (auto p : gpu_list_) delete p; if (initialize_hsa_) { @@ -160,8 +181,10 @@ hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { (decltype(::hsa_ven_amd_aqlprofile_start)*)dlsym(handle, "hsa_ven_amd_aqlprofile_start"); api->hsa_ven_amd_aqlprofile_stop = (decltype(::hsa_ven_amd_aqlprofile_stop)*)dlsym(handle, "hsa_ven_amd_aqlprofile_stop"); +#ifdef AQLPROF_NEW_API api->hsa_ven_amd_aqlprofile_read = (decltype(::hsa_ven_amd_aqlprofile_read)*)dlsym(handle, "hsa_ven_amd_aqlprofile_read"); +#endif api->hsa_ven_amd_aqlprofile_legacy_get_pm4 = (decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*)dlsym( handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); @@ -191,9 +214,9 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { agent_info->dev_index = cpu_list_.size(); status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); - CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(cpu pool)", status); + if ((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == NULL)) cpu_pool_ = &agent_info->cpu_pool; status = hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); - CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(kern arg pool)", status); + if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == NULL)) kern_arg_pool_ = &agent_info->kern_arg_pool; agent_info->gpu_pool = {}; cpu_list_.push_back(agent_info); @@ -355,7 +378,7 @@ uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size uint8_t* buffer = NULL; if (!cpu_agents_.empty()) { size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status = hsa_amd_memory_pool_allocate(cpu_list_[0]->kern_arg_pool, size, 0, reinterpret_cast(&buffer)); + status = hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the kernel arguments if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; @@ -375,7 +398,7 @@ uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t s uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; if (!cpu_agents_.empty()) { - status = hsa_amd_memory_pool_allocate(cpu_list_[0]->cpu_pool, size, 0, reinterpret_cast(&buffer)); + status = hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the memory if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; @@ -399,22 +422,38 @@ uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t s return ptr; } +// Wait signal +void HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const { + while (1) { + const hsa_signal_value_t signal_value = + hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED); + if (signal_value == 0) { + break; + } else { + if (signal_value == 1) WARN_LOGGING("signal waiting..."); + else EXC_RAISING(HSA_STATUS_ERROR, "hsa_signal_wait_scacquire (" << signal_value << ")"); + } + } +} + +// Wait signal with signal value restore +void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { + SignalWait(signal); + hsa_signal_store_relaxed(const_cast(signal), signal_value); +} + // Copy data from GPU to host memory bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size) { hsa_status_t status = HSA_STATUS_ERROR; if (!cpu_agents_.empty()) { hsa_signal_t s = {}; status = hsa_signal_create(1, 0, NULL, &s); - if (status == HSA_STATUS_SUCCESS) { - status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); - if (status == HSA_STATUS_SUCCESS) { - if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, - HSA_WAIT_STATE_BLOCKED) != 0) { - status = HSA_STATUS_ERROR; - } - } - status = hsa_signal_destroy(s); - } + CHECK_STATUS("hsa_signal_create()", status); + status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); + CHECK_STATUS("hsa_amd_memory_async_copy()", status); + SignalWait(s); + status = hsa_signal_destroy(s); + CHECK_STATUS("hsa_signal_destroy()", status); } return (status == HSA_STATUS_SUCCESS); } @@ -557,6 +596,7 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; +HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; } // namespace util } // namespace rocprofiler diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index b00ee8ed..c76046d2 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -1,24 +1,26 @@ -/****************************************************************************** -Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*******************************************************************************/ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +<95> Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +<95> Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ #ifndef SRC_UTIL_HSA_RSRC_FACTORY_H_ #define SRC_UTIL_HSA_RSRC_FACTORY_H_ @@ -43,21 +45,23 @@ THE SOFTWARE. #define HSA_QUEUE_ALIGN_BYTES 64 #define HSA_PACKET_ALIGN_BYTES 64 -#define CHECK_STATUS(msg, status) \ - if (status != HSA_STATUS_SUCCESS) { \ +#define CHECK_STATUS(msg, status) do { \ + if ((status) != HSA_STATUS_SUCCESS) { \ const char* emsg = 0; \ hsa_status_string(status, &emsg); \ printf("%s: %s\n", msg, emsg ? emsg : ""); \ - exit(1); \ - } + abort(); \ + } \ +} while (0) -#define CHECK_ITER_STATUS(msg, status) \ - if (status != HSA_STATUS_INFO_BREAK) { \ +#define CHECK_ITER_STATUS(msg, status) do { \ + if ((status) != HSA_STATUS_INFO_BREAK) { \ const char* emsg = 0; \ hsa_status_string(status, &emsg); \ printf("%s: %s\n", msg, emsg ? emsg : ""); \ - exit(1); \ - } + abort(); \ + } \ +} while (0) namespace rocprofiler { namespace util { @@ -116,9 +120,42 @@ struct AgentInfo { uint32_t shader_arrays_per_se; }; +// HSA timer class +// Provides current HSA timestampa and system-clock/ns conversion API +class HsaTimer { + public: + typedef uint64_t timestamp_t; + static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; + typedef long double freq_t; + + HsaTimer() { + timestamp_t sysclock_hz = 0; + hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); + CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status); + sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz; + } + + // Methids for system-clock/ns conversion + timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { return timestamp_t((freq_t)sysclock * sysclock_factor_); } + timestamp_t ns_to_sysclock(const timestamp_t& time) const { return timestamp_t((freq_t)time / sysclock_factor_); } + + // Return timestamp in 'ns' + timestamp_t timestamp_ns() const { + timestamp_t sysclock; + hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); + CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)", status); + return sysclock_to_ns(sysclock); + } + + private: + // Timestamp frequency factor + freq_t sysclock_factor_; +}; + class HsaRsrcFactory { public: typedef std::recursive_mutex mutex_t; + typedef HsaTimer::timestamp_t timestamp_t; static HsaRsrcFactory* Create(bool initialize_hsa = true) { std::lock_guard lck(mutex_); @@ -204,6 +241,12 @@ class HsaRsrcFactory { // @return uint8_t* Pointer to buffer, null if allocation fails. uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size); + // Wait signal + void SignalWait(const hsa_signal_t& signal) const; + + // Wait signal with signal value restore + void SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const; + // Copy data from GPU to host memory bool Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size); bool Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size); @@ -235,6 +278,19 @@ class HsaRsrcFactory { // Return Loader API table const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; } + // Methods for system-clock/ns conversion and timestamp in 'ns' + timestamp_t SysclockToNs(const timestamp_t& sysclock) const { return timer_->sysclock_to_ns(sysclock); } + timestamp_t NsToSysclock(const timestamp_t& time) const { return timer_->ns_to_sysclock(time); } + timestamp_t TimestampNs() const { return timer_->timestamp_ns(); } + + timestamp_t GetSysTimeout() const { return timeout_; } + static timestamp_t GetTimeoutNs() { return timeout_ns_; } + static void SetTimeoutNs(const timestamp_t& time) { + std::lock_guard lck(mutex_); + timeout_ns_ = time; + if (instance_ != NULL) instance_->timeout_ = instance_->timer_->ns_to_sysclock(time); + } + private: // System agents iterating callback static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); @@ -280,6 +336,18 @@ class HsaRsrcFactory { // Loader API table hsa_ven_amd_loader_1_00_pfn_t loader_api_; + + // System timeout, ns + static timestamp_t timeout_ns_; + // System timeout, sysclock + timestamp_t timeout_; + + // HSA timer + HsaTimer* timer_; + + // CPU/kern-arg memory pools + hsa_amd_memory_pool_t *cpu_pool_; + hsa_amd_memory_pool_t *kern_arg_pool_; }; } // namespace util diff --git a/src/util/logger.h b/src/util/logger.h index 97477899..d37f6567 100644 --- a/src/util/logger.h +++ b/src/util/logger.h @@ -66,6 +66,7 @@ class Logger { static void begm() { Instance().ResetStreaming(true); } static void endl() { Instance().ResetStreaming(false); } + static void errm() { Instance().SetError(); } static const std::string& LastMessage() { Logger& logger = Instance(); @@ -94,19 +95,27 @@ class Logger { static uint32_t GetPid() { return syscall(__NR_getpid); } static uint32_t GetTid() { return syscall(__NR_gettid); } - Logger() : file_(NULL), dirty_(false), streaming_(false), messaging_(false) { - const char* path = getenv("ROCPROFILER_LOG"); - if (path != NULL) { - file_ = fopen("/tmp/rocprofiler_log.txt", "a"); + Logger() : file_(NULL), session_file_(NULL), dirty_(false), streaming_(false), messaging_(false), error_(false) { + const char* var = getenv("ROCPROFILER_LOG"); + if (var != NULL) file_ = fopen("/tmp/rocprofiler_log.txt", "a"); + + var = getenv("ROCPROFILER_SESS"); + if (var != NULL) { + std::string dir = var; + if (dir.back() != '/') dir.push_back('/'); + std::string name = dir + "log.txt"; + session_file_ = fopen(name.c_str(), "a"); + if (session_file_ != NULL) session_dir_ = dir; + else std::cerr << "ROCProfiler: cannot create session log '" << name << "'" << std::endl << std::flush; } + ResetStreaming(false); } ~Logger() { - if (file_ != NULL) { - if (dirty_) Put("\n"); - fclose(file_); - } + if (dirty_) Put("\n"); + if (file_ != NULL) fclose(file_); + if (session_file_ != NULL) fclose(session_file_); } void ResetStreaming(const bool messaging) { @@ -129,8 +138,15 @@ class Logger { if (file_ != NULL) { dirty_ = true; flock(fileno(file_), LOCK_EX); + fprintf(file_, "%s", m.c_str()); fflush(file_); + + if (session_file_ != NULL) { + fprintf(session_file_, "%s", m.c_str()); + fflush(session_file_); + } + flock(fileno(file_), LOCK_UN); } } @@ -146,10 +162,23 @@ class Logger { Put(oss.str()); } + void SetError() { + std::lock_guard lck(mutex_); + if (error_ == false) { + error_ = true; + if (session_dir_.empty() == false) { + auto x = fopen(std::string(session_dir_ + "error").c_str(), "w"); (void)x; + } + } + } + FILE* file_; + FILE* session_file_; bool dirty_; bool streaming_; bool messaging_; + bool error_; + std::string session_dir_; static mutex_t mutex_; static Logger* instance_; @@ -160,32 +189,33 @@ class Logger { } // namespace rocprofiler #define ERR_LOGGING(stream) \ - { \ - rocprofiler::util::Logger::Instance() << "error: " << rocprofiler::util::Logger::begm \ + do { \ + rocprofiler::util::Logger::Instance() << rocprofiler::util::Logger::errm \ + << "error: " << rocprofiler::util::Logger::begm \ << stream << rocprofiler::util::Logger::endl; \ - } + } while(0) #define INFO_LOGGING(stream) \ - { \ + do { \ rocprofiler::util::Logger::Instance() << "info: " << rocprofiler::util::Logger::begm << stream \ << rocprofiler::util::Logger::endl; \ - } + } while(0) #define WARN_LOGGING(stream) \ - { \ - std::cerr << "ROCProfiler: " << stream << std::endl; \ + do { \ + std::cerr << "ROCProfiler: " << stream << std::endl; \ rocprofiler::util::Logger::Instance() << "warning: " << rocprofiler::util::Logger::begm << stream \ << rocprofiler::util::Logger::endl; \ - } + } while(0) #ifdef DEBUG #define DBG_LOGGING(stream) \ - { \ + do { \ rocprofiler::util::Logger::Instance() << rocprofiler::util::Logger::begm << "debug: \"" \ << stream << "\"" < < < < \ " in " << __FUNCTION__ << " at " << __FILE__ << " line " << __LINE__ \ << rocprofiler::util::Logger::endl; \ - } + } while(0) #endif #endif // SRC_UTIL_LOGGER_H_ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 278bc5c4..2f35639d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,16 +1,16 @@ ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -20,7 +20,7 @@ # THE SOFTWARE. ################################################################################ -cmake_minimum_required ( VERSION 3.5.0 ) +cmake_minimum_required ( VERSION 2.8.12 ) set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) set ( EXE_NAME "ctrl" ) @@ -49,7 +49,7 @@ execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/${TEST_NAME}/*.hsaco ${PROJECT_ ## Building test executable add_executable ( ${EXE_NAME} ${KERN_SRC} ${CTRL_SRC} ${UTIL_SRC} ) target_include_directories ( ${EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) -target_link_libraries( ${EXE_NAME} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt atomic ) +target_link_libraries( ${EXE_NAME} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/run.sh ${PROJECT_BINARY_DIR}" ) execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/tool/*.xml ${PROJECT_BINARY_DIR}" ) execute_process ( COMMAND sh -xc "mkdir -p ${PROJECT_BINARY_DIR}/RESULTS" ) @@ -59,4 +59,4 @@ set ( TEST_LIB "tool" ) set ( TEST_LIB_SRC ${TEST_DIR}/tool/tool.cpp ${UTIL_SRC} ) add_library ( ${TEST_LIB} SHARED ${TEST_LIB_SRC} ) target_include_directories ( ${TEST_LIB} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) -target_link_libraries ( ${TEST_LIB} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt atomic ) +target_link_libraries ( ${TEST_LIB} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt ) diff --git a/test/run.sh b/test/run.sh index 037b47a2..3ac292e6 100755 --- a/test/run.sh +++ b/test/run.sh @@ -1,16 +1,16 @@ ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. -# +# # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: -# +# # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. -# +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -36,6 +36,8 @@ export ROCP_TOOL_LIB=libtool.so unset ROCP_PROXY_QUEUE # ROC profiler metrics config file export ROCP_METRICS=metrics.xml +# ROC profiler kernels timing +export ROCP_TIMESTAMP_ON=1 # output directory for the tool library, for metrics results file 'results.txt' export ROCP_OUTPUT_DIR=./RESULTS @@ -54,6 +56,11 @@ export ROCP_DITER=100 export ROCP_INPUT=input.xml eval $tbin +#export ROCP_KITER=1 +#export ROCP_DITER=4 +#export ROCP_INPUT=input1.xml +#eval $tbin + #valgrind --leak-check=full $tbin #valgrind --tool=massif $tbin #ms_print massif.out. diff --git a/test/tool/input1.xml b/test/tool/input1.xml new file mode 100644 index 00000000..254c83dc --- /dev/null +++ b/test/tool/input1.xml @@ -0,0 +1,5 @@ +# List of metrics + diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 0eb79940..65385078 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -37,11 +37,14 @@ THE SOFTWARE. #include #include +#include +#include #include #include #include #include #include +#include #include #include "inc/rocprofiler.h" @@ -68,7 +71,8 @@ struct callbacks_data_t { // Context stored entry type struct context_entry_t { - uint32_t valid; + bool valid; + bool active; uint32_t index; hsa_agent_t agent; rocprofiler_group_t group; @@ -93,8 +97,6 @@ callbacks_data_t* callbacks_data = NULL; // Stored contexts array typedef std::map context_array_t; context_array_t* context_array = NULL; -typedef std::list wait_list_t; -wait_list_t* wait_list = NULL; // Contexts collected count volatile uint32_t context_count = 0; volatile uint32_t context_collected = 0; @@ -169,9 +171,9 @@ std::string filtr_kernel_name(const std::string name) { } ++rit; } - while (((*rit == ' ') || (*rit == ' ')) && (rit != rend)) rit++; + while (rit != rend) if ((*rit == ' ') || (*rit == ' ')) rit++; else break; auto rbeg = rit; - while ((*rit != ' ') && (*rit != ':') && (rit != rend)) rit++; + while (rit != rend) if ((*rit != ' ') && (*rit != ':')) rit++; else break; const uint32_t pos = rend - rit; const uint32_t length = rit - rbeg; return name.substr(pos, length); @@ -382,11 +384,12 @@ void output_group(const context_entry_t* entry, const char* label) { } } -// Dump stored context profiling output data -bool dump_context(context_entry_t* entry) { +// Dump stored context entry +bool dump_context_entry(context_entry_t* entry) { hsa_status_t status = HSA_STATUS_ERROR; - if (entry->valid == 0) return true; + volatile std::atomic* valid = reinterpret_cast*>(&entry->valid); + while (valid->load() == false) sched_yield(); const rocprofiler_dispatch_record_t* record = entry->data.record; if (record) { @@ -436,65 +439,48 @@ bool dump_context(context_entry_t* entry) { rocprofiler_close(group.context); } - entry->valid = 0; return true; } -// Dump and clean a given context entry -static inline bool dump_context_entry(context_entry_t* entry) { - const bool ret = dump_context(entry); - if (ret) dealloc_context_entry(entry); - return ret; -} - -// Dump waiting entries -static inline void dump_wait_list() { - if (pthread_mutex_lock(&mutex) != 0) { - perror("pthread_mutex_lock"); - abort(); - } - - auto it = wait_list->begin(); - auto end = wait_list->end(); - while (it != end) { - auto cur = it++; - if (dump_context_entry(*cur)) { - wait_list->erase(cur); +// Wait for and dump all stored contexts for a given queue if not NULL +void dump_context_array(hsa_queue_t* queue) { + bool done = false; + while (done == false) { + done = true; + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); } - } - if (pthread_mutex_unlock(&mutex) != 0) { - perror("pthread_mutex_unlock"); - abort(); - } -} - -// Dump all stored contexts profiling output data -void dump_context_array() { - if (pthread_mutex_lock(&mutex) != 0) { - perror("pthread_mutex_lock"); - abort(); - } - - if (context_array) { - if (!wait_list->empty()) dump_wait_list(); + if (context_array) { + auto it = context_array->begin(); + auto end = context_array->end(); + while (it != end) { + auto cur = it++; + context_entry_t* entry = &(cur->second); + volatile std::atomic* valid = reinterpret_cast*>(&entry->valid); + while (valid->load() == false) sched_yield(); + if ((queue == NULL) || (entry->data.queue == queue)) { + if (entry->active == true) { + if (dump_context_entry(&(cur->second)) == false) done = false; + else entry->active = false; + } + } + } - auto it = context_array->begin(); - auto end = context_array->end(); - while (it != end) { - auto cur = it++; - dump_context(&(cur->second)); + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + if (done == false) sched_yield(); } } - - if (pthread_mutex_unlock(&mutex) != 0) { - perror("pthread_mutex_unlock"); - abort(); - } } // Profiling completion handler -bool handler(rocprofiler_group_t group, void* arg) { +// Dump and delete the context entry +// Return true if the context was dumped successfully +bool context_handler(rocprofiler_group_t group, void* arg) { context_entry_t* entry = reinterpret_cast(arg); if (pthread_mutex_lock(&mutex) != 0) { @@ -502,11 +488,15 @@ bool handler(rocprofiler_group_t group, void* arg) { abort(); } - if (!wait_list->empty()) dump_wait_list(); - - if (!dump_context_entry(entry)) { - wait_list->push_back(entry); + bool ret = true; + if (entry->active == true) { + ret = dump_context_entry(entry); + if (ret == false) { + fprintf(stderr, "tool error: context is not complete\n"); + abort(); + } } + if (ret) dealloc_context_entry(entry); if (trace_on) { fprintf(stdout, "tool::handler: context_array %d tid %u\n", (int)(context_array->size()), GetTid()); @@ -577,7 +567,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, context_entry_t* entry = alloc_context_entry(); // context properties rocprofiler_properties_t properties{}; - properties.handler = (result_prefix != NULL) ? handler : NULL; + properties.handler = (result_prefix != NULL) ? context_handler : NULL; properties.handler_arg = (void*)entry; rocprofiler_feature_t* features = tool_data->features; @@ -598,22 +588,20 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, feature_count = next_offset - set_offset; } - if (tool_data->feature_count > 0) { - // Open profiling context - status = rocprofiler_open(callback_data->agent, features, feature_count, - &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); - check_status(status); - - // Check that we have only one profiling group - uint32_t group_count = 0; - status = rocprofiler_group_count(context, &group_count); - check_status(status); - assert(group_count == 1); - // Get group[0] - const uint32_t group_index = 0; - status = rocprofiler_get_group(context, group_index, group); - check_status(status); - } + // Open profiling context + status = rocprofiler_open(callback_data->agent, features, feature_count, + &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); + check_status(status); + + // Check that we have only one profiling group + uint32_t group_count = 0; + status = rocprofiler_group_count(context, &group_count); + check_status(status); + assert(group_count == 1); + // Get group[0] + const uint32_t group_index = 0; + status = rocprofiler_get_group(context, group_index, group); + check_status(status); // Fill profiling context entry entry->agent = callback_data->agent; @@ -623,7 +611,8 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, entry->data = *callback_data; entry->data.kernel_name = strdup(callback_data->kernel_name); entry->file_handle = tool_data->file_handle; - entry->valid = 1; + entry->active = true; + reinterpret_cast*>(&entry->valid)->store(true); if (trace_on) { fprintf(stdout, "tool::dispatch: context_array %d tid %u\n", (int)(context_array->size()), GetTid()); @@ -635,7 +624,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, hsa_status_t destroy_callback(hsa_queue_t* queue, void*) { if (result_file_opened == false) printf("\nROCProfiler results:\n"); - dump_context_array(); + dump_context_array(queue); return HSA_STATUS_SUCCESS; } @@ -644,8 +633,16 @@ static hsa_status_t info_callback(const rocprofiler_info_data_t info, void * arg if (((symb == 'b') && (info.metric.expr == NULL)) || ((symb == 'd') && (info.metric.expr != NULL))) { - printf("\n gpu-agent%d : %s : %s\n", info.agent_index, info.metric.name, info.metric.description); - if (info.metric.expr != NULL) printf(" %s = %s\n", info.metric.name, info.metric.expr); + if (info.metric.expr != NULL) { + fprintf(stdout, "\n gpu-agent%d : %s : %s\n", info.agent_index, info.metric.name, info.metric.description); + fprintf(stdout, " %s = %s\n", info.metric.name, info.metric.expr); + } else { + fprintf(stdout, "\n gpu-agent%d : %s", info.agent_index, info.metric.name); + if (info.metric.instances > 1) fprintf(stdout, "[0-%u]", info.metric.instances - 1); + fprintf(stdout, " : %s\n", info.metric.description); + fprintf(stdout, " block %s has %u counters\n", info.metric.block_name, info.metric.block_counters); + } + fflush(stdout); } return HSA_STATUS_SUCCESS; } @@ -800,7 +797,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) } else { if (*info_symb == 'b') printf("Basic HW counters:\n"); else printf("Derived metrics:\n"); - rocprofiler_iterate_info(NULL, ROCPROFILER_INFO_KIND_METRIC, info_callback, info_symb); + hsa_status_t status = rocprofiler_iterate_info(NULL, ROCPROFILER_INFO_KIND_METRIC, info_callback, info_symb); + check_status(status); } exit(1); } @@ -917,8 +915,10 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK; parameters_dict["TOKEN_MASK2"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2; +#ifdef AQLPROF_NEW_API parameters_dict["SE_MASK"] = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK; +#endif printf(" %s (", name.c_str()); features[index] = {}; @@ -956,7 +956,6 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) // Context array aloocation context_array = new context_array_t; - wait_list = new wait_list_t; // Adding dispatch observer rocprofiler_queue_callbacks_t callbacks_ptrs{0}; @@ -1007,21 +1006,13 @@ extern "C" PUBLIC_API void OnUnloadTool() { rocprofiler_remove_queue_callbacks(); // Dump stored profiling output data - printf("\nROCPRofiler: %u contexts collected", context_collected); - if (result_file_opened) printf(", output directory %s", result_prefix); - printf("\n"); fflush(stdout); - dump_context_array(); - if (wait_list) { - if (!wait_list->empty()) { - printf("\nWaiting for pending kernels ..."); fflush(stdout); - while (wait_list->size() != 0) { - usleep(1000); - dump_wait_list(); - } - printf(".done\n"); fflush(stdout); - } + printf("\nROCPRofiler: %u contexts collected", context_collected); fflush(stdout); + dump_context_array(NULL); + if (result_file_opened) { + fclose(result_file_handle); + printf(", output directory %s", result_prefix); } - if (result_file_opened) fclose(result_file_handle); + printf("\n"); fflush(stdout); // Cleanup if (callbacks_data != NULL) { @@ -1039,8 +1030,6 @@ extern "C" PUBLIC_API void OnUnloadTool() { range_vec = NULL; delete context_array; context_array = NULL; - delete wait_list; - wait_list = NULL; } extern "C" DESTRUCTOR_API void destructor() { diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp index 5116a3a8..5404608b 100644 --- a/test/util/hsa_rsrc_factory.cpp +++ b/test/util/hsa_rsrc_factory.cpp @@ -1,24 +1,26 @@ -/****************************************************************************** -Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*******************************************************************************/ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +<95> Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +<95> Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ #include "util/hsa_rsrc_factory.h" @@ -105,14 +107,21 @@ hsa_status_t FindKernArgPool(hsa_amd_memory_pool_t pool, void* data) { // Constructor of the class HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize_hsa) { hsa_status_t status; + + cpu_pool_ = NULL; + kern_arg_pool_ = NULL; + // Initialize the Hsa Runtime if (initialize_hsa_) { status = hsa_init(); CHECK_STATUS("Error in hsa_init", status); } + // Discover the set of Gpu devices available on the platform status = hsa_iterate_agents(GetHsaAgentsCallback, this); CHECK_STATUS("Error Calling hsa_iterate_agents", status); + if (cpu_pool_ == NULL) CHECK_STATUS("CPU memory pool is not found", HSA_STATUS_ERROR); + if (kern_arg_pool_ == NULL) CHECK_STATUS("Kern-arg memory pool is not found", HSA_STATUS_ERROR); // Get AqlProfile API table aqlprofile_api_ = {0}; @@ -127,10 +136,19 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize loader_api_ = {0}; status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_); CHECK_STATUS("loader API table query failed", status); + + // Instantiate HSA timer + timer_ = new HsaTimer; + CHECK_STATUS("HSA timer allocation failed", + (timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); + + // System timeout + timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_); } // Destructor of the class HsaRsrcFactory::~HsaRsrcFactory() { + delete timer_; for (auto p : cpu_list_) delete p; for (auto p : gpu_list_) delete p; if (initialize_hsa_) { @@ -157,8 +175,10 @@ hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { (decltype(::hsa_ven_amd_aqlprofile_start)*)dlsym(handle, "hsa_ven_amd_aqlprofile_start"); api->hsa_ven_amd_aqlprofile_stop = (decltype(::hsa_ven_amd_aqlprofile_stop)*)dlsym(handle, "hsa_ven_amd_aqlprofile_stop"); +#ifdef AQLPROF_NEW_API api->hsa_ven_amd_aqlprofile_read = (decltype(::hsa_ven_amd_aqlprofile_read)*)dlsym(handle, "hsa_ven_amd_aqlprofile_read"); +#endif api->hsa_ven_amd_aqlprofile_legacy_get_pm4 = (decltype(::hsa_ven_amd_aqlprofile_legacy_get_pm4)*)dlsym( handle, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); @@ -188,9 +208,9 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { agent_info->dev_index = cpu_list_.size(); status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); - CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(cpu pool)", status); + if ((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == NULL)) cpu_pool_ = &agent_info->cpu_pool; status = hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); - CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(kern arg pool)", status); + if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == NULL)) kern_arg_pool_ = &agent_info->kern_arg_pool; agent_info->gpu_pool = {}; cpu_list_.push_back(agent_info); @@ -352,7 +372,7 @@ uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size uint8_t* buffer = NULL; if (!cpu_agents_.empty()) { size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status = hsa_amd_memory_pool_allocate(cpu_list_[0]->kern_arg_pool, size, 0, reinterpret_cast(&buffer)); + status = hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the kernel arguments if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; @@ -372,7 +392,7 @@ uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t s uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; if (!cpu_agents_.empty()) { - status = hsa_amd_memory_pool_allocate(cpu_list_[0]->cpu_pool, size, 0, reinterpret_cast(&buffer)); + status = hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the memory if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; @@ -396,22 +416,37 @@ uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t s return ptr; } +// Wait signal +void HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const { + while (1) { + const hsa_signal_value_t signal_value = + hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED); + if (signal_value == 0) { + break; + } else { + CHECK_STATUS("hsa_signal_wait_scacquire()", HSA_STATUS_ERROR); + } + } +} + +// Wait signal with signal value restore +void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { + SignalWait(signal); + hsa_signal_store_relaxed(const_cast(signal), signal_value); +} + // Copy data from GPU to host memory bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size) { hsa_status_t status = HSA_STATUS_ERROR; if (!cpu_agents_.empty()) { hsa_signal_t s = {}; status = hsa_signal_create(1, 0, NULL, &s); - if (status == HSA_STATUS_SUCCESS) { - status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); - if (status == HSA_STATUS_SUCCESS) { - if (hsa_signal_wait_scacquire(s, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, - HSA_WAIT_STATE_BLOCKED) != 0) { - status = HSA_STATUS_ERROR; - } - } - status = hsa_signal_destroy(s); - } + CHECK_STATUS("hsa_signal_create()", status); + status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); + CHECK_STATUS("hsa_amd_memory_async_copy()", status); + SignalWait(s); + status = hsa_signal_destroy(s); + CHECK_STATUS("hsa_signal_destroy()", status); } return (status == HSA_STATUS_SUCCESS); } @@ -554,3 +589,4 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; +HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; diff --git a/test/util/hsa_rsrc_factory.h b/test/util/hsa_rsrc_factory.h index e7dcc559..c9466f89 100644 --- a/test/util/hsa_rsrc_factory.h +++ b/test/util/hsa_rsrc_factory.h @@ -1,24 +1,26 @@ -/****************************************************************************** -Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*******************************************************************************/ +/********************************************************************** +Copyright ©2013 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: + +<95> Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +<95> Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ #ifndef TEST_UTIL_HSA_RSRC_FACTORY_H_ #define TEST_UTIL_HSA_RSRC_FACTORY_H_ @@ -43,21 +45,23 @@ THE SOFTWARE. #define HSA_QUEUE_ALIGN_BYTES 64 #define HSA_PACKET_ALIGN_BYTES 64 -#define CHECK_STATUS(msg, status) \ - if (status != HSA_STATUS_SUCCESS) { \ +#define CHECK_STATUS(msg, status) do { \ + if ((status) != HSA_STATUS_SUCCESS) { \ const char* emsg = 0; \ hsa_status_string(status, &emsg); \ printf("%s: %s\n", msg, emsg ? emsg : ""); \ - exit(1); \ - } + abort(); \ + } \ +} while (0) -#define CHECK_ITER_STATUS(msg, status) \ - if (status != HSA_STATUS_INFO_BREAK) { \ +#define CHECK_ITER_STATUS(msg, status) do { \ + if ((status) != HSA_STATUS_INFO_BREAK) { \ const char* emsg = 0; \ hsa_status_string(status, &emsg); \ printf("%s: %s\n", msg, emsg ? emsg : ""); \ - exit(1); \ - } + abort(); \ + } \ +} while (0) static const size_t MEM_PAGE_BYTES = 0x1000; static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; @@ -114,9 +118,42 @@ struct AgentInfo { uint32_t shader_arrays_per_se; }; +// HSA timer class +// Provides current HSA timestampa and system-clock/ns conversion API +class HsaTimer { + public: + typedef uint64_t timestamp_t; + static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; + typedef long double freq_t; + + HsaTimer() { + timestamp_t sysclock_hz = 0; + hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); + CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status); + sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz; + } + + // Methids for system-clock/ns conversion + timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { return timestamp_t((freq_t)sysclock * sysclock_factor_); } + timestamp_t ns_to_sysclock(const timestamp_t& time) const { return timestamp_t((freq_t)time / sysclock_factor_); } + + // Return timestamp in 'ns' + timestamp_t timestamp_ns() const { + timestamp_t sysclock; + hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); + CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)", status); + return sysclock_to_ns(sysclock); + } + + private: + // Timestamp frequency factor + freq_t sysclock_factor_; +}; + class HsaRsrcFactory { public: typedef std::recursive_mutex mutex_t; + typedef HsaTimer::timestamp_t timestamp_t; static HsaRsrcFactory* Create(bool initialize_hsa = true) { std::lock_guard lck(mutex_); @@ -202,6 +239,12 @@ class HsaRsrcFactory { // @return uint8_t* Pointer to buffer, null if allocation fails. uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size); + // Wait signal + void SignalWait(const hsa_signal_t& signal) const; + + // Wait signal with signal value restore + void SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const; + // Copy data from GPU to host memory bool Memcpy(const hsa_agent_t& agent, void* dst, const void* src, size_t size); bool Memcpy(const AgentInfo* agent_info, void* dst, const void* src, size_t size); @@ -233,6 +276,19 @@ class HsaRsrcFactory { // Return Loader API table const hsa_ven_amd_loader_1_00_pfn_t* LoaderApi() const { return &loader_api_; } + // Methods for system-clock/ns conversion and timestamp in 'ns' + timestamp_t SysclockToNs(const timestamp_t& sysclock) const { return timer_->sysclock_to_ns(sysclock); } + timestamp_t NsToSysclock(const timestamp_t& time) const { return timer_->ns_to_sysclock(time); } + timestamp_t TimestampNs() const { return timer_->timestamp_ns(); } + + timestamp_t GetSysTimeout() const { return timeout_; } + static timestamp_t GetTimeoutNs() { return timeout_ns_; } + static void SetTimeoutNs(const timestamp_t& time) { + std::lock_guard lck(mutex_); + timeout_ns_ = time; + if (instance_ != NULL) instance_->timeout_ = instance_->timer_->ns_to_sysclock(time); + } + private: // System agents iterating callback static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); @@ -278,6 +334,18 @@ class HsaRsrcFactory { // Loader API table hsa_ven_amd_loader_1_00_pfn_t loader_api_; + + // System timeout, ns + static timestamp_t timeout_ns_; + // System timeout, sysclock + timestamp_t timeout_; + + // HSA timer + HsaTimer* timer_; + + // CPU/kern-arg memory pools + hsa_amd_memory_pool_t *cpu_pool_; + hsa_amd_memory_pool_t *kern_arg_pool_; }; From f344120fcce98120a9b488c8dc60273b74e33980 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 20 Aug 2018 00:48:43 -0500 Subject: [PATCH 005/168] metrics descriptions --- test/tool/gfx_metrics.xml | 100 +++++++++++++++++++------------------- test/tool/metrics.xml | 5 +- 2 files changed, 52 insertions(+), 53 deletions(-) diff --git a/test/tool/gfx_metrics.xml b/test/tool/gfx_metrics.xml index 899ca85e..9e4f24fc 100644 --- a/test/tool/gfx_metrics.xml +++ b/test/tool/gfx_metrics.xml @@ -1,31 +1,31 @@ - - + + - - - - - - - - - - + + + + + + + + + + - - - - - + + + + + - - - + + + - - - + + + @@ -33,37 +33,37 @@ - - + + - - - - - - - - - - + + + + + + + + + + - - - - - + + + + + - - - + + + - - - - - - - + + + + + + + diff --git a/test/tool/metrics.xml b/test/tool/metrics.xml index a346eee9..6ee5c1d6 100644 --- a/test/tool/metrics.xml +++ b/test/tool/metrics.xml @@ -47,15 +47,14 @@ - # GPUBusy, percentage - # The percentage of time GPU was busy. + # GPUBusy The percentage of time GPU was busy. - # Wavefronts Total wavefronts., + # Wavefronts Total wavefronts. Date: Mon, 20 Aug 2018 02:53:16 -0500 Subject: [PATCH 006/168] install prefix --- cmake_modules/env.cmake | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index a71acb66..47d404a2 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -118,15 +118,20 @@ get_filename_component ( HSA_RUNTIME_LIB_PATH ${HSA_RUNTIME_LIB} DIRECTORY ) find_library ( HSA_KMT_LIB "libhsakmt.so" ) get_filename_component ( HSA_KMT_LIB_PATH ${HSA_KMT_LIB} DIRECTORY ) -set ( API_PATH ${HSA_RUNTIME_INC_PATH} ) +## Install directory +if ( DEFINED ENV{CMAKE_INSTALL_DIR} ) + set ( CMAKE_INSTALL_PREFIX $ENV{CMAKE_INSTALL_DIR} ) +else () + set ( CMAKE_INSTALL_PREFIX "/opt/rocm" ) +endif () ## Basic Tool Chain Information -message ( "----------------NBIT: ${NBIT}" ) -message ( "-----------BuildType: ${CMAKE_BUILD_TYPE}" ) +message ( "----------------NBit: ${NBIT}" ) +message ( "----------Build-Type: ${CMAKE_BUILD_TYPE}" ) message ( "------------Compiler: ${CMAKE_CXX_COMPILER}" ) message ( "----Compiler-Version: ${CMAKE_CXX_COMPILER_VERSION}" ) message ( "-----HSA-Runtime-Inc: ${HSA_RUNTIME_INC_PATH}" ) message ( "-----HSA-Runtime-Lib: ${HSA_RUNTIME_LIB_PATH}" ) -message ( "------------API-path: ${API_PATH}" ) -message ( "-----CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}" ) +message ( "-----------CXX-Flags: ${CMAKE_CXX_FLAGS}" ) message ( "---CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" ) +message ( "---------Install-Dir: ${CMAKE_INSTALL_PREFIX}" ) From d1e594d468621ca702d460d17cdcc1a485e76487 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 22 Aug 2018 11:17:12 -0500 Subject: [PATCH 007/168] Update README.md --- README.md | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3ac63141..18be5e38 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,9 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics and HW traces The library source tree: - - doc - Documentation + - bin + - rpl_run.sh - profiling utility + - doc - Documentation - inc/rocprofiler.h - Library public API - src - Library sources - core - Library API sources @@ -30,6 +32,9 @@ The library source tree: cd build cmake -DCMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa .. make + + For ROCM under 1.9 need: + export CMAKE_CURR_API=1 ``` ## To run the test: @@ -59,3 +64,73 @@ The library source tree: ``` export ROCPROFILER_TRACE=1 ``` + +## Profiling utility usage: + + rpl_run.sh [-h] [--list-basic] [--list-derived] [-i ] [-o ] + +Options: + -h - this help + --verbose - verbose mode, dumping all base counters used in the input metrics + --list-basic - to print the list of basic HW counters + --list-derived - to print the list of derived metrics with formulas + + -i <.txt|.xml file> - input file + Input file .txt format, automatically rerun application for every pmc/sqtt line: + + # Perf counters group 1 + pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize + # Perf counters group 2 + pmc : WriteSize L2CacheHit + # Filter by dispatches range, GPU index and kernel names + # supported range formats: "3:9", "3:", "3" + range: 1 : 4 + gpu: 0 1 2 3 + kernel: simple Pass1 simpleConvolutionPass2 + + Input file .xml format, for single profiling run: + + # Metrics list definition, also the form ":" can be used + # All defined metrics can be found in the 'metrics.xml' + # There are basic metrics for raw HW counters and high-level metrics for derived counters + + + # Filter by dispatches range, GPU index and kernel names + + + -o - output CSV file [.csv] + -d - directory where profiler store profiling data including thread treaces [/tmp] + The data directory is renoving autonatically if the directory is matching the temporary one, which is the default. + -t - to change the temporary directory [/tmp] + By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory. + + --basenames - to turn on/off truncating of the kernel full function names till the base ones [off] + --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off] + --ctx-limit - maximum number of outstanding contexts [0 - unlimited] + --heartbeat - to print progress heartbeats [0 - disabled] + --sqtt-size - to set SQTT buffer size, aggregate for all SE [0x2000000] + Can be set in KB (1024B) or MB (1048576) units, examples 20K or 20M respectively. + --sqtt-local - to allocate SQTT buffer in local GPU memory [on] + +Configuration file: + You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: + First the configuration file is looking in the current directory, then in your home, and then in the package directory. + Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'sqtt-size', 'sqtt-local'. + An example of 'rpl_rc.xml': + + From 0921b34f0a53a20d7f0e77a8f0c5d404618c9f49 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 22 Aug 2018 11:18:23 -0500 Subject: [PATCH 008/168] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 18be5e38..a0f9f440 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ The library source tree: ``` ## Profiling utility usage: - +``` rpl_run.sh [-h] [--list-basic] [--list-derived] [-i ] [-o ] Options: @@ -133,4 +133,4 @@ Configuration file: sqtt-size=0x20M sqtt-local=on > - +``` From 1cf808b55c3d339392ea7fc9b8b70b9cdceb165a Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 22 Aug 2018 11:27:33 -0500 Subject: [PATCH 009/168] Update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a0f9f440..ae1d889e 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ HW specific low-level performance analysis interface for profiling of GPU comput The library source tree: - bin - - rpl_run.sh - profiling utility + - rpl_run.sh - Profiling tool run script - doc - Documentation - inc/rocprofiler.h - Library public API - src - Library sources @@ -14,6 +14,9 @@ The library source tree: - util - Library utils sources - xml - XML parser - test - Library test suite + - tool - Profiling tool + - tool.cpp - tool sources + - metrics.xml - metrics config file - ctrl - Test controll - util - Test utils - simple_convolution - Simple convolution test kernel @@ -31,7 +34,6 @@ The library source tree: mkdir build cd build cmake -DCMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa .. - make For ROCM under 1.9 need: export CMAKE_CURR_API=1 From aae99a8ba70297f2f8c0724dd882b542664fc060 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 22 Aug 2018 11:28:47 -0500 Subject: [PATCH 010/168] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ae1d889e..40b5e4f5 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,7 @@ The library source tree: mkdir build cd build cmake -DCMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa .. + make For ROCM under 1.9 need: export CMAKE_CURR_API=1 From d763aa8aaffe84ab23653b97e1a7fb17af8654d4 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 22 Aug 2018 11:34:31 -0500 Subject: [PATCH 011/168] install dir --- cmake_modules/env.cmake | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index 47d404a2..37223b6a 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -118,13 +118,6 @@ get_filename_component ( HSA_RUNTIME_LIB_PATH ${HSA_RUNTIME_LIB} DIRECTORY ) find_library ( HSA_KMT_LIB "libhsakmt.so" ) get_filename_component ( HSA_KMT_LIB_PATH ${HSA_KMT_LIB} DIRECTORY ) -## Install directory -if ( DEFINED ENV{CMAKE_INSTALL_DIR} ) - set ( CMAKE_INSTALL_PREFIX $ENV{CMAKE_INSTALL_DIR} ) -else () - set ( CMAKE_INSTALL_PREFIX "/opt/rocm" ) -endif () - ## Basic Tool Chain Information message ( "----------------NBit: ${NBIT}" ) message ( "----------Build-Type: ${CMAKE_BUILD_TYPE}" ) From 4ca55841853dea4fd3abca54a106fba204bbc0c3 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 22 Aug 2018 11:35:39 -0500 Subject: [PATCH 012/168] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 40b5e4f5..20be662f 100644 --- a/README.md +++ b/README.md @@ -30,11 +30,13 @@ The library source tree: ## To build with the current installed ROCM: ``` + - To build and install to /opt/rocm/rocprofiler cd .../rocprofiler mkdir build cd build - cmake -DCMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa .. + cmake -DCMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa -DCMAKE_INSTALL_PREFIX=/opt/rocm .. make + make install For ROCM under 1.9 need: export CMAKE_CURR_API=1 From bdb98b64187969865fbf045bac5ed7a0c41b9af7 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 22 Aug 2018 12:02:47 -0500 Subject: [PATCH 013/168] rocm include dir search fix --- cmake_modules/env.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index 37223b6a..68209386 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -107,9 +107,9 @@ elseif ( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86" ) endif () ## Find hsa-runtime headers/lib -find_file ( HSA_RUNTIME_INC "hsa.h" ) -if ( "${HSA_RUNTIME_INC_PATH}" STREQUAL "" ) - find_file ( HSA_RUNTIME_INC "hsa/hsa.h" ) +find_file ( HSA_RUNTIME_INC "hsa/hsa.h" ) +if ( "${HSA_RUNTIME_INC}" STREQUAL "" ) + find_file ( HSA_RUNTIME_INC "hsa.h" ) endif() find_library ( HSA_RUNTIME_LIB "libhsa-runtime${NBIT}.so" ) get_filename_component ( HSA_RUNTIME_INC_PATH ${HSA_RUNTIME_INC} DIRECTORY ) From 3039a5162921530a080d7048bc50cfcfc3a09710 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Sun, 26 Aug 2018 11:25:33 -0500 Subject: [PATCH 014/168] enable timestamps only --- CMakeLists.txt | 18 +++++++++++------- cmake_modules/env.cmake | 7 +++---- src/core/context.h | 7 ++++++- src/core/intercept_queue.h | 1 - test/tool/tool.cpp | 38 ++++++++++++++++++++++++++++---------- 5 files changed, 48 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 92bc348f..c8d473d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,23 +74,28 @@ endif () add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test ) ## Install information -install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION ${ROCPROFILER_NAME}/lib ) -install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h DESTINATION ${ROCPROFILER_NAME}/include ) +#if ( CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT ) +#message ( "CMAKE default prefix: ${CMAKE_INSTALL_PREFIX}" ) +#endif () +set ( CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}/${ROCPROFILER_NAME}" ) +message ( "---------Install-Dir: ${CMAKE_INSTALL_PREFIX}" ) +install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION lib ) +install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h DESTINATION include ) # rpl_run.sh tblextr.py txt2xml.sh install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/bin/rpl_run.sh ${CMAKE_CURRENT_SOURCE_DIR}/bin/txt2xml.sh ${CMAKE_CURRENT_SOURCE_DIR}/bin/tblextr.py - DESTINATION ${ROCPROFILER_NAME}/bin + DESTINATION bin PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) # gfx_metrics.xml metrics.xml install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/test/tool/metrics.xml ${CMAKE_CURRENT_SOURCE_DIR}/test/tool/gfx_metrics.xml - DESTINATION ${ROCPROFILER_NAME}/lib ) + DESTINATION lib ) # libtool.so -install ( FILES ${PROJECT_BINARY_DIR}/test/libtool.so DESTINATION ${ROCPROFILER_NAME}/tool ) -install ( FILES ${PROJECT_BINARY_DIR}/test/ctrl DESTINATION ${ROCPROFILER_NAME}/tool +install ( FILES ${PROJECT_BINARY_DIR}/test/libtool.so DESTINATION tool ) +install ( FILES ${PROJECT_BINARY_DIR}/test/ctrl DESTINATION tool PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) ## Packaging directives @@ -105,7 +110,6 @@ set ( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" ) ## Debian package specific variables set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev" ) -set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/HSA-RocProfiler" ) set ( CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst;${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm" ) ## RPM package specific variables diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index 68209386..6bf6ed45 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -107,9 +107,9 @@ elseif ( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86" ) endif () ## Find hsa-runtime headers/lib -find_file ( HSA_RUNTIME_INC "hsa/hsa.h" ) -if ( "${HSA_RUNTIME_INC}" STREQUAL "" ) - find_file ( HSA_RUNTIME_INC "hsa.h" ) +find_file ( HSA_RUNTIME_INC "hsa.h" ) +if ( "${HSA_RUNTIME_INC_PATH}" STREQUAL "" ) + find_file ( HSA_RUNTIME_INC "hsa/hsa.h" ) endif() find_library ( HSA_RUNTIME_LIB "libhsa-runtime${NBIT}.so" ) get_filename_component ( HSA_RUNTIME_INC_PATH ${HSA_RUNTIME_INC} DIRECTORY ) @@ -127,4 +127,3 @@ message ( "-----HSA-Runtime-Inc: ${HSA_RUNTIME_INC_PATH}" ) message ( "-----HSA-Runtime-Lib: ${HSA_RUNTIME_LIB_PATH}" ) message ( "-----------CXX-Flags: ${CMAKE_CXX_FLAGS}" ) message ( "---CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" ) -message ( "---------Install-Dir: ${CMAKE_INSTALL_PREFIX}" ) diff --git a/src/core/context.h b/src/core/context.h index f7ad792d..6eb391a8 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -160,13 +160,18 @@ class Context { queue_(queue), hsa_rsrc_(&util::HsaRsrcFactory::Instance()), api_(hsa_rsrc_->AqlProfileApi()), + metrics_(NULL), handler_(handler), handler_arg_(handler_arg) { - if (info_count == 0) return; + if (info_count == 0) { + set_.push_back(Group(agent_info_, this, 0)); + return; + } metrics_ = MetricsDict::Create(agent_info); if (metrics_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); + if (Initialize(info, info_count) == false) { fprintf(stdout, "\nInput metrics out of HW limit. Proposed metrics group set:\n"); fflush(stdout); MetricsGroupSet(agent_info, info, info_count).Print(stdout); diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index 627a718a..c99e51dc 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -181,7 +181,6 @@ class InterceptQueue { void* context_handler_arg = NULL; rocprofiler_handler_t context_handler_fun = context->GetHandler(&context_handler_arg); tracker_->Enable(tracker_entry, context_handler_fun, context_handler_arg); - rocprofiler_close(context); } } } diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 65385078..373f1f7b 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -144,6 +144,13 @@ void check_status(hsa_status_t status) { } } +// Print profiling results output break if terminal output is enabled +void results_output_break() { + const bool is_terminal_output = (result_file_opened == false); + if (is_terminal_output) printf("\nROCprofiler results:\n"); +} + +// Filtering kernel name std::string filtr_kernel_name(const std::string name) { auto rit = name.rbegin(); auto rend = name.rend(); @@ -179,6 +186,7 @@ std::string filtr_kernel_name(const std::string name) { return name.substr(pos, length); } +// Inflight submits monitoring thread void* monitor_thr_fun(void*) { while (context_array != NULL) { sleep(CTX_OUTSTANDING_MON); @@ -198,6 +206,7 @@ void* monitor_thr_fun(void*) { return NULL; } +// Increment profiling context counter value uint32_t next_context_count() { if (pthread_mutex_lock(&mutex) != 0) { perror("pthread_mutex_lock"); @@ -423,12 +432,14 @@ bool dump_context_entry(context_entry_t* entry) { rocprofiler_group_t& group = entry->group; if (group.context != NULL) { - status = rocprofiler_group_get_data(&group); - check_status(status); - if (verbose == 1) output_group(entry, "group0-data"); + if (entry->feature_count > 0) { + status = rocprofiler_group_get_data(&group); + check_status(status); + if (verbose == 1) output_group(entry, "group0-data"); - status = rocprofiler_get_metrics(group.context); - check_status(status); + status = rocprofiler_get_metrics(group.context); + check_status(status); + } std::ostringstream oss; oss << index << "__" << filtr_kernel_name(entry->data.kernel_name); output_results(entry, oss.str().substr(0, KERNEL_NAME_LEN_MAX).c_str()); @@ -623,7 +634,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, } hsa_status_t destroy_callback(hsa_queue_t* queue, void*) { - if (result_file_opened == false) printf("\nROCProfiler results:\n"); + results_output_break(); dump_context_array(queue); return HSA_STATUS_SUCCESS; } @@ -1006,13 +1017,20 @@ extern "C" PUBLIC_API void OnUnloadTool() { rocprofiler_remove_queue_callbacks(); // Dump stored profiling output data - printf("\nROCPRofiler: %u contexts collected", context_collected); fflush(stdout); - dump_context_array(NULL); + fflush(stdout); if (result_file_opened) { + printf("\nROCPRofiler: %u contexts collected", context_collected); fflush(stdout); + dump_context_array(NULL); fclose(result_file_handle); - printf(", output directory %s", result_prefix); + printf(", output directory %s\n", result_prefix); + } else { + if (context_collected != context_count) { + results_output_break(); + dump_context_array(NULL); + } + printf("\nROCPRofiler: %u contexts collected\n", context_collected); } - printf("\n"); fflush(stdout); + fflush(stdout); // Cleanup if (callbacks_data != NULL) { From ce893db802dd840b98591950eaddabeb625eaa31 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Mon, 17 Sep 2018 12:57:30 -0500 Subject: [PATCH 015/168] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 20be662f..8af4d76e 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,8 @@ The library source tree: cd .../rocprofiler mkdir build cd build - cmake -DCMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa -DCMAKE_INSTALL_PREFIX=/opt/rocm .. + export CMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa + cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm .. make make install From 894eba54bec3ed74338d340ce5526b16393525a3 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 20 Sep 2018 16:16:29 -0500 Subject: [PATCH 016/168] Update README.md --- README.md | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/README.md b/README.md index 8af4d76e..41a5934e 100644 --- a/README.md +++ b/README.md @@ -43,18 +43,6 @@ The library source tree: export CMAKE_CURR_API=1 ``` -## To run the test: -``` - cd .../rocprofiler/build - export LD_LIBRARY_PATH=.: # paths to ROC profiler and oher libraries - export HSA_TOOLS_LIB=librocprofiler64.so # ROC profiler library loaded by HSA runtime - export ROCP_TOOL_LIB=test/libtool.so # tool library loaded by ROC profiler - export ROCP_METRICS=metrics.xml # ROC profiler metrics config file - export ROCP_INPUT=input.xml # input file for the tool library - export ROCP_OUTPUT_DIR=./ # output directory for the tool library, for metrics results file 'results.txt' - -``` - ## Internal 'simple_convolution' test run script: ``` cd .../rocprofiler/build From 8591164e8caf654ae0acead6224ba44ad3b608c9 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 4 Oct 2018 15:23:39 -0500 Subject: [PATCH 017/168] fix reading of the input counters line without end-of-line symboll --- bin/rpl_run.sh | 3 ++- bin/tblextr.py | 5 +++-- bin/txt2xml.sh | 5 +++-- test/run.sh | 3 ++- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 64185761..cfa38832 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -1,3 +1,5 @@ +#!/bin/sh + ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. # @@ -20,7 +22,6 @@ # THE SOFTWARE. ################################################################################ -#!/bin/sh time_stamp=`date +%y%m%d_%H%M%S` BIN_DIR=`dirname $0` BIN_DIR=`cd $BIN_DIR; pwd` diff --git a/bin/tblextr.py b/bin/tblextr.py index 630417ce..6a0f8eb2 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -1,3 +1,5 @@ +#!/usr/bin/python + ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. # @@ -20,7 +22,6 @@ # THE SOFTWARE. ################################################################################ -#!/usr/bin/python import os, sys, re # Parsing results in the format: @@ -114,7 +115,7 @@ def print_tbl(outfile): outfile = sys.argv[1] infiles = sys.argv[2:] -for f in infiles : +for f in infiles: parse_res(f) ret = print_tbl(outfile) sys.exit(ret) diff --git a/bin/txt2xml.sh b/bin/txt2xml.sh index 9881160d..66da77db 100755 --- a/bin/txt2xml.sh +++ b/bin/txt2xml.sh @@ -1,3 +1,5 @@ +#!/bin/bash + ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. # @@ -20,7 +22,6 @@ # THE SOFTWARE. ################################################################################ -#!/bin/bash timestamp=`date +%y%m%d_%H%M%S` if [ $# = 0 ] ; then @@ -41,7 +42,7 @@ gpu_index="" parse() { scan="$1" index=0 - while read -r line ; do + while read -r line || [[ -n "$line" ]] ; do line=`echo $line | sed "s/\s*#.*$//"` if [ -z "$line" ] ; then continue diff --git a/test/run.sh b/test/run.sh index 3ac292e6..a189d18a 100755 --- a/test/run.sh +++ b/test/run.sh @@ -1,3 +1,5 @@ +#!/bin/sh + ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. # @@ -19,7 +21,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. ################################################################################ -#!/bin/sh test_bin_dflt=./test/ctrl From a37902057d86b4a010f0811a1c358fdeaa20c767 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 12 Oct 2018 15:30:14 -0500 Subject: [PATCH 018/168] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 41a5934e..07ee7b63 100644 --- a/README.md +++ b/README.md @@ -73,9 +73,9 @@ Options: Input file .txt format, automatically rerun application for every pmc/sqtt line: # Perf counters group 1 - pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize + pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts FetchSize # Perf counters group 2 - pmc : WriteSize L2CacheHit + pmc : VALUUtilization,WriteSize L2CacheHit # Filter by dispatches range, GPU index and kernel names # supported range formats: "3:9", "3:", "3" range: 1 : 4 From 0fe0ffdcd1e0f70e9e71d38a5be89e1e1d825042 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 12 Oct 2018 15:34:38 -0500 Subject: [PATCH 019/168] Update README.md --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index 07ee7b63..6f02bca0 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,6 @@ The library source tree: cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm .. make make install - - For ROCM under 1.9 need: - export CMAKE_CURR_API=1 ``` ## Internal 'simple_convolution' test run script: From 5ac3c576bfa39794700629e7d8f1165cbdf6f7a5 Mon Sep 17 00:00:00 2001 From: Rene van Oostrum Date: Mon, 15 Oct 2018 10:46:00 -0500 Subject: [PATCH 020/168] Change sorting so that it not only works with Python 2.7, but with 3.x too --- script/tblextr.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/script/tblextr.py b/script/tblextr.py index 9a314db4..f6a37dc0 100755 --- a/script/tblextr.py +++ b/script/tblextr.py @@ -84,8 +84,7 @@ def print_tbl(outfile): out = open(outfile, 'w') - keys = var_table.keys() - keys.sort(key=int) + keys = sorted(var_table.keys(), key=int) entry = var_table[keys[0]] list1 = [] From df0ce581560f39faee91f8082effd62babfcc172 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 16 Nov 2018 19:26:53 -0600 Subject: [PATCH 021/168] tracker handlers ordering --- bin/rpl_run.sh | 52 ++--- inc/rocprofiler.h | 19 +- src/core/hsa_queue.h | 9 - src/core/intercept_queue.cpp | 4 +- src/core/intercept_queue.h | 47 ++++- src/core/metrics.h | 2 +- src/core/rocprofiler.cpp | 114 ++++++++++- src/core/tracker.h | 53 ++++-- src/core/types.h | 2 +- src/util/hsa_rsrc_factory.cpp | 9 +- src/util/hsa_rsrc_factory.h | 15 +- test/CMakeLists.txt | 34 +++- test/app/intercept_test.cpp | 231 +++++++++++++++++++++++ test/app/standalone_test.cpp | 163 ++++++++++++++++ test/app/test.cpp | 48 ++++- test/ctrl/run_kernel.h | 13 +- test/ctrl/test_hsa.cpp | 63 ++++--- test/ctrl/test_hsa.h | 20 +- test/dummy_kernel/dummy_kernel.cl | 28 +++ test/dummy_kernel/dummy_kernel.h | 71 +++++++ test/dummy_kernel/gfx8_DummyKernel.hsaco | Bin 0 -> 10952 bytes test/dummy_kernel/gfx9_DummyKernel.hsaco | Bin 0 -> 10952 bytes test/run.sh | 44 +++-- test/tool/metrics.xml | 70 +++---- test/tool/tool.cpp | 7 +- test/util/hsa_rsrc_factory.cpp | 9 +- test/util/hsa_rsrc_factory.h | 15 +- 27 files changed, 939 insertions(+), 203 deletions(-) create mode 100644 test/app/intercept_test.cpp create mode 100644 test/app/standalone_test.cpp create mode 100644 test/dummy_kernel/dummy_kernel.cl create mode 100644 test/dummy_kernel/dummy_kernel.h create mode 100755 test/dummy_kernel/gfx8_DummyKernel.hsaco create mode 100755 test/dummy_kernel/gfx9_DummyKernel.hsaco diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index cfa38832..043c0007 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -82,7 +82,7 @@ usage() { echo "Metrics definition: $PKG_DIR/lib/metrics.xml" echo "" echo "Usage:" - echo " rpl_run.sh [-h] [--list-basic] [--list-derived] [-i ] [-o ] " + echo " $bin_name [-h] [--list-basic] [--list-derived] [-i ] [-o ] " echo "" echo "Options:" echo " -h - this help" @@ -91,7 +91,7 @@ usage() { echo " --list-derived - to print the list of derived metrics with formulas" echo "" echo " -i <.txt|.xml file> - input file" - echo " Input file .txt format, automatically rerun application for every pmc/sqtt line:" + echo " Input file .txt format, automatically rerun application for every pmc line:" echo "" echo " # Perf counters group 1" echo " pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize" @@ -131,22 +131,17 @@ usage() { echo " --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off]" echo " --ctx-limit - maximum number of outstanding contexts [0 - unlimited]" echo " --heartbeat - to print progress heartbeats [0 - disabled]" - echo " --sqtt-size - to set SQTT buffer size, aggregate for all SE [0x2000000]" - echo " Can be set in KB (1024B) or MB (1048576) units, examples 20K or 20M respectively." - echo " --sqtt-local - to allocate SQTT buffer in local GPU memory [on]" echo "" echo "Configuration file:" echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" echo " First the configuration file is looking in the current directory, then in your home, and then in the package directory." - echo " Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'sqtt-size', 'sqtt-local'." + echo " Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat'." echo " An example of 'rpl_rc.xml':" echo " " echo "" exit 1 @@ -246,20 +241,6 @@ while [ 1 ] ; do export ROCP_OUTSTANDING_MAX="$2" elif [ "$1" = "--heartbeat" ] ; then export ROCP_OUTSTANDING_MON="$2" - elif [ "$1" = "--sqtt-size" ] ; then - size_m=`echo "$2" | sed -n "s/^\(.*\)M$/\1/p"` - size_k=`echo "$2" | sed -n "s/^\(.*\)K$/\1/p"` - if [ -n "$size_m" ] ; then size_b=$((size_m*1024*1024)) - elif [ -n "$size_k" ] ; then size_b=$((size_k*1024)) - else size_b=$2 - fi - export ROCP_SQTT_SIZE=$size_b - elif [ "$1" = "--sqtt-local" ] ; then - if [ "$2" = "on" ] ; then - export ROCP_SQTT_LOCAL=1 - else - export ROCP_SQTT_LOCAL=0 - fi elif [ "$1" = "--verbose" ] ; then ARG_VAL=0 export ROCP_VERBOSE_MODE=1 @@ -276,15 +257,16 @@ if [ "$ARG_CK" = "-" ] ; then fi if [ -z "$INPUT_FILE" ] ; then - fatal "Need input file" -fi - -input_base=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\1/"` -input_type=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\2/"` -if [ -z "${input_base}" -o -z "${input_type}" ] ; then - fatal "Bad input file '$INPUT_FILE'" + input_base="results" + input_type="none" +else + input_base=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\1/"` + input_type=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\2/"` + if [ -z "${input_base}" -o -z "${input_type}" ] ; then + fatal "Bad input file '$INPUT_FILE'" + fi + input_base=`basename $input_base` fi -input_base=`basename $input_base` if [ "$OUTPUT_DIR" = "--" ] ; then fatal "Bad output dir '$OUTPUT_DIR'" @@ -309,7 +291,7 @@ input_list="" RES_DIR="" if [ "$input_type" = "xml" ] ; then input_list=$INPUT_FILE -elif [ "$input_type" = "txt" ] ; then +elif [ "$input_type" = "txt" -o "$input_type" = "none" ] ; then OUTPUT_DIR="-" RES_DIR=$DATA_PATH/$DATA_DIR if [ -e $RES_DIR ] ; then @@ -317,7 +299,11 @@ elif [ "$input_type" = "txt" ] ; then fi mkdir -p $RES_DIR echo "RPL: output dir '$RES_DIR'" - $BIN_DIR/txt2xml.sh $INPUT_FILE $RES_DIR + if [ "$input_type" = "txt" ] ; then + $BIN_DIR/txt2xml.sh $INPUT_FILE $RES_DIR + else + echo "" > $RES_DIR/input.xml + fi input_list=`/bin/ls $RES_DIR/input*.xml` export ROCPROFILER_SESS=$RES_DIR else @@ -341,6 +327,8 @@ if [ -n "$csv_output" ] ; then python $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST if [ "$?" -eq 0 ] ; then echo "RPL: '$csv_output' is generated" + else + echo "Data extracting error: $OUTPUT_LIST'" fi fi diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 17106687..4448128f 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -45,7 +45,7 @@ THE SOFTWARE. #include #include -#define ROCPROFILER_VERSION_MAJOR 3 +#define ROCPROFILER_VERSION_MAJOR 5 #define ROCPROFILER_VERSION_MINOR 0 #ifdef __cplusplus @@ -216,21 +216,24 @@ typedef struct { uint32_t agent_index; // GPU index const hsa_queue_t* queue; // HSA queue uint64_t queue_index; // Index in the queue + uint32_t queue_id; // Queue id const hsa_kernel_dispatch_packet_t* packet; // HSA dispatch packet const char* kernel_name; // Kernel name + uint64_t kernel_object; // Kernel object pointer + int64_t thread_id; // Thread id const rocprofiler_dispatch_record_t* record; // Dispatch record } rocprofiler_callback_data_t; // Profiling callback type typedef hsa_status_t (*rocprofiler_callback_t)( - const rocprofiler_callback_data_t* callback_data, // [in] callback data union, data depends on - // the callback API id + const rocprofiler_callback_data_t* callback_data, // [in] callback data void* user_data, // [in/out] user data passed to the callback - rocprofiler_group_t* group); // [out] profiling group + rocprofiler_group_t* group); // [out] returned profiling group // Queue callbacks typedef struct { rocprofiler_callback_t dispatch; // dispatch callback + hsa_status_t (*create)(hsa_queue_t* queue, void* data); // create callback hsa_status_t (*destroy)(hsa_queue_t* queue, void* data); // destroy callback } rocprofiler_queue_callbacks_t; @@ -309,6 +312,8 @@ typedef enum { ROCPROFILER_INFO_KIND_METRIC_COUNT = 1, // metric features count, int32 ROCPROFILER_INFO_KIND_TRACE = 2, // trace info ROCPROFILER_INFO_KIND_TRACE_COUNT = 3, // trace features count, int32 + ROCPROFILER_INFO_KIND_TRACE_PARAMETER = 4, // trace parameter info + ROCPROFILER_INFO_KIND_TRACE_PARAMETER_COUNT = 5 // trace parameter count, int32 } rocprofiler_info_kind_t; // Profiling info query @@ -337,6 +342,12 @@ typedef struct { const char* description; // trace description uint32_t parameter_count; // supported by the trace number parameters } trace; + struct { + uint32_t code; // parameter code + const char* trace_name; // trace name + const char* parameter_name; // parameter name + const char* description; // trace parameter description + } trace_parameter; }; } rocprofiler_info_data_t; diff --git a/src/core/hsa_queue.h b/src/core/hsa_queue.h index 620f6224..12ef97bb 100644 --- a/src/core/hsa_queue.h +++ b/src/core/hsa_queue.h @@ -32,15 +32,6 @@ namespace rocprofiler { class HsaQueue : public Queue { public: - typedef void (HsaQueue::*submit_fptr_t)(const packet_t* packet); - enum { - LEGACY_SLOT_SIZE_W = HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE / sizeof(packet_word_t), - LEGACY_SLOT_SIZE_P = HSA_VEN_AMD_AQLPROFILE_LEGACY_PM4_PACKET_SIZE / sizeof(packet_t) - }; - struct slot_pm4_t { - packet_word_t words[LEGACY_SLOT_SIZE_W]; - }; - HsaQueue(const util::AgentInfo* agent_info, hsa_queue_t* queue) : queue_(queue) {} void Submit(const packet_t* packet) { diff --git a/src/core/intercept_queue.cpp b/src/core/intercept_queue.cpp index 7703c662..91028f73 100644 --- a/src/core/intercept_queue.cpp +++ b/src/core/intercept_queue.cpp @@ -30,12 +30,14 @@ void InterceptQueue::HsaIntercept(HsaApiTable* table) { InterceptQueue::mutex_t InterceptQueue::mutex_; rocprofiler_callback_t InterceptQueue::dispatch_callback_ = NULL; +InterceptQueue::queue_callback_t InterceptQueue::create_callback_ = NULL; InterceptQueue::queue_callback_t InterceptQueue::destroy_callback_ = NULL; void* InterceptQueue::callback_data_ = NULL; InterceptQueue::obj_map_t* InterceptQueue::obj_map_ = NULL; const char* InterceptQueue::kernel_none_ = ""; Tracker* InterceptQueue::tracker_ = NULL; bool InterceptQueue::tracker_on_ = false; -bool InterceptQueue::in_constr_call_ = false; +bool InterceptQueue::in_create_call_ = false; +InterceptQueue::queue_id_t InterceptQueue::current_queue_id = 0; } // namespace rocprofiler diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index c99e51dc..1f31b0d9 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -26,6 +26,7 @@ THE SOFTWARE. #include #include #include +#include #include #include @@ -49,6 +50,7 @@ class InterceptQueue { typedef std::map obj_map_t; typedef hsa_status_t (*queue_callback_t)(hsa_queue_t*, void* data); typedef void (*queue_event_callback_t)(hsa_status_t status, hsa_queue_t *queue, void *arg); + typedef uint32_t queue_id_t; static void HsaIntercept(HsaApiTable* table); @@ -61,8 +63,8 @@ class InterceptQueue { std::lock_guard lck(mutex_); hsa_status_t status = HSA_STATUS_ERROR; - if (in_constr_call_) EXC_ABORT(status, "recursive InterceptQueueCreate()"); - in_constr_call_ = true; + if (in_create_call_) EXC_ABORT(status, "recursive InterceptQueueCreate()"); + in_create_call_ = true; ProxyQueue* proxy = ProxyQueue::Create(agent, size, type, queue_event_callback, data, private_segment_size, group_segment_size, queue, &status); @@ -79,8 +81,14 @@ class InterceptQueue { (*obj_map_)[(uint64_t)(*queue)] = obj; status = proxy->SetInterceptCB(OnSubmitCB, obj); obj->queue_event_callback_ = callback; + obj->queue_id = current_queue_id; + ++current_queue_id; - in_constr_call_ = false; + if (create_callback_ != NULL) { + status = create_callback_(*queue, callback_data_); + } + + in_create_call_ = false; return status; } @@ -139,13 +147,17 @@ class InterceptQueue { } // Prepareing dispatch callback data - const char* kernel_name = GetKernelName(dispatch_packet); + uint64_t kernel_symbol = GetKernelSymbol(dispatch_packet); + const char* kernel_name = GetKernelName(kernel_symbol); rocprofiler_callback_data_t data = {obj->agent_info_->dev_id, obj->agent_info_->dev_index, obj->queue_, user_que_idx, + obj->queue_id, dispatch_packet, kernel_name, + kernel_symbol, + syscall(__NR_gettid), (tracker_entry) ? tracker_entry->record : NULL}; // Calling dispatch callback @@ -154,7 +166,10 @@ class InterceptQueue { free(const_cast(kernel_name)); // Injecting profiling start/stop packets if ((status != HSA_STATUS_SUCCESS) || (group.context == NULL)) { - if (tracker_entry != NULL) tracker_->Delete(tracker_entry); + if (tracker_entry != NULL) { + const_cast(dispatch_packet)->completion_signal = tracker_entry->orig; + tracker_->Delete(tracker_entry); + } } else { Context* context = reinterpret_cast(group.context); @@ -197,10 +212,15 @@ class InterceptQueue { } } - static void SetCallbacks(rocprofiler_callback_t dispatch_callback, queue_callback_t destroy_callback, void* data) { + static void SetCallbacks(rocprofiler_callback_t dispatch_callback, + queue_callback_t create_callback, + queue_callback_t destroy_callback, + void* data) + { std::lock_guard lck(mutex_); callback_data_ = data; dispatch_callback_ = dispatch_callback; + create_callback_ = create_callback; destroy_callback_ = destroy_callback; } @@ -219,7 +239,7 @@ class InterceptQueue { return static_cast((*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask); } - static const char* GetKernelName(const hsa_kernel_dispatch_packet_t* dispatch_packet) { + static uint64_t GetKernelSymbol(const hsa_kernel_dispatch_packet_t* dispatch_packet) { const amd_kernel_code_t* kernel_code = NULL; hsa_status_t status = util::HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address( @@ -228,8 +248,12 @@ class InterceptQueue { if (HSA_STATUS_SUCCESS != status) { kernel_code = reinterpret_cast(dispatch_packet->kernel_object); } - amd_runtime_loader_debug_info_t* dbg_info = reinterpret_cast( - kernel_code->runtime_loader_kernel_symbol); + return kernel_code->runtime_loader_kernel_symbol; + } + + static const char* GetKernelName(const uint64_t kernel_symbol) { + amd_runtime_loader_debug_info_t* dbg_info = + reinterpret_cast(kernel_symbol); const char* kernel_name = (dbg_info != NULL) ? dbg_info->kernel_name : NULL; // Kernel name is mangled name @@ -288,18 +312,21 @@ class InterceptQueue { static mutex_t mutex_; static const packet_word_t header_type_mask = (1ul << HSA_PACKET_HEADER_WIDTH_TYPE) - 1; static rocprofiler_callback_t dispatch_callback_; + static queue_callback_t create_callback_; static queue_callback_t destroy_callback_; static void* callback_data_; static obj_map_t* obj_map_; static const char* kernel_none_; static Tracker* tracker_; static bool tracker_on_; - static bool in_constr_call_; + static bool in_create_call_; + static queue_id_t current_queue_id; hsa_queue_t* const queue_; ProxyQueue* const proxy_; const util::AgentInfo* agent_info_; queue_event_callback_t queue_event_callback_; + queue_id_t queue_id; }; } // namespace rocprofiler diff --git a/src/core/metrics.h b/src/core/metrics.h index 8f05a3e7..46806dcf 100644 --- a/src/core/metrics.h +++ b/src/core/metrics.h @@ -195,7 +195,7 @@ class MetricsDict { } static hsa_ven_amd_aqlprofile_id_query_t Translate(const util::AgentInfo* agent_info, const std::string& block_name) { - hsa_ven_amd_aqlprofile_profile_t profile; + hsa_ven_amd_aqlprofile_profile_t profile{}; profile.agent = agent_info->dev_id; hsa_ven_amd_aqlprofile_id_query_t query = {block_name.c_str(), 0, 0}; hsa_status_t status = diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index e8901387..6042e59e 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -56,6 +56,16 @@ THE SOFTWARE. // Internal library methods // namespace rocprofiler { +hsa_status_t CreateQueuePro( + hsa_agent_t agent, + uint32_t size, + hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data), + void *data, + uint32_t private_segment_size, + uint32_t group_segment_size, + hsa_queue_t **queue); + decltype(hsa_queue_create)* hsa_queue_create_fn; decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; @@ -115,6 +125,11 @@ void RestoreHsaApi() { table->amd_ext_->hsa_amd_queue_intercept_register_fn = hsa_amd_queue_intercept_register_fn; } +void StandaloneIntercept() { + ::HsaApiTable* table = kHsaApiTable; + table->core_->hsa_queue_create_fn = rocprofiler::CreateQueuePro; +} + typedef void (*tool_handler_t)(); typedef void (*tool_handler_prop_t)(rocprofiler_settings_t*); void * tool_handle = NULL; @@ -195,9 +210,7 @@ DESTRUCTOR_API void destructor() { const MetricsDict* GetMetrics(const hsa_agent_t& agent) { rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); - if (agent_info == NULL) { - EXC_RAISING(HSA_STATUS_ERROR, "agent is not found"); - } + if (agent_info == NULL) EXC_RAISING(HSA_STATUS_ERROR, "agent is not found"); const MetricsDict* metrics = MetricsDict::Create(agent_info); if (metrics == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); return metrics; @@ -209,6 +222,94 @@ hsa_status_t GetExcStatus(const std::exception& e) { : HSA_STATUS_ERROR; } + +inline size_t CreateEnableCmd(const hsa_agent_t& agent, packet_t* command, const size_t& slot_count) { + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); + const bool is_legacy = (strncmp(agent_info->name, "gfx8", 4) == 0); + const size_t packet_count = (is_legacy) ? Profile::LEGACY_SLOT_SIZE_PKT : 1; + + if (packet_count > slot_count) EXC_RAISING(HSA_STATUS_ERROR, "packet_count > slot_count"); + + // AQLprofile object + hsa_ven_amd_aqlprofile_profile_t profile{}; + profile.agent = agent_info->dev_id; + // Query for cmd buffer size + hsa_status_t status = hsa_rsrc->AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info( + &profile, HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD, NULL); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "get_info(ENABLE_CMD).size exc"); + if (profile.command_buffer.size == 0) EXC_RAISING(status, "get_info(ENABLE_CMD).size == 0"); + // Allocate cmd buffer + const size_t aligment_mask = 0x100 - 1; + profile.command_buffer.ptr = + hsa_rsrc->AllocateSysMemory(agent_info, profile.command_buffer.size); + if ((reinterpret_cast(profile.command_buffer.ptr) & aligment_mask) != 0) { + EXC_RAISING(status, "profile.command_buffer.ptr bad alignment"); + } + + // Generating cmd packet + if (is_legacy) { + packet_t packet{}; + + // Query for cmd buffer data + status = hsa_rsrc->AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info( + &profile, HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD, &packet); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "get_info(ENABLE_CMD).data exc"); + + // Check for legacy GFXIP + status = hsa_rsrc->AqlProfileApi()->hsa_ven_amd_aqlprofile_legacy_get_pm4(&packet, command); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + } else { + // Query for cmd buffer data + status = hsa_rsrc->AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info( + &profile, HSA_VEN_AMD_AQLPROFILE_INFO_ENABLE_CMD, command); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "get_info(ENABLE_CMD).data exc"); + } + + // Return cmd packet data size + return (packet_count * sizeof(packet_t)); +} + +hsa_status_t CreateQueuePro( + hsa_agent_t agent, + uint32_t size, + hsa_queue_type32_t type, + void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data), + void *data, + uint32_t private_segment_size, + uint32_t group_segment_size, + hsa_queue_t **queue) +{ + static packet_t enable_cmd_packet[Profile::LEGACY_SLOT_SIZE_PKT]; + static size_t enable_cmd_size = 0; + static std::mutex enable_cmd_mutex; + + // Create HSA queue + hsa_status_t status = hsa_queue_create_fn( + agent, + size, + type, + callback, + data, + private_segment_size, + group_segment_size, + queue); + if (status != HSA_STATUS_SUCCESS) return status; + + // Create 'Enable' cmd packet + if (enable_cmd_size == 0) { + std::lock_guard lck(enable_cmd_mutex); + if (enable_cmd_size == 0) { + enable_cmd_size = CreateEnableCmd(agent, enable_cmd_packet, Profile::LEGACY_SLOT_SIZE_PKT); + } + } + + // Enable counters for the queue + rocprofiler::util::HsaRsrcFactory::Instance().Submit(*queue, enable_cmd_packet, enable_cmd_size); + + return HSA_STATUS_SUCCESS; +} + rocprofiler_properties_t rocprofiler_properties; uint32_t SqttProfile::output_buffer_size_ = 0x2000000; // 32M bool SqttProfile::output_buffer_local_ = true; @@ -261,7 +362,10 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa if (intercept_mode) { rocprofiler::ProxyQueue::HsaIntercept(table); rocprofiler::InterceptQueue::HsaIntercept(table); + } else { + rocprofiler::StandaloneIntercept(); } + return true; } @@ -419,14 +523,14 @@ PUBLIC_API hsa_status_t rocprofiler_get_metrics(const rocprofiler_t* handle) { // Set/remove queue callbacks PUBLIC_API hsa_status_t rocprofiler_set_queue_callbacks(rocprofiler_queue_callbacks_t callbacks, void* data) { API_METHOD_PREFIX - rocprofiler::InterceptQueue::SetCallbacks(callbacks.dispatch, callbacks.destroy, data); + rocprofiler::InterceptQueue::SetCallbacks(callbacks.dispatch, callbacks.create, callbacks.destroy, data); API_METHOD_SUFFIX } // Remove queue callbacks PUBLIC_API hsa_status_t rocprofiler_remove_queue_callbacks() { API_METHOD_PREFIX - rocprofiler::InterceptQueue::SetCallbacks(NULL, NULL, NULL); + rocprofiler::InterceptQueue::SetCallbacks(NULL, NULL, NULL, NULL); API_METHOD_SUFFIX } diff --git a/src/core/tracker.h b/src/core/tracker.h index acbf5cf6..ab7f3b5d 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -49,6 +49,7 @@ class Tracker { typedef sig_list_t::iterator sig_list_it_t; struct entry_t { + std::atomic valid; Tracker* tracker; sig_list_t::iterator it; hsa_agent_t agent; @@ -100,7 +101,7 @@ class Tracker { // Adding antry to the list mutex_.lock(); - entry->it = sig_list_.insert(sig_list_.begin(), entry); + entry->it = sig_list_.insert(sig_list_.end(), entry); mutex_.unlock(); return entry; @@ -142,7 +143,7 @@ class Tracker { void Erase(const sig_list_it_t& it) { Delete(*it); } // Entry completion - void Complete(entry_t* entry) { + inline void Complete(entry_t* entry) { record_t* record = entry->record; // Debug trace @@ -160,6 +161,7 @@ class Tracker { record->begin = hsa_rsrc_->SysclockToNs(dispatch_time.start); record->end = hsa_rsrc_->SysclockToNs(dispatch_time.end); record->complete = hsa_rsrc_->TimestampNs(); + entry->valid.store(true, std::memory_order_release); // Original intercepted signal completion hsa_signal_t orig = entry->orig; @@ -174,6 +176,19 @@ class Tracker { } } + inline static void HandleEntry(entry_t* entry) { + // Call entry handler + void* handler = static_cast(entry->handler); + if (entry->context_active) { + reinterpret_cast(handler)(0, entry->arg); + } else { + rocprofiler_group_t group{}; + reinterpret_cast(handler)(group, entry->arg); + } + // Delete tracker entry + entry->tracker->Delete(entry); + } + // Handler for packet completion static bool Handler(hsa_signal_value_t, void* arg) { // Acquire entry @@ -182,20 +197,31 @@ class Tracker { while (ptr->load(std::memory_order_acquire) == NULL) sched_yield(); // Complete entry - entry->tracker->Complete(entry); + Tracker* tracker = entry->tracker; + tracker->Complete(entry); - // Call entry handler - void* handler = static_cast(entry->handler); - if (entry->context_active) { - reinterpret_cast(handler)(0, entry->arg); + if (ordering_enabled_ == false) { + HandleEntry(entry); } else { - rocprofiler_group_t group{}; - reinterpret_cast(handler)(group, entry->arg); + // Acquire last entry + entry_t* back = tracker->sig_list_.back(); + volatile std::atomic* ptr = &back->handler; + while (ptr->load(std::memory_order_acquire) == NULL) sched_yield(); + + tracker->handler_mutex_.lock(); + sig_list_it_t it = tracker->sig_list_.begin(); + sig_list_it_t end = back->it; + while (it != end) { + entry = *(it++); + if (entry->valid.load(std::memory_order_acquire)) { + HandleEntry(entry); + } else { + break; + } + } + tracker->handler_mutex_.unlock(); } - // Delete tracker entry - entry->tracker->Delete(entry); - return false; } @@ -203,10 +229,13 @@ class Tracker { sig_list_t sig_list_; // Inter-thread synchronization mutex_t mutex_; + mutex_t handler_mutex_; // Outstanding dispatches std::atomic outstanding_; // HSA resources factory util::HsaRsrcFactory* hsa_rsrc_; + // Handling ordering enabled + static const bool ordering_enabled_ = true; // Enable tracing static const bool trace_on_ = false; }; diff --git a/src/core/types.h b/src/core/types.h index c58d6cf2..ef8600f0 100644 --- a/src/core/types.h +++ b/src/core/types.h @@ -26,7 +26,7 @@ THE SOFTWARE. #include namespace rocprofiler { -typedef hsa_ven_amd_aqlprofile_1_00_pfn_t pfn_t; +typedef hsa_ven_amd_aqlprofile_pfn_t pfn_t; typedef hsa_ven_amd_aqlprofile_event_t event_t; typedef hsa_ven_amd_aqlprofile_parameter_t parameter_t; typedef hsa_ven_amd_aqlprofile_profile_t profile_t; diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index 3c50d27d..2d64bae0 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -134,13 +134,13 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize #ifdef ROCP_LD_AQLPROFILE status = LoadAqlProfileLib(&aqlprofile_api_); #else - status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, 1, 0, &aqlprofile_api_); + status = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, sizeof(aqlprofile_api_), &aqlprofile_api_); #endif CHECK_STATUS("aqlprofile API table load failed", status); // Get Loader API table loader_api_ = {0}; - status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_); + status = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_); CHECK_STATUS("loader API table query failed", status); // Instantiate HSA timer @@ -527,6 +527,7 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br // Print the various fields of Hsa Gpu Agents bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { + std::cout << std::flush; std::clog << header << " :" << std::endl; const AgentInfo* agent_info; @@ -550,7 +551,7 @@ bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { } uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { - const uint32_t slot_size_b = 0x40; + const uint32_t slot_size_b = CMD_SLOT_SIZE_B; // adevance command queue const uint64_t write_idx = hsa_queue_load_write_index_relaxed(queue); @@ -578,7 +579,7 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { } uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes) { - const uint32_t slot_size_b = 0x40; + const uint32_t slot_size_b = CMD_SLOT_SIZE_B; if ((size_bytes & (slot_size_b - 1)) != 0) { fprintf(stderr, "HsaRsrcFactory::Submit: Bad packet size %zx\n", size_bytes); abort(); diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index c76046d2..9997a81c 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -123,7 +123,7 @@ struct AgentInfo { // HSA timer class // Provides current HSA timestampa and system-clock/ns conversion API class HsaTimer { - public: + public: typedef uint64_t timestamp_t; static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; typedef long double freq_t; @@ -136,8 +136,12 @@ class HsaTimer { } // Methids for system-clock/ns conversion - timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { return timestamp_t((freq_t)sysclock * sysclock_factor_); } - timestamp_t ns_to_sysclock(const timestamp_t& time) const { return timestamp_t((freq_t)time / sysclock_factor_); } + timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { + return timestamp_t((freq_t)sysclock * sysclock_factor_); + } + timestamp_t ns_to_sysclock(const timestamp_t& time) const { + return timestamp_t((freq_t)time / sysclock_factor_); + } // Return timestamp in 'ns' timestamp_t timestamp_ns() const { @@ -147,13 +151,14 @@ class HsaTimer { return sysclock_to_ns(sysclock); } - private: + private: // Timestamp frequency factor freq_t sysclock_factor_; }; class HsaRsrcFactory { public: + static const size_t CMD_SLOT_SIZE_B = 0x40; typedef std::recursive_mutex mutex_t; typedef HsaTimer::timestamp_t timestamp_t; @@ -272,7 +277,7 @@ class HsaRsrcFactory { static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); // Return AqlProfile API table - typedef hsa_ven_amd_aqlprofile_1_00_pfn_t aqlprofile_pfn_t; + typedef hsa_ven_amd_aqlprofile_pfn_t aqlprofile_pfn_t; const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; } // Return Loader API table diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 2f35639d..c7d86ccf 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -35,19 +35,47 @@ endif () ## Util sources file( GLOB UTIL_SRC "${TEST_DIR}/util/*.cpp" ) +## Standalone test sources +set ( STEXE_NAME "standalone_test" ) +set ( STTST_SRC + ${TEST_DIR}/app/standalone_test.cpp + ${TEST_DIR}/ctrl/test_hsa.cpp +) + +## Intercept test sources +set ( INEXE_NAME "intercept_test" ) +set ( INTST_SRC + ${TEST_DIR}/app/intercept_test.cpp + ${TEST_DIR}/ctrl/test_hsa.cpp +) + ## Test control sources set ( CTRL_SRC ${TEST_DIR}/app/test.cpp ${TEST_DIR}/ctrl/test_hsa.cpp ) -## Test kernel sources +## Dummy kernel +set ( DUMMY_NAME dummy_kernel ) +execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/${DUMMY_NAME}/*.hsaco ${PROJECT_BINARY_DIR}" ) + +## Test kernel set ( TEST_NAME simple_convolution ) set ( KERN_SRC ${TEST_DIR}/${TEST_NAME}/${TEST_NAME}.cpp ) execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/${TEST_NAME}/*.hsaco ${PROJECT_BINARY_DIR}" ) -## Building test executable -add_executable ( ${EXE_NAME} ${KERN_SRC} ${CTRL_SRC} ${UTIL_SRC} ) +## Building standalone test executable +add_executable ( ${STEXE_NAME} ${STTST_SRC} ${UTIL_SRC} ${KERN_SRC} ) +target_include_directories ( ${STEXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries( ${STEXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) + +## Building intercept test executable +add_library ( ${INEXE_NAME} SHARED ${INTST_SRC} ${UTIL_SRC} ${KERN_SRC} ) +target_include_directories ( ${INEXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries( ${INEXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) + +## Building ctrl test executable +add_executable ( ${EXE_NAME} ${CTRL_SRC} ${UTIL_SRC} ${KERN_SRC} ) target_include_directories ( ${EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) target_link_libraries( ${EXE_NAME} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/run.sh ${PROJECT_BINARY_DIR}" ) diff --git a/test/app/intercept_test.cpp b/test/app/intercept_test.cpp new file mode 100644 index 00000000..87e00d64 --- /dev/null +++ b/test/app/intercept_test.cpp @@ -0,0 +1,231 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include + +#include +#include +#include + +#include "ctrl/run_kernel.h" +#include "ctrl/test_aql.h" +#include "ctrl/test_hsa.h" +#include "inc/rocprofiler.h" +#include "dummy_kernel/dummy_kernel.h" +#include "simple_convolution/simple_convolution.h" +#include "util/test_assert.h" + +#define PUBLIC_API __attribute__((visibility("default"))) +#define CONSTRUCTOR_API __attribute__((constructor)) +#define DESTRUCTOR_API __attribute__((destructor)) + +// Dispatch callbacks and context handlers synchronization +pthread_mutex_t mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; +// Tool is unloaded +volatile bool is_loaded = false; + +// Error handler +void fatal(const std::string msg) { + fflush(stdout); + fprintf(stderr, "%s\n\n", msg.c_str()); + fflush(stderr); + abort(); +} + +// Check returned HSA API status +void check_status(hsa_status_t status) { + if (status != HSA_STATUS_SUCCESS) { + const char* error_string = NULL; + rocprofiler_error_string(&error_string); + fprintf(stderr, "ERROR: %s\n", error_string); + abort(); + } +} + +// Context stored entry type +struct context_entry_t { + bool valid; + hsa_agent_t agent; + rocprofiler_group_t group; + rocprofiler_callback_data_t data; +}; + +// Dump stored context entry +void dump_context_entry(context_entry_t* entry) { + volatile std::atomic* valid = reinterpret_cast*>(&entry->valid); + while (valid->load() == false) sched_yield(); + + const std::string kernel_name = entry->data.kernel_name; + const rocprofiler_dispatch_record_t* record = entry->data.record; + + fflush(stdout); + fprintf(stdout, "kernel symbol(0x%lx) name(\"%s\") tid(%ld) queue-id(%u) gpu-id(%u) ", + entry->data.kernel_object, + kernel_name.c_str(), + entry->data.thread_id, + entry->data.queue_id, + HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index); + if (record) fprintf(stdout, "time(%lu,%lu,%lu,%lu)", + record->dispatch, + record->begin, + record->end, + record->complete); + fprintf(stdout, "\n"); + fflush(stdout); + + rocprofiler_group_t& group = entry->group; + if (group.context == NULL) { + fprintf(stderr, "tool error: context is NULL\n"); + abort(); + } + + rocprofiler_close(group.context); +} + +// Profiling completion handler +// Dump and delete the context entry +// Return true if the context was dumped successfully +bool context_handler(rocprofiler_group_t group, void* arg) { + context_entry_t* entry = reinterpret_cast(arg); + + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + dump_context_entry(entry); + delete entry; + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + return false; +} + +// Kernel disoatch callback +hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* /*user_data*/, + rocprofiler_group_t* group) { + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; + + // Profiling context + rocprofiler_t* context = NULL; + + // Context entry + context_entry_t* entry = new context_entry_t(); + + // context properties + rocprofiler_properties_t properties{}; + properties.handler = context_handler; + properties.handler_arg = (void*)entry; + + // Open profiling context + status = rocprofiler_open(callback_data->agent, NULL, 0, + &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); + check_status(status); + + // Get group[0] + status = rocprofiler_get_group(context, 0, group); + check_status(status); + + // Fill profiling context entry + entry->agent = callback_data->agent; + entry->group = *group; + entry->data = *callback_data; + entry->data.kernel_name = strdup(callback_data->kernel_name); + reinterpret_cast*>(&entry->valid)->store(true); + + return HSA_STATUS_SUCCESS; +} + +void initialize() { + // Getting GPU device info + const AgentInfo* agent_info = NULL; + if (HsaRsrcFactory::Instance().GetGpuAgentInfo(0, &agent_info) == false) { + fprintf(stderr, "GetGpuAgentInfo failed\n"); + abort(); + } + + // Adding dispatch observer + rocprofiler_queue_callbacks_t callbacks_ptrs{}; + callbacks_ptrs.dispatch = dispatch_callback; + rocprofiler_set_queue_callbacks(callbacks_ptrs, NULL); +} + +void cleanup() { + // Unregister dispatch callback + rocprofiler_remove_queue_callbacks(); + + // Dump stored profiling output data + fflush(stdout); +} + +// Tool constructor +extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) +{ + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + if (is_loaded) return; + is_loaded = true; + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + // Enable timestamping + settings->timestamp_on = true; + + // Initialize profiling + initialize(); +} + +// Tool destructor +extern "C" PUBLIC_API void OnUnloadTool() { + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + if (!is_loaded) return; + is_loaded = false; + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + // Final resources cleanup + cleanup(); +} + +extern "C" CONSTRUCTOR_API void constructor() { + printf("INTT constructor\n"); fflush(stdout); +} + +extern "C" DESTRUCTOR_API void destructor() { + if (is_loaded == true) OnUnloadTool(); +} diff --git a/test/app/standalone_test.cpp b/test/app/standalone_test.cpp new file mode 100644 index 00000000..f6fc965e --- /dev/null +++ b/test/app/standalone_test.cpp @@ -0,0 +1,163 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include + +#include "ctrl/run_kernel.h" +#include "ctrl/test_aql.h" +#include "ctrl/test_hsa.h" +#include "inc/rocprofiler.h" +#include "dummy_kernel/dummy_kernel.h" +#include "simple_convolution/simple_convolution.h" +#include "util/test_assert.h" + +void print_features(rocprofiler_feature_t* feature, uint32_t feature_count) { + for (rocprofiler_feature_t* p = feature; p < feature + feature_count; ++p) { + std::cout << (p - feature) << ": " << p->name; + switch (p->data.kind) { + case ROCPROFILER_DATA_KIND_INT64: + std::cout << std::dec << " result64 (" << p->data.result_int64 << ")" << std::endl; + break; + case ROCPROFILER_DATA_KIND_BYTES: { + const char* ptr = reinterpret_cast(p->data.result_bytes.ptr); + uint64_t size = 0; + for (unsigned i = 0; i < p->data.result_bytes.instance_count; ++i) { + size = *reinterpret_cast(ptr); + const char* data = ptr + sizeof(size); + std::cout << std::endl; + std::cout << std::hex << " data (" << (void*)data << ")" << std::endl; + std::cout << std::dec << " size (" << size << ")" << std::endl; + ptr = data + size; + } + break; + } + default: + std::cout << "result kind (" << p->data.kind << ")" << std::endl; + TEST_ASSERT(false); + } + } +} + +void read_features(uint32_t n, rocprofiler_t* context, rocprofiler_feature_t* feature, const unsigned feature_count) { + std::cout << "read features" << std::endl; + hsa_status_t status = rocprofiler_read(context, n); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + std::cout << "read issue" << std::endl; + status = rocprofiler_get_data(context, n); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + status = rocprofiler_get_metrics(context); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + print_features(feature, feature_count); +} + +int main() { + bool ret_val = false; + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; + // Profiling context + rocprofiler_t* context = NULL; + // Profiling properties + rocprofiler_properties_t properties; + + // Profiling feature objects + const unsigned feature_count = 9; + rocprofiler_feature_t feature[feature_count]; + // PMC events + memset(feature, 0, sizeof(feature)); + feature[0].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[0].name = "GRBM_COUNT"; + feature[1].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[1].name = "GRBM_GUI_ACTIVE"; + feature[2].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[2].name = "GPUBusy"; + feature[3].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[3].name = "SQ_WAVES"; + feature[4].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[4].name = "SQ_INSTS_VALU"; + feature[5].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[5].name = "VALUInsts"; + feature[6].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[6].name = "TCC_HIT_sum"; + feature[7].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[7].name = "TCC_MISS_sum"; + feature[8].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[8].name = "WRITE_SIZE"; + + // Instantiate HSA resources + HsaRsrcFactory::Create(); + + // Getting GPU device info + const AgentInfo* agent_info = NULL; + if (HsaRsrcFactory::Instance().GetGpuAgentInfo(0, &agent_info) == false) abort(); + + // Creating the queues pool + const unsigned queue_count = 16; + hsa_queue_t* queue[queue_count]; + for (unsigned queue_ind = 0; queue_ind < queue_count; ++queue_ind) { + if (HsaRsrcFactory::Instance().CreateQueue(agent_info, 128, &queue[queue_ind]) == false) abort(); + } + hsa_queue_t* prof_queue = queue[0]; + + // Creating profiling context + properties = {}; + properties.queue = prof_queue; + status = rocprofiler_open(agent_info->dev_id, feature, feature_count, &context, + ROCPROFILER_MODE_STANDALONE, &properties); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + + // Test initialization + TestHsa::HsaInstantiate(); + + // Dispatching profiled kernel n-times to collect all counter groups data + const unsigned group_n = 0; + status = rocprofiler_start(context, group_n); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + std::cout << "start" << std::endl; + + for (unsigned ind = 0; ind < 3; ++ind) { +#if 1 + const unsigned queue_ind = ind % queue_count; + hsa_queue_t* prof_queue = queue[queue_ind]; + //ret_val = RunKernel(0, NULL, NULL, prof_queue); + ret_val = RunKernel(0, NULL, NULL, prof_queue); + std::cout << "run kernel, queue " << queue_ind << std::endl; +#else + sleep(3); +#endif + read_features(group_n, context, feature, feature_count); + } + + // Stop counters + status = rocprofiler_stop(context, group_n); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + std::cout << "stop" << std::endl; + + // Finishing cleanup + // Deleting profiling context will delete all allocated resources + status = rocprofiler_close(context); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + + return (ret_val) ? 0 : 1; +} diff --git a/test/app/test.cpp b/test/app/test.cpp index 9e694833..796ba1eb 100644 --- a/test/app/test.cpp +++ b/test/app/test.cpp @@ -21,20 +21,66 @@ THE SOFTWARE. *******************************************************************************/ #include +#include #include #include +#include #include "ctrl/run_kernel.h" #include "ctrl/test_aql.h" +#include "dummy_kernel/dummy_kernel.h" #include "simple_convolution/simple_convolution.h" +void thread_fun(const int kiter, const int diter, const uint32_t agents_number) { + const AgentInfo* agent_info[agents_number]; + hsa_queue_t* queue[agents_number]; + HsaRsrcFactory* rsrc = &HsaRsrcFactory::Instance(); + + for (uint32_t n = 0; n < agents_number; ++n) { + uint32_t agent_id = n % rsrc->GetCountOfGpuAgents(); + if (rsrc->GetGpuAgentInfo(agent_id, &agent_info[n]) == false) { + fprintf(stderr, "AgentInfo failed\n"); + abort(); + } + if (rsrc->CreateQueue(agent_info[n], 128, &queue[n]) == false) { + fprintf(stderr, "CreateQueue failed\n"); + abort(); + } + } + + for (int i = 0; i < kiter; ++i) { + for (uint32_t n = 0; n < agents_number; ++n) { + // RunKernel(0, NULL, agent_info[n], queue[n], diter); + RunKernel(0, NULL, agent_info[n], queue[n], diter); + } + } + + for (uint32_t n = 0; n < agents_number; ++n) { + hsa_queue_destroy(queue[n]); + } +} + int main(int argc, char** argv) { const char* kiter_s = getenv("ROCP_KITER"); const char* diter_s = getenv("ROCP_DITER"); + const char* agents_s = getenv("ROCP_AGENTS"); + const char* thrs_s = getenv("ROCP_THRS"); + const int kiter = (kiter_s != NULL) ? atol(kiter_s) : 1; const int diter = (diter_s != NULL) ? atol(diter_s) : 1; + const uint32_t agents_number = (agents_s != NULL) ? (uint32_t)atol(agents_s) : 1; + const int thrs = (thrs_s != NULL) ? atol(thrs_s) : 1; + TestHsa::HsaInstantiate(); - for (int i = 0; i < kiter; ++i) RunKernel(argc, argv, diter); + + std::thread t[thrs]; + for (int n = 0; n < thrs; ++n) { + t[n] = std::thread(thread_fun, kiter, diter, agents_number); + } + for (int n = 0; n < thrs; ++n) { + t[n].join(); + } + TestHsa::HsaShutdown(); return 0; } diff --git a/test/ctrl/run_kernel.h b/test/ctrl/run_kernel.h index b122664b..846e0b68 100644 --- a/test/ctrl/run_kernel.h +++ b/test/ctrl/run_kernel.h @@ -26,13 +26,20 @@ THE SOFTWARE. #include "ctrl/test_hsa.h" #include "util/test_assert.h" -template bool RunKernel(int argc, char* argv[], int count = 1) { +template bool RunKernel(int argc = 0, char* argv[] = NULL, const AgentInfo* agent_info = NULL, hsa_queue_t* queue = NULL, int count = 1) { bool ret_val = false; + if (getenv("ROC_TEST_TRACE") == NULL) std::clog.rdbuf(NULL); + + // Create test kernel object Kernel test_kernel; - TestAql* test_aql = new TestHsa(&test_kernel); - test_aql = new Test(test_aql); + + TestHsa* test_hsa = new TestHsa(&test_kernel); + test_hsa->SetAgentInfo(agent_info); + test_hsa->SetQueue(queue); + + TestAql* test_aql = new Test(test_hsa); TEST_ASSERT(test_aql != NULL); if (test_aql == NULL) return 1; diff --git a/test/ctrl/test_hsa.cpp b/test/ctrl/test_hsa.cpp index 87861821..d006d19c 100644 --- a/test/ctrl/test_hsa.cpp +++ b/test/ctrl/test_hsa.cpp @@ -29,60 +29,54 @@ THE SOFTWARE. #include "util/hsa_rsrc_factory.h" HsaRsrcFactory* TestHsa::hsa_rsrc_ = NULL; -const AgentInfo* TestHsa::agent_info_ = NULL; -hsa_queue_t* TestHsa::hsa_queue_ = NULL; -uint32_t TestHsa::agent_id_ = 0; -HsaRsrcFactory* TestHsa::HsaInstantiate(const uint32_t agent_ind) { +HsaRsrcFactory* TestHsa::HsaInstantiate() { // Instantiate an instance of Hsa Resources Factory if (hsa_rsrc_ == NULL) { - agent_id_ = agent_ind; - hsa_rsrc_ = HsaRsrcFactory::Create(); - // Print properties of the agents hsa_rsrc_->PrintGpuAgents("> GPU agents"); - - // Create an instance of Gpu agent - if (!hsa_rsrc_->GetGpuAgentInfo(agent_ind, &agent_info_)) { - agent_info_ = NULL; - std::cerr << "> error: agent[" << agent_ind << "] is not found" << std::endl; - return NULL; - } - std::clog << "> Using agent[" << agent_ind << "] : " << agent_info_->name << std::endl; - - // Create an instance of Aql Queue - if (hsa_queue_ == NULL) { - uint32_t num_pkts = 128; - if (hsa_rsrc_->CreateQueue(agent_info_, num_pkts, &hsa_queue_) == false) { - hsa_queue_ = NULL; - TEST_ASSERT(false); - } - } } return hsa_rsrc_; } void TestHsa::HsaShutdown() { - if (hsa_queue_ != NULL) { - hsa_queue_destroy(hsa_queue_); - hsa_queue_ = NULL; - } if (hsa_rsrc_) hsa_rsrc_->Destroy(); } -bool TestHsa::Initialize(int arg_cnt, char** arg_list) { +bool TestHsa::Initialize(int /*arg_cnt*/, char** /*arg_list*/) { std::clog << "TestHsa::Initialize :" << std::endl; // Instantiate a Timer object setup_timer_idx_ = hsa_timer_.CreateTimer(); dispatch_timer_idx_ = hsa_timer_.CreateTimer(); - if (HsaInstantiate(agent_id_) == NULL) { + if (hsa_rsrc_ == NULL) { TEST_ASSERT(false); return false; } + // Create an instance of Gpu agent + if (agent_info_ == NULL) { + const uint32_t agent_id = 0; + if (!hsa_rsrc_->GetGpuAgentInfo(agent_id, &agent_info_)) { + agent_info_ = NULL; + std::cerr << "> error: agent[" << agent_id << "] is not found" << std::endl; + return NULL; + } + } + std::clog << "> Using agent[" << agent_info_->dev_index << "] : " << agent_info_->name << std::endl; + + // Create an instance of Aql Queue + if (hsa_queue_ == NULL) { + const uint32_t num_pkts = 128; + if (hsa_rsrc_->CreateQueue(agent_info_, num_pkts, &hsa_queue_) == false) { + hsa_queue_ = NULL; + TEST_ASSERT(false); + } + my_queue_ = true; + } + // Obtain handle of signal hsa_rsrc_->CreateSignal(1, &hsa_signal_); @@ -119,6 +113,8 @@ bool TestHsa::Setup() { mem_map_t& mem_map = test_->GetMemMap(); for (mem_it_t it = mem_map.begin(); it != mem_map.end(); ++it) { mem_descr_t& des = it->second; + if (des.size == 0) continue; + switch (des.id) { case TestKernel::LOCAL_DES_ID: des.ptr = hsa_rsrc_->AllocateLocalMemory(agent_info_, des.size); @@ -220,7 +216,7 @@ bool TestHsa::Run() { // Submit AQL packet to the queue const uint64_t que_idx = hsa_rsrc_->Submit(hsa_queue_, &aql); - std::clog << "> Waiting on kernel dispatch signal, que_idx=" << que_idx << std::endl; + std::clog << "> Waiting on kernel dispatch signal, que_idx=" << que_idx << std::endl << std::flush; // Wait on the dispatch signal until the kernel is finished. // Update wait condition to HSA_WAIT_STATE_ACTIVE for Polling @@ -245,6 +241,8 @@ bool TestHsa::VerifyResults() { const uint32_t size = test_->GetOutputSize(); bool suc = false; + if (size == 0) return true; + // Copy local kernel output buffers from local memory into host memory if (test_->IsOutputLocal()) { output = hsa_rsrc_->AllocateSysMemory(agent_info_, size); @@ -279,5 +277,8 @@ void TestHsa::PrintTime() { bool TestHsa::Cleanup() { hsa_executable_destroy(hsa_exec_); hsa_signal_destroy(hsa_signal_); + if (my_queue_) hsa_queue_destroy(hsa_queue_); + hsa_queue_ = NULL; + agent_info_ = NULL; return true; } diff --git a/test/ctrl/test_hsa.h b/test/ctrl/test_hsa.h index 84080e77..b5df8b69 100644 --- a/test/ctrl/test_hsa.h +++ b/test/ctrl/test_hsa.h @@ -32,23 +32,27 @@ THE SOFTWARE. class TestHsa : public TestAql { public: // Instantiate HSA resources - static HsaRsrcFactory* HsaInstantiate(const uint32_t agent_ind = agent_id_); + static HsaRsrcFactory* HsaInstantiate(); static void HsaShutdown(); - static void SetQueue(hsa_queue_t* queue) { hsa_queue_ = queue; } - static uint32_t HsaAgentId() { return agent_id_; } // Constructor explicit TestHsa(TestKernel* test) : test_(test), name_(test->Name()) { total_time_taken_ = 0; setup_time_taken_ = 0; dispatch_time_taken_ = 0; + agent_info_ = NULL; + hsa_queue_ = NULL; + my_queue_ = false; hsa_exec_ = {}; } // Get methods for Agent Info, HAS queue, HSA Resourcse Manager + HsaRsrcFactory* GetRsrcFactory() { return hsa_rsrc_; } + hsa_agent_t HsaAgent() { return agent_info_->dev_id; } const AgentInfo* GetAgentInfo() { return agent_info_; } + void SetAgentInfo(const AgentInfo* agent_info) { agent_info_ = agent_info; } hsa_queue_t* GetQueue() { return hsa_queue_; } - HsaRsrcFactory* GetRsrcFactory() { return hsa_rsrc_; } + void SetQueue(hsa_queue_t* queue) { hsa_queue_ = queue; } // Initialize application environment including setting // up of various configuration parameters based on @@ -105,14 +109,12 @@ class TestHsa : public TestAql { // Instance of Hsa Resources Factory static HsaRsrcFactory* hsa_rsrc_; - // GPU id - static uint32_t agent_id_; - // Handle to an Hsa Gpu Agent - static const AgentInfo* agent_info_; + const AgentInfo* agent_info_; // Handle to an Hsa Queue - static hsa_queue_t* hsa_queue_; + hsa_queue_t* hsa_queue_; + bool my_queue_; // Test kernel name std::string name_; diff --git a/test/dummy_kernel/dummy_kernel.cl b/test/dummy_kernel/dummy_kernel.cl new file mode 100644 index 00000000..4ab159c8 --- /dev/null +++ b/test/dummy_kernel/dummy_kernel.cl @@ -0,0 +1,28 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +/** + dummy kernel + */ +__kernel void DummyKernel() { + uint tid = get_global_id(0); +} diff --git a/test/dummy_kernel/dummy_kernel.h b/test/dummy_kernel/dummy_kernel.h new file mode 100644 index 00000000..1b8ce430 --- /dev/null +++ b/test/dummy_kernel/dummy_kernel.h @@ -0,0 +1,71 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef TEST_DUMMY_KERNEL_DUMMY_KERNEL_H_ +#define TEST_DUMMY_KERNEL_DUMMY_KERNEL_H_ + +#include +#include + +#include "ctrl/test_kernel.h" + +// Class implements DummyKernel kernel parameters +class DummyKernel : public TestKernel { + public: + // Kernel buffers IDs + enum { KERNARG_BUF_ID, LOCAL_BUF_ID }; + + // Constructor + DummyKernel() : + width_(64), + height_(64) + { + SetInDescr(KERNARG_BUF_ID, KERNARG_DES_ID, 0); + SetOutDescr(LOCAL_BUF_ID, LOCAL_DES_ID, 0); + } + + // Initialize method + void Init() {} + + // Return compute grid size + uint32_t GetGridSize() const { return width_ * height_; } + + // Print output + void PrintOutput(const void* ptr) const {} + + // Return name + std::string Name() const { return std::string("DummyKernel"); } + + private: + // Reference CPU implementation + bool ReferenceImplementation(uint32_t* output, const uint32_t* input, const float* mask, + const uint32_t width, const uint32_t height, + const uint32_t maskWidth, const uint32_t maskHeight) { return true; } + + // Width of the Input array + const uint32_t width_; + + // Height of the Input array + const uint32_t height_; +}; + +#endif // TEST_DUMMY_KERNEL_DUMMY_KERNEL_H_ diff --git a/test/dummy_kernel/gfx8_DummyKernel.hsaco b/test/dummy_kernel/gfx8_DummyKernel.hsaco new file mode 100755 index 0000000000000000000000000000000000000000..35866785c020e2fbdae760ccd0768cf680fc3f5d GIT binary patch literal 10952 zcmeI2O>7&-6~~7kp(vUbXoX@`MKRW{DyK$7NK4A0gQAsV$5s$JQ6e{~lN2&7t|iL+ zDn%Jm5Up3PAu%!$s6Y-zFVU$$4nckJp+yVIHhOS@9MZx-jX(u*aDf8NrG;Cdhe~~K z- zq(l(D!y+UC(DvJm`iICsFO^$%Fd?K3GDnQ#b}MNN3`JV02gf~u9js5aWxG=dX=Oi- zvqNe)7>Z_pJGRVCY2`RC@UJpXuJ83fRO80q%WE)>mrme6{KmQPa}x#c zV6Xd#OZ%wYqbKU=!j*HL{VZ@V@Pod9@b5z}A@UpuyX6PVZk zhS;~SuP^k5QJKz_iqC7ZysZs9tMzL?2$BBjt)7sk^=YqU3dR%=HA_k zh^{H_f=^4R(22PvRR{eDqlLE$-jEx!n9HSz7xlGJ~LN#=6UHz z@_#@7*~dKpTmOFkg2;N@=FdJtuRsMs|Dm zNF{eEQ#E#X*dCh?u{<1aW-b{QDkV9mra1@dbY}KsK2v?ORJlkB>}{hD9tp+caXdNP z;%30jfSUm~18xS~47eF^Gw^?(0eR2nk#|Zv-k3En<7eV7G(Gm+P&41XuKE3eO(?s2 zJkFklH6iM2cf?}jwl`u1!co0tUn~_AE5BQadc{Z4SWoNM+Ml@+xifEhqvn*h*f0Z& zwbP=uw&<{0!wlAH2Z1d`?#F(C8{rgiZozLi2*WpxK$NKzK+XqW!&?&@w>tVPj zkAi;(j5pgh-v7(y`)gpA%<#q%YBWO|OTh!EX~Wj^BSq8QucYaWt*OSEYMnixsaMfd zwKcVarY9ev3G^%l4lFh-Z_G3U8?`v(!Gau4X=jIdvG0~f60pe1hg+U zF2Xw2!ni*orsZviJwofj<7|m*jmsi*Bebo*-Y~=Unq_m09Wmi|3OjcAz;XkQU+q@- z-aP!qVY6G|&*x=-^RZoR@>y;60ex0GYO6K6Q9}^4dFqWRZMhMHwzoB``+PCjf9M5DSDi=`GKox0WDl?v^J{THtD{yVoGgd~T-e=m%OJxBWw z_8w}(9AJK|2+fb@#AYzu`vsi;-n}p{%oniFgXKmS^hq74Z%fBJr~|gSZ6m7@I1kzn zr&d{)qsK$%Loh7F<=geO2%H;1``n1Yxk1{##B+nS*}7=&Em<)-cAcQ%PB7T(Z4H99 zP|9y>t2Ookw>--YA%@rC_p0C(zhAckfz%Zd)-5mOtQB$+TIQTguEP(aBH$Bdi_eSa zALRt{A@>TNyZfKEAf8*?47eF^GvH>x&48N$Hv?`4+zdRv3^>o4`MEJZ55>|+hMu)U zz0UpeF%Q2EApW+oSwh!g?&rtBBF}9Os5Dyd%w<=Hq;-SJjnB%=*bLnQTDf}DE&nf&c^GSsdFwZMI z#k{QW?=rU(evbSC`f|) zlJ;x~ij|;P35u1VSP63X-HC)HO(>2Wj*mjUFT=gA`TSMeZX-n zuE8$ac3bAAoLA;gPICnMasGbxlcuautw#d3a{bcc^@m)0v+H$@T0{I^CGHWNOU^Hk m|0!UOb{b-T-~T2x&NeZu%6TPz4Jtd0ALu2I50wE*Q~V3w3`%VP literal 0 HcmV?d00001 diff --git a/test/dummy_kernel/gfx9_DummyKernel.hsaco b/test/dummy_kernel/gfx9_DummyKernel.hsaco new file mode 100755 index 0000000000000000000000000000000000000000..35866785c020e2fbdae760ccd0768cf680fc3f5d GIT binary patch literal 10952 zcmeI2O>7&-6~~7kp(vUbXoX@`MKRW{DyK$7NK4A0gQAsV$5s$JQ6e{~lN2&7t|iL+ zDn%Jm5Up3PAu%!$s6Y-zFVU$$4nckJp+yVIHhOS@9MZx-jX(u*aDf8NrG;Cdhe~~K z- zq(l(D!y+UC(DvJm`iICsFO^$%Fd?K3GDnQ#b}MNN3`JV02gf~u9js5aWxG=dX=Oi- zvqNe)7>Z_pJGRVCY2`RC@UJpXuJ83fRO80q%WE)>mrme6{KmQPa}x#c zV6Xd#OZ%wYqbKU=!j*HL{VZ@V@Pod9@b5z}A@UpuyX6PVZk zhS;~SuP^k5QJKz_iqC7ZysZs9tMzL?2$BBjt)7sk^=YqU3dR%=HA_k zh^{H_f=^4R(22PvRR{eDqlLE$-jEx!n9HSz7xlGJ~LN#=6UHz z@_#@7*~dKpTmOFkg2;N@=FdJtuRsMs|Dm zNF{eEQ#E#X*dCh?u{<1aW-b{QDkV9mra1@dbY}KsK2v?ORJlkB>}{hD9tp+caXdNP z;%30jfSUm~18xS~47eF^Gw^?(0eR2nk#|Zv-k3En<7eV7G(Gm+P&41XuKE3eO(?s2 zJkFklH6iM2cf?}jwl`u1!co0tUn~_AE5BQadc{Z4SWoNM+Ml@+xifEhqvn*h*f0Z& zwbP=uw&<{0!wlAH2Z1d`?#F(C8{rgiZozLi2*WpxK$NKzK+XqW!&?&@w>tVPj zkAi;(j5pgh-v7(y`)gpA%<#q%YBWO|OTh!EX~Wj^BSq8QucYaWt*OSEYMnixsaMfd zwKcVarY9ev3G^%l4lFh-Z_G3U8?`v(!Gau4X=jIdvG0~f60pe1hg+U zF2Xw2!ni*orsZviJwofj<7|m*jmsi*Bebo*-Y~=Unq_m09Wmi|3OjcAz;XkQU+q@- z-aP!qVY6G|&*x=-^RZoR@>y;60ex0GYO6K6Q9}^4dFqWRZMhMHwzoB``+PCjf9M5DSDi=`GKox0WDl?v^J{THtD{yVoGgd~T-e=m%OJxBWw z_8w}(9AJK|2+fb@#AYzu`vsi;-n}p{%oniFgXKmS^hq74Z%fBJr~|gSZ6m7@I1kzn zr&d{)qsK$%Loh7F<=geO2%H;1``n1Yxk1{##B+nS*}7=&Em<)-cAcQ%PB7T(Z4H99 zP|9y>t2Ookw>--YA%@rC_p0C(zhAckfz%Zd)-5mOtQB$+TIQTguEP(aBH$Bdi_eSa zALRt{A@>TNyZfKEAf8*?47eF^GvH>x&48N$Hv?`4+zdRv3^>o4`MEJZ55>|+hMu)U zz0UpeF%Q2EApW+oSwh!g?&rtBBF}9Os5Dyd%w<=Hq;-SJjnB%=*bLnQTDf}DE&nf&c^GSsdFwZMI z#k{QW?=rU(evbSC`f|) zlJ;x~ij|;P35u1VSP63X-HC)HO(>2Wj*mjUFT=gA`TSMeZX-n zuE8$ac3bAAoLA;gPICnMasGbxlcuautw#d3a{bcc^@m)0v+H$@T0{I^CGHWNOU^Hk m|0!UOb{b-T-~T2x&NeZu%6TPz4Jtd0ALu2I50wE*Q~V3w3`%VP literal 0 HcmV?d00001 diff --git a/test/run.sh b/test/run.sh index a189d18a..0a0a2f72 100755 --- a/test/run.sh +++ b/test/run.sh @@ -22,45 +22,49 @@ # THE SOFTWARE. ################################################################################ -test_bin_dflt=./test/ctrl - # paths to ROC profiler and oher libraries export LD_LIBRARY_PATH=$PWD -# enable error messages logging to '/tmp/rocprofiler_log.txt' -export ROCPROFILER_LOG=1 - # ROC profiler library loaded by HSA runtime export HSA_TOOLS_LIB=librocprofiler64.so -# tool library loaded by ROC profiler -export ROCP_TOOL_LIB=libtool.so +# enable error messages logging to '/tmp/rocprofiler_log.txt' +export ROCPROFILER_LOG=1 # ROC profiler metrics config file unset ROCP_PROXY_QUEUE # ROC profiler metrics config file export ROCP_METRICS=metrics.xml +# test trace +export ROC_TEST_TRACE=1 + +# tool library loaded by ROC profiler +export ROCP_TOOL_LIB=./test/libintercept_test.so +../bin/run_tool.sh ./test/ctrl + +unset ROCP_TOOL_LIB +eval ./test/standalone_test + +# tool library loaded by ROC profiler +export ROCP_TOOL_LIB=libtool.so # ROC profiler kernels timing export ROCP_TIMESTAMP_ON=1 # output directory for the tool library, for metrics results file 'results.txt' +# and SQTT trace files 'thread_trace.se.out' export ROCP_OUTPUT_DIR=./RESULTS if [ ! -e $ROCP_TOOL_LIB ] ; then export ROCP_TOOL_LIB=test/libtool.so fi -if [ -n "$1" ] ; then - tbin="$*" -else - tbin=$test_bin_dflt -fi +export ROCP_KITER=1 +export ROCP_DITER=4 +export ROCP_INPUT=input1.xml +eval ./test/ctrl -export ROCP_KITER=100 -export ROCP_DITER=100 +export ROCP_KITER=50 +export ROCP_DITER=50 +export ROCP_AGENTS=1 +export ROCP_THRS=1 export ROCP_INPUT=input.xml -eval $tbin - -#export ROCP_KITER=1 -#export ROCP_DITER=4 -#export ROCP_INPUT=input1.xml -#eval $tbin +eval ./test/ctrl #valgrind --leak-check=full $tbin #valgrind --tool=massif $tbin diff --git a/test/tool/metrics.xml b/test/tool/metrics.xml index 6ee5c1d6..4011c131 100644 --- a/test/tool/metrics.xml +++ b/test/tool/metrics.xml @@ -1,49 +1,39 @@ #include "gfx_metrics.xml" - # average for 16 instances - - - - # sum for 16 instances - - - - - - - - - # FETCH_SIZE, kilobytes - # The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - - # WRITE_SIZE, kilobytes - # The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - + + + + + + + + + + + + + + - # average for 16 instances - - - - # sum for 16 instances - - - - - - - - - - - # FETCH_SIZE, kilobytes - # The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - - # WRITE_SIZE, kilobytes - # The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. - + + + + + + + + + + + + + + + + diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 373f1f7b..ac9b7a28 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -417,7 +417,8 @@ bool dump_context_entry(context_entry_t* entry) { index, entry->data.queue_index, nik_name.c_str()); - if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", + if (record) fprintf(file_handle, ", gpu-id(%u), time(%lu,%lu,%lu,%lu)", + HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index, record->dispatch, record->begin, record->end, @@ -1019,10 +1020,10 @@ extern "C" PUBLIC_API void OnUnloadTool() { // Dump stored profiling output data fflush(stdout); if (result_file_opened) { - printf("\nROCPRofiler: %u contexts collected", context_collected); fflush(stdout); + printf("\nROCPRofiler:"); fflush(stdout); dump_context_array(NULL); fclose(result_file_handle); - printf(", output directory %s\n", result_prefix); + printf(" %u contexts collected, output directory %s\n", context_collected, result_prefix); } else { if (context_collected != context_count) { results_output_break(); diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp index 5404608b..0293c6c4 100644 --- a/test/util/hsa_rsrc_factory.cpp +++ b/test/util/hsa_rsrc_factory.cpp @@ -128,13 +128,13 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize #ifdef ROCP_LD_AQLPROFILE status = LoadAqlProfileLib(&aqlprofile_api_); #else - status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, 1, 0, &aqlprofile_api_); + status = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, sizeof(aqlprofile_api_), &aqlprofile_api_); #endif CHECK_STATUS("aqlprofile API table load failed", status); // Get Loader API table loader_api_ = {0}; - status = hsa_system_get_extension_table(HSA_EXTENSION_AMD_LOADER, 1, 0, &loader_api_); + status = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_); CHECK_STATUS("loader API table query failed", status); // Instantiate HSA timer @@ -520,6 +520,7 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br // Print the various fields of Hsa Gpu Agents bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { + std::cout << std::flush; std::clog << header << " :" << std::endl; const AgentInfo* agent_info; @@ -543,7 +544,7 @@ bool HsaRsrcFactory::PrintGpuAgents(const std::string& header) { } uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { - const uint32_t slot_size_b = 0x40; + const uint32_t slot_size_b = CMD_SLOT_SIZE_B; // adevance command queue const uint64_t write_idx = hsa_queue_load_write_index_relaxed(queue); @@ -571,7 +572,7 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { } uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes) { - const uint32_t slot_size_b = 0x40; + const uint32_t slot_size_b = CMD_SLOT_SIZE_B; if ((size_bytes & (slot_size_b - 1)) != 0) { fprintf(stderr, "HsaRsrcFactory::Submit: Bad packet size %zx\n", size_bytes); abort(); diff --git a/test/util/hsa_rsrc_factory.h b/test/util/hsa_rsrc_factory.h index c9466f89..738a8e2f 100644 --- a/test/util/hsa_rsrc_factory.h +++ b/test/util/hsa_rsrc_factory.h @@ -121,7 +121,7 @@ struct AgentInfo { // HSA timer class // Provides current HSA timestampa and system-clock/ns conversion API class HsaTimer { - public: + public: typedef uint64_t timestamp_t; static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; typedef long double freq_t; @@ -134,8 +134,12 @@ class HsaTimer { } // Methids for system-clock/ns conversion - timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { return timestamp_t((freq_t)sysclock * sysclock_factor_); } - timestamp_t ns_to_sysclock(const timestamp_t& time) const { return timestamp_t((freq_t)time / sysclock_factor_); } + timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { + return timestamp_t((freq_t)sysclock * sysclock_factor_); + } + timestamp_t ns_to_sysclock(const timestamp_t& time) const { + return timestamp_t((freq_t)time / sysclock_factor_); + } // Return timestamp in 'ns' timestamp_t timestamp_ns() const { @@ -145,13 +149,14 @@ class HsaTimer { return sysclock_to_ns(sysclock); } - private: + private: // Timestamp frequency factor freq_t sysclock_factor_; }; class HsaRsrcFactory { public: + static const size_t CMD_SLOT_SIZE_B = 0x40; typedef std::recursive_mutex mutex_t; typedef HsaTimer::timestamp_t timestamp_t; @@ -270,7 +275,7 @@ class HsaRsrcFactory { static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); // Return AqlProfile API table - typedef hsa_ven_amd_aqlprofile_1_00_pfn_t aqlprofile_pfn_t; + typedef hsa_ven_amd_aqlprofile_pfn_t aqlprofile_pfn_t; const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; } // Return Loader API table From 1047b900d4fb22540ec0a133092b800e5d5c6308 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 19 Nov 2018 19:14:18 -0600 Subject: [PATCH 022/168] pre/post package scripts install dir name fix and CPACK_PACKAGING_INSTALL_PREFIX --- CMakeLists.txt | 2 ++ DEBIAN/postinst | 2 +- DEBIAN/prerm | 2 +- RPM/rpm_post | 2 +- RPM/rpm_postun | 2 +- bin/rpl_run.sh | 11 +++++------ bin/tblextr.py | 4 ++-- test/tool/tool.cpp | 7 ++++--- 8 files changed, 17 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c8d473d7..05ee1800 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,6 +77,7 @@ add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test ) #if ( CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT ) #message ( "CMAKE default prefix: ${CMAKE_INSTALL_PREFIX}" ) #endif () +set ( CPACK_PACKAGING_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX} ) set ( CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}/${ROCPROFILER_NAME}" ) message ( "---------Install-Dir: ${CMAKE_INSTALL_PREFIX}" ) install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION lib ) @@ -99,6 +100,7 @@ install ( FILES ${PROJECT_BINARY_DIR}/test/ctrl DESTINATION tool PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) ## Packaging directives +set ( CPACK_GENERATOR "DEB" "RPM" "TGZ" ) set ( CPACK_PACKAGE_NAME "${ROCPROFILER_NAME}-dev" ) set ( CPACK_PACKAGE_VENDOR "AMD" ) set ( CPACK_PACKAGE_VERSION_MAJOR ${BUILD_VERSION_MAJOR} ) diff --git a/DEBIAN/postinst b/DEBIAN/postinst index 3d022884..abec93b9 100644 --- a/DEBIAN/postinst +++ b/DEBIAN/postinst @@ -3,7 +3,7 @@ set -e do_ldconfig() { - echo /opt/rocm/librocprofiler/lib > /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig + echo /opt/rocm/rocprofiler/lib > /etc/ld.so.conf.d/librocprofiler64.conf && ldconfig } case "$1" in diff --git a/DEBIAN/prerm b/DEBIAN/prerm index b3f509a9..40946383 100644 --- a/DEBIAN/prerm +++ b/DEBIAN/prerm @@ -3,7 +3,7 @@ set -e rm_ldconfig() { - rm -f /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig + rm -f /etc/ld.so.conf.d/librocprofiler64.conf && ldconfig } case "$1" in diff --git a/RPM/rpm_post b/RPM/rpm_post index 57c5c811..d0684561 100644 --- a/RPM/rpm_post +++ b/RPM/rpm_post @@ -1 +1 @@ -echo /opt/rocm/librocprofiler/lib > /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig +echo /opt/rocm/rocprofiler/lib > /etc/ld.so.conf.d/librocprofiler64.conf && ldconfig diff --git a/RPM/rpm_postun b/RPM/rpm_postun index 6b3c8f28..b9c1fadb 100644 --- a/RPM/rpm_postun +++ b/RPM/rpm_postun @@ -1 +1 @@ -rm -f /etc/ld.so.conf.d/libhsa-rocprofiler64.conf && ldconfig +rm -f /etc/ld.so.conf.d/librocprofiler64.conf && ldconfig diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 043c0007..78a03fa1 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -27,7 +27,6 @@ BIN_DIR=`dirname $0` BIN_DIR=`cd $BIN_DIR; pwd` RUN_DIR=`pwd` TMP_DIR="/tmp" -DATA_PATH=$TMP_DIR DATA_DIR="rpl_data_${time_stamp}_$$" PKG_DIR=`echo $BIN_DIR | sed "s/\/bin\/*//"` @@ -195,6 +194,7 @@ if [ -z "$1" ] ; then fi INPUT_FILE="" +DATA_PATH="-" OUTPUT_DIR="-" output="" csv_output="" @@ -210,8 +210,7 @@ while [ 1 ] ; do elif [ "$1" = "-o" ] ; then output="$2" elif [ "$1" = "-d" ] ; then - OUTPUT_DIR="$2" - DATA_PATH=$OUTPUT_DIR + DATA_PATH=$2 elif [ "$1" = "-t" ] ; then TMP_DIR="$2" if [ "$OUTPUT_DIR" = "-" ] ; then @@ -268,8 +267,8 @@ else input_base=`basename $input_base` fi -if [ "$OUTPUT_DIR" = "--" ] ; then - fatal "Bad output dir '$OUTPUT_DIR'" +if [ "$DATA_PATH" = "-" ] ; then + DATA_PATH=$TMP_DIR fi if [ -n "$output" ] ; then @@ -290,9 +289,9 @@ echo "RPL: input file '$INPUT_FILE'" input_list="" RES_DIR="" if [ "$input_type" = "xml" ] ; then + OUTPUT_DIR=$DATA_PATH input_list=$INPUT_FILE elif [ "$input_type" = "txt" -o "$input_type" = "none" ] ; then - OUTPUT_DIR="-" RES_DIR=$DATA_PATH/$DATA_DIR if [ -e $RES_DIR ] ; then error "Rundir '$RES_DIR' exists" diff --git a/bin/tblextr.py b/bin/tblextr.py index 6a0f8eb2..87ecadd5 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -45,7 +45,7 @@ def parse_res(infile): if not os.path.isfile(infile): fatal("Error: input file '" + infile + "' not found") inp = open(infile, 'r') - beg_pattern = re.compile("^dispatch\[(\d*)\], queue_index\(\d*\), kernel_name\(\"([^\"]*)\"\)") + beg_pattern = re.compile("^dispatch\[(\d*)\],.* kernel-name\(\"([^\"]*)\"\)") ts_pattern = re.compile(", time\((\d*),(\d*),(\d*),(\d*)\)") var_pattern = re.compile("^\s*([^\s]*)\s+\((\d*)\)") @@ -55,7 +55,7 @@ def parse_res(infile): m = var_pattern.match(record) if m: - if not dispatch_number in var_table: fatal("Error: dispatch number not unique '" + str(dispatch_number) + "'") + if not dispatch_number in var_table: fatal("Error: dispatch number not found '" + str(dispatch_number) + "'") var = m.group(1) val = m.group(2) var_table[dispatch_number][m.group(1)] = m.group(2) diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index ac9b7a28..0db9e290 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -413,12 +413,13 @@ bool dump_context_entry(context_entry_t* entry) { FILE* file_handle = entry->file_handle; const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); - fprintf(file_handle, "dispatch[%u], queue_index(%lu), kernel_name(\"%s\")", + fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), kernel-name(\"%s\")", index, + HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index, + entry->data.queue_id, entry->data.queue_index, nik_name.c_str()); - if (record) fprintf(file_handle, ", gpu-id(%u), time(%lu,%lu,%lu,%lu)", - HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index, + if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", record->dispatch, record->begin, record->end, From 569e980a06ba858bfb9ab59db21717135a5d06f3 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 4 Feb 2019 19:45:46 -0600 Subject: [PATCH 023/168] adding kernel properties --- CMakeLists.txt | 20 +- bin/dform.py | 36 +++ bin/rpl_run.sh | 81 ++++++- bin/run_tool.sh | 36 +++ bin/sqlitedb.py | 228 ++++++++++++++++++ bin/tblextr.py | 350 +++++++++++++++++++++++++-- bin/txt2xml.sh | 9 +- inc/rocprofiler.h | 8 +- script/rpl_run.sh | 377 ------------------------------ script/txt2xml.sh | 94 -------- src/core/intercept_queue.h | 16 +- src/core/rocprofiler.cpp | 21 +- src/core/tracker.h | 103 +++++--- src/util/hsa_rsrc_factory.h | 2 +- test/app/intercept_test_stand.cpp | 189 +++++++++++++++ test/run.sh | 2 + test/tool/gfx_metrics.xml | 4 +- test/tool/metrics.xml | 4 +- test/tool/tool.cpp | 172 +++++++------- 19 files changed, 1092 insertions(+), 660 deletions(-) create mode 100644 bin/dform.py create mode 100755 bin/run_tool.sh create mode 100644 bin/sqlitedb.py delete mode 100755 script/rpl_run.sh delete mode 100755 script/txt2xml.sh create mode 100644 test/app/intercept_test_stand.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 05ee1800..18bbee13 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -73,11 +73,14 @@ endif () ## Build tests add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test ) -## Install information -#if ( CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT ) -#message ( "CMAKE default prefix: ${CMAKE_INSTALL_PREFIX}" ) -#endif () -set ( CPACK_PACKAGING_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX} ) +## Create symlinks for packaging and install +add_custom_target ( rocprof-link ALL WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + COMMAND ${CMAKE_COMMAND} -E create_symlink ../${ROCPROFILER_NAME}/bin/rpl_run.sh rocprof-link ) +add_custom_target ( inc-link ALL WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + COMMAND ${CMAKE_COMMAND} -E create_symlink ../${ROCPROFILER_NAME}/include inc-link ) +add_custom_target ( so-link ALL WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + COMMAND ${CMAKE_COMMAND} -E create_symlink ../${ROCPROFILER_NAME}/lib/${ROCPROFILER_LIBRARY}.so so-link ) + set ( CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}/${ROCPROFILER_NAME}" ) message ( "---------Install-Dir: ${CMAKE_INSTALL_PREFIX}" ) install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION lib ) @@ -87,8 +90,15 @@ install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/bin/rpl_run.sh ${CMAKE_CURRENT_SOURCE_DIR}/bin/txt2xml.sh ${CMAKE_CURRENT_SOURCE_DIR}/bin/tblextr.py + ${CMAKE_CURRENT_SOURCE_DIR}/bin/dform.py + ${CMAKE_CURRENT_SOURCE_DIR}/bin/sqlitedb.py DESTINATION bin PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) +install ( FILES ${PROJECT_BINARY_DIR}/inc-link DESTINATION ../include RENAME ${ROCPROFILER_NAME} ) +install ( FILES ${PROJECT_BINARY_DIR}/so-link DESTINATION ../lib RENAME ${ROCPROFILER_LIBRARY}.so ) +install ( FILES ${PROJECT_BINARY_DIR}/rocprof-link DESTINATION ../bin + PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE + RENAME rocprof ) # gfx_metrics.xml metrics.xml install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/test/tool/metrics.xml diff --git a/bin/dform.py b/bin/dform.py new file mode 100644 index 00000000..5fc8d6fc --- /dev/null +++ b/bin/dform.py @@ -0,0 +1,36 @@ +#!/usr/bin/python +from sqlitedb import SQLiteDB + +def post_process_data(db, table_name, outfile = ''): +# db.add_data_column('A', 'DispDurNs', 'INTEGER', 'BeginNs - DispatchNs') +# db.add_data_column('A', 'ComplDurNs', 'INTEGER', 'CompleteNs - EndNs') +# db.add_data_column('A', 'TotalDurNs', 'INTEGER', 'CompleteNs - DispatchNs') +# db.add_data_column(table_name, 'TimeNs', 'INTEGER', 'BeginNs - %d' % start_ns) + db.add_data_column(table_name, 'DurationNs', 'INTEGER', 'EndNs - BeginNs') + if outfile != '': db.dump_csv(table_name, outfile) + +def gen_data_bins(db, outfile): + db.execute('create view C as select Name, Calls, TotalDurationNs, TotalDurationNs/Calls as AverageNs, TotalDurationNs*100.0/(select sum(TotalDurationNs) from %s) as Percentage from %s order by TotalDurationNs desc;' % ('B', 'B')); + db.dump_csv('C', outfile) + db.execute('DROP VIEW C') + +def gen_table_bins(db, table, outfile, name_var, dur_ns_var): + db.execute('create view B as select (%s) as Name, count(%s) as Calls, sum(%s) as TotalDurationNs from %s group by %s' % (name_var, name_var, dur_ns_var, table, name_var)) + gen_data_bins(db, outfile) + db.execute('DROP VIEW B') + +def gen_api_json_trace(db, table, start_us, outfile): + db.execute('create view B as select "Index", Name as name, pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s order by ts asc;' % (start_us, table)); + db.dump_json('B', table, outfile) + db.execute('DROP VIEW B') + +def gen_ops_json_trace(db, table, base_pid, start_us, outfile): + db.execute('create view B as select "Index", Name as name, ("gpu-id" + %d) as pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s order by ts asc;' % (base_pid, start_us, table)); + db.dump_json('B', table, outfile) + db.execute('DROP VIEW B') + +def gen_kernel_json_trace(db, table, base_pid, start_us, outfile): + db.execute('create view B as select "Index", KernelName as name, ("gpu-id" + %d) as pid, (0) as tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s order by ts asc;' % (base_pid, start_us, table)); + db.dump_json('B', table, outfile) + db.execute('DROP VIEW B') +############################################################################################## diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 78a03fa1..adefad73 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -23,19 +23,27 @@ ################################################################################ time_stamp=`date +%y%m%d_%H%M%S` -BIN_DIR=`dirname $0` -BIN_DIR=`cd $BIN_DIR; pwd` +BIN_DIR=$(dirname $(realpath $0)) +PKG_DIR=$(dirname $BIN_DIR) +ROOT_DIR=$(dirname $PKG_DIR) RUN_DIR=`pwd` TMP_DIR="/tmp" DATA_DIR="rpl_data_${time_stamp}_$$" -PKG_DIR=`echo $BIN_DIR | sed "s/\/bin\/*//"` -BIN_DIR=$PKG_DIR/bin - # PATH to custom HSA and OpenCl runtimes HSA_PATH=$PKG_DIR/lib/hsa -export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$HSA_PATH +# roctracer path +if [ -z "$ROCTRACER_PATH" ] ; then ROCTRACER_PATH=$ROOT_DIR/roctracer; fi + +# runtime API trace +HSA_TRACE=0 +HIP_TRACE=0 + +# Generate stats +GEN_STATS=0 + +export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$ROCTRACER_PATH/lib:$ROCTRACER_PATH/tool:$HSA_PATH export PATH=.:$PATH # enable error logging @@ -128,9 +136,14 @@ usage() { echo "" echo " --basenames - to turn on/off truncating of the kernel full function names till the base ones [off]" echo " --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off]" + echo " --ctx-wait - to wait for outstanding contexts on profiler exit [on]" echo " --ctx-limit - maximum number of outstanding contexts [0 - unlimited]" echo " --heartbeat - to print progress heartbeats [0 - disabled]" echo "" + echo " --stats - generating kernel executino stats" + echo " --hsa-trace - to trace HSA, generates API execution stats and JSON file viewable in chrome tracing" + echo " --hip-trace - to trace HIP, generates API execution stats and JSON file viewable in chrome tracing" + echo "" echo "Configuration file:" echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" echo " First the configuration file is looking in the current directory, then in your home, and then in the package directory." @@ -178,12 +191,36 @@ run() { mkdir -p "$ROCP_OUTPUT_DIR" fi + API_TRACE="" + PRELOAD_LIBS="" + if [ "$HSA_TRACE" = 1 ] ; then + API_TRACE="hsa" + fi + if [ "$HIP_TRACE" = 1 ] ; then + if [ -z "$API_TRACE" ] ; then + API_TRACE="hip"; + else + API_TRACE="all" + fi + if [ -z "$HCC_HOME" ] ; then error "env var HCC_HOME is not defined"; fi + PRELOAD_LIBS="$PRELOAD_LIBS $HCC_HOME/lib/libmcwamp_hsa.so" + fi + if [ -n "$API_TRACE" ] ; then + API_TRACE=$(echo $API_TRACE | sed 's/all//') + if [ -n "$API_TRACE" ] ; then export ROCTRACER_DOMAIN=$API_TRACE; fi + export HSA_TOOLS_LIB="libtracer_tool.so libroctracer64.so $HSA_TOOLS_LIB" + PRELOAD_LIBS="$PRELOAD_LIBS $HSA_TOOLS_LIB" + fi + + redirection_cmd="" if [ -n "$ROCP_OUTPUT_DIR" ] ; then OUTPUT_LIST="$OUTPUT_LIST $ROCP_OUTPUT_DIR/results.txt" - eval "$APP_CMD 2>&1 | tee $ROCP_OUTPUT_DIR/log.txt" - else - eval "$APP_CMD" + redirection_cmd="2>&1 | tee $ROCP_OUTPUT_DIR/log.txt" fi + + #unset ROCP_OUTPUT_DIR + CMD_LINE="LD_PRELOAD='$PRELOAD_LIBS' $APP_CMD $redirection_cmd" + eval "$CMD_LINE" } # main @@ -236,10 +273,29 @@ while [ 1 ] ; do else export ROCP_TIMESTAMP_ON=0 fi + elif [ "$1" = "--ctx-wait" ] ; then + if [ "$2" = "on" ] ; then + export ROCP_OUTSTANDING_WAIT=1 + else + export ROCP_OUTSTANDING_WAIT=0 + fi elif [ "$1" = "--ctx-limit" ] ; then export ROCP_OUTSTANDING_MAX="$2" elif [ "$1" = "--heartbeat" ] ; then export ROCP_OUTSTANDING_MON="$2" + elif [ "$1" = "--stats" ] ; then + ARG_VAL=0 + export ROCP_TIMESTAMP_ON=1 + GEN_STATS=1 + elif [ "$1" = "--hsa-trace" ] ; then + ARG_VAL=0 + export ROCP_TIMESTAMP_ON=1 + GEN_STATS=1 + HSA_TRACE=1 + elif [ "$1" = "--hip-trace" ] ; then + ARG_VAL=0 + GEN_STATS=1 + HIP_TRACE=1 elif [ "$1" = "--verbose" ] ; then ARG_VAL=0 export ROCP_VERBOSE_MODE=1 @@ -323,7 +379,12 @@ for name in $input_list; do done if [ -n "$csv_output" ] ; then - python $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST + if [ "$GEN_STATS" = "1" ] ; then + db_output=$(echo $csv_output | sed "s/\.csv/.db/") + python $BIN_DIR/tblextr.py $db_output $OUTPUT_LIST + else + python $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST + fi if [ "$?" -eq 0 ] ; then echo "RPL: '$csv_output' is generated" else diff --git a/bin/run_tool.sh b/bin/run_tool.sh new file mode 100755 index 00000000..5af6d1a1 --- /dev/null +++ b/bin/run_tool.sh @@ -0,0 +1,36 @@ +#!/bin/sh +BIN_DIR=`dirname $0` +BIN_DIR=`cd $BIN_DIR; pwd` +PKG_DIR=`echo $BIN_DIR | sed "s/\/bin\/*//"` +BIN_DIR=$PKG_DIR/bin + +# PATH to custom HSA libs +HSA_PATH=$PKG_DIR/lib/hsa + +if [ -z "$1" ] ; then + echo "Usage: $0 " +else +# profiler plugin library +test_app=$* + +# paths to ROC profiler and oher libraries +export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$HSA_PATH +export PATH=.:$PATH + +# ROC profiler library loaded by HSA runtime +export HSA_TOOLS_LIB=librocprofiler64.so.1 +# tool library loaded by ROC profiler +if [ -z $ROCP_TOOL_LIB ] ; then + export ROCP_TOOL_LIB=libintercept_test.so +fi +# enable error messages +export HSA_TOOLS_REPORT_LOAD_FAILURE=1 +export HSA_VEN_AMD_AQLPROFILE_LOG=1 +export ROCPROFILER_LOG=1 +# to prevent internal simple proxy queue +unset ROCP_PROXY_QUEUE +# ROC profiler metrics config file +export ROCP_METRICS=$BIN_DIR/lib/metrics.xml + +LD_PRELOAD=$ROCP_TOOL_LIB $test_app +fi diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py new file mode 100644 index 00000000..295fe7a7 --- /dev/null +++ b/bin/sqlitedb.py @@ -0,0 +1,228 @@ +import csv, sqlite3, re, sys +from functools import reduce + +# SQLite Database class +class SQLiteDB: + def __init__(self, file_name): + self.connection = sqlite3.connect(file_name) + self.tables = {} + self.json_arg_list_enabled = 0 + + def __del__(self): + self.connection.close() + + # add DB table + def add_table(self, name, descr, extra = ()): + (field_list, field_dict) = descr + if name in self.tables: raise Exception('table is already added: "' + name + '"') + + # create DB table + table_descr = [] + for field in field_list: table_descr.append('"%s" %s' % (field, field_dict[field])) + for item in extra: table_descr.append('"%s" %s' % (item[0], item[1])) + stm = 'CREATE TABLE ' + name + ' (%s)' % ', '.join(table_descr) + cursor = self.connection.cursor() + cursor.execute(stm) + self.connection.commit() + + # register table + fields_str = ','.join(map(lambda x: '"' + x + '"', field_list)) + templ_str = ','.join('?' * len(field_list)) + stm = 'INSERT INTO ' + name + '(' + fields_str + ') VALUES(' + templ_str + ');' + self.tables[name] = stm + + return (cursor, stm); + + # add columns to table + def add_columns(self, name, columns): + cursor = self.connection.cursor() + for item in columns: + stm = 'ALTER TABLE ' + name + ' ADD COLUMN "%s" %s' % (item[0], item[1]) + cursor.execute(stm) + self.connection.commit() + + # add columns with expression + def add_data_column(self, table_name, data_label, data_type, data_expr): + cursor = self.connection.cursor() + cursor.execute('ALTER TABLE %s ADD COLUMN "%s" %s' % (table_name, data_label, data_type)) + cursor.execute('UPDATE %s SET %s = (%s);' % (table_name, data_label, data_expr)) + + # populate DB table entry + def insert_entry(self, table, val_list): + (cursor, stm) = table + cursor.execute(stm, val_list) + + # populate DB table entry + def commit_entry(self, table, val_list): + self.insert_entry(table, val_list) + self.connection.commit() + + # populate DB table data + def insert_table(self, table, reader): + for val_list in reader: + if not val_list[-1]: val_list.pop() + self.insert_entry(table, val_list) + self.connection.commit() + + # return table fields list + def _get_fields(self, table_name): + cursor = self.connection.execute('SELECT * FROM ' + table_name) + return list(map(lambda x: '"%s"' % (x[0]), cursor.description)) + + # return table raws list + def _get_raws(self, table_name): + cursor = self.connection.execute('SELECT * FROM ' + table_name) + return cursor.fetchall() + def _get_raws_indexed(self, table_name): + cursor = self.connection.execute('SELECT * FROM ' + table_name + ' order by "Index" asc;') + return cursor.fetchall() + def _get_raw_by_id(self, table_name, req_id): + cursor = self.connection.execute('SELECT * FROM ' + table_name + ' WHERE "Index"=?', (req_id,)) + raws = cursor.fetchall() + if len(raws) != 1: + raise Exception('Index is not unique, table "' + table_name + '"') + return list(raws[0]) + + # dump CSV table + def dump_csv(self, table_name, file_name): + if not re.search(r'\.csv$', file_name): + raise Exception('wrong output file type: "' + file_name + '"' ) + + fields = self._get_fields(table_name) + with open(file_name, mode='w') as fd: + fd.write(','.join(fields) + '\n') + for raw in self._get_raws(table_name): + fd.write(reduce(lambda a, b: str(a) + ',' + str(b), raw) + '\n') + + # dump JSON trace + def open_json(self, file_name): + if not re.search(r'\.json$', file_name): + raise Exception('wrong output file type: "' + file_name + '"' ) + with open(file_name, mode='w') as fd: + fd.write('{ "traceEvents":[{}\n'); + + def close_json(self, file_name): + if not re.search(r'\.json$', file_name): + raise Exception('wrong output file type: "' + file_name + '"' ) + with open(file_name, mode='a') as fd: + fd.write(']}\n'); + + def label_json(self, pid, label, file_name): + if not re.search(r'\.json$', file_name): + raise Exception('wrong output file type: "' + file_name + '"' ) + with open(file_name, mode='a') as fd: + fd.write(',{"args":{"name":"%s"},"ph":"M","pid":%s,"name":"process_name"}\n' %(label, pid)); + + def flow_json(self, base_id, from_pid, from_tid, from_us_list, to_pid, to_us_dict, corr_id_list, start_us, file_name): + if not re.search(r'\.json$', file_name): + raise Exception('wrong output file type: "' + file_name + '"' ) + with open(file_name, mode='a') as fd: + dep_id = base_id + for ind in range(len(from_tid)): + if (len(corr_id_list) != 0): corr_id = corr_id_list[ind] + else: corr_id = ind + from_ts = from_us_list[ind] - start_us + to_ts = to_us_dict[corr_id] - start_us + if from_ts > to_ts: from_ts = to_ts + fd.write(',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%s,"tid":%s,"name":"dep"}\n' % (from_ts, dep_id, str(from_pid), from_tid[ind])) + fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%s,"tid":0,"name":"dep"}\n' % (to_ts, dep_id, str(to_pid))) + dep_id += 1 + + def dump_json(self, table_name, data_name, file_name): + if not re.search(r'\.json$', file_name): + raise Exception('wrong output file type: "' + file_name + '"' ) + + sub_ptrn = re.compile(r'(^"|"$)') + name_ptrn = re.compile(r'(name|Name)') + + table_fields = self._get_fields(table_name) + table_raws = self._get_raws_indexed(table_name) + data_fields = self._get_fields(data_name) + data_raws = self._get_raws_indexed(data_name) + + with open(file_name, mode='a') as fd: + table_raws_len = len(table_raws) + for raw_index in range(table_raws_len): + if (raw_index == table_raws_len - 1) or (raw_index % 1000 == 0): + sys.stdout.write( \ + "\rdump json " + str(raw_index) + ":" + str(len(table_raws)) + " "*100 \ + ) + + vals_list = [] + values = list(table_raws[raw_index]) + for value_index in range(len(values)): + label = table_fields[value_index] + value = values[value_index] + if name_ptrn.search(label): value = sub_ptrn.sub(r'', value) + if label != '"Index"': vals_list.append('%s:"%s"' % (label, value)) + + args_list = [] + data = list(data_raws[raw_index]) + for value_index in range(len(data)): + label = data_fields[value_index] + value = data[value_index] + if name_ptrn.search(label): value = sub_ptrn.sub(r'', value) + if label != '"Index"': args_list.append('%s:"%s"' % (label, value)) + + fd.write(',{"ph":"%s",%s,\n "args":{\n %s\n }\n}\n' % ('X', ','.join(vals_list), ',\n '.join(args_list))) + + sys.stdout.write('\n') + + # execute query on DB + def execute(self, cmd): + cursor = self.connection.cursor() + cursor.execute(cmd) + + # commit DB + def commit(self): + self.connection.commit() + + # close DB + def close(self): + self.connection.close() + + # access DB + def get_raws(self, table_name): + cur = self.connection.cursor() + cur.execute("SELECT * FROM %s" % table_name) + return cur.fetchall() + + # return CSV descriptor + # list of fields and dictionaly for the fields types + def _get_csv_descr(self, table_name, fd): + reader = csv.DictReader(fd) + field_names = reader.fieldnames + if not field_names[-1]: field_names.pop() + field_types = {} + + for entry in reader: + fields_left = [f for f in field_names if f not in field_types.keys()] + # all fields processed + if not fields_left: break + + for field in fields_left: + data = entry[field] + # need data for the field to be processed + if len(data) == 0: continue + + if data.isdigit(): + field_types[field] = "INTEGER" + else: + field_types[field] = "TEXT" + + if len(fields_left) > 0: raise Exception('types not found for fields: ', fields_left) + return (field_names, field_types) + + # add CSV table + def add_csv_table(self, table_name, file_name, extra = ()): + with open(file_name, mode='r') as fd: + # get CSV table descriptor + descr = self._get_csv_descr(table_name, fd) + # reader to populate the table + fd.seek(0) + reader = csv.reader(fd) + reader.next() + table = self.add_table(table_name, descr, extra) + self.insert_table(table, reader) + +############################################################################################## diff --git a/bin/tblextr.py b/bin/tblextr.py index 87ecadd5..4c4cc782 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -23,6 +23,8 @@ ################################################################################ import os, sys, re +from sqlitedb import SQLiteDB +import dform # Parsing results in the format: #dispatch[0], queue_index(0), kernel_name("SimpleConvolution"), time(1048928000311041,1048928006154674,1048928006168274,1048928006170503): @@ -30,8 +32,24 @@ # SQ_WAVES (4096) # SQ_INSTS_VMEM_RD (36864) +COPY_PID = 0 +OPS_PID = 1 +HSA_PID = 2 +HIP_PID = 3 +GPU_BASE_PID = 4 +max_gpu_id = -1 +START_US = 0 + +# dependencies dictionary +dep_dict = {} +kern_dep_list = [] + # global vars -var_list = ['Index', 'KernelName', 'DispatchNs', 'BeginNs', 'EndNs', 'CompleteNs'] +table_descr = [ + ['Index', 'KernelName'], + {'Index': 'INTEGER', 'KernelName': 'TEXT'} +] +var_list = table_descr[0] var_table = {} ############################################################# @@ -42,10 +60,12 @@ def fatal(msg): # parse results method def parse_res(infile): + global max_gpu_id if not os.path.isfile(infile): fatal("Error: input file '" + infile + "' not found") inp = open(infile, 'r') - beg_pattern = re.compile("^dispatch\[(\d*)\],.* kernel-name\(\"([^\"]*)\"\)") + beg_pattern = re.compile("^dispatch\[(\d*)\], (.*) kernel-name\(\"([^\"]*)\"\)") + prop_pattern = re.compile("([\w-]+)\((\w+)\)"); ts_pattern = re.compile(", time\((\d*),(\d*),(\d*),(\d*)\)") var_pattern = re.compile("^\s*([^\s]*)\s+\((\d*)\)") @@ -67,8 +87,25 @@ def parse_res(infile): if not dispatch_number in var_table: var_table[dispatch_number] = { 'Index': dispatch_number, - 'KernelName': "\"" + m.group(2) + "\"" + 'KernelName': "\"" + m.group(3) + "\"" } + + gpu_id = 0 + disp_tid = 0 + + kernel_properties = m.group(2) + for prop in kernel_properties.split(', '): + m = prop_pattern.match(prop) + if m: + var = m.group(1) + val = m.group(2) + var_table[dispatch_number][var] = val + if not var in var_list: var_list.append(var); + if var == 'gpu-id': + gpu_id = int(val) + if (gpu_id > max_gpu_id): max_gpu_id = gpu_id + if var == 'tid': disp_tid = val + else: fatal('wrong kernel property "' + prop + '" in "'+ kernel_properties + '"') m = ts_pattern.search(record) if m: var_table[dispatch_number]['DispatchNs'] = m.group(1) @@ -76,47 +113,310 @@ def parse_res(infile): var_table[dispatch_number]['EndNs'] = m.group(3) var_table[dispatch_number]['CompleteNs'] = m.group(4) + gpu_pid = GPU_BASE_PID + int(gpu_id) + if not gpu_pid in dep_dict: dep_dict[gpu_pid] = {} + dep_str = dep_dict[gpu_pid] + if not 'tid' in dep_str: dep_str['tid'] = [] + if not 'from' in dep_str: dep_str['from'] = [] + if not 'to' in dep_str: dep_str['to'] = {} + to_id = len(dep_str['tid']) + from_us = int(m.group(1)) / 1000 + to_us = int(m.group(2)) / 1000 + dep_str['to'][to_id] = to_us + dep_str['from'].append(from_us) + dep_str['tid'].append(disp_tid) + dep_str['pid'] = HSA_PID + kern_dep_list.append((disp_tid, m.group(1))) + inp.close() ############################################################# -# print results table method -def print_tbl(outfile): +# merge results table +def merge_table(): global var_list - if len(var_table) == 0: return 1 + keys = sorted(var_table.keys(), key=int) - out = open(outfile, 'w') + fields = set(var_table[keys[0]]) + if 'DispatchNs' in fields: + var_list.append('DispatchNs') + var_list.append('BeginNs') + var_list.append('EndNs') + var_list.append('CompleteNs') + var_list = [x for x in var_list if x in fields] +############################################################# - keys = var_table.keys() - keys.sort(key=int) +# dump CSV results +def dump_csv(file_name): + global var_list + keys = sorted(var_table.keys(), key=int) - entry = var_table[keys[0]] - list1 = [] - for var in var_list: - if var in entry: - list1.append(var) - var_list = list1 + with open(file_name, mode='w') as fd: + fd.write(','.join(var_list) + '\n'); + for ind in keys: + entry = var_table[ind] + dispatch_number = entry['Index'] + if ind != dispatch_number: fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n") + val_list = [entry[var] for var in var_list] + fd.write(','.join(val_list) + '\n'); +############################################################# + +# fill kernels DB +def fill_kernel_db(table_name, db): + global var_list + keys = sorted(var_table.keys(), key=int) - for var in var_list: out.write(var + ',') - out.write("\n") + for var in set(var_list).difference(set(table_descr[1])): + table_descr[1][var] = 'INTEGER' + table_descr[0] = var_list; + + table_handle = db.add_table(table_name, table_descr) for ind in keys: entry = var_table[ind] dispatch_number = entry['Index'] if ind != dispatch_number: fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n") - for var in var_list: out.write(entry[var] + ',') - out.write("\n") + val_list = [entry[var] for var in var_list] + db.insert_entry(table_handle, val_list) +############################################################# + +# fill HSA DB +hsa_table_descr = [ + ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'args', 'Index'], + {'Index':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER'} +] +def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep_filtr, expl_id): + file_name = indir + '/' + api_name + '_api_trace.txt' + ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) ([^\(]+)(\(.*)$') + ptrn_ac = re.compile(r'hsa_amd_memory_async_copy') + + if not os.path.isfile(file_name): return 0 + + dep_tid_list = [] + dep_from_us_list = [] + dep_id_list = [] + + global START_US + with open(file_name, mode='r') as fd: + line = fd.readline() + record = line[:-1] + m = ptrn_val.match(record) + if m: START_US = int(m.group(1)) / 1000 + START_US = 0 + + record_id = 0 + table_handle = db.add_table(table_name, hsa_table_descr) + with open(file_name, mode='r') as fd: + for line in fd.readlines(): + record = line[:-1] + m = ptrn_val.match(record) + if m: + rec_vals = [] + for ind in range(1,7): + rec_vals.append(m.group(ind)) + rec_vals[2] = api_pid + rec_vals.append(record_id) + db.insert_entry(table_handle, rec_vals) + if ptrn_ac.search(rec_vals[4]) or record_id in dep_filtr: + beg_ns = int(rec_vals[0]) + end_ns = int(rec_vals[1]) + from_us = (beg_ns / 1000) + ((end_ns - beg_ns) / 1000) + dep_from_us_list.append(from_us) + dep_tid_list.append(int(rec_vals[3])) + dep_id_list.append(record_id) + record_id += 1 + else: fatal("hsa bad record") + + for (tid, from_ns) in dep_list: + db.insert_entry(table_handle, [from_ns, from_ns, api_pid, tid, 'hsa_dispatch', '', record_id]) + record_id += 1 + + if not dep_pid in dep_dict: dep_dict[dep_pid] = {} + dep_dict[dep_pid]['pid'] = api_pid + dep_dict[dep_pid]['tid'] = dep_tid_list + dep_dict[dep_pid]['from'] = dep_from_us_list + if expl_id: dep_dict[dep_pid]['id'] = dep_id_list + + return 1 +############################################################# + +# fill COPY DB +copy_table_descr = [ + ['BeginNs', 'EndNs', 'Name', 'pid', 'tid', 'Index'], + {'Index':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER'} +] +def fill_copy_db(table_name, db, indir): + file_name = indir + '/' + 'async_copy_trace.txt' + ptrn_val = re.compile(r'(\d+):(\d+) (.*)$') + ptrn_id = re.compile(r'^async-copy(\d+)$') + + if not COPY_PID in dep_dict: dep_dict[COPY_PID] = {} + dep_to_us_dict = {} + + table_handle = db.add_table(table_name, copy_table_descr) + with open(file_name, mode='r') as fd: + for line in fd.readlines(): + record = line[:-1] + m = ptrn_val.match(record) + if m: + rec_vals = [] + for ind in range(1,4): rec_vals.append(m.group(ind)) + rec_vals.append(COPY_PID) + rec_vals.append(0) + m = ptrn_id.match(rec_vals[2]) + if m: dep_to_us_dict[int(m.group(1))] = int(rec_vals[0]) / 1000 + else: fatal("bad async-copy entry") + rec_vals.append(m.group(1)) + db.insert_entry(table_handle, rec_vals) + else: fatal("async-copy bad record") - out.close() - return 0 + dep_dict[COPY_PID]['to'] = dep_to_us_dict ############################################################# +# fill HCC ops DB +ops_table_descr = [ + ['BeginNs', 'EndNs', 'dev-id', 'queue-id', 'Name', 'pid', 'tid', 'Index'], + {'Index':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'dev-id':'INTEGER', 'queue-id':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER'} +] +def fill_ops_db(table_name, db, indir): + global max_gpu_id + file_name = indir + '/' + 'hcc_ops_trace.txt' + ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) (.*)$') + ptrn_id = re.compile(r'^[^:]+:(\d+)$') + + if not os.path.isfile(file_name): return {} + + filtr = {} + + record_id = 0 + table_handle = db.add_table(table_name, ops_table_descr) + with open(file_name, mode='r') as fd: + for line in fd.readlines(): + record = line[:-1] + m = ptrn_val.match(record) + if m: + rec_vals = [] + for ind in range(1,6): rec_vals.append(m.group(ind)) + gpu_id = int(rec_vals[2]); + if (gpu_id > max_gpu_id): max_gpu_id = gpu_id + gpu_pid = GPU_BASE_PID + int(gpu_id) + rec_vals.append(gpu_pid) + rec_vals.append(0) + m = ptrn_id.match(rec_vals[4]) + if not m: fatal("bad hcc ops entry '" + record + "'") + corr_id = int(m.group(1)) - 1 + rec_vals.append(corr_id) + db.insert_entry(table_handle, rec_vals) + filtr[corr_id] = 1 + + if not gpu_pid in dep_dict: dep_dict[gpu_pid] = {} + dep_dict[gpu_pid]['to'][corr_id] = int(rec_vals[0]) / 1000 + dep_dict[gpu_pid]['bsp'] = OPS_PID + else: fatal("async-copy bad record") + + return filtr +############################################################# # main if (len(sys.argv) < 3): fatal("Usage: " + sys.argv[0] + " ") outfile = sys.argv[1] infiles = sys.argv[2:] -for f in infiles: - parse_res(f) -ret = print_tbl(outfile) -sys.exit(ret) +indir = re.sub(r'\/[^\/]*$', r'', infiles[0]) + +dbfile = '' +csvfile = '' + +if re.search(r'\.csv$', outfile): + csvfile = outfile +elif re.search(r'\.db$', outfile): + dbfile = outfile + csvfile = re.sub(r'\.db$', '.csv', outfile) +else: + fatal("Bad output file '" + outfile + "'") + +for f in infiles: parse_res(f) +if len(var_table) == 0: sys.exit(1) +merge_table() + +if dbfile == '': + dump_csv(csvfile) +else: + statfile = re.sub(r'\.csv$', '.stats.csv', csvfile) + jsonfile = re.sub(r'\.csv$', '.json', csvfile) + + with open(dbfile, mode='w') as fd: fd.truncate() + db = SQLiteDB(dbfile) + + hsa_trace_found = fill_api_db('HSA', db, indir, 'hsa', HSA_PID, COPY_PID, kern_dep_list, {}, 0) + if hsa_trace_found: + fill_copy_db('COPY', db, indir) + + ops_filtr = fill_ops_db('OPS', db, indir) + hip_trace_found = fill_api_db('HIP', db, indir, 'hip', HIP_PID, OPS_PID, [], ops_filtr, 1) + + fill_kernel_db('A', db) + + any_trace_found = hsa_trace_found | hip_trace_found + if any_trace_found: + db.open_json(jsonfile) + + if hsa_trace_found: + db.label_json(HSA_PID, "CPU HSA API", jsonfile) + db.label_json(COPY_PID, "COPY", jsonfile) + + if hip_trace_found: + db.label_json(HIP_PID, "CPU HIP API", jsonfile) + + if any_trace_found and max_gpu_id >= 0: + for ind in range(0, int(max_gpu_id) + 1): + db.label_json(int(ind) + int(GPU_BASE_PID), "GPU" + str(ind), jsonfile) + + dform.post_process_data(db, 'A', csvfile) + dform.gen_table_bins(db, 'A', statfile, 'KernelName', 'DurationNs') + if hsa_trace_found and 'BeginNs' in var_list: + dform.gen_kernel_json_trace(db, 'A', GPU_BASE_PID, START_US, jsonfile) + + if hsa_trace_found: + statfile = re.sub(r'stats', r'hsa_stats', statfile) + dform.post_process_data(db, 'HSA') + dform.gen_table_bins(db, 'HSA', statfile, 'Name', 'DurationNs') + dform.gen_api_json_trace(db, 'HSA', START_US, jsonfile) + + dform.post_process_data(db, 'COPY') + dform.gen_api_json_trace(db, 'COPY', START_US, jsonfile) + + if hip_trace_found: + statfile = re.sub(r'stats', r'hip_stats', statfile) + dform.post_process_data(db, 'HIP') + dform.gen_table_bins(db, 'HIP', statfile, 'Name', 'DurationNs') + dform.gen_api_json_trace(db, 'HIP', START_US, jsonfile) + + dform.post_process_data(db, 'OPS') + dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_US, jsonfile) + + if any_trace_found: + for (to_pid, dep_str) in dep_dict.items(): + if 'bsp' in dep_str: + bspid = dep_str['bsp'] + base_str = dep_dict[bspid] + for v in ('pid', 'tid', 'from', 'id'): + dep_str[v] = base_str[v] + base_str['inv'] = 1 + + dep_id = 0 + for (to_pid, dep_str) in dep_dict.items(): + if 'inv' in dep_str: continue + from_pid = dep_str['pid'] + tid_list = dep_str['tid'] + from_us_list = dep_str['from'] + to_us_dict = dep_str['to'] + corr_id_list = [] + if 'id' in dep_str: corr_id_list = dep_str['id'] + db.flow_json(dep_id, from_pid, tid_list, from_us_list, to_pid, to_us_dict, corr_id_list, START_US, jsonfile) + dep_id += len(tid_list) + + if any_trace_found: + db.close_json(jsonfile); + db.close() + +sys.exit(0) ############################################################# diff --git a/bin/txt2xml.sh b/bin/txt2xml.sh index 66da77db..27bbe8c4 100755 --- a/bin/txt2xml.sh +++ b/bin/txt2xml.sh @@ -66,11 +66,11 @@ parse() { else output=$outdir/input${index}.xml header="# $timestamp '$output' generated with '$0 $*'" + echo $header > $output if [ "$feature" == "pmc" ] ; then line=`echo "$line" | sed -e "s/ /,/g"` cat >> $output < EOF @@ -78,9 +78,14 @@ EOF if [ "$feature" == "sqtt" ] ; then cat >> $output < +EOF + fi + + if [ "$feature" == "hsa" ] ; then + cat >> $output < EOF fi fi diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 4448128f..6aeb26af 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -26,8 +26,8 @@ THE SOFTWARE. // // The goal of the implementation is to provide a HW specific low-level // performance analysis interface for profiling of GPU compute applications. -// The profiling includes HW performance counters with complex -// performance metrics and HW traces. +// The profiling includes HW performance counters with derived +// performance metrics. // // The library can be used by a tool library loaded by HSA runtime or by // higher level HW independent performance analysis API like PAPI. @@ -42,10 +42,11 @@ THE SOFTWARE. #define INC_ROCPROFILER_H_ #include +#include #include #include -#define ROCPROFILER_VERSION_MAJOR 5 +#define ROCPROFILER_VERSION_MAJOR 6 #define ROCPROFILER_VERSION_MINOR 0 #ifdef __cplusplus @@ -220,6 +221,7 @@ typedef struct { const hsa_kernel_dispatch_packet_t* packet; // HSA dispatch packet const char* kernel_name; // Kernel name uint64_t kernel_object; // Kernel object pointer + const amd_kernel_code_t* kernel_code; // Kernel code pointer int64_t thread_id; // Thread id const rocprofiler_dispatch_record_t* record; // Dispatch record } rocprofiler_callback_data_t; diff --git a/script/rpl_run.sh b/script/rpl_run.sh deleted file mode 100755 index a8260e77..00000000 --- a/script/rpl_run.sh +++ /dev/null @@ -1,377 +0,0 @@ -################################################################################ -# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -################################################################################ - -#!/bin/sh -time_stamp=`date +%y%m%d_%H%M%S` -BIN_DIR=`dirname $0` -BIN_DIR=`cd $BIN_DIR; pwd` -RUN_DIR=`pwd` -TMP_DIR="/tmp" -DATA_PATH=$TMP_DIR -DATA_DIR="rpl_data_${time_stamp}_$$" - -PKG_DIR=`echo $BIN_DIR | sed "s/\/bin\/*$//"` -BIN_DIR=$PKG_DIR/bin - -# PATH to custom HSA and OpenCl runtimes -HSA_PATH=$PKG_DIR/lib/hsa - -export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$HSA_PATH -export PATH=.:$PATH - -# enable error logging -export HSA_TOOLS_REPORT_LOAD_FAILURE=1 -export HSA_VEN_AMD_AQLPROFILE_LOG=1 -export ROCPROFILER_LOG=1 - -# ROC Profiler environment -# Loading of ROC Profiler by HSA runtime -export HSA_TOOLS_LIB=librocprofiler64.so -# Loading of the test tool by ROC Profiler -export ROCP_TOOL_LIB=libtool.so -# Enabling HSA dispatches intercepting by ROC PRofiler -export ROCP_HSA_INTERCEPT=1 -# Disabling internal ROC Profiler proxy queue (simple version supported for testing purposes) -unset ROCP_PROXY_QUEUE -# ROC Profiler metrics definition -export ROCP_METRICS=$PKG_DIR/lib/metrics.xml -# ROC Profiler package path -export ROCP_PACKAGE_DIR=$PKG_DIR - -# error handling -fatal() { - echo "$0: Error: $1" - echo "" - usage -} - -error() { - echo "$0: Error: $1" - echo "" - exit 1 -} - -# usage method -usage() { - bin_name=`basename $0` - echo "ROCm Profiling Library (RPL) run script, a part of ROCprofiler library package." - echo "Full path: $BIN_DIR/$bin_name" - echo "Metrics definition: $PKG_DIR/lib/metrics.xml" - echo "" - echo "Usage:" - echo " rpl_run.sh [-h] [--list-basic] [--list-derived] [-i ] [-o ] " - echo "" - echo "Options:" - echo " -h - this help" - echo " --verbose - verbose mode, dumping all base counters used in the input metrics" - echo " --list-basic - to print the list of basic HW counters" - echo " --list-derived - to print the list of derived metrics with formulas" - echo "" - echo " -i <.txt|.xml file> - input file" - echo " Input file .txt format, automatically rerun application for every pmc/sqtt line:" - echo "" - echo " # Perf counters group 1" - echo " pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize" - echo " # Perf counters group 2" - echo " pmc : WriteSize L2CacheHit" - echo " # SQ tread trace" - echo " sqtt : MASK = 0x0F00 TOKEN_MASK = 0x144B TOKEN_MASK2 = 0xFFFF" - echo " # Filter by dispatches range, GPU index and kernel names" - echo " # supported range formats: \"3:9\", \"3:\", \"3\"" - echo " range: 1 : 4" - echo " gpu: 0 1 2 3" - echo " kernel: simple Pass1 simpleConvolutionPass2" - echo "" - echo " Input file .xml format, for single profiling run:" - echo "" - echo " # Metrics list definition, also the form \":\" can be used" - echo " # All defined metrics can be found in the 'metrics.xml'" - echo " # There are basic metrics for raw HW counters and high-level metrics for derived counters" - echo " " - echo "" - echo " # Trace enabling and the parameters definition" - echo " " - echo " " - echo " " - echo "" - echo " # Filter by dispatches range, GPU index and kernel names" - echo " " - echo "" - echo " Supported by profiler SQTT parameters:" - echo " TARGET_CU - target Compute Unit, MASK.CU_SEL field" - echo " VM_ID_MASK - select which VM IDs to capture, MASK.VM_ID_MASK field" - echo " MASK - MASK register value" - echo " TOKEN_MASK - TOKEN_MASK register value" - echo " TOKEN_MASK2 - TOKEN_MASK2 register value, traced instructions mask" - echo " The parameters defaults:" - echo " TARGET_CU = 0;" - echo " VM_ID_MASK = 0;" - echo " MASK:" - echo " mask.bits.CU_SEL = param{TARGET_CU};" - echo " mask.bits.SH_SEL = 0x0;" - echo " mask.bits.SIMD_EN = 0xF;" - echo " mask.bits.SQ_STALL_EN = 0x1;" - echo " mask.bits.SPI_STALL_EN = 0x1;" - echo " mask.bits.REG_STALL_EN = 0x1;" - echo " mask.bits.VM_ID_MASK = param{VM_ID_MASK};" - echo " TOKEN_MASK:" - echo " token_mask.bits.TOKEN_MASK = 0xFFFF;" - echo " token_mask.bits.REG_MASK = 0xFF;" - echo " token_mask.bits.REG_DROP_ON_STALL = 0x1;" - echo " TOKEN_MASK2:" - echo " token_mask2.bits.INST_MASK = 0xFFFFFF7F; // INST_PC is disabled because its tracing can cause extra stalling" - echo " // and it is recommended to disable by SQTT user guide" - echo " HIWATER = 6; // which is 6/8 fraction of the tread trace fifo" - echo "" - echo " -o - output CSV file [.csv]" - echo " -d - directory where profiler store profiling data including thread treaces [/tmp]" - echo " The data directory is renoving autonatically if the directory is matching the temporary one, which is the default." - echo " -t - to change the temporary directory [/tmp]" - echo " By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory." - echo "" - echo " --basenames - to turn on/off truncating of the kernel full function names till the base ones [off]" - echo " --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off]" - echo " --ctx-limit - maximum number of outstanding contexts [0 - unlimited]" - echo " --heartbeat - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [0 - disabled]" - echo " --sqtt-size - to set SQTT buffer size, aggregate for all SE [0x2000000]" - echo " Can be set in KB (1024B) or MB (1048576) units, examples 20K or 20M respectively." - echo " --sqtt-local - to allocate SQTT buffer in local GPU memory [on]" - echo "" - echo "Configuration file:" - echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" - echo" First the configuration file is looking in the current directory, then in your home, and then in the package directory." - echo " Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'sqtt-size', 'sqtt-local'." - echo " An example of 'rpl_rc.xml':" - echo " " - echo "" - exit 1 -} - -# profiling run method -OUTPUT_LIST="" -run() { - export ROCP_INPUT="$1" - OUTPUT_DIR="$2" - shift - shift - APP_CMD=$* - - if [ "$OUTPUT_DIR" = "-" ] ; then - input_tag=`echo $ROCP_INPUT | sed "s/\.xml//"` - export ROCP_OUTPUT_DIR=${input_tag}_results_${time_stamp} - elif [ "$OUTPUT_DIR" = "--" ] ; then - unset ROCP_OUTPUT_DIR - else - export ROCP_OUTPUT_DIR=$OUTPUT_DIR - fi - echo "RPL: result dir '$ROCP_OUTPUT_DIR'" - - if [ ! -e "$ROCP_INPUT" ] ; then - error "Input file '$ROCP_INPUT' not found" - fi - - if [ -n "$ROCP_OUTPUT_DIR" ] ; then - if [ "$OUTPUT_DIR" = "-" ] ; then - if [ -e "$ROCP_OUTPUT_DIR" ] ; then - error "generated dir '$ROCP_OUTPUT_DIR' exists" - fi - fi - mkdir -p "$ROCP_OUTPUT_DIR" - fi - - if [ -n "$ROCP_OUTPUT_DIR" ] ; then - OUTPUT_LIST="$OUTPUT_LIST $ROCP_OUTPUT_DIR/results.txt" - eval "$APP_CMD 2>&1 | tee $ROCP_OUTPUT_DIR/log.txt" - else - eval "$APP_CMD" - fi -} - -# main -echo "RPL: on '$time_stamp' from '$PKG_DIR' at '$RUN_DIR'" -# Parsing arguments -if [ -z "$1" ] ; then - usage -fi - -INPUT_FILE="" -OUTPUT_DIR="-" -output="" -csv_output="" - -ARG_IN="" -while [ 1 ] ; do - ARG_IN=$1 - ARG_VAL=1 - if [ "$1" = "-h" ] ; then - usage - elif [ "$1" = "-i" ] ; then - INPUT_FILE="$2" - elif [ "$1" = "-o" ] ; then - output="$2" - elif [ "$1" = "-d" ] ; then - OUTPUT_DIR="$2" - DATA_PATH=$OUTPUT_DIR - elif [ "$1" = "-t" ] ; then - TMP_DIR="$2" - if [ "$OUTPUT_DIR" = "-" ] ; then - DATA_PATH=$TMP_DIR - fi - elif [ "$1" = "--list-basic" ] ; then - export ROCP_INFO=b - eval "$PKG_DIR/test/SimpleConvolution" - exit 1 - elif [ "$1" = "--list-derived" ] ; then - export ROCP_INFO=d - eval "$PKG_DIR/test/SimpleConvolution" - exit 1 - elif [ "$1" = "--basenames" ] ; then - if [ "$2" = "on" ] ; then - export ROCP_TRUNCATE_NAMES=1 - else - export ROCP_TRUNCATE_NAMES=0 - fi - elif [ "$1" = "--timestamp" ] ; then - if [ "$2" = "on" ] ; then - export ROCP_TRACKER_ON=1 - else - export ROCP_TRACKER_ON=0 - fi - elif [ "$1" = "--ctx-limit" ] ; then - export ROCP_OUTSTANDING_MAX="$2" - elif [ "$1" = "--heartbeat" ] ; then - export ROCP_OUTSTANDING_MON="$2" - elif [ "$1" = "--sqtt-size" ] ; then - size_m=`echo "$2" | sed -n "s/^\(.*\)M$/\1/p"` - size_k=`echo "$2" | sed -n "s/^\(.*\)K$/\1/p"` - if [ -n "$size_m" ] ; then size_b=$((size_m*1024*1024)) - elif [ -n "$size_k" ] ; then size_b=$((size_k*1024)) - else size_b=$2 - fi - export ROCP_SQTT_SIZE=$size_b - elif [ "$1" = "--sqtt-local" ] ; then - if [ "$2" = "on" ] ; then - export ROCP_SQTT_LOCAL=1 - else - export ROCP_SQTT_LOCAL=0 - fi - elif [ "$1" = "--verbose" ] ; then - ARG_VAL=0 - export ROCP_VERBOSE_MODE=1 - else - break - fi - shift - if [ "$ARG_VAL" = 1 ] ; then shift; fi -done - -ARG_CK=`echo $ARG_IN | sed "s/^-.*$/-/"` -if [ "$ARG_CK" = "-" ] ; then - fatal "Wrong option '$ARG_IN'" -fi - -if [ -z "$INPUT_FILE" ] ; then - fatal "Need input file" -fi - -input_base=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\1/"` -input_type=`echo "$INPUT_FILE" | sed "s/^\(.*\)\.\([^\.]*\)$/\2/"` -if [ -z "${input_base}" -o -z "${input_type}" ] ; then - fatal "Bad input file '$INPUT_FILE'" -fi -input_base=`basename $input_base` - -if [ "$OUTPUT_DIR" = "--" ] ; then - fatal "Bad output dir '$OUTPUT_DIR'" -fi - -if [ -n "$output" ] ; then - if [ "$output" = "--" ] ; then - OUTPUT_DIR="--" - else - csv_output=$output - fi -else - csv_output=$RUN_DIR/${input_base}.csv -fi - -APP_CMD=$* - -echo "RPL: profiling '$APP_CMD'" -echo "RPL: input file '$INPUT_FILE'" - -input_list="" -RES_DIR="" -if [ "$input_type" = "xml" ] ; then - input_list=$INPUT_FILE -elif [ "$input_type" = "txt" ] ; then - OUTPUT_DIR="-" - RES_DIR=$DATA_PATH/$DATA_DIR - if [ -e $RES_DIR ] ; then - error "Rundir '$RES_DIR' exists" - fi - mkdir -p $RES_DIR - echo "RPL: output dir '$RES_DIR'" - $BIN_DIR/txt2xml.sh $INPUT_FILE $RES_DIR - input_list=`/bin/ls $RES_DIR/input*.xml` -else - fatal "Bad input file type '$INPUT_FILE'" -fi - -for name in $input_list; do - run $name $OUTPUT_DIR $APP_CMD -done - -if [ -n "$csv_output" ] ; then - python $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST - if [ "$?" = 1 ] ; then - error "CSV generation error, profiling results '$RES_DIR'" - fi - echo "RPL: '$csv_output' is generated" -fi - -if [ "$DATA_PATH" = "$TMP_DIR" ] ; then - if [ -e "$RES_DIR" ] ; then - rm -rf $RES_DIR - fi -fi - -exit 0 diff --git a/script/txt2xml.sh b/script/txt2xml.sh deleted file mode 100755 index 57cb4be7..00000000 --- a/script/txt2xml.sh +++ /dev/null @@ -1,94 +0,0 @@ -################################################################################ -# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -################################################################################ - -#!/bin/bash -timestamp=`date +%y%m%d_%H%M%S` - -if [ $# = 0 ] ; then - echo "Usage: $0 [output dir]" - exit -1 -fi - -input=$1 -outdir=$2 -if [ -z "$outdir" ] ; then - outdir="." -fi - -range="" -kernel="" -gpu_index="" - -parse() { - scan="$1" - index=0 - while read -r line ; do - line=`echo $line | sed "s/\s*#.*$//"` - if [ -z "$line" ] ; then - continue - fi - - feature=`echo $line | sed -n "s/^\s*\([a-z]*\)\s*:.*$/\1/p"` - line=`echo $line | sed "s/^[^:]*:\s*//"` - line=`echo "$line" | sed -e "s/\s*=\s*/=/g" -e "s/\s*:\s*/:/g" -e "s/,\{1,\}/ /g" -e "s/\s\{1,\}/ /g" -e "s/\s*$//"` - - if [ "$scan" = 0 ] ; then - line=`echo "$line" | sed -e "s/ /,/g"` - if [ "$feature" == "range" ] ; then - range=$line - fi - if [ "$feature" == "kernel" ] ; then - kernel=$line - fi - if [ "$feature" == "gpu" ] ; then - gpu_index=$line - fi - else - output=$outdir/input${index}.xml - header="# $timestamp '$output' generated with '$0 $*'" - - if [ "$feature" == "pmc" ] ; then - line=`echo "$line" | sed -e "s/ /,/g"` - cat >> $output < - -EOF - fi - - if [ "$feature" == "sqtt" ] ; then - cat >> $output < - -EOF - fi - fi - - index=$((index + 1)) - done < $input -} - -parse 0 -parse 1 - -exit 0 diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index 1f31b0d9..e41dcd0f 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -71,7 +71,7 @@ class InterceptQueue { if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "ProxyQueue::Create()"); if (tracker_on || tracker_on_) { - if (tracker_ == NULL) tracker_ = new Tracker; + if (tracker_ == NULL) tracker_ = &Tracker::Instance(); status = hsa_amd_profiling_set_profiler_enabled(*queue, true); if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_set_profiler_enabled()"); } @@ -110,7 +110,7 @@ class InterceptQueue { static hsa_status_t QueueDestroy(hsa_queue_t* queue) { std::lock_guard lck(mutex_); - hsa_status_t status = HSA_STATUS_ERROR; + hsa_status_t status = HSA_STATUS_SUCCESS; if (destroy_callback_ != NULL) { status = destroy_callback_(queue, callback_data_); @@ -147,7 +147,8 @@ class InterceptQueue { } // Prepareing dispatch callback data - uint64_t kernel_symbol = GetKernelSymbol(dispatch_packet); + const amd_kernel_code_t* kernel_code = GetKernelCode(dispatch_packet); + const uint64_t kernel_symbol = kernel_code->runtime_loader_kernel_symbol; const char* kernel_name = GetKernelName(kernel_symbol); rocprofiler_callback_data_t data = {obj->agent_info_->dev_id, obj->agent_info_->dev_index, @@ -157,6 +158,7 @@ class InterceptQueue { dispatch_packet, kernel_name, kernel_symbol, + kernel_code, syscall(__NR_gettid), (tracker_entry) ? tracker_entry->record : NULL}; @@ -177,7 +179,7 @@ class InterceptQueue { if (tracker_entry != NULL) { Group* context_group = context->GetGroup(group.index); context_group->IncrRefsCount(); - tracker_->Enable(tracker_entry, Context::Handler, reinterpret_cast(context_group)); + tracker_->EnableContext(tracker_entry, Context::Handler, reinterpret_cast(context_group)); } const pkt_vector_t& start_vector = context->StartPackets(group.index); @@ -195,7 +197,7 @@ class InterceptQueue { if (tracker_entry != NULL) { void* context_handler_arg = NULL; rocprofiler_handler_t context_handler_fun = context->GetHandler(&context_handler_arg); - tracker_->Enable(tracker_entry, context_handler_fun, context_handler_arg); + tracker_->EnableDispatch(tracker_entry, context_handler_fun, context_handler_arg); } } } @@ -239,7 +241,7 @@ class InterceptQueue { return static_cast((*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask); } - static uint64_t GetKernelSymbol(const hsa_kernel_dispatch_packet_t* dispatch_packet) { + static const amd_kernel_code_t* GetKernelCode(const hsa_kernel_dispatch_packet_t* dispatch_packet) { const amd_kernel_code_t* kernel_code = NULL; hsa_status_t status = util::HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address( @@ -248,7 +250,7 @@ class InterceptQueue { if (HSA_STATUS_SUCCESS != status) { kernel_code = reinterpret_cast(dispatch_packet->kernel_object); } - return kernel_code->runtime_loader_kernel_symbol; + return kernel_code; } static const char* GetKernelName(const uint64_t kernel_symbol) { diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index 6042e59e..c3c4bd0c 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -136,12 +136,15 @@ void * tool_handle = NULL; // Load profiling tool library // Return true if intercepting mode is enabled -bool LoadTool() { - bool intercept_mode = false; +enum { + DISPATCH_INTERCEPT_MODE = 0x1 +}; +uint32_t LoadTool() { + uint32_t intercept_mode = 0; const char* tool_lib = getenv("ROCP_TOOL_LIB"); if (tool_lib) { - intercept_mode = true; + intercept_mode = DISPATCH_INTERCEPT_MODE; tool_handle = dlopen(tool_lib, RTLD_NOW); if (tool_handle == NULL) { @@ -164,7 +167,7 @@ bool LoadTool() { } rocprofiler_settings_t settings{}; - settings.intercept_mode = (intercept_mode) ? 1 : 0; + settings.intercept_mode = (intercept_mode != 0) ? 1 : 0; settings.sqtt_size = SqttProfile::GetSize(); settings.sqtt_local = SqttProfile::IsLocal() ? 1: 0; settings.timeout = util::HsaRsrcFactory::GetTimeoutNs(); @@ -173,11 +176,11 @@ bool LoadTool() { if (handler) handler(); else if (handler_prop) handler_prop(&settings); - intercept_mode = (settings.intercept_mode != 0); SqttProfile::SetSize(settings.sqtt_size); SqttProfile::SetLocal(settings.sqtt_local != 0); util::HsaRsrcFactory::SetTimeoutNs(settings.timeout); InterceptQueue::TrackerOn(settings.timestamp_on != 0); + if (settings.intercept_mode != 0) intercept_mode = DISPATCH_INTERCEPT_MODE; } return intercept_mode; @@ -313,6 +316,9 @@ hsa_status_t CreateQueuePro( rocprofiler_properties_t rocprofiler_properties; uint32_t SqttProfile::output_buffer_size_ = 0x2000000; // 32M bool SqttProfile::output_buffer_local_ = true; +Tracker* Tracker::instance_ = NULL; +Tracker::mutex_t Tracker::glob_mutex_; +Tracker::counter_t Tracker::counter_ = 0; util::Logger::mutex_t util::Logger::mutex_; util::Logger* util::Logger::instance_ = NULL; } @@ -355,8 +361,8 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa } // Loading a tool lib and setting of intercept mode - const bool intercept_mode_on = rocprofiler::LoadTool(); - if (intercept_mode_on) intercept_mode = true; + const uint32_t intercept_mode_mask = rocprofiler::LoadTool(); + if (intercept_mode_mask & rocprofiler::DISPATCH_INTERCEPT_MODE) intercept_mode = true; // HSA intercepting if (intercept_mode) { @@ -371,6 +377,7 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa // HSA-runtime tool on-unload method PUBLIC_API void OnUnload() { + rocprofiler::Tracker::Destroy(); rocprofiler::UnloadTool(); rocprofiler::RestoreHsaApi(); } diff --git a/src/core/tracker.h b/src/core/tracker.h index ab7f3b5d..0cada86f 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -47,8 +47,10 @@ class Tracker { struct entry_t; typedef std::list sig_list_t; typedef sig_list_t::iterator sig_list_it_t; + typedef uint64_t counter_t; struct entry_t { + counter_t index; std::atomic valid; Tracker* tracker; sig_list_t::iterator it; @@ -58,22 +60,25 @@ class Tracker { record_t* record; std::atomic handler; void* arg; - bool context_active; + bool is_context; + bool is_memcopy; }; - Tracker() : - outstanding_(0), - hsa_rsrc_(&(util::HsaRsrcFactory::Instance())) - {} + static Tracker* Create() { + std::lock_guard lck(glob_mutex_); + if (instance_ == NULL) instance_ = new Tracker; + return instance_; + } - ~Tracker() { - auto it = sig_list_.begin(); - auto end = sig_list_.end(); - while (it != end) { - auto cur = it++; - hsa_rsrc_->SignalWait((*cur)->signal); - Erase(cur); - } + static Tracker& Instance() { + if (instance_ == NULL) instance_ = Create(); + return *instance_; + } + + static void Destroy() { + std::lock_guard lck(glob_mutex_); + if (instance_ != NULL) delete instance_; + instance_ = NULL; } // Add tracker entry @@ -102,6 +107,7 @@ class Tracker { // Adding antry to the list mutex_.lock(); entry->it = sig_list_.insert(sig_list_.end(), entry); + entry->index = counter_++; mutex_.unlock(); return entry; @@ -130,20 +136,39 @@ class Tracker { } } - void Enable(entry_t* entry, hsa_amd_signal_handler handler, void* arg) { - entry->context_active = true; + void EnableContext(entry_t* entry, hsa_amd_signal_handler handler, void* arg) { + entry->is_context = true; + Enable(entry, reinterpret_cast(handler), arg); + } + void EnableDispatch(entry_t* entry, rocprofiler_handler_t handler, void* arg) { Enable(entry, reinterpret_cast(handler), arg); } - void Enable(entry_t* entry, rocprofiler_handler_t handler, void* arg) { + void EnableMemcopy(entry_t* entry, hsa_amd_signal_handler handler, void* arg) { + entry->is_memcopy = true; Enable(entry, reinterpret_cast(handler), arg); } private: + Tracker() : + outstanding_(0), + hsa_rsrc_(&(util::HsaRsrcFactory::Instance())) + {} + + ~Tracker() { + auto it = sig_list_.begin(); + auto end = sig_list_.end(); + while (it != end) { + auto cur = it++; + hsa_rsrc_->SignalWait((*cur)->signal); + Erase(cur); + } + } + // Delete an entry by iterator void Erase(const sig_list_it_t& it) { Delete(*it); } // Entry completion - inline void Complete(entry_t* entry) { + inline void Complete(hsa_signal_value_t signal_value, entry_t* entry) { record_t* record = entry->record; // Debug trace @@ -154,12 +179,20 @@ class Tracker { } // Query begin/end and complete timestamps - hsa_amd_profiling_dispatch_time_t dispatch_time{}; - hsa_status_t status = hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time"); + if (entry->is_memcopy) { + hsa_amd_profiling_async_copy_time_t async_copy_time{}; + hsa_status_t status = hsa_amd_profiling_get_async_copy_time(entry->signal, &async_copy_time); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_async_copy_time"); + record->begin = hsa_rsrc_->SysclockToNs(async_copy_time.start); + record->end = hsa_rsrc_->SysclockToNs(async_copy_time.end); + } else { + hsa_amd_profiling_dispatch_time_t dispatch_time{}; + hsa_status_t status = hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time"); + record->begin = hsa_rsrc_->SysclockToNs(dispatch_time.start); + record->end = hsa_rsrc_->SysclockToNs(dispatch_time.end); + } - record->begin = hsa_rsrc_->SysclockToNs(dispatch_time.start); - record->end = hsa_rsrc_->SysclockToNs(dispatch_time.end); record->complete = hsa_rsrc_->TimestampNs(); entry->valid.store(true, std::memory_order_release); @@ -171,16 +204,17 @@ class Tracker { orig_signal_ptr->start_ts = prof_signal_ptr->start_ts; orig_signal_ptr->end_ts = prof_signal_ptr->end_ts; - const hsa_signal_value_t value = hsa_signal_load_relaxed(orig); - hsa_signal_store_screlease(orig, value - 1); + const hsa_signal_value_t new_value = hsa_signal_load_relaxed(orig) - 1; + if (signal_value != new_value) EXC_ABORT(HSA_STATUS_ERROR, "Tracker::Complete bad signal value"); + hsa_signal_store_screlease(orig, signal_value); } } - inline static void HandleEntry(entry_t* entry) { + inline static void HandleEntry(hsa_signal_value_t signal_value, entry_t* entry) { // Call entry handler void* handler = static_cast(entry->handler); - if (entry->context_active) { - reinterpret_cast(handler)(0, entry->arg); + if (entry->is_context || entry->is_memcopy) { + reinterpret_cast(handler)(signal_value, entry->arg); } else { rocprofiler_group_t group{}; reinterpret_cast(handler)(group, entry->arg); @@ -190,7 +224,7 @@ class Tracker { } // Handler for packet completion - static bool Handler(hsa_signal_value_t, void* arg) { + static bool Handler(hsa_signal_value_t signal_value, void* arg) { // Acquire entry entry_t* entry = reinterpret_cast(arg); volatile std::atomic* ptr = &entry->handler; @@ -198,10 +232,10 @@ class Tracker { // Complete entry Tracker* tracker = entry->tracker; - tracker->Complete(entry); + tracker->Complete(signal_value, entry); if (ordering_enabled_ == false) { - HandleEntry(entry); + HandleEntry(signal_value, entry); } else { // Acquire last entry entry_t* back = tracker->sig_list_.back(); @@ -214,7 +248,7 @@ class Tracker { while (it != end) { entry = *(it++); if (entry->valid.load(std::memory_order_acquire)) { - HandleEntry(entry); + HandleEntry(signal_value, entry); } else { break; } @@ -225,6 +259,11 @@ class Tracker { return false; } + // instance + static Tracker* instance_; + static mutex_t glob_mutex_; + static counter_t counter_; + // Tracked signals list sig_list_t sig_list_; // Inter-thread synchronization @@ -235,7 +274,7 @@ class Tracker { // HSA resources factory util::HsaRsrcFactory* hsa_rsrc_; // Handling ordering enabled - static const bool ordering_enabled_ = true; + static const bool ordering_enabled_ = false; // Enable tracing static const bool trace_on_ = false; }; diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index 9997a81c..b3f3cf0d 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -135,7 +135,7 @@ class HsaTimer { sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz; } - // Methids for system-clock/ns conversion + // Methods for system-clock/ns conversion timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { return timestamp_t((freq_t)sysclock * sysclock_factor_); } diff --git a/test/app/intercept_test_stand.cpp b/test/app/intercept_test_stand.cpp new file mode 100644 index 00000000..7e6298e7 --- /dev/null +++ b/test/app/intercept_test_stand.cpp @@ -0,0 +1,189 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include "ctrl/run_kernel.h" +#include "ctrl/test_aql.h" +#include "ctrl/test_hsa.h" +#include "inc/rocprofiler.h" +#include "dummy_kernel/dummy_kernel.h" +#include "simple_convolution/simple_convolution.h" +#include "util/test_assert.h" + +// Dispatch callbacks and context handlers synchronization +pthread_mutex_t mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; + +// Error handler +void fatal(const std::string msg) { + fflush(stdout); + fprintf(stderr, "%s\n\n", msg.c_str()); + fflush(stderr); + abort(); +} + +// Check returned HSA API status +void check_status(hsa_status_t status) { + if (status != HSA_STATUS_SUCCESS) { + const char* error_string = NULL; + rocprofiler_error_string(&error_string); + fprintf(stderr, "ERROR: %s\n", error_string); + abort(); + } +} + +// Context stored entry type +struct context_entry_t { + bool valid; + hsa_agent_t agent; + rocprofiler_group_t group; + rocprofiler_callback_data_t data; +}; + +// Dump stored context entry +void dump_context_entry(context_entry_t* entry) { + volatile std::atomic* valid = reinterpret_cast*>(&entry->valid); + while (valid->load() == false) sched_yield(); + + const std::string kernel_name = entry->data.kernel_name; + const rocprofiler_dispatch_record_t* record = entry->data.record; + + fflush(stdout); + fprintf(stdout, "kernel symbol(0x%lx) name(\"%s\")", entry->data.kernel_object, kernel_name.c_str()); + if (record) fprintf(stdout, ", gpu-id(%u), time(%lu,%lu,%lu,%lu)", + HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index, + record->dispatch, + record->begin, + record->end, + record->complete); + fprintf(stdout, "\n"); + fflush(stdout); + + rocprofiler_group_t& group = entry->group; + if (group.context == NULL) { + fprintf(stderr, "tool error: context is NULL\n"); + abort(); + } + + rocprofiler_close(group.context); +} + +// Profiling completion handler +// Dump and delete the context entry +// Return true if the context was dumped successfully +bool context_handler(rocprofiler_group_t group, void* arg) { + context_entry_t* entry = reinterpret_cast(arg); + + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + dump_context_entry(entry); + delete entry; + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + return false; +} + +// Kernel disoatch callback +hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* /*user_data*/, + rocprofiler_group_t* group) { + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; + + // Profiling context + rocprofiler_t* context = NULL; + + // Context entry + context_entry_t* entry = new context_entry_t(); + + // context properties + rocprofiler_properties_t properties{}; + properties.handler = context_handler; + properties.handler_arg = (void*)entry; + + // Open profiling context + status = rocprofiler_open(callback_data->agent, NULL, 0, + &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); + check_status(status); + + // Get group[0] + status = rocprofiler_get_group(context, 0, group); + check_status(status); + + // Fill profiling context entry + entry->agent = callback_data->agent; + entry->group = *group; + entry->data = *callback_data; + entry->data.kernel_name = strdup(callback_data->kernel_name); + reinterpret_cast*>(&entry->valid)->store(true); + + return HSA_STATUS_SUCCESS; +} + +int main() { + bool ret_val = false; + const char* kiter_s = getenv("ROCP_KITER"); + const char* diter_s = getenv("ROCP_DITER"); + const unsigned kiter = (kiter_s != NULL) ? atol(kiter_s) : 1; + const unsigned diter = (diter_s != NULL) ? atol(diter_s) : 1; + + // Instantiate HSA resources + HsaRsrcFactory::Create(); + + // Getting GPU device info + const AgentInfo* agent_info = NULL; + if (HsaRsrcFactory::Instance().GetGpuAgentInfo(0, &agent_info) == false) abort(); + + // Creating the queue + hsa_queue_t* queue = NULL; + if (HsaRsrcFactory::Instance().CreateQueue(agent_info, 128, &queue) == false) abort(); + + // Adding dispatch observer + rocprofiler_queue_callbacks_t callbacks_ptrs{}; + callbacks_ptrs.dispatch = dispatch_callback; + rocprofiler_set_queue_callbacks(callbacks_ptrs, NULL); + + // Test initialization + TestHsa::SetQueue(queue); + TestHsa::HsaInstantiate(0); + + for (unsigned ind = 0; ind < kiter; ++ind) { + printf("Iteration %u:\n", ind); + ret_val = RunKernel(0, NULL, diter); + if (ret_val) ret_val = RunKernel(0, NULL, diter); + } + + TestHsa::HsaShutdown(); + + return (ret_val) ? 0 : 1; +} diff --git a/test/run.sh b/test/run.sh index 0a0a2f72..550ad5b1 100755 --- a/test/run.sh +++ b/test/run.sh @@ -22,6 +22,8 @@ # THE SOFTWARE. ################################################################################ +# enable tools load failure reporting +export HSA_TOOLS_REPORT_LOAD_FAILURE=1 # paths to ROC profiler and oher libraries export LD_LIBRARY_PATH=$PWD # ROC profiler library loaded by HSA runtime diff --git a/test/tool/gfx_metrics.xml b/test/tool/gfx_metrics.xml index 9e4f24fc..fecfe7b9 100644 --- a/test/tool/gfx_metrics.xml +++ b/test/tool/gfx_metrics.xml @@ -29,7 +29,7 @@ - + @@ -65,5 +65,5 @@ - + diff --git a/test/tool/metrics.xml b/test/tool/metrics.xml index 4011c131..0b53b72e 100644 --- a/test/tool/metrics.xml +++ b/test/tool/metrics.xml @@ -167,7 +167,7 @@ # WriteUnitStalled The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad). @@ -177,7 +177,7 @@ expr=100*TCC_WRREQ_STALL_max/GRBM_GUI_ACTIVE > - # The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). + # ALUStalledByLDS The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). * range; }; +// kernel properties structure +struct kernel_properties_t { + uint32_t grid_size; + uint32_t workgroup_size; + uint32_t lds_size; + uint32_t scratch_size; + uint32_t vgpr_count; + uint32_t sgpr_count; + uint32_t fbarrier_count; + hsa_signal_t signal; +}; + // Context stored entry type struct context_entry_t { bool valid; @@ -79,6 +91,7 @@ struct context_entry_t { rocprofiler_feature_t* features; unsigned feature_count; rocprofiler_callback_data_t data; + kernel_properties_t kernel_properties; FILE* file_handle; }; @@ -100,7 +113,7 @@ context_array_t* context_array = NULL; // Contexts collected count volatile uint32_t context_count = 0; volatile uint32_t context_collected = 0; -// Profiling results output file name +// Profiling results output dir const char* result_prefix = NULL; // Global results file handle FILE* result_file_handle = NULL; @@ -116,6 +129,7 @@ std::vector* kernel_string_vec = NULL; // DIspatch number range filter std::vector* range_vec = NULL; // Otstanding dispatches parameters +static uint32_t CTX_OUTSTANDING_WAIT = 1; static uint32_t CTX_OUTSTANDING_MAX = 0; static uint32_t CTX_OUTSTANDING_MON = 0; // to truncate kernel names @@ -413,11 +427,20 @@ bool dump_context_entry(context_entry_t* entry) { FILE* file_handle = entry->file_handle; const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); - fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), kernel-name(\"%s\")", + fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), tid(%lu), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), kernel-name(\"%s\")", index, HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index, entry->data.queue_id, entry->data.queue_index, + entry->data.thread_id, + entry->kernel_properties.grid_size, + entry->kernel_properties.workgroup_size, + entry->kernel_properties.lds_size, + entry->kernel_properties.scratch_size, + entry->kernel_properties.vgpr_count, + entry->kernel_properties.sgpr_count, + entry->kernel_properties.fbarrier_count, + entry->kernel_properties.signal.handle, nik_name.c_str()); if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", record->dispatch, @@ -480,13 +503,13 @@ void dump_context_array(hsa_queue_t* queue) { } } } + } - if (pthread_mutex_unlock(&mutex) != 0) { - perror("pthread_mutex_unlock"); - abort(); - } - if (done == false) sched_yield(); + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); } + if (done == false) sched_yield(); } } @@ -563,6 +586,8 @@ bool check_filter(const rocprofiler_callback_data_t* callback_data, const callba hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data, rocprofiler_group_t* group) { // Passed tool data + const hsa_kernel_dispatch_packet_t* packet = callback_data->packet; + const amd_kernel_code_t* kernel_code = callback_data->kernel_code; callbacks_data_t* tool_data = reinterpret_cast(user_data); // HSA status hsa_status_t status = HSA_STATUS_ERROR; @@ -578,6 +603,21 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, rocprofiler_t* context = NULL; // Context entry context_entry_t* entry = alloc_context_entry(); + // kernel properties + kernel_properties_t* kernel_properties_ptr = &(entry->kernel_properties); + uint64_t grid_size = packet->grid_size_x * packet->grid_size_y * packet->grid_size_z; + if (grid_size > UINT32_MAX) abort(); + kernel_properties_ptr->grid_size = (uint32_t)grid_size; + uint64_t workgroup_size = packet->workgroup_size_x * packet->workgroup_size_y * packet->workgroup_size_z; + if (workgroup_size > UINT32_MAX) abort(); + kernel_properties_ptr->workgroup_size = (uint32_t)workgroup_size; + kernel_properties_ptr->lds_size = packet->group_segment_size; + kernel_properties_ptr->scratch_size = packet->private_segment_size; + kernel_properties_ptr->vgpr_count = kernel_code->reserved_vgpr_count; + kernel_properties_ptr->sgpr_count = kernel_code->reserved_sgpr_count; + kernel_properties_ptr->fbarrier_count = kernel_code->workgroup_fbarrier_count; + kernel_properties_ptr->signal = packet->completion_signal; + // context properties rocprofiler_properties_t properties{}; properties.handler = (result_prefix != NULL) ? context_handler : NULL; @@ -660,7 +700,7 @@ static hsa_status_t info_callback(const rocprofiler_info_data_t info, void * arg return HSA_STATUS_SUCCESS; } -std::string normalize_token(const std::string token, bool not_empty, std::string label) { +std::string normalize_token(const std::string& token, bool not_empty, const std::string& label) { const std::string space_chars_set = " \t"; const size_t first_pos = token.find_first_not_of(space_chars_set); size_t norm_len = 0; @@ -676,23 +716,17 @@ std::string normalize_token(const std::string token, bool not_empty, std::string } if (((first_pos != std::string::npos) && (norm_len == 0)) || ((first_pos == std::string::npos) && not_empty)) { - fatal(label + ": " + error_str); + fatal("normalize_token error, " + label + ": '" + token + "'," + error_str); } return (norm_len != 0) ? token.substr(first_pos, norm_len) : std::string(""); } -int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, const std::string& delim, std::vector* vec, const char* label = NULL) { +int get_xml_array(const xml::Xml::level_t* node, const std::string& field, const std::string& delim, std::vector* vec, const char* label = NULL) { int parse_iter = 0; - auto nodes = xml->GetNodes(tag); - auto rit = nodes.rbegin(); - auto rend = nodes.rend(); - while (rit != rend) { - auto& opts = (*rit)->opts; - if (opts.find(field) != opts.end()) break; - ++rit; - } - if (rit != rend) { - const std::string array_string = (*rit)->opts[field]; + const auto& opts = node->opts; + auto it = opts.find(field); + if (it != opts.end()) { + const std::string array_string = it->second; if (label != NULL) printf("%s%s = %s\n", label, field.c_str(), array_string.c_str()); size_t pos1 = 0; const size_t string_len = array_string.length(); @@ -701,14 +735,30 @@ int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& fiel const bool found = (pos2 != std::string::npos); const size_t token_len = (pos2 != std::string::npos) ? pos2 - pos1 : string_len - pos1; const std::string token = array_string.substr(pos1, token_len); - const std::string norm_str = normalize_token(token, found, "Tokens array parsing error, file '" + xml->GetName() + "', " + tag + "::" + field); + const std::string norm_str = normalize_token(token, found, "get_xml_array"); if (norm_str.length() != 0) vec->push_back(norm_str); if (!found) break; pos1 = pos2 + 1; ++parse_iter; } } + return parse_iter; +} +int get_xml_array(xml::Xml* xml, const std::string& tag, const std::string& field, const std::string& delim, std::vector* vec, const char* label = NULL) { + int parse_iter = 0; + const auto nodes = xml->GetNodes(tag); + auto rit = nodes.rbegin(); + const auto rend = nodes.rend(); + while (rit != rend) { + auto& opts = (*rit)->opts; + if (opts.find(field) != opts.end()) break; + ++rit; + } + if (rit != rend) { + parse_iter = get_xml_array(*rit, field, delim, vec, label); + //fatal("Tokens array parsing error, file '" + xml->GetName() + "', " + tag + "::" + field); + } return parse_iter; } @@ -765,6 +815,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) if (it != opts.end()) { to_truncate_names = (it->second == "on") ? 1 : 0; } it = opts.find("timestamp"); if (it != opts.end()) { settings->timestamp_on = (it->second == "on") ? 1 : 0; } + it = opts.find("ctx-wait"); + if (it != opts.end()) { CTX_OUTSTANDING_WAIT = atol(it->second.c_str()); } it = opts.find("ctx-limit"); if (it != opts.end()) { CTX_OUTSTANDING_MAX = atol(it->second.c_str()); } it = opts.find("heartbeat"); @@ -789,6 +841,7 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) // Enable kernel names truncating check_env_var("ROCP_TRUNCATE_NAMES", to_truncate_names); // Set outstanding dispatches parameter + check_env_var("ROCP_OUTSTANDING_WAIT", CTX_OUTSTANDING_WAIT); check_env_var("ROCP_OUTSTANDING_MAX", CTX_OUTSTANDING_MAX); check_env_var("ROCP_OUTSTANDING_MON", CTX_OUTSTANDING_MON); // Enable timestamping @@ -884,10 +937,7 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) range_vec->push_back(*(range_vec->begin()) + 1); } - // Getting traces - auto traces_list = xml->GetNodes("top.trace"); - - const unsigned feature_count = metrics_vec.size() + traces_list.size(); + const unsigned feature_count = metrics_vec.size(); rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; memset(features, 0, feature_count * sizeof(rocprofiler_feature_t)); @@ -901,71 +951,7 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) } if (metrics_vec.size()) printf("\n"); - printf(" %d traces\n", (int)traces_list.size()); - unsigned index = metrics_vec.size(); - for (auto* entry : traces_list) { - auto params_list = xml->GetNodes("top.trace.parameters"); - if (params_list.size() > 1) { - fatal("ROCProfiler: Single input 'parameters' section is supported"); - } - std::string name = ""; - bool to_copy_data = false; - for (const auto& opt : entry->opts) { - if (opt.first == "name") name = opt.second; - else if (opt.first == "copy") to_copy_data = (opt.second == "true"); - else fatal("ROCProfiler: Bad trace property '" + opt.first + "'"); - } - if (name == "") fatal("ROCProfiler: Bad trace properties, name is not specified"); - - std::map parameters_dict; - parameters_dict["TARGET_CU"] = - HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET; - parameters_dict["VM_ID_MASK"] = - HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK; - parameters_dict["MASK"] = - HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK; - parameters_dict["TOKEN_MASK"] = - HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK; - parameters_dict["TOKEN_MASK2"] = - HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2; -#ifdef AQLPROF_NEW_API - parameters_dict["SE_MASK"] = - HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK; -#endif - - printf(" %s (", name.c_str()); - features[index] = {}; - features[index].kind = ROCPROFILER_FEATURE_KIND_TRACE; - features[index].name = strdup(name.c_str()); - features[index].data.result_bytes.copy = to_copy_data; - - for (auto* params : params_list) { - const unsigned parameter_count = params->opts.size(); - rocprofiler_parameter_t* parameters = new rocprofiler_parameter_t[parameter_count]; - unsigned p_index = 0; - for (auto& v : params->opts) { - const std::string parameter_name = v.first; - if (parameters_dict.find(parameter_name) == parameters_dict.end()) { - fprintf(stderr, "ROCProfiler: unknown trace parameter '%s'\n", parameter_name.c_str()); - abort(); - } - const uint32_t value = strtol(v.second.c_str(), NULL, 0); - printf("\n %s = 0x%x", parameter_name.c_str(), value); - parameters[p_index] = {}; - parameters[p_index].parameter_name = parameters_dict[parameter_name]; - parameters[p_index].value = value; - ++p_index; - } - - features[index].parameters = parameters; - features[index].parameter_count = parameter_count; - } - if (params_list.empty() == false) printf("\n "); - printf(")\n"); - fflush(stdout); - ++index; - } - fflush(stdout); + const uint32_t features_found = metrics_vec.size(); // Context array aloocation context_array = new context_array_t; @@ -977,7 +963,7 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) callbacks_data = new callbacks_data_t{}; callbacks_data->features = features; - callbacks_data->feature_count = feature_count; + callbacks_data->feature_count = features_found; callbacks_data->set = (metrics_set->empty()) ? NULL : metrics_set; callbacks_data->group_index = 0; callbacks_data->file_handle = result_file_handle; @@ -1022,13 +1008,13 @@ extern "C" PUBLIC_API void OnUnloadTool() { fflush(stdout); if (result_file_opened) { printf("\nROCPRofiler:"); fflush(stdout); - dump_context_array(NULL); + if (CTX_OUTSTANDING_WAIT == 1) dump_context_array(NULL); fclose(result_file_handle); printf(" %u contexts collected, output directory %s\n", context_collected, result_prefix); } else { if (context_collected != context_count) { results_output_break(); - dump_context_array(NULL); + if (CTX_OUTSTANDING_WAIT == 1) dump_context_array(NULL); } printf("\nROCPRofiler: %u contexts collected\n", context_collected); } From e029fd1ca4d8c0c3dc18430dfdfdce786d49aafd Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 4 Feb 2019 19:52:04 -0600 Subject: [PATCH 024/168] removing scripts, moved as bin --- script/tblextr.py | 118 ---------------------------------------------- 1 file changed, 118 deletions(-) delete mode 100755 script/tblextr.py diff --git a/script/tblextr.py b/script/tblextr.py deleted file mode 100755 index f6a37dc0..00000000 --- a/script/tblextr.py +++ /dev/null @@ -1,118 +0,0 @@ -################################################################################ -# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -################################################################################ - -#!/usr/bin/python -import os, sys, re - -# Parsing results in the format: -#dispatch[0], queue_index(0), kernel_name("SimpleConvolution"), time(1048928000311041,1048928006154674,1048928006168274,1048928006170503): -# GRBM_GUI_ACTIVE (74332) -# SQ_WAVES (4096) -# SQ_INSTS_VMEM_RD (36864) - -# global vars -var_list = ['Index', 'KernelName', 'DispatchNs', 'BeginNs', 'EndNs', 'CompleteNs'] -var_table = {} -############################################################# - -def fatal(msg): - sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); - sys.exit(1) -############################################################# - -# parse results method -def parse_res(infile): - if not os.path.isfile(infile): fatal("Error: input file '" + infile + "' not found") - inp = open(infile, 'r') - - beg_pattern = re.compile("^dispatch\[(\d*)\], queue_index\(\d*\), kernel_name\(\"([^\"]*)\"\)") - ts_pattern = re.compile(", time\((\d*),(\d*),(\d*),(\d*)\)") - var_pattern = re.compile("^\s*([^\s]*)\s+\((\d*)\)") - - dispatch_number = 0 - for line in inp.readlines(): - record = line[:-1] - - m = var_pattern.match(record) - if m: - if not dispatch_number in var_table: fatal("Error: dispatch number not unique '" + str(dispatch_number) + "'") - var = m.group(1) - val = m.group(2) - var_table[dispatch_number][m.group(1)] = m.group(2) - if not var in var_list: var_list.append(var) - - m = beg_pattern.match(record) - if m: - dispatch_number = m.group(1) - if not dispatch_number in var_table: - var_table[dispatch_number] = { - 'Index': dispatch_number, - 'KernelName': "\"" + m.group(2) + "\"" - } - m = ts_pattern.search(record) - if m: - var_table[dispatch_number]['DispatchNs'] = m.group(1) - var_table[dispatch_number]['BeginNs'] = m.group(2) - var_table[dispatch_number]['EndNs'] = m.group(3) - var_table[dispatch_number]['CompleteNs'] = m.group(4) - - inp.close() -############################################################# - -# print results table method -def print_tbl(outfile): - global var_list - - out = open(outfile, 'w') - - keys = sorted(var_table.keys(), key=int) - - entry = var_table[keys[0]] - list1 = [] - for var in var_list: - if var in entry: - list1.append(var) - var_list = list1 - - for var in var_list: out.write(var + ',') - out.write("\n") - - for ind in keys: - entry = var_table[ind] - dispatch_number = entry['Index'] - if ind != dispatch_number: fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n") - for var in var_list: out.write(entry[var] + ',') - out.write("\n") - - out.close() -############################################################# - -# main -if (len(sys.argv) < 3): fatal("Usage: " + sys.argv[0] + " ") - -outfile = sys.argv[1] -infiles = sys.argv[2:] -for f in infiles : - parse_res(f) -print_tbl(outfile) -sys.exit(0) -############################################################# From 9e00d58891b0fc3f84eae5bfb455973123388b2d Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 4 Feb 2019 19:58:09 -0600 Subject: [PATCH 025/168] readme: adding 'rocprof' hsa trace option --- README.md | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 6f02bca0..ed0d3709 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ Options: --list-derived - to print the list of derived metrics with formulas -i <.txt|.xml file> - input file - Input file .txt format, automatically rerun application for every pmc/sqtt line: + Input file .txt format, automatically rerun application for every pmc line: # Perf counters group 1 pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts FetchSize @@ -107,21 +107,20 @@ Options: --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off] --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] - --sqtt-size - to set SQTT buffer size, aggregate for all SE [0x2000000] - Can be set in KB (1024B) or MB (1048576) units, examples 20K or 20M respectively. - --sqtt-local - to allocate SQTT buffer in local GPU memory [on] + + --stats - generating kernel executino stats + --hsa-trace - to trace HSA, generates API execution stats and JSON file viewable in chrome tracing + --hip-trace - to trace HIP, generates API execution stats and JSON file viewable in chrome tracing Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: First the configuration file is looking in the current directory, then in your home, and then in the package directory. - Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'sqtt-size', 'sqtt-local'. + Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat'. An example of 'rpl_rc.xml': ``` From 3eb77389dfc22889688e2b661e5859055182b1c8 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 4 Feb 2019 20:19:30 -0600 Subject: [PATCH 026/168] readme: generated files info --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ed0d3709..1894c562 100644 --- a/README.md +++ b/README.md @@ -108,9 +108,12 @@ Options: --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] - --stats - generating kernel executino stats + --stats - generating kernel executino stats, file .stats.csv --hsa-trace - to trace HSA, generates API execution stats and JSON file viewable in chrome tracing + Generated files: .hsa_stats.txt .json --hip-trace - to trace HIP, generates API execution stats and JSON file viewable in chrome tracing + Generated files: .hip_stats.txt .json + Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: From f56e53a5c0e08c988fb7608e2b8668fb8ceed1cb Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 4 Feb 2019 20:25:07 -0600 Subject: [PATCH 027/168] tracing info help --- README.md | 3 +-- bin/rocprof | 1 + bin/rpl_run.sh | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) create mode 120000 bin/rocprof diff --git a/README.md b/README.md index 1894c562..210e9c5e 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ The library source tree: ## Profiling utility usage: ``` - rpl_run.sh [-h] [--list-basic] [--list-derived] [-i ] [-o ] + rocprof [-h] [--list-basic] [--list-derived] [-i ] [-o ] Options: -h - this help @@ -114,7 +114,6 @@ Options: --hip-trace - to trace HIP, generates API execution stats and JSON file viewable in chrome tracing Generated files: .hip_stats.txt .json - Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: First the configuration file is looking in the current directory, then in your home, and then in the package directory. diff --git a/bin/rocprof b/bin/rocprof new file mode 120000 index 00000000..e3aaad4e --- /dev/null +++ b/bin/rocprof @@ -0,0 +1 @@ +rpl_run.sh \ No newline at end of file diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index adefad73..91fd6703 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -140,9 +140,11 @@ usage() { echo " --ctx-limit - maximum number of outstanding contexts [0 - unlimited]" echo " --heartbeat - to print progress heartbeats [0 - disabled]" echo "" - echo " --stats - generating kernel executino stats" + echo " --stats - generating kernel execution stats, file .stats.csv" echo " --hsa-trace - to trace HSA, generates API execution stats and JSON file viewable in chrome tracing" + echo " Generated files: .hsa_stats.txt .json" echo " --hip-trace - to trace HIP, generates API execution stats and JSON file viewable in chrome tracing" + echo " Generated files: .hip_stats.txt .json" echo "" echo "Configuration file:" echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" From 83bc8e6a4c3b5d2fa207ca2fa4936ad05fe11ec6 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Mon, 4 Feb 2019 20:37:45 -0600 Subject: [PATCH 028/168] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 210e9c5e..60fd9a3a 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ HW specific low-level performance analysis interface for profiling of GPU comput The library source tree: - bin - - rpl_run.sh - Profiling tool run script + - rocprof - Profiling tool run script - doc - Documentation - inc/rocprofiler.h - Library public API - src - Library sources @@ -34,7 +34,7 @@ The library source tree: cd .../rocprofiler mkdir build cd build - export CMAKE_PREFIX_PATH=/opt/rocm/lib:/opt/rocm/include/hsa + export CMAKE_PREFIX_PATH=/opt/rocm cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm .. make make install From 758ddf5d5d9d71cb34e8dcf4a69bdb98be90c477 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 19 Feb 2019 16:05:21 -0600 Subject: [PATCH 029/168] Update README.md --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 60fd9a3a..34ef3f38 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,10 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics and HW traces -The library source tree: +[Profiler default metrics](test/tool/metrics.xml) + +## Source tree: +``` - bin - rocprof - Profiling tool run script - doc - Documentation @@ -20,6 +23,7 @@ The library source tree: - ctrl - Test controll - util - Test utils - simple_convolution - Simple convolution test kernel +``` ## Build environment: ``` From 67b17cb95f1463de0844e1e9604c46faea2b9f21 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 19 Feb 2019 16:07:02 -0600 Subject: [PATCH 030/168] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 34ef3f38..ea99af25 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics and HW traces -[Profiler default metrics](test/tool/metrics.xml) +[Profiler default metrics XML specification](test/tool/metrics.xml) ## Source tree: ``` From 61c17b282486f43586f7753aaf0f6580a1d4a196 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 19 Feb 2019 16:08:10 -0600 Subject: [PATCH 031/168] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ea99af25..c77ab5c3 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics and HW traces -[Profiler default metrics XML specification](test/tool/metrics.xml) +[The link to profiler default metrics XML specification](test/tool/metrics.xml) ## Source tree: ``` From efdb465389bad87820194f1360c7f880516187ea Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 19 Feb 2019 16:17:13 -0600 Subject: [PATCH 032/168] Update README.md --- README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c77ab5c3..edfc0e58 100644 --- a/README.md +++ b/README.md @@ -115,8 +115,14 @@ Options: --stats - generating kernel executino stats, file .stats.csv --hsa-trace - to trace HSA, generates API execution stats and JSON file viewable in chrome tracing Generated files: .hsa_stats.txt .json - --hip-trace - to trace HIP, generates API execution stats and JSON file viewable in chrome tracing - Generated files: .hip_stats.txt .json + Traced API list can be set by input .txt or .xml files. + Input .txt: + hsa: hsa_queue_create hsa_amd_memory_pool_allocate + Input .xml: + + + + Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: From 29fe11b4aacb5da603393ab614b52d4174aa2ff2 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 14:37:01 -0600 Subject: [PATCH 033/168] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index edfc0e58..1b01636e 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,8 @@ Options: --heartbeat - to print progress heartbeats [0 - disabled] --stats - generating kernel executino stats, file .stats.csv - --hsa-trace - to trace HSA, generates API execution stats and JSON file viewable in chrome tracing + --hip-trace - to trace HIP, generates API execution stats/trace and JSON file viewable in chrome tracing + --hsa-trace - to trace HSA, generates API execution stats/trace and JSON file viewable in chrome tracing Generated files: .hsa_stats.txt .json Traced API list can be set by input .txt or .xml files. Input .txt: From baf874f85acc9dd8349057c09e4ed65ddb763b75 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 17:23:56 -0600 Subject: [PATCH 034/168] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1b01636e..6049fba6 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library supports GFX8/GFX9. -HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics and HW traces +HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics and HW traces. +There two usage modes for counters access, system wide sampling and accumulating per kernels. In per kernel usage mode the kernels execution is serialized. [The link to profiler default metrics XML specification](test/tool/metrics.xml) From 4d1a2edabf3cb3bf9473ceadeafa62e782aa98a3 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 17:25:18 -0600 Subject: [PATCH 035/168] Update README.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6049fba6..385401b5 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,12 @@ # ROC-profiler - +``` ROC profiler library. Profiling with perf-counters and derived metrics. Library supports GFX8/GFX9. HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics and HW traces. -There two usage modes for counters access, system wide sampling and accumulating per kernels. In per kernel usage mode the kernels execution is serialized. +There two usage modes for counters access, system wide sampling and per kernels accumulating. In per kernel usage mode the kernels execution is serialized. [The link to profiler default metrics XML specification](test/tool/metrics.xml) +``` ## Source tree: ``` From 1dd85febd918a96a200eb14c5380aeee64589af1 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 17:26:20 -0600 Subject: [PATCH 036/168] Update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 385401b5..839da33e 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,10 @@ ``` ROC profiler library. Profiling with perf-counters and derived metrics. Library supports GFX8/GFX9. -HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics and HW traces. -There two usage modes for counters access, system wide sampling and per kernels accumulating. In per kernel usage mode the kernels execution is serialized. +HW specific low-level performance analysis interface for profiling of GPU compute applications. The +profiling includes HW performance counters with complex performance metrics and HW traces. +There two usage modes for counters access, system wide sampling and per kernels accumulating. In per +kernel usage mode the kernels execution is serialized. [The link to profiler default metrics XML specification](test/tool/metrics.xml) ``` From e5982b9ffee7b56e7aeca295bc3f916e036dd35a Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 17:26:42 -0600 Subject: [PATCH 037/168] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 839da33e..bd7e8f36 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,9 @@ HW specific low-level performance analysis interface for profiling of GPU comput profiling includes HW performance counters with complex performance metrics and HW traces. There two usage modes for counters access, system wide sampling and per kernels accumulating. In per kernel usage mode the kernels execution is serialized. - -[The link to profiler default metrics XML specification](test/tool/metrics.xml) ``` +[The link to profiler default metrics XML specification](test/tool/metrics.xml) + ## Source tree: ``` From d37c0f41c6d2dec537b48339844216dc57e8d12e Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 20:42:35 -0600 Subject: [PATCH 038/168] Update rocprofiler_spec.md --- doc/rocprofiler_spec.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index 001bcbe1..a8ef7a2b 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -14,7 +14,7 @@ The library has C API and is based on AQLprofile AMD specific HSA extension. 1. The library provides methods to query the list of supported HW features. 2. The library provides profiling APIs to start, stop, read metrics results and tracing data. - 3. The library provides a callback API for collecting per-kernel profiling data for + 3. The library provides a intercepting API for collecting per-kernel profiling data for the kernels dispatched to HSA AQL queues. 4. The library provides mechanism to load profiling tool library plugin by env variable @@ -427,6 +427,7 @@ hsa_status_t rocprofiler_group_get_data( ``` The library provides a callback API for enabling profiling for the kernels dispatched to HSA AQL queues. The API enables per-kernel profiling data collection. +Currently implemented the option with serializing the kernels execution. ROC profiler callback type: From b8f8e410c030bed0b816c75a8be1eab0ade1a4c6 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 20:44:11 -0600 Subject: [PATCH 039/168] Update README.md --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index bd7e8f36..31e8458f 100644 --- a/README.md +++ b/README.md @@ -3,9 +3,7 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library supports GFX8/GFX9. HW specific low-level performance analysis interface for profiling of GPU compute applications. The -profiling includes HW performance counters with complex performance metrics and HW traces. -There two usage modes for counters access, system wide sampling and per kernels accumulating. In per -kernel usage mode the kernels execution is serialized. +profiling includes HW performance counters with complex performance metrics. ``` [The link to profiler default metrics XML specification](test/tool/metrics.xml) From 3ad6f22ecbf17ab87090d7ea55ad4879cf0891fb Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 20:45:01 -0600 Subject: [PATCH 040/168] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 31e8458f..b884ad82 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,8 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics. ``` + +## Metrics [The link to profiler default metrics XML specification](test/tool/metrics.xml) From f1718391c5d670184040331d66a81df24cc65eb2 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 20:45:41 -0600 Subject: [PATCH 041/168] Update README.md --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index b884ad82..48b09814 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,8 @@ # ROC-profiler -``` ROC profiler library. Profiling with perf-counters and derived metrics. Library supports GFX8/GFX9. HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics. -``` ## Metrics [The link to profiler default metrics XML specification](test/tool/metrics.xml) From f7278ac40938bcd60efaf5e69ebb92c2265c0ec1 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Feb 2019 20:46:05 -0600 Subject: [PATCH 042/168] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 48b09814..71b673c6 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ profiling includes HW performance counters with complex performance metrics. [The link to profiler default metrics XML specification](test/tool/metrics.xml) -## Source tree: +## Source tree ``` - bin - rocprof - Profiling tool run script From 0ae956103729aed2c1f86964e53516808a07a87e Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 12 Mar 2019 11:49:29 -0500 Subject: [PATCH 043/168] 2.2 update --- bin/rpl_run.sh | 12 +++++-- bin/run_tool.sh | 2 +- inc/rocprofiler.h | 1 + src/core/rocprofiler.cpp | 59 ++++++++++++++++++++++++++++++- test/app/intercept_test_stand.cpp | 2 +- test/run.sh | 13 ++++--- test/tool/tool.cpp | 4 +++ 7 files changed, 82 insertions(+), 11 deletions(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 91fd6703..d94ee76f 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -141,10 +141,16 @@ usage() { echo " --heartbeat - to print progress heartbeats [0 - disabled]" echo "" echo " --stats - generating kernel execution stats, file .stats.csv" - echo " --hsa-trace - to trace HSA, generates API execution stats and JSON file viewable in chrome tracing" + echo " --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible" echo " Generated files: .hsa_stats.txt .json" - echo " --hip-trace - to trace HIP, generates API execution stats and JSON file viewable in chrome tracing" - echo " Generated files: .hip_stats.txt .json" + echo " Traced API list can be set by input .txt or .xml files." + echo " Input .txt:" + echo " hsa: hsa_queue_create hsa_amd_memory_pool_allocate" + echo " Input .xml:" + echo " " + echo " " + echo " " + echo " " echo "" echo "Configuration file:" echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" diff --git a/bin/run_tool.sh b/bin/run_tool.sh index 5af6d1a1..5ee438c0 100755 --- a/bin/run_tool.sh +++ b/bin/run_tool.sh @@ -27,7 +27,7 @@ fi export HSA_TOOLS_REPORT_LOAD_FAILURE=1 export HSA_VEN_AMD_AQLPROFILE_LOG=1 export ROCPROFILER_LOG=1 -# to prevent internal simple proxy queue +# ROC profiler metrics config file unset ROCP_PROXY_QUEUE # ROC profiler metrics config file export ROCP_METRICS=$BIN_DIR/lib/metrics.xml diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 6aeb26af..5449204b 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -64,6 +64,7 @@ uint32_t rocprofiler_version_minor(); typedef struct { uint32_t intercept_mode; + uint32_t memcopy_tracking; uint32_t sqtt_size; uint32_t sqtt_local; uint64_t timeout; diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index c3c4bd0c..dec62c5c 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -83,6 +83,9 @@ decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacqui decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn; decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn; +decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn; +decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn; + ::HsaApiTable* kHsaApiTable; void SaveHsaApi(::HsaApiTable* table) { @@ -137,7 +140,8 @@ void * tool_handle = NULL; // Load profiling tool library // Return true if intercepting mode is enabled enum { - DISPATCH_INTERCEPT_MODE = 0x1 + DISPATCH_INTERCEPT_MODE = 0x1, + MEMCOPY_INTERCEPT_MODE = 0x2 }; uint32_t LoadTool() { uint32_t intercept_mode = 0; @@ -181,6 +185,7 @@ uint32_t LoadTool() { util::HsaRsrcFactory::SetTimeoutNs(settings.timeout); InterceptQueue::TrackerOn(settings.timestamp_on != 0); if (settings.intercept_mode != 0) intercept_mode = DISPATCH_INTERCEPT_MODE; + if (settings.memcopy_tracking) intercept_mode |= MEMCOPY_INTERCEPT_MODE; } return intercept_mode; @@ -313,6 +318,50 @@ hsa_status_t CreateQueuePro( return HSA_STATUS_SUCCESS; } +bool async_copy_handler(hsa_signal_value_t value, void* arg) { + Tracker::entry_t* entry = reinterpret_cast(arg); + printf("%lu: async-copy time(%lu,%lu)\n", entry->index, entry->record->begin, entry->record->end); + return false; +} + +hsa_status_t hsa_amd_memory_async_copy_interceptor( + void* dst, hsa_agent_t dst_agent, const void* src, + hsa_agent_t src_agent, size_t size, uint32_t num_dep_signals, + const hsa_signal_t* dep_signals, hsa_signal_t completion_signal) +{ + Tracker* tracker = &Tracker::Instance(); + Tracker::entry_t* tracker_entry = tracker->Alloc(hsa_agent_t{}, completion_signal); + hsa_status_t status = hsa_amd_memory_async_copy_fn(dst, dst_agent, src, + src_agent, size, num_dep_signals, + dep_signals, tracker_entry->signal); + if (status == HSA_STATUS_SUCCESS) { + tracker->EnableMemcopy(tracker_entry, async_copy_handler, reinterpret_cast(tracker_entry)); + } else { + tracker->Delete(tracker_entry); + } + return status; +} + +hsa_status_t hsa_amd_memory_async_copy_rect_interceptor( + const hsa_pitched_ptr_t* dst, const hsa_dim3_t* dst_offset, const hsa_pitched_ptr_t* src, + const hsa_dim3_t* src_offset, const hsa_dim3_t* range, hsa_agent_t copy_agent, + hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) +{ + Tracker* tracker = &Tracker::Instance(); + Tracker::entry_t* tracker_entry = tracker->Alloc(hsa_agent_t{}, completion_signal); + hsa_status_t status = hsa_amd_memory_async_copy_rect_fn(dst, dst_offset, src, + src_offset, range, copy_agent, + dir, num_dep_signals, dep_signals, + tracker_entry->signal); + if (status == HSA_STATUS_SUCCESS) { + tracker->EnableMemcopy(tracker_entry, async_copy_handler, reinterpret_cast(tracker_entry)); + } else { + tracker->Delete(tracker_entry); + } + return status; +} + rocprofiler_properties_t rocprofiler_properties; uint32_t SqttProfile::output_buffer_size_ = 0x2000000; // 32M bool SqttProfile::output_buffer_local_ = true; @@ -363,6 +412,14 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa // Loading a tool lib and setting of intercept mode const uint32_t intercept_mode_mask = rocprofiler::LoadTool(); if (intercept_mode_mask & rocprofiler::DISPATCH_INTERCEPT_MODE) intercept_mode = true; + if (intercept_mode_mask & rocprofiler::MEMCOPY_INTERCEPT_MODE) { + hsa_status_t status = hsa_amd_profiling_async_copy_enable(true); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_async_copy_enable"); + rocprofiler::hsa_amd_memory_async_copy_fn = table->amd_ext_->hsa_amd_memory_async_copy_fn; + rocprofiler::hsa_amd_memory_async_copy_rect_fn = table->amd_ext_->hsa_amd_memory_async_copy_rect_fn; + table->amd_ext_->hsa_amd_memory_async_copy_fn = rocprofiler::hsa_amd_memory_async_copy_interceptor; + table->amd_ext_->hsa_amd_memory_async_copy_rect_fn = rocprofiler::hsa_amd_memory_async_copy_rect_interceptor; + } // HSA intercepting if (intercept_mode) { diff --git a/test/app/intercept_test_stand.cpp b/test/app/intercept_test_stand.cpp index 7e6298e7..de3dbdaf 100644 --- a/test/app/intercept_test_stand.cpp +++ b/test/app/intercept_test_stand.cpp @@ -178,7 +178,7 @@ int main() { TestHsa::HsaInstantiate(0); for (unsigned ind = 0; ind < kiter; ++ind) { - printf("Iteration %u:\n", ind); + printf("Iterastion %u:\n", ind); ret_val = RunKernel(0, NULL, diter); if (ret_val) ret_val = RunKernel(0, NULL, diter); } diff --git a/test/run.sh b/test/run.sh index 550ad5b1..580f4713 100755 --- a/test/run.sh +++ b/test/run.sh @@ -56,11 +56,6 @@ if [ ! -e $ROCP_TOOL_LIB ] ; then export ROCP_TOOL_LIB=test/libtool.so fi -export ROCP_KITER=1 -export ROCP_DITER=4 -export ROCP_INPUT=input1.xml -eval ./test/ctrl - export ROCP_KITER=50 export ROCP_DITER=50 export ROCP_AGENTS=1 @@ -68,6 +63,14 @@ export ROCP_THRS=1 export ROCP_INPUT=input.xml eval ./test/ctrl +# Memcopies tracking +export ROCP_MCOPY_TRACKING=1 + +export ROCP_KITER=1 +export ROCP_DITER=4 +export ROCP_INPUT=input1.xml +eval ./test/ctrl + #valgrind --leak-check=full $tbin #valgrind --tool=massif $tbin #ms_print massif.out. diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 4853fd86..d96ab12c 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -834,6 +834,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) } it = opts.find("sqtt-local"); if (it != opts.end()) { settings->sqtt_local = (it->second == "on"); } + it = opts.find("memcopies"); + if (it != opts.end()) { settings->memcopy_tracking = (it->second == "on"); } } } // Enable verbose mode @@ -852,6 +854,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) check_env_var("ROCP_SQTT_SIZE", settings->sqtt_size); // Set SQTT local buffer check_env_var("ROCP_SQTT_LOCAL", settings->sqtt_local); + // Set memcopies tracking + check_env_var("ROCP_MCOPY_TRACKING", settings->memcopy_tracking); is_sqtt_local = settings->sqtt_local; From 528000626865a16e4960b55347ac34b458a6ccf2 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 19 Mar 2019 16:15:46 -0500 Subject: [PATCH 044/168] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 71b673c6..3868aeac 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ Options: By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory. --basenames - to turn on/off truncating of the kernel full function names till the base ones [off] - --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off] + --timestamp - to turn on/off the kernel dispatches timestamps, dispatch/begin/end/complete [off] --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] From 391bc82dce12a2cd1d9d7e65f1ecf76737ea1a78 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 2 Apr 2019 10:34:39 -0500 Subject: [PATCH 045/168] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3868aeac..f7409480 100644 --- a/README.md +++ b/README.md @@ -123,7 +123,7 @@ Options: hsa: hsa_queue_create hsa_amd_memory_pool_allocate Input .xml: - + From 11535ba1e88de3e218fc9357dca806273de1bfec Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 2 Apr 2019 20:12:41 -0500 Subject: [PATCH 046/168] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f7409480..9ce54bbd 100644 --- a/README.md +++ b/README.md @@ -40,8 +40,8 @@ profiling includes HW performance counters with complex performance metrics. cd .../rocprofiler mkdir build cd build - export CMAKE_PREFIX_PATH=/opt/rocm - cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm .. + export CMAKE_PREFIX_PATH=/opt/rocm/include/hsa:/opt/rocm + cmake .. make make install ``` From cda8b42ffdd7c5e204de338ef70a76200f157e68 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 2 Apr 2019 20:21:32 -0500 Subject: [PATCH 047/168] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 9ce54bbd..c36a99c2 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,7 @@ Options: --stats - generating kernel executino stats, file .stats.csv --hip-trace - to trace HIP, generates API execution stats/trace and JSON file viewable in chrome tracing + 'HCC_HOME' env va is required to be set to where 'hcc' is installed. --hsa-trace - to trace HSA, generates API execution stats/trace and JSON file viewable in chrome tracing Generated files: .hsa_stats.txt .json Traced API list can be set by input .txt or .xml files. From e8cc5960f8783f6ffdb48a86f44f3603411c09c5 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 2 Apr 2019 20:21:55 -0500 Subject: [PATCH 048/168] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c36a99c2..711bc86b 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ Options: --stats - generating kernel executino stats, file .stats.csv --hip-trace - to trace HIP, generates API execution stats/trace and JSON file viewable in chrome tracing - 'HCC_HOME' env va is required to be set to where 'hcc' is installed. + 'HCC_HOME' env var is required to be set to where 'hcc' is installed. --hsa-trace - to trace HSA, generates API execution stats/trace and JSON file viewable in chrome tracing Generated files: .hsa_stats.txt .json Traced API list can be set by input .txt or .xml files. From cc130bd76502d0ee0d335c2b8f66b69ef7d389e9 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 2 Apr 2019 23:18:25 -0500 Subject: [PATCH 049/168] fixing standalone intercepion for N-GPUs --- src/core/rocprofiler.cpp | 42 ++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index dec62c5c..b4ba5d4a 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -230,13 +230,12 @@ hsa_status_t GetExcStatus(const std::exception& e) { : HSA_STATUS_ERROR; } - -inline size_t CreateEnableCmd(const hsa_agent_t& agent, packet_t* command, const size_t& slot_count) { - rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); - const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); +inline size_t CreateEnableCmd(const rocprofiler::util::AgentInfo* agent_info, packet_t* command, const size_t& slot_count) { const bool is_legacy = (strncmp(agent_info->name, "gfx8", 4) == 0); const size_t packet_count = (is_legacy) ? Profile::LEGACY_SLOT_SIZE_PKT : 1; + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + if (packet_count > slot_count) EXC_RAISING(HSA_STATUS_ERROR, "packet_count > slot_count"); // AQLprofile object @@ -288,9 +287,13 @@ hsa_status_t CreateQueuePro( uint32_t group_segment_size, hsa_queue_t **queue) { - static packet_t enable_cmd_packet[Profile::LEGACY_SLOT_SIZE_PKT]; - static size_t enable_cmd_size = 0; - static std::mutex enable_cmd_mutex; + typedef std::pair cmd_entry_t; + typedef std::vector cmd_vec_t; + static cmd_vec_t cmd_vec; + static uint32_t cmd_mask = 0; + static std::mutex cmd_mutex; + + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); // Create HSA queue hsa_status_t status = hsa_queue_create_fn( @@ -305,15 +308,30 @@ hsa_status_t CreateQueuePro( if (status != HSA_STATUS_SUCCESS) return status; // Create 'Enable' cmd packet - if (enable_cmd_size == 0) { - std::lock_guard lck(enable_cmd_mutex); - if (enable_cmd_size == 0) { - enable_cmd_size = CreateEnableCmd(agent, enable_cmd_packet, Profile::LEGACY_SLOT_SIZE_PKT); + const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); + const uint32_t dev_index = 1 << agent_info->dev_index; + const uint32_t dev_mask = 1 << dev_index; + if ((cmd_mask & dev_mask) == 0) { + std::lock_guard lck(cmd_mutex); + + if ((cmd_mask & dev_mask) == 0) { + cmd_mask |= dev_mask; + // Allocating cmd vector + uint32_t mask = 1; + while (1) { + const uint32_t max = 1 << cmd_vec.size(); + if (mask >= max) cmd_vec.push_back({}); + if (((mask & dev_mask) != 0) || (mask == 0)) break; + mask <<= 1; + } + if (mask == 0) EXC_RAISING(status, "bad device index (" << dev_index << ")"); + // Creating cmd packets + cmd_vec[dev_index].second = CreateEnableCmd(agent_info, cmd_vec[dev_index].first, Profile::LEGACY_SLOT_SIZE_PKT); } } // Enable counters for the queue - rocprofiler::util::HsaRsrcFactory::Instance().Submit(*queue, enable_cmd_packet, enable_cmd_size); + rocprofiler::util::HsaRsrcFactory::Instance().Submit(*queue, cmd_vec[dev_index].first, cmd_vec[dev_index].second); return HSA_STATUS_SUCCESS; } From f92bc03fa273f8419a92114e401aafe1407f2f91 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 4 Apr 2019 21:29:57 -0500 Subject: [PATCH 050/168] 2.3 update --- bin/build_kernel.sh | 30 ++++ bin/rpl_run.sh | 2 + bin/run_tool.sh | 18 +- bin/tblextr.py | 4 +- inc/rocprofiler.h | 68 +++++++- src/core/context.h | 308 ++++++++++++++++++--------------- src/core/context_pool.h | 193 +++++++++++++++++++++ src/core/hsa_queue.h | 26 +-- src/core/intercept_queue.h | 4 +- src/core/metrics.h | 2 +- src/core/rocprofiler.cpp | 115 ++++++++---- src/core/tracker.h | 34 ++-- src/util/hsa_rsrc_factory.cpp | 173 +++++++++++++----- src/util/hsa_rsrc_factory.h | 76 ++++++-- src/util/logger.h | 21 ++- test/CMakeLists.txt | 4 +- test/app/intercept_test.cpp | 192 ++++++++++++++++---- test/run.sh | 49 +++++- test/tool/input1.xml | 13 +- test/tool/input2.xml | 5 + test/tool/tool.cpp | 4 +- test/util/hsa_rsrc_factory.cpp | 169 +++++++++++++----- test/util/hsa_rsrc_factory.h | 68 +++++++- 23 files changed, 1200 insertions(+), 378 deletions(-) create mode 100755 bin/build_kernel.sh create mode 100644 src/core/context_pool.h create mode 100644 test/tool/input2.xml diff --git a/bin/build_kernel.sh b/bin/build_kernel.sh new file mode 100755 index 00000000..6c4afe6f --- /dev/null +++ b/bin/build_kernel.sh @@ -0,0 +1,30 @@ +#!/bin/sh + +TEST_NAME=$1 +DST_DIR=$2 + +if [ -z "$TEST_NAME" ] ; then + echo "Usage: $0 " + echo " Will look for .cl and will build .so dynamic object library" + exit 1 +fi + +if [ -z "$DST_DIR" ] ; then + DST_DIR=$(dirname TEST_NAME) +fi + +GFXIP=$(/opt/rocm/bin/rocminfo | grep "amdgcn-amd-amdhsa--" | head -n 1 | sed -n "s/^.*amdgcn-amd-amdhsa--\(\w*\).*$/\1/p") +if [ -z "$GFXIP" ] ; then + echo "GPU is not found" + exit 1 +fi + +OBJ_PREF=$(echo $GFXIP | head -c 4) +OBJ_NAME=$(echo "_$(basename $TEST_NAME)" | sed -e 's/_./\U&\E/g' -e 's/_//g') +OBJ_FILE=${OBJ_PREF}_${OBJ_NAME}.hsaco + +/opt/rocm/opencl/bin/x86_64/clang -cl-std=CL2.0 -cl-std=CL2.0 -include /opt/rocm/opencl/include/opencl-c.h -Xclang -mlink-bitcode-file -Xclang /opt/rocm/opencl/lib/x86_64/bitcode/opencl.amdgcn.bc -Xclang -mlink-bitcode-file -Xclang /opt/rocm/opencl/lib/x86_64/bitcode/ockl.amdgcn.bc -target amdgcn-amd-amdhsa -mcpu=$GFXIP -mno-code-object-v3 $TEST_NAME.cl -o $OBJ_FILE + +echo "'$OBJ_FILE' is generated for '$GFXIP'" + +exit 0 diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index d94ee76f..78edf446 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -142,6 +142,7 @@ usage() { echo "" echo " --stats - generating kernel execution stats, file .stats.csv" echo " --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible" + echo " --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible" echo " Generated files: .hsa_stats.txt .json" echo " Traced API list can be set by input .txt or .xml files." echo " Input .txt:" @@ -302,6 +303,7 @@ while [ 1 ] ; do HSA_TRACE=1 elif [ "$1" = "--hip-trace" ] ; then ARG_VAL=0 + export ROCP_TIMESTAMP_ON=1 GEN_STATS=1 HIP_TRACE=1 elif [ "$1" = "--verbose" ] ; then diff --git a/bin/run_tool.sh b/bin/run_tool.sh index 5ee438c0..ed1609fa 100755 --- a/bin/run_tool.sh +++ b/bin/run_tool.sh @@ -1,26 +1,27 @@ #!/bin/sh BIN_DIR=`dirname $0` -BIN_DIR=`cd $BIN_DIR; pwd` -PKG_DIR=`echo $BIN_DIR | sed "s/\/bin\/*//"` -BIN_DIR=$PKG_DIR/bin +BIN_DIR=`realpath $BIN_DIR` +PKG_DIR=${BIN_DIR%/bin} # PATH to custom HSA libs HSA_PATH=$PKG_DIR/lib/hsa if [ -z "$1" ] ; then echo "Usage: $0 " -else + exit 1 +fi + # profiler plugin library test_app=$* # paths to ROC profiler and oher libraries -export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$HSA_PATH +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PKG_DIR/lib:$PKG_DIR/tool:$HSA_PATH export PATH=.:$PATH # ROC profiler library loaded by HSA runtime export HSA_TOOLS_LIB=librocprofiler64.so.1 # tool library loaded by ROC profiler -if [ -z $ROCP_TOOL_LIB ] ; then +if [ -z "$ROCP_TOOL_LIB" ] ; then export ROCP_TOOL_LIB=libintercept_test.so fi # enable error messages @@ -30,7 +31,8 @@ export ROCPROFILER_LOG=1 # ROC profiler metrics config file unset ROCP_PROXY_QUEUE # ROC profiler metrics config file -export ROCP_METRICS=$BIN_DIR/lib/metrics.xml +if [ -z "$ROCP_METRICS" ] ; then + export ROCP_METRICS=$PKG_DIR/lib/metrics.xml +fi LD_PRELOAD=$ROCP_TOOL_LIB $test_app -fi diff --git a/bin/tblextr.py b/bin/tblextr.py index 4c4cc782..329ab0d8 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -308,7 +308,9 @@ def fill_ops_db(table_name, db, indir): db.insert_entry(table_handle, rec_vals) filtr[corr_id] = 1 - if not gpu_pid in dep_dict: dep_dict[gpu_pid] = {} + if not gpu_pid in dep_dict: + dep_dict[gpu_pid] = {} + dep_dict[gpu_pid]['to'] = {} dep_dict[gpu_pid]['to'][corr_id] = int(rec_vals[0]) / 1000 dep_dict[gpu_pid]['bsp'] = OPS_PID else: fatal("async-copy bad record") diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 5449204b..1e74c464 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -26,8 +26,9 @@ THE SOFTWARE. // // The goal of the implementation is to provide a HW specific low-level // performance analysis interface for profiling of GPU compute applications. -// The profiling includes HW performance counters with derived -// performance metrics. +// The profiling includes HW performance counters (PMC) with complex +// performance metrics and thread traces (SQTT). The profiling is supported +// by the SQTT, PMC and Callback APIs. // // The library can be used by a tool library loaded by HSA runtime or by // higher level HW independent performance analysis API like PAPI. @@ -46,7 +47,7 @@ THE SOFTWARE. #include #include -#define ROCPROFILER_VERSION_MAJOR 6 +#define ROCPROFILER_VERSION_MAJOR 7 #define ROCPROFILER_VERSION_MINOR 0 #ifdef __cplusplus @@ -219,6 +220,7 @@ typedef struct { const hsa_queue_t* queue; // HSA queue uint64_t queue_index; // Index in the queue uint32_t queue_id; // Queue id + hsa_signal_t completion_signal; // Completion signal const hsa_kernel_dispatch_packet_t* packet; // HSA dispatch packet const char* kernel_name; // Kernel name uint64_t kernel_object; // Kernel object pointer @@ -381,6 +383,66 @@ hsa_status_t rocprofiler_queue_create_profiled( void* data, uint32_t private_segment_size, uint32_t group_segment_size, hsa_queue_t** queue); +//////////////////////////////////////////////////////////////////////////////// +// Profiling pool +// +// Support for profiling contexts pool +// The API provide capability to create a contexts pool for a given agent and a set of features, +// to fetch/relase a context entry, to register a callback for the contexts completion. + +// Profiling pool handle +typedef void rocprofiler_pool_t; + +// Profiling pool entry +typedef struct { + rocprofiler_t* context; // context object + void* payload; // payload data object +} rocprofiler_pool_entry_t; + +// Profiling handler, calling on profiling completion +typedef bool (*rocprofiler_pool_handler_t)(const rocprofiler_pool_entry_t* entry, void* arg); + +// Profiling preperties +typedef struct { + uint32_t num_entries; // pool size entries + uint32_t payload_bytes; // payload size bytes + rocprofiler_pool_handler_t handler; // handler on context completion + void* handler_arg; // the handler arg +} rocprofiler_pool_properties_t; + +// Open profiling pool +hsa_status_t rocprofiler_pool_open( + hsa_agent_t agent, // GPU handle + rocprofiler_feature_t* features, // [in] profiling features array + uint32_t feature_count, // profiling info count + rocprofiler_pool_t** pool, // [out] context object + uint32_t mode, // profiling mode mask + rocprofiler_pool_properties_t*); // pool properties + +// Close profiling pool +hsa_status_t rocprofiler_pool_close( + rocprofiler_pool_t* pool); // profiling pool handle + +// Fetch profiling pool entry +hsa_status_t rocprofiler_pool_fetch( + rocprofiler_pool_t* pool, // profiling pool handle + rocprofiler_pool_entry_t* entry); // [out] empty profiling pool entry + +// Release profiling pool entry +hsa_status_t rocprofiler_pool_release( + rocprofiler_pool_entry_t* entry); // released profiling pool entry + +// Iterate fetched profiling pool entries +hsa_status_t rocprofiler_pool_iterate( + rocprofiler_pool_t* pool, // profiling pool handle + hsa_status_t (*callback)(rocprofiler_pool_entry_t* entry, void* data), // callback + void *data); // [in/out] data passed to callback + +// Flush completed entries in profiling pool +hsa_status_t rocprofiler_pool_flush( + rocprofiler_pool_t* pool); // profiling pool handle + +//////////////////////////////////////////////////////////////////////////////// #ifdef __cplusplus } // extern "C" block #endif // __cplusplus diff --git a/src/core/context.h b/src/core/context.h index 6eb391a8..a59effd0 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -153,149 +153,31 @@ class Context { public: typedef std::map info_map_t; - Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, - const uint32_t info_count, rocprofiler_handler_t handler, void* handler_arg) - : agent_(agent_info->dev_id), - agent_info_(agent_info), - queue_(queue), - hsa_rsrc_(&util::HsaRsrcFactory::Instance()), - api_(hsa_rsrc_->AqlProfileApi()), - metrics_(NULL), - handler_(handler), - handler_arg_(handler_arg) + static void Create(Context* obj, const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, + const uint32_t info_count, rocprofiler_handler_t handler, void* handler_arg) { - if (info_count == 0) { - set_.push_back(Group(agent_info_, this, 0)); - return; - } - - metrics_ = MetricsDict::Create(agent_info); - if (metrics_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); - - if (Initialize(info, info_count) == false) { - fprintf(stdout, "\nInput metrics out of HW limit. Proposed metrics group set:\n"); fflush(stdout); - MetricsGroupSet(agent_info, info, info_count).Print(stdout); - fprintf(stdout, "\n"); fflush(stdout); - EXC_RAISING(HSA_STATUS_ERROR, "Metrics list exceeds HW limits"); - } - Finalize(); - - if (handler != NULL) { - for (unsigned group_index = 0; group_index < set_.size(); ++group_index) { - set_[group_index].ResetRefsCount(); - const profile_vector_t profile_vector = GetProfiles(group_index); - for (auto& tuple : profile_vector) { - // Handler for stop packet completion - hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, - &set_[group_index]); - } - } - } - } - - ~Context() { - for (const auto& v : info_map_) { - const std::string& name = v.first; - const rocprofiler_feature_t* info = v.second; - if ((info->kind == ROCPROFILER_FEATURE_KIND_METRIC) && - (metrics_map_.find(name) == metrics_map_.end())) { - delete info; - } - } + new (obj) Context(agent_info, queue, info, info_count, handler, handler_arg); + obj->Construct(agent_info, queue, info, info_count, handler, handler_arg); } - // Initialize rocprofiler context - bool Initialize(rocprofiler_feature_t* info_array, const uint32_t info_count) { - // Register input features to not duplicate by features referencing - for (unsigned i = 0; i < info_count; ++i) { - rocprofiler_feature_t* info = &info_array[i]; - if (!info->name) EXC_RAISING(HSA_STATUS_ERROR, "input feature name is NULL"); - info_map_[info->name] = info; - } - - // Adding zero group, always present - if (info_count) set_.push_back(Group(agent_info_, this, 0)); - - // Processing input features - for (unsigned i = 0; i < info_count; ++i) { - rocprofiler_feature_t* info = &info_array[i]; - const rocprofiler_feature_kind_t kind = info->kind; - const char* name = info->name; - - if (kind == ROCPROFILER_FEATURE_KIND_METRIC) { // Processing metrics features - const Metric* metric = metrics_->Get(name); - if (metric == NULL) - EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found"); -#if 0 - std::cout << " " << name << (metric->GetExpr() ? " = " + metric->GetExpr()->String() : " counter") << std::endl; -#endif + static void Release(Context* obj) { obj->Destruct(); } - auto ret = metrics_map_.insert({name, metric}); - if (!ret.second) - EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name - << "' is registered more then once"); - - counters_vec_t counters_vec = metric->GetCounters(); - if (counters_vec.empty()) - EXC_RAISING(HSA_STATUS_ERROR, "bad metric '" << name << "' is empty"); - - for (const counter_t* counter : counters_vec) { - // For metrics expressions checking that there is no the same counter in the input metrics - // and also that the counter wasn't registered already by another input metric expression - if (metric->GetExpr()) { - if (info_map_.find(counter->name) != info_map_.end()) { - continue; - } else { - info = NewCounterInfo(counter); - info_map_[info->name] = info; - } - } - - const event_t* event = &(counter->event); - const block_des_t block_des = {event->block_name, event->block_index}; - auto ret = groups_map_.insert({block_des, {}}); - block_status_t& block_status = ret.first->second; - if (block_status.max_counters == 0) { - profile_t query = {}; - query.agent = agent_; - query.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC; - query.events = event; - - uint32_t block_counters; - hsa_status_t status = api_->hsa_ven_amd_aqlprofile_get_info( - &query, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS, &block_counters); - if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "get block_counters info"); - block_status.max_counters = block_counters; - } - if (block_status.counter_index >= block_status.max_counters) { - return false; - - block_status.counter_index = 0; - block_status.group_index += 1; - } - block_status.counter_index += 1; - if (block_status.group_index >= set_.size()) { - set_.push_back(Group(agent_info_, this, block_status.group_index)); - } - const uint32_t group_index = block_status.group_index; - set_[group_index].Insert(profile_info_t{event, NULL, 0, info}); - } - } else if (kind == ROCPROFILER_FEATURE_KIND_TRACE) { // Processing traces features - set_[0].Insert(profile_info_t{NULL, info->parameters, info->parameter_count, info}); - } else { - EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); - } + static Context* Create(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, + const uint32_t info_count, rocprofiler_handler_t handler, void* handler_arg) + { + Context* obj = new Context(agent_info, queue, info, info_count, handler, handler_arg); + if (obj == NULL) EXC_RAISING(HSA_STATUS_ERROR, "allocation error"); + try { + obj->Construct(agent_info, queue, info, info_count, handler, handler_arg); + } catch(...) { + delete obj; + obj = NULL; + throw; } - - return true; + return obj; } - void Finalize() { - for (unsigned index = 0; index < set_.size(); ++index) { - const hsa_status_t status = set_[index].Finalize(); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "context finalize failed"); - } - } + static void Destroy(Context* obj) { if (obj != NULL) delete obj; } void Reset(const uint32_t& group_index) { set_[group_index].ResetRefsCount(); } @@ -415,6 +297,160 @@ class Context { rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; } private: + Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, + const uint32_t info_count, rocprofiler_handler_t handler, void* handler_arg) + : agent_(agent_info->dev_id), + agent_info_(agent_info), + queue_(queue), + hsa_rsrc_(&util::HsaRsrcFactory::Instance()), + api_(hsa_rsrc_->AqlProfileApi()), + metrics_(NULL), + handler_(handler), + handler_arg_(handler_arg) + {} + + ~Context() { Destruct(); } + + void Destruct() { + for (const auto& v : info_map_) { + const std::string& name = v.first; + const rocprofiler_feature_t* info = v.second; + if ((info->kind == ROCPROFILER_FEATURE_KIND_METRIC) && + (metrics_map_.find(name) == metrics_map_.end())) { + delete info; + } + } + } + + void Construct(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, + const uint32_t info_count, rocprofiler_handler_t handler, void* handler_arg) + { + if (info_count == 0) { + set_.push_back(Group(agent_info_, this, 0)); + return; + } + + metrics_ = MetricsDict::Create(agent_info); + if (metrics_ == NULL) EXC_RAISING(HSA_STATUS_ERROR, "MetricsDict create failed"); + + if (Initialize(info, info_count) == false) { + fprintf(stdout, "\nInput metrics out of HW limit. Proposed metrics group set:\n"); fflush(stdout); + MetricsGroupSet(agent_info, info, info_count).Print(stdout); + fprintf(stdout, "\n"); fflush(stdout); + EXC_RAISING(HSA_STATUS_ERROR, "Metrics list exceeds HW limits"); + } + Finalize(); + + if (handler != NULL) { + for (unsigned group_index = 0; group_index < set_.size(); ++group_index) { + set_[group_index].ResetRefsCount(); + const profile_vector_t profile_vector = GetProfiles(group_index); + for (auto& tuple : profile_vector) { + // Handler for stop packet completion + hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, + &set_[group_index]); + } + } + } + } + + // Initialize rocprofiler context + bool Initialize(rocprofiler_feature_t* info_array, const uint32_t info_count) { + // Register input features to not duplicate by features referencing + for (unsigned i = 0; i < info_count; ++i) { + rocprofiler_feature_t* info = &info_array[i]; + const rocprofiler_feature_kind_t kind = info->kind; + const char* name = info->name; + if (!name) EXC_RAISING(HSA_STATUS_ERROR, "input feature name is NULL"); + info_map_[name] = info; + if (kind == ROCPROFILER_FEATURE_KIND_METRIC) { + auto ret = metrics_map_.insert({name, NULL}); + if (!ret.second) + EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name + << "' is registered more then once"); + } + } + + // Adding zero group, always present + if (info_count) set_.push_back(Group(agent_info_, this, 0)); + + // Processing input features + for (unsigned i = 0; i < info_count; ++i) { + rocprofiler_feature_t* info = &info_array[i]; + const rocprofiler_feature_kind_t kind = info->kind; + const char* name = info->name; + + if (kind == ROCPROFILER_FEATURE_KIND_METRIC) { // Processing metrics features + const Metric* metric = metrics_->Get(name); + if (metric == NULL) + EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found"); +#if 0 + std::cout << " " << name << (metric->GetExpr() ? " = " + metric->GetExpr()->String() : " counter") << std::endl; +#endif + + metrics_map_[name] = metric; + counters_vec_t counters_vec = metric->GetCounters(); + if (counters_vec.empty()) + EXC_RAISING(HSA_STATUS_ERROR, "bad metric '" << name << "' is empty"); + + for (const counter_t* counter : counters_vec) { + // For metrics expressions checking that there is no the same counter in the input metrics + // and also that the counter wasn't registered already by another input metric expression + if (metric->GetExpr()) { + if (info_map_.find(counter->name) != info_map_.end()) { + continue; + } else { + info = NewCounterInfo(counter); + info_map_[info->name] = info; + } + } + + const event_t* event = &(counter->event); + const block_des_t block_des = {event->block_name, event->block_index}; + auto ret = groups_map_.insert({block_des, {}}); + block_status_t& block_status = ret.first->second; + if (block_status.max_counters == 0) { + profile_t query = {}; + query.agent = agent_; + query.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC; + query.events = event; + + uint32_t block_counters; + hsa_status_t status = api_->hsa_ven_amd_aqlprofile_get_info( + &query, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS, &block_counters); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "get block_counters info"); + block_status.max_counters = block_counters; + } + if (block_status.counter_index >= block_status.max_counters) { + return false; + + block_status.counter_index = 0; + block_status.group_index += 1; + } + block_status.counter_index += 1; + if (block_status.group_index >= set_.size()) { + set_.push_back(Group(agent_info_, this, block_status.group_index)); + } + const uint32_t group_index = block_status.group_index; + set_[group_index].Insert(profile_info_t{event, NULL, 0, info}); + } + } else if (kind == ROCPROFILER_FEATURE_KIND_TRACE) { // Processing traces features + set_[0].Insert(profile_info_t{NULL, info->parameters, info->parameter_count, info}); + } else { + EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); + } + } + + return true; + } + + void Finalize() { + for (unsigned index = 0; index < set_.size(); ++index) { + const hsa_status_t status = set_[index].Finalize(); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "context finalize failed"); + } + } + // Getting profling packets profile_vector_t GetProfiles(const uint32_t& index) { profile_vector_t vec; diff --git a/src/core/context_pool.h b/src/core/context_pool.h new file mode 100644 index 00000000..3056cccc --- /dev/null +++ b/src/core/context_pool.h @@ -0,0 +1,193 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#ifndef SRC_CORE_CONTEXT_POOL_H_ +#define SRC_CORE_CONTEXT_POOL_H_ + +#include "inc/rocprofiler.h" + +#include + +#include "core/context.h" + +namespace rocprofiler { +class ContextPool { + public: + typedef uint64_t index_t; + typedef std::mutex mutex_t; + + struct entry_t { + ContextPool* pool; + Context* context; + std::atomic completed; + }; + + static ContextPool* Create( + uint32_t num_entries, + uint32_t payload_bytes, + const util::AgentInfo* agent_info, + rocprofiler_feature_t* info, + const uint32_t info_count, + rocprofiler_pool_handler_t handler, + void* handler_arg) + { + ContextPool* obj = new ContextPool(num_entries, payload_bytes, agent_info, info, info_count, handler, handler_arg); + if (obj == NULL) EXC_RAISING(HSA_STATUS_ERROR, "allocation error"); + return obj; + } + + static void Destroy(ContextPool* pool) { delete pool; } + + void Fetch(rocprofiler_pool_entry_t* pool_entry) { + if (constructed_ == false) { + Construct(agent_info_, info_, info_count_); + } + const index_t write_index = write_index_.fetch_add(entry_size_bytes_, std::memory_order_relaxed); + while (write_index >= (read_index_.load(std::memory_order_acquire) + array_size_bytes_)) { + check_completed(); + std::this_thread::yield(); + } + entry_t* entry = GetPoolEntry(write_index, pool_entry); + if (entry->completed.load(std::memory_order_relaxed) != false) EXC_RAISING(HSA_STATUS_ERROR, "Corrupted pool entry"); + } + + void Flush() { + check_completed(); + } + + private: + static unsigned aligned64(const unsigned& size) { return (size + 0x3f) & ~0x3fu; } + + static bool context_handler(rocprofiler_group_t group, void* arg) { + entry_t* entry = reinterpret_cast(arg); + entry->completed.store(true, std::memory_order_release); + entry->pool->check_completed(); + return true; + } + + ContextPool( + uint32_t num_entries, + uint32_t payload_bytes, + const util::AgentInfo* agent_info, + rocprofiler_feature_t* info, + const uint32_t info_count, + rocprofiler_pool_handler_t pool_handler, + void* pool_handler_arg + ) : + payload_off_(aligned64(sizeof(entry_t))), + entry_size_bytes_(payload_off_ + aligned64(payload_bytes)), + array_size_bytes_(entry_size_bytes_ * num_entries), + array_(NULL), + read_index_(0), + write_index_(0), + sync_flag_(false), + + agent_info_(agent_info), + info_(info), + info_count_(info_count), + pool_handler_(pool_handler), + pool_handler_arg_(pool_handler_arg), + constructed_(false) + {} + + void Construct(const util::AgentInfo* agent_info, rocprofiler_feature_t* info, const uint32_t info_count) { + std::lock_guard lck(mutex_); + + if (constructed_ == false) { + array_data_ = (char*) malloc(array_size_bytes_ + 0x3f); + array_ = reinterpret_cast(((intptr_t)array_data_ + 0x3f) >> 6 << 6); + if (((intptr_t)array_ & 0x3f) != 0) EXC_RAISING(HSA_STATUS_ERROR, "Pool array is not aligned"); + memset(array_, 0, array_size_bytes_); + + const char* end = array_ + array_size_bytes_; + for (char* ptr = array_; ptr < end; ptr += entry_size_bytes_) { + entry_t* entry = reinterpret_cast(ptr); + entry->pool = this; + entry->context = Context::Create(agent_info, NULL, info, info_count, ContextPool::context_handler, ptr); + } + + constructed_ = true; + } + } + + ~ContextPool() { + const char* end = array_ + array_size_bytes_; + for (char* ptr = array_; ptr < end; ptr += entry_size_bytes_) { + entry_t* entry = reinterpret_cast(ptr); + Context::Destroy(entry->context); + } + free(array_); + } + + char* GetArrayPtr(const uint32_t& index) { return array_ + (index % array_size_bytes_); } + + entry_t* GetPoolEntry(const uint32_t& index, rocprofiler_pool_entry_t* pool_entry) { + char* ptr = GetArrayPtr(index); + entry_t* entry = reinterpret_cast(ptr); + void* payload = ptr + payload_off_; + *pool_entry = rocprofiler_pool_entry_t{}; + pool_entry->context = reinterpret_cast(entry->context); + pool_entry->payload = payload; + return entry; + } + + void check_completed() { + if (sync_flag_.test_and_set(std::memory_order_acquire) == false) { + index_t read_index = read_index_.load(std::memory_order_relaxed); + const index_t write_index = write_index_.load(std::memory_order_relaxed); + while(read_index < write_index) { + rocprofiler_pool_entry_t pool_entry{}; + entry_t* entry = GetPoolEntry(read_index, &pool_entry); + if (entry->completed.load(std::memory_order_acquire) == true) { + pool_handler_(&pool_entry, pool_handler_arg_); + entry->completed.store(false, std::memory_order_relaxed); + read_index += entry_size_bytes_; + read_index_.store(read_index, std::memory_order_release); + } else { + break; + } + } + sync_flag_.clear(std::memory_order_release); + } + } + + const uint32_t payload_off_; + const uint32_t entry_size_bytes_; + const uint32_t array_size_bytes_; + char* array_data_; + char* array_; + volatile std::atomic read_index_; + volatile std::atomic write_index_; + volatile std::atomic_flag sync_flag_; + + const util::AgentInfo* agent_info_; + rocprofiler_feature_t* info_; + const uint32_t info_count_; + rocprofiler_pool_handler_t pool_handler_; + void* pool_handler_arg_; + + bool constructed_; + mutex_t mutex_; +}; +} // namespace rocprofiler + +#endif // SRC_CORE_CONTEXT_POOL_H_ diff --git a/src/core/hsa_queue.h b/src/core/hsa_queue.h index 12ef97bb..7c7d96c6 100644 --- a/src/core/hsa_queue.h +++ b/src/core/hsa_queue.h @@ -35,31 +35,7 @@ class HsaQueue : public Queue { HsaQueue(const util::AgentInfo* agent_info, hsa_queue_t* queue) : queue_(queue) {} void Submit(const packet_t* packet) { - // Compute the write index of queue and copy Aql packet into it - const uint64_t que_idx = hsa_queue_load_write_index_relaxed(queue_); - // Increment the write index - hsa_queue_store_write_index_relaxed(queue_, que_idx + 1); - - const uint32_t mask = queue_->size - 1; - - // Copy packet to the queue - const packet_word_t* src = reinterpret_cast(packet); - packet_t* slot = reinterpret_cast(queue_->base_address) + (que_idx & mask); - packet_word_t* dst = reinterpret_cast(slot); - const uint32_t nwords = sizeof(packet_t) / sizeof(packet_word_t); - for (unsigned i = 1; i < nwords; ++i) { - dst[i] = src[i]; - } - - // To maintain global order to ensure the prior copy of the packet contents is made visible - // before the header is updated. - // With in-order CP it will wait until the first packet in the blob will be valid - std::atomic* header_atomic_ptr = - reinterpret_cast*>(&dst[0]); - header_atomic_ptr->store(src[0], std::memory_order_release); - - // Doorbell signaling - hsa_signal_store_relaxed(queue_->doorbell_signal, que_idx); + rocprofiler::util::HsaRsrcFactory::Instance().Submit(queue_, packet); } private: diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index e41dcd0f..5a6234ab 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -72,7 +72,7 @@ class InterceptQueue { if (tracker_on || tracker_on_) { if (tracker_ == NULL) tracker_ = &Tracker::Instance(); - status = hsa_amd_profiling_set_profiler_enabled(*queue, true); + status = rocprofiler::util::HsaRsrcFactory::HsaApi()->hsa_amd_profiling_set_profiler_enabled(*queue, true); if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_set_profiler_enabled()"); } @@ -138,6 +138,7 @@ class InterceptQueue { if ((GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) && (dispatch_callback_ != NULL)) { const hsa_kernel_dispatch_packet_t* dispatch_packet = reinterpret_cast(packet); + const hsa_signal_t completion_signal = dispatch_packet->completion_signal; // Adding kernel timing tracker Tracker::entry_t* tracker_entry = NULL; @@ -155,6 +156,7 @@ class InterceptQueue { obj->queue_, user_que_idx, obj->queue_id, + completion_signal, dispatch_packet, kernel_name, kernel_symbol, diff --git a/src/core/metrics.h b/src/core/metrics.h index 46806dcf..cb55d189 100644 --- a/src/core/metrics.h +++ b/src/core/metrics.h @@ -170,7 +170,7 @@ class MetricsDict { const_iterator_t End() const { return cache_.end(); } xml::Xml::nodes_t GetNodes(const std::string& scope) const { - return xml_->GetNodes("top." + scope + ".metric"); + return (xml_ != NULL) ? xml_->GetNodes("top." + scope + ".metric") : xml::Xml::nodes_t(); } private: diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index b4ba5d4a..de16fa19 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -28,6 +28,7 @@ THE SOFTWARE. #include #include "core/context.h" +#include "core/context_pool.h" #include "core/hsa_queue.h" #include "core/intercept_queue.h" #include "core/proxy_queue.h" @@ -89,6 +90,8 @@ decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn; ::HsaApiTable* kHsaApiTable; void SaveHsaApi(::HsaApiTable* table) { + util::HsaRsrcFactory::InitHsaApiTable(table); + kHsaApiTable = table; hsa_queue_create_fn = table->core_->hsa_queue_create_fn; hsa_queue_destroy_fn = table->core_->hsa_queue_destroy_fn; @@ -230,11 +233,12 @@ hsa_status_t GetExcStatus(const std::exception& e) { : HSA_STATUS_ERROR; } -inline size_t CreateEnableCmd(const rocprofiler::util::AgentInfo* agent_info, packet_t* command, const size_t& slot_count) { - const bool is_legacy = (strncmp(agent_info->name, "gfx8", 4) == 0); - const size_t packet_count = (is_legacy) ? Profile::LEGACY_SLOT_SIZE_PKT : 1; +inline size_t CreateEnableCmd(const hsa_agent_t& agent, packet_t* command, const size_t& slot_count) { rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); + const bool is_legacy = (strncmp(agent_info->name, "gfx8", 4) == 0); + const size_t packet_count = (is_legacy) ? Profile::LEGACY_SLOT_SIZE_PKT : 1; if (packet_count > slot_count) EXC_RAISING(HSA_STATUS_ERROR, "packet_count > slot_count"); @@ -287,13 +291,9 @@ hsa_status_t CreateQueuePro( uint32_t group_segment_size, hsa_queue_t **queue) { - typedef std::pair cmd_entry_t; - typedef std::vector cmd_vec_t; - static cmd_vec_t cmd_vec; - static uint32_t cmd_mask = 0; - static std::mutex cmd_mutex; - - rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + static packet_t enable_cmd_packet[Profile::LEGACY_SLOT_SIZE_PKT]; + static size_t enable_cmd_size = 0; + static std::mutex enable_cmd_mutex; // Create HSA queue hsa_status_t status = hsa_queue_create_fn( @@ -308,30 +308,15 @@ hsa_status_t CreateQueuePro( if (status != HSA_STATUS_SUCCESS) return status; // Create 'Enable' cmd packet - const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); - const uint32_t dev_index = 1 << agent_info->dev_index; - const uint32_t dev_mask = 1 << dev_index; - if ((cmd_mask & dev_mask) == 0) { - std::lock_guard lck(cmd_mutex); - - if ((cmd_mask & dev_mask) == 0) { - cmd_mask |= dev_mask; - // Allocating cmd vector - uint32_t mask = 1; - while (1) { - const uint32_t max = 1 << cmd_vec.size(); - if (mask >= max) cmd_vec.push_back({}); - if (((mask & dev_mask) != 0) || (mask == 0)) break; - mask <<= 1; - } - if (mask == 0) EXC_RAISING(status, "bad device index (" << dev_index << ")"); - // Creating cmd packets - cmd_vec[dev_index].second = CreateEnableCmd(agent_info, cmd_vec[dev_index].first, Profile::LEGACY_SLOT_SIZE_PKT); + if (enable_cmd_size == 0) { + std::lock_guard lck(enable_cmd_mutex); + if (enable_cmd_size == 0) { + enable_cmd_size = CreateEnableCmd(agent, enable_cmd_packet, Profile::LEGACY_SLOT_SIZE_PKT); } } // Enable counters for the queue - rocprofiler::util::HsaRsrcFactory::Instance().Submit(*queue, cmd_vec[dev_index].first, cmd_vec[dev_index].second); + rocprofiler::util::HsaRsrcFactory::Instance().Submit(*queue, enable_cmd_packet, enable_cmd_size); return HSA_STATUS_SUCCESS; } @@ -383,11 +368,11 @@ hsa_status_t hsa_amd_memory_async_copy_rect_interceptor( rocprofiler_properties_t rocprofiler_properties; uint32_t SqttProfile::output_buffer_size_ = 0x2000000; // 32M bool SqttProfile::output_buffer_local_ = true; -Tracker* Tracker::instance_ = NULL; +std::atomic Tracker::instance_{}; Tracker::mutex_t Tracker::glob_mutex_; Tracker::counter_t Tracker::counter_ = 0; util::Logger::mutex_t util::Logger::mutex_; -util::Logger* util::Logger::instance_ = NULL; +std::atomic util::Logger::instance_{}; } /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -494,8 +479,9 @@ PUBLIC_API hsa_status_t rocprofiler_open(hsa_agent_t agent, rocprofiler_feature_ } } - *handle = new rocprofiler::Context(agent_info, queue, features, feature_count, properties->handler, - properties->handler_arg); + rocprofiler::Context** context_ret = reinterpret_cast(handle); + *context_ret = rocprofiler::Context::Create(agent_info, queue, features, feature_count, + properties->handler, properties->handler_arg); API_METHOD_SUFFIX } @@ -503,7 +489,7 @@ PUBLIC_API hsa_status_t rocprofiler_open(hsa_agent_t agent, rocprofiler_feature_ PUBLIC_API hsa_status_t rocprofiler_close(rocprofiler_t* handle) { API_METHOD_PREFIX rocprofiler::Context* context = reinterpret_cast(handle); - if (context) delete context; + if (context) rocprofiler::Context::Destroy(context); API_METHOD_SUFFIX } @@ -625,6 +611,64 @@ PUBLIC_API hsa_status_t rocprofiler_iterate_trace_data( API_METHOD_SUFFIX } +//////////////////////////////////////////////////////////////////////////////// +// Open profiling pool +PUBLIC_API hsa_status_t rocprofiler_pool_open(hsa_agent_t agent, // GPU handle + rocprofiler_feature_t* features, // [in] profiling features array + uint32_t feature_count, // profiling info count + rocprofiler_pool_t** pool, // [out] context object + uint32_t mode, // profiling mode mask + rocprofiler_pool_properties_t* properties) // pool properties +{ + API_METHOD_PREFIX + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); + if (agent_info == NULL) { + EXC_RAISING(HSA_STATUS_ERROR, "agent is not found"); + } + + rocprofiler::ContextPool* obj = rocprofiler::ContextPool::Create( + properties->num_entries, + properties->payload_bytes, + agent_info, + features, + feature_count, + properties->handler, + properties->handler_arg + ); + *pool = reinterpret_cast(obj); + API_METHOD_SUFFIX +} + +// Close profiling pool +PUBLIC_API hsa_status_t rocprofiler_pool_close(rocprofiler_pool_t* pool) // profiling pool handle +{ + API_METHOD_PREFIX + rocprofiler::ContextPool* obj = reinterpret_cast(pool); + rocprofiler::ContextPool::Destroy(obj); + API_METHOD_SUFFIX +} + +// Fetch profiling pool entry +PUBLIC_API hsa_status_t rocprofiler_pool_fetch(rocprofiler_pool_t* pool, // profiling pool handle + rocprofiler_pool_entry_t* entry) // [out] empty profling pool entry +{ + API_METHOD_PREFIX + rocprofiler::ContextPool* context_pool = reinterpret_cast(pool); + context_pool->Fetch(entry); + API_METHOD_SUFFIX +} + +// Fetch profiling pool entry +PUBLIC_API hsa_status_t rocprofiler_pool_flush(rocprofiler_pool_t* pool) // profiling pool handle +{ + API_METHOD_PREFIX + rocprofiler::ContextPool* context_pool = reinterpret_cast(pool); + context_pool->Flush(); + API_METHOD_SUFFIX +} + +//////////////////////////////////////////////////////////////////////////////// // Return the info for a given info kind PUBLIC_API hsa_status_t rocprofiler_get_info( const hsa_agent_t *agent, @@ -687,6 +731,7 @@ PUBLIC_API hsa_status_t rocprofiler_iterate_info( info.metric.name = strdup(name.c_str()); info.metric.description = strdup(descr.c_str()); info.metric.expr = expr.empty() ? NULL : strdup(expr.c_str()); + info.metric.instances = 1; if (expr.empty()) { // Getting the block name diff --git a/src/core/tracker.h b/src/core/tracker.h index 0cada86f..ffc06b85 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -66,13 +66,19 @@ class Tracker { static Tracker* Create() { std::lock_guard lck(glob_mutex_); - if (instance_ == NULL) instance_ = new Tracker; - return instance_; + Tracker* obj = instance_.load(std::memory_order_relaxed); + if (obj == NULL) { + obj = new Tracker; + if (obj == NULL) EXC_ABORT(HSA_STATUS_ERROR, "Tracker creation failed"); + instance_.store(obj, std::memory_order_release); + } + return obj; } static Tracker& Instance() { - if (instance_ == NULL) instance_ = Create(); - return *instance_; + Tracker* obj = instance_.load(std::memory_order_acquire); + if (obj == NULL) obj = Create(); + return *obj; } static void Destroy() { @@ -99,9 +105,9 @@ class Tracker { entry->record = record; // Creating a proxy signal - status = hsa_signal_create(1, 0, NULL, &(entry->signal)); + status = hsa_api_.hsa_signal_create(1, 0, NULL, &(entry->signal)); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); - status = hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); + status = hsa_api_.hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); // Adding antry to the list @@ -115,7 +121,7 @@ class Tracker { // Delete tracker entry void Delete(entry_t* entry) { - hsa_signal_destroy(entry->signal); + hsa_api_.hsa_signal_destroy(entry->signal); mutex_.lock(); sig_list_.erase(entry->it); mutex_.unlock(); @@ -151,7 +157,8 @@ class Tracker { private: Tracker() : outstanding_(0), - hsa_rsrc_(&(util::HsaRsrcFactory::Instance())) + hsa_rsrc_(&(util::HsaRsrcFactory::Instance())), + hsa_api_(*(hsa_rsrc_->HsaApi())) {} ~Tracker() { @@ -181,13 +188,13 @@ class Tracker { // Query begin/end and complete timestamps if (entry->is_memcopy) { hsa_amd_profiling_async_copy_time_t async_copy_time{}; - hsa_status_t status = hsa_amd_profiling_get_async_copy_time(entry->signal, &async_copy_time); + hsa_status_t status = hsa_api_.hsa_amd_profiling_get_async_copy_time(entry->signal, &async_copy_time); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_async_copy_time"); record->begin = hsa_rsrc_->SysclockToNs(async_copy_time.start); record->end = hsa_rsrc_->SysclockToNs(async_copy_time.end); } else { hsa_amd_profiling_dispatch_time_t dispatch_time{}; - hsa_status_t status = hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time); + hsa_status_t status = hsa_api_.hsa_amd_profiling_get_dispatch_time(entry->agent, entry->signal, &dispatch_time); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time"); record->begin = hsa_rsrc_->SysclockToNs(dispatch_time.start); record->end = hsa_rsrc_->SysclockToNs(dispatch_time.end); @@ -204,9 +211,9 @@ class Tracker { orig_signal_ptr->start_ts = prof_signal_ptr->start_ts; orig_signal_ptr->end_ts = prof_signal_ptr->end_ts; - const hsa_signal_value_t new_value = hsa_signal_load_relaxed(orig) - 1; + const hsa_signal_value_t new_value = hsa_api_.hsa_signal_load_relaxed(orig) - 1; if (signal_value != new_value) EXC_ABORT(HSA_STATUS_ERROR, "Tracker::Complete bad signal value"); - hsa_signal_store_screlease(orig, signal_value); + hsa_api_.hsa_signal_store_screlease(orig, signal_value); } } @@ -260,7 +267,7 @@ class Tracker { } // instance - static Tracker* instance_; + static std::atomic instance_; static mutex_t glob_mutex_; static counter_t counter_; @@ -273,6 +280,7 @@ class Tracker { std::atomic outstanding_; // HSA resources factory util::HsaRsrcFactory* hsa_rsrc_; + const util::hsa_pfn_t& hsa_api_; // Handling ordering enabled static const bool ordering_enabled_ = false; // Enable tracing diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index 2d64bae0..a47062dd 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -77,13 +77,13 @@ static hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool return HSA_STATUS_ERROR_INVALID_ARGUMENT; } - err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); CHECK_STATUS("hsa_amd_memory_pool_get_info", err); if (HSA_AMD_SEGMENT_GLOBAL != segment) { return HSA_STATUS_SUCCESS; } - err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); CHECK_STATUS("hsa_amd_memory_pool_get_info", err); uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT; @@ -117,14 +117,16 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize cpu_pool_ = NULL; kern_arg_pool_ = NULL; + InitHsaApiTable(NULL); + // Initialize the Hsa Runtime if (initialize_hsa_) { - status = hsa_init(); + status = hsa_api_.hsa_init(); CHECK_STATUS("Error in hsa_init", status); } // Discover the set of Gpu devices available on the platform - status = hsa_iterate_agents(GetHsaAgentsCallback, this); + status = hsa_api_.hsa_iterate_agents(GetHsaAgentsCallback, this); CHECK_STATUS("Error Calling hsa_iterate_agents", status); if (cpu_pool_ == NULL) CHECK_STATUS("CPU memory pool is not found", HSA_STATUS_ERROR); if (kern_arg_pool_ == NULL) CHECK_STATUS("Kern-arg memory pool is not found", HSA_STATUS_ERROR); @@ -134,17 +136,17 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize #ifdef ROCP_LD_AQLPROFILE status = LoadAqlProfileLib(&aqlprofile_api_); #else - status = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, sizeof(aqlprofile_api_), &aqlprofile_api_); + status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, sizeof(aqlprofile_api_), &aqlprofile_api_); #endif CHECK_STATUS("aqlprofile API table load failed", status); // Get Loader API table loader_api_ = {0}; - status = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_); + status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_); CHECK_STATUS("loader API table query failed", status); // Instantiate HSA timer - timer_ = new HsaTimer; + timer_ = new HsaTimer(&hsa_api_); CHECK_STATUS("HSA timer allocation failed", (timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); @@ -158,11 +160,95 @@ HsaRsrcFactory::~HsaRsrcFactory() { for (auto p : cpu_list_) delete p; for (auto p : gpu_list_) delete p; if (initialize_hsa_) { - hsa_status_t status = hsa_shut_down(); + hsa_status_t status = hsa_api_.hsa_shut_down(); CHECK_STATUS("Error in hsa_shut_down", status); } } +void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { + std::lock_guard lck(mutex_); + + if (hsa_api_.hsa_init == NULL) { + if (table != NULL) { + hsa_api_.hsa_init = table->core_->hsa_init_fn; + hsa_api_.hsa_shut_down = table->core_->hsa_shut_down_fn; + hsa_api_.hsa_agent_get_info = table->core_->hsa_agent_get_info_fn; + hsa_api_.hsa_iterate_agents = table->core_->hsa_iterate_agents_fn; + + hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn; + hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn; + hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn; + hsa_api_.hsa_queue_store_write_index_relaxed = table->core_->hsa_queue_store_write_index_relaxed_fn; + hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn; + + hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn; + hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn; + hsa_api_.hsa_signal_load_relaxed = table->core_->hsa_signal_load_relaxed_fn; + hsa_api_.hsa_signal_store_relaxed = table->core_->hsa_signal_store_relaxed_fn; + hsa_api_.hsa_signal_wait_scacquire = table->core_->hsa_signal_wait_scacquire_fn; + hsa_api_.hsa_signal_store_screlease = table->core_->hsa_signal_store_screlease_fn; + + hsa_api_.hsa_code_object_reader_create_from_file = table->core_->hsa_code_object_reader_create_from_file_fn; + hsa_api_.hsa_executable_create_alt = table->core_->hsa_executable_create_alt_fn; + hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn; + hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn; + hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn; + + hsa_api_.hsa_system_get_info = table->core_->hsa_system_get_info_fn; + hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn; + + hsa_api_.hsa_amd_agent_iterate_memory_pools = table->amd_ext_->hsa_amd_agent_iterate_memory_pools_fn; + hsa_api_.hsa_amd_memory_pool_get_info = table->amd_ext_->hsa_amd_memory_pool_get_info_fn; + hsa_api_.hsa_amd_memory_pool_allocate = table->amd_ext_->hsa_amd_memory_pool_allocate_fn; + hsa_api_.hsa_amd_agents_allow_access = table->amd_ext_->hsa_amd_agents_allow_access_fn; + hsa_api_.hsa_amd_memory_async_copy = table->amd_ext_->hsa_amd_memory_async_copy_fn; + + hsa_api_.hsa_amd_signal_async_handler = table->amd_ext_->hsa_amd_signal_async_handler_fn; + hsa_api_.hsa_amd_profiling_set_profiler_enabled = table->amd_ext_->hsa_amd_profiling_set_profiler_enabled_fn; + hsa_api_.hsa_amd_profiling_get_async_copy_time = table->amd_ext_->hsa_amd_profiling_get_async_copy_time_fn; + hsa_api_.hsa_amd_profiling_get_dispatch_time = table->amd_ext_->hsa_amd_profiling_get_dispatch_time_fn; + } else { + hsa_api_.hsa_init = hsa_init; + hsa_api_.hsa_shut_down = hsa_shut_down; + hsa_api_.hsa_agent_get_info = hsa_agent_get_info; + hsa_api_.hsa_iterate_agents = hsa_iterate_agents; + + hsa_api_.hsa_queue_create = hsa_queue_create; + hsa_api_.hsa_queue_destroy = hsa_queue_destroy; + hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed; + hsa_api_.hsa_queue_store_write_index_relaxed = hsa_queue_store_write_index_relaxed; + hsa_api_.hsa_queue_load_read_index_relaxed = hsa_queue_load_read_index_relaxed; + + hsa_api_.hsa_signal_create = hsa_signal_create; + hsa_api_.hsa_signal_destroy = hsa_signal_destroy; + hsa_api_.hsa_signal_load_relaxed = hsa_signal_load_relaxed; + hsa_api_.hsa_signal_store_relaxed = hsa_signal_store_relaxed; + hsa_api_.hsa_signal_wait_scacquire = hsa_signal_wait_scacquire; + hsa_api_.hsa_signal_store_screlease = hsa_signal_store_screlease; + + hsa_api_.hsa_code_object_reader_create_from_file = hsa_code_object_reader_create_from_file; + hsa_api_.hsa_executable_create_alt = hsa_executable_create_alt; + hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object; + hsa_api_.hsa_executable_freeze = hsa_executable_freeze; + hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol; + + hsa_api_.hsa_system_get_info = hsa_system_get_info; + hsa_api_.hsa_system_get_major_extension_table = hsa_system_get_major_extension_table; + + hsa_api_.hsa_amd_agent_iterate_memory_pools = hsa_amd_agent_iterate_memory_pools; + hsa_api_.hsa_amd_memory_pool_get_info = hsa_amd_memory_pool_get_info; + hsa_api_.hsa_amd_memory_pool_allocate = hsa_amd_memory_pool_allocate; + hsa_api_.hsa_amd_agents_allow_access = hsa_amd_agents_allow_access; + hsa_api_.hsa_amd_memory_async_copy = hsa_amd_memory_async_copy; + + hsa_api_.hsa_amd_signal_async_handler = hsa_amd_signal_async_handler; + hsa_api_.hsa_amd_profiling_set_profiler_enabled = hsa_amd_profiling_set_profiler_enabled; + hsa_api_.hsa_amd_profiling_get_async_copy_time = hsa_amd_profiling_get_async_copy_time; + hsa_api_.hsa_amd_profiling_get_dispatch_time = hsa_amd_profiling_get_dispatch_time; + } + } +} + hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { void* handle = dlopen(kAqlProfileLib, RTLD_NOW); if (handle == NULL) { @@ -204,7 +290,7 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { AgentInfo* agent_info = NULL; hsa_device_type_t type; - status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); + status = hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); CHECK_STATUS("Error Calling hsa_agent_get_info", status); if (type == HSA_DEVICE_TYPE_CPU) { @@ -213,9 +299,9 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { agent_info->dev_type = HSA_DEVICE_TYPE_CPU; agent_info->dev_index = cpu_list_.size(); - status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); + status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); if ((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == NULL)) cpu_pool_ = &agent_info->cpu_pool; - status = hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); + status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == NULL)) kern_arg_pool_ = &agent_info->kern_arg_pool; agent_info->gpu_pool = {}; @@ -227,28 +313,28 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { agent_info = new AgentInfo{}; agent_info->dev_id = agent; agent_info->dev_type = HSA_DEVICE_TYPE_GPU; - hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); strncpy(agent_info->gfxip, agent_info->name, 4); agent_info->gfxip[4] = '\0'; - hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); - hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); - hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false; - hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), &agent_info->cu_num); - hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), &agent_info->waves_per_cu); - hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), &agent_info->simds_per_cu); - hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &agent_info->se_num); - hsa_agent_get_info(agent, + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), &agent_info->shader_arrays_per_se); agent_info->cpu_pool = {}; agent_info->kern_arg_pool = {}; - status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); + status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); // Set GPU index @@ -339,7 +425,7 @@ bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue) { hsa_status_t status; - status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, + status = hsa_api_.hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, queue); return (status == HSA_STATUS_SUCCESS); } @@ -350,7 +436,7 @@ bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, // @return bool true if successful, false otherwise bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { hsa_status_t status; - status = hsa_signal_create(value, 0, NULL, signal); + status = hsa_api_.hsa_signal_create(value, 0, NULL, signal); return (status == HSA_STATUS_SUCCESS); } @@ -363,7 +449,7 @@ uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t hsa_status_t status = HSA_STATUS_ERROR; uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status = hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, reinterpret_cast(&buffer)); + status = hsa_api_.hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, reinterpret_cast(&buffer)); uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; return ptr; } @@ -378,11 +464,11 @@ uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size uint8_t* buffer = NULL; if (!cpu_agents_.empty()) { size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status = hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0, reinterpret_cast(&buffer)); + status = hsa_api_.hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the kernel arguments if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; - status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); } } uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; @@ -398,11 +484,11 @@ uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t s uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; if (!cpu_agents_.empty()) { - status = hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0, reinterpret_cast(&buffer)); + status = hsa_api_.hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the memory if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; - status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); } } uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; @@ -426,7 +512,7 @@ uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t s void HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const { while (1) { const hsa_signal_value_t signal_value = - hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED); + hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED); if (signal_value == 0) { break; } else { @@ -439,7 +525,7 @@ void HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const { // Wait signal with signal value restore void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { SignalWait(signal); - hsa_signal_store_relaxed(const_cast(signal), signal_value); + hsa_api_.hsa_signal_store_relaxed(const_cast(signal), signal_value); } // Copy data from GPU to host memory @@ -447,12 +533,12 @@ bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src hsa_status_t status = HSA_STATUS_ERROR; if (!cpu_agents_.empty()) { hsa_signal_t s = {}; - status = hsa_signal_create(1, 0, NULL, &s); + status = hsa_api_.hsa_signal_create(1, 0, NULL, &s); CHECK_STATUS("hsa_signal_create()", status); - status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); + status = hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); CHECK_STATUS("hsa_amd_memory_async_copy()", status); SignalWait(s); - status = hsa_signal_destroy(s); + status = hsa_api_.hsa_signal_destroy(s); CHECK_STATUS("hsa_signal_destroy()", status); } return (status == HSA_STATUS_SUCCESS); @@ -494,29 +580,29 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br // Create code object reader hsa_code_object_reader_t code_obj_rdr = {0}; - status = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); + status = hsa_api_.hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); if (status != HSA_STATUS_SUCCESS) { std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl; return false; } // Create executable. - status = hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + status = hsa_api_.hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, executable); CHECK_STATUS("Error in creating executable object", status); // Load code object. - status = hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, code_obj_rdr, + status = hsa_api_.hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, code_obj_rdr, NULL, NULL); CHECK_STATUS("Error in loading executable object", status); // Freeze executable. - status = hsa_executable_freeze(*executable, ""); + status = hsa_api_.hsa_executable_freeze(*executable, ""); CHECK_STATUS("Error in freezing executable object", status); // Get symbol handle. hsa_executable_symbol_t kernelSymbol; - status = hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0, + status = hsa_api_.hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0, &kernelSymbol); CHECK_STATUS("Error in looking up kernel symbol", status); @@ -554,9 +640,9 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { const uint32_t slot_size_b = CMD_SLOT_SIZE_B; // adevance command queue - const uint64_t write_idx = hsa_queue_load_write_index_relaxed(queue); - hsa_queue_store_write_index_relaxed(queue, write_idx + 1); - while ((write_idx - hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { + const uint64_t write_idx = hsa_api_.hsa_queue_load_write_index_relaxed(queue); + hsa_api_.hsa_queue_store_write_index_relaxed(queue, write_idx + 1); + while ((write_idx - hsa_api_.hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { sched_yield(); } @@ -573,7 +659,7 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { header_atomic_ptr->store(slot_data[0], std::memory_order_release); // ringdoor bell - hsa_signal_store_relaxed(queue->doorbell_signal, write_idx); + hsa_api_.hsa_signal_store_relaxed(queue->doorbell_signal, write_idx); return write_idx; } @@ -595,9 +681,10 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s return write_idx; } -HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL; +std::atomic HsaRsrcFactory::instance_{}; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; +hsa_pfn_t HsaRsrcFactory::hsa_api_{}; } // namespace util } // namespace rocprofiler diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index b3f3cf0d..f982ddde 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -26,6 +26,7 @@ POSSIBILITY OF SUCH DAMAGE. #define SRC_UTIL_HSA_RSRC_FACTORY_H_ #include +#include #include #include #include @@ -35,6 +36,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include @@ -69,6 +71,46 @@ static const size_t MEM_PAGE_BYTES = 0x1000; static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t; +struct hsa_pfn_t { + decltype(hsa_init)* hsa_init; + decltype(hsa_shut_down)* hsa_shut_down; + decltype(hsa_agent_get_info)* hsa_agent_get_info; + decltype(hsa_iterate_agents)* hsa_iterate_agents; + + decltype(hsa_queue_create)* hsa_queue_create; + decltype(hsa_queue_destroy)* hsa_queue_destroy; + decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed; + decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed; + decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed; + + decltype(hsa_signal_create)* hsa_signal_create; + decltype(hsa_signal_destroy)* hsa_signal_destroy; + decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed; + decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed; + decltype(hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire; + decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease; + + decltype(hsa_code_object_reader_create_from_file)* hsa_code_object_reader_create_from_file; + decltype(hsa_executable_create_alt)* hsa_executable_create_alt; + decltype(hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object; + decltype(hsa_executable_freeze)* hsa_executable_freeze; + decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol; + + decltype(hsa_system_get_info)* hsa_system_get_info; + decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table; + + decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools; + decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info; + decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate; + decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access; + decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy; + + decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler; + decltype(hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled; + decltype(hsa_amd_profiling_get_async_copy_time)* hsa_amd_profiling_get_async_copy_time; + decltype(hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time; +}; + // Encapsulates information about a Hsa Agent such as its // handle, name, max queue size, max wavefront size, etc. struct AgentInfo { @@ -128,9 +170,9 @@ class HsaTimer { static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; typedef long double freq_t; - HsaTimer() { + HsaTimer(const hsa_pfn_t* hsa_api) : hsa_api_(hsa_api) { timestamp_t sysclock_hz = 0; - hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); + hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status); sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz; } @@ -146,7 +188,7 @@ class HsaTimer { // Return timestamp in 'ns' timestamp_t timestamp_ns() const { timestamp_t sysclock; - hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); + hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)", status); return sysclock_to_ns(sysclock); } @@ -154,6 +196,8 @@ class HsaTimer { private: // Timestamp frequency factor freq_t sysclock_factor_; + // HSA API table + const hsa_pfn_t* const hsa_api_; }; class HsaRsrcFactory { @@ -164,17 +208,20 @@ class HsaRsrcFactory { static HsaRsrcFactory* Create(bool initialize_hsa = true) { std::lock_guard lck(mutex_); - if (instance_ == NULL) { - instance_ = new HsaRsrcFactory(initialize_hsa); + HsaRsrcFactory* obj = instance_.load(std::memory_order_relaxed); + if (obj == NULL) { + obj = new HsaRsrcFactory(initialize_hsa); + instance_.store(obj, std::memory_order_release); } - return instance_; + return obj; } static HsaRsrcFactory& Instance() { - if (instance_ == NULL) instance_ = Create(false); - hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + HsaRsrcFactory* obj = instance_.load(std::memory_order_acquire); + if (obj == NULL) obj = Create(false); + hsa_status_t status = (obj != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; CHECK_STATUS("HsaRsrcFactory::Instance() failed", status); - return *instance_; + return *obj; } static void Destroy() { @@ -276,6 +323,10 @@ class HsaRsrcFactory { static uint64_t Submit(hsa_queue_t* queue, const void* packet); static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); + // Initialize HSA API table + void static InitHsaApiTable(HsaApiTable* table); + static const hsa_pfn_t* HsaApi() { return &hsa_api_; } + // Return AqlProfile API table typedef hsa_ven_amd_aqlprofile_pfn_t aqlprofile_pfn_t; const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; } @@ -293,7 +344,7 @@ class HsaRsrcFactory { static void SetTimeoutNs(const timestamp_t& time) { std::lock_guard lck(mutex_); timeout_ns_ = time; - if (instance_ != NULL) instance_->timeout_ = instance_->timer_->ns_to_sysclock(time); + if (instance_ != NULL) Instance().timeout_ = Instance().timer_->ns_to_sysclock(time); } private: @@ -322,7 +373,7 @@ class HsaRsrcFactory { // HSA was initialized const bool initialize_hsa_; - static HsaRsrcFactory* instance_; + static std::atomic instance_; static mutex_t mutex_; // Used to maintain a list of Hsa Gpu Agent Info @@ -336,6 +387,9 @@ class HsaRsrcFactory { // System agents map std::map agent_map_; + // HSA runtime API table + static hsa_pfn_t hsa_api_; + // AqlProfile API table aqlprofile_pfn_t aqlprofile_api_; diff --git a/src/util/logger.h b/src/util/logger.h index d37f6567..527589f6 100644 --- a/src/util/logger.h +++ b/src/util/logger.h @@ -76,8 +76,16 @@ class Logger { static Logger* Create() { std::lock_guard lck(mutex_); - if (instance_ == NULL) instance_ = new Logger(); - return instance_; + Logger* obj = instance_.load(std::memory_order_relaxed); + if (obj == NULL) { + obj = new Logger(); + if (obj == NULL) { + std::cerr << "ROCProfiler: log object creation failed" << std::endl << std::flush; + abort(); + } + instance_.store(obj, std::memory_order_release); + } + return obj; } static void Destroy() { @@ -87,8 +95,9 @@ class Logger { } static Logger& Instance() { - Create(); - return *instance_; + Logger* obj = instance_.load(std::memory_order_acquire); + if (obj == NULL) obj = Create(); + return *obj; } private: @@ -179,10 +188,10 @@ class Logger { bool messaging_; bool error_; std::string session_dir_; + std::map message_; static mutex_t mutex_; - static Logger* instance_; - std::map message_; + static std::atomic instance_; }; } // namespace util diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c7d86ccf..7f128e86 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -57,12 +57,12 @@ set ( CTRL_SRC ## Dummy kernel set ( DUMMY_NAME dummy_kernel ) -execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/${DUMMY_NAME}/*.hsaco ${PROJECT_BINARY_DIR}" ) +execute_process ( COMMAND sh -xc "${TEST_DIR}/../bin/build_kernel.sh ${TEST_DIR}/${DUMMY_NAME}/${DUMMY_NAME} ${PROJECT_BINARY_DIR}" ) ## Test kernel set ( TEST_NAME simple_convolution ) set ( KERN_SRC ${TEST_DIR}/${TEST_NAME}/${TEST_NAME}.cpp ) -execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/${TEST_NAME}/*.hsaco ${PROJECT_BINARY_DIR}" ) +execute_process ( COMMAND sh -xc "${TEST_DIR}/../bin/build_kernel.sh ${TEST_DIR}/${TEST_NAME}/${TEST_NAME} ${PROJECT_BINARY_DIR}" ) ## Building standalone test executable add_executable ( ${STEXE_NAME} ${STTST_SRC} ${UTIL_SRC} ${KERN_SRC} ) diff --git a/test/app/intercept_test.cpp b/test/app/intercept_test.cpp index 87e00d64..876b3102 100644 --- a/test/app/intercept_test.cpp +++ b/test/app/intercept_test.cpp @@ -25,9 +25,10 @@ THE SOFTWARE. #include #include +#include #include +#include #include -#include #include "ctrl/run_kernel.h" #include "ctrl/test_aql.h" @@ -36,6 +37,7 @@ THE SOFTWARE. #include "dummy_kernel/dummy_kernel.h" #include "simple_convolution/simple_convolution.h" #include "util/test_assert.h" +#include "util/xml.h" #define PUBLIC_API __attribute__((visibility("default"))) #define CONSTRUCTOR_API __attribute__((constructor)) @@ -45,6 +47,9 @@ THE SOFTWARE. pthread_mutex_t mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; // Tool is unloaded volatile bool is_loaded = false; +// Profiling features +//rocprofiler_feature_t* features = NULL; +//unsigned feature_count = 0; // Error handler void fatal(const std::string msg) { @@ -72,8 +77,19 @@ struct context_entry_t { rocprofiler_callback_data_t data; }; +// Context callback arg +struct callbacks_arg_t { + rocprofiler_pool_t** pools; +}; + +// Handler callback arg +struct handler_arg_t { + rocprofiler_feature_t* features; + unsigned feature_count; +}; + // Dump stored context entry -void dump_context_entry(context_entry_t* entry) { +void dump_context_entry(context_entry_t* entry, rocprofiler_feature_t* features, unsigned feature_count) { volatile std::atomic* valid = reinterpret_cast*>(&entry->valid); while (valid->load() == false) sched_yield(); @@ -97,26 +113,44 @@ void dump_context_entry(context_entry_t* entry) { rocprofiler_group_t& group = entry->group; if (group.context == NULL) { - fprintf(stderr, "tool error: context is NULL\n"); - abort(); + fatal("context is NULL\n"); + } + if (feature_count > 0) { + hsa_status_t status = rocprofiler_group_get_data(&group); + check_status(status); + status = rocprofiler_get_metrics(group.context); + check_status(status); } - rocprofiler_close(group.context); + for (unsigned i = 0; i < feature_count; ++i) { + const rocprofiler_feature_t* p = &features[i]; + fprintf(stdout, "> %s ", p->name); + switch (p->data.kind) { + // Output metrics results + case ROCPROFILER_DATA_KIND_INT64: + fprintf(stdout, "= (%lu)\n", p->data.result_int64); + break; + default: + fprintf(stderr, "Undefined data kind(%u)\n", p->data.kind); + abort(); + } + } } // Profiling completion handler // Dump and delete the context entry // Return true if the context was dumped successfully -bool context_handler(rocprofiler_group_t group, void* arg) { - context_entry_t* entry = reinterpret_cast(arg); +bool context_handler(const rocprofiler_pool_entry_t* entry, void* arg) { + // Context entry + context_entry_t* ctx_entry = reinterpret_cast(entry->payload); + handler_arg_t* handler_arg = reinterpret_cast(arg); if (pthread_mutex_lock(&mutex) != 0) { perror("pthread_mutex_lock"); abort(); } - dump_context_entry(entry); - delete entry; + dump_context_entry(ctx_entry, handler_arg->features, handler_arg->feature_count); if (pthread_mutex_unlock(&mutex) != 0) { perror("pthread_mutex_unlock"); @@ -125,35 +159,65 @@ bool context_handler(rocprofiler_group_t group, void* arg) { return false; } +#if 0 +// Profiling completion handler +// Dump and delete the context entry +// Return true if the context was dumped successfully +bool context_handler1(rocprofiler_group_t group, void* arg) { + context_entry_t* ctx_entry = reinterpret_cast(arg); + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + dump_context_entry(ctx_entry, features, feature_count); + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + return false; +} +#endif // Kernel disoatch callback -hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* /*user_data*/, +hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* arg, rocprofiler_group_t* group) { + // Passed tool data + hsa_agent_t agent = callback_data->agent; // HSA status hsa_status_t status = HSA_STATUS_ERROR; - // Profiling context - rocprofiler_t* context = NULL; - - // Context entry - context_entry_t* entry = new context_entry_t(); - +#if 1 + // Open profiling context + const unsigned gpu_id = HsaRsrcFactory::Instance().GetAgentInfo(agent)->dev_index; + callbacks_arg_t* callbacks_arg = reinterpret_cast(arg); + rocprofiler_pool_t* pool = callbacks_arg->pools[gpu_id]; + rocprofiler_pool_entry_t pool_entry{}; + status = rocprofiler_pool_fetch(pool, &pool_entry); + check_status(status); + // Profiling context entry + rocprofiler_t* context = pool_entry.context; + context_entry_t* entry = reinterpret_cast(pool_entry.payload); +#else + // Open profiling context // context properties + context_entry_t* entry = new context_entry_t{}; + rocprofiler_t* context = NULL; rocprofiler_properties_t properties{}; - properties.handler = context_handler; + properties.handler = context_handler1; properties.handler_arg = (void*)entry; - - // Open profiling context - status = rocprofiler_open(callback_data->agent, NULL, 0, + status = rocprofiler_open(agent, features, feature_count, &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); check_status(status); - +#endif // Get group[0] status = rocprofiler_get_group(context, 0, group); check_status(status); // Fill profiling context entry - entry->agent = callback_data->agent; + entry->agent = agent; entry->group = *group; entry->data = *callback_data; entry->data.kernel_name = strdup(callback_data->kernel_name); @@ -162,26 +226,90 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, return HSA_STATUS_SUCCESS; } +unsigned metrics_input(rocprofiler_feature_t** ret) { + // Profiling feature objects + const unsigned feature_count = 9; + rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; + memset(features, 0, feature_count * sizeof(rocprofiler_feature_t)); + + // PMC events + features[0].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[0].name = "GRBM_COUNT"; + features[1].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[1].name = "GRBM_GUI_ACTIVE"; + features[2].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[2].name = "GPUBusy"; + features[3].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[3].name = "SQ_WAVES"; + features[4].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[4].name = "SQ_INSTS_VALU"; + features[5].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[5].name = "VALUInsts"; + features[6].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[6].name = "TCC_HIT_sum"; + features[7].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[7].name = "TCC_MISS_sum"; + features[8].kind = ROCPROFILER_FEATURE_KIND_METRIC; + features[8].name = "WRITE_SIZE"; + + *ret = features; + return feature_count; +} + void initialize() { - // Getting GPU device info - const AgentInfo* agent_info = NULL; - if (HsaRsrcFactory::Instance().GetGpuAgentInfo(0, &agent_info) == false) { - fprintf(stderr, "GetGpuAgentInfo failed\n"); - abort(); - } + // Available GPU agents + const unsigned gpu_count = HsaRsrcFactory::Instance().GetCountOfGpuAgents(); + + // Getting profiling features + rocprofiler_feature_t* features = NULL; + unsigned feature_count = metrics_input(&features); + + // Handler arg + handler_arg_t* handler_arg = new handler_arg_t{}; + handler_arg->features = features; + handler_arg->feature_count = feature_count; + + // Context properties + rocprofiler_pool_properties_t properties{}; + properties.num_entries = 100; + properties.payload_bytes = sizeof(context_entry_t); + properties.handler = context_handler; + properties.handler_arg = handler_arg; // Adding dispatch observer + callbacks_arg_t* callbacks_arg = new callbacks_arg_t{}; + callbacks_arg->pools = new rocprofiler_pool_t* [gpu_count]; + for (unsigned gpu_id = 0; gpu_id < gpu_count; gpu_id++) { + // Getting GPU device info + const AgentInfo* agent_info = NULL; + if (HsaRsrcFactory::Instance().GetGpuAgentInfo(gpu_id, &agent_info) == false) { + fprintf(stderr, "GetGpuAgentInfo failed\n"); + abort(); + } + + // Open profiling pool + rocprofiler_pool_t* pool = NULL; + hsa_status_t status = rocprofiler_pool_open(agent_info->dev_id, features, feature_count, + &pool, 0/*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); + check_status(status); + callbacks_arg->pools[gpu_id] = pool; + } + rocprofiler_queue_callbacks_t callbacks_ptrs{}; callbacks_ptrs.dispatch = dispatch_callback; - rocprofiler_set_queue_callbacks(callbacks_ptrs, NULL); + rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_arg); } void cleanup() { // Unregister dispatch callback rocprofiler_remove_queue_callbacks(); - - // Dump stored profiling output data - fflush(stdout); + // CLose profiling pool +#if 0 + hsa_status_t status = rocprofiler_pool_flush(pool); + check_status(status); + status = rocprofiler_pool_close(pool); + check_status(status); +#endif } // Tool constructor diff --git a/test/run.sh b/test/run.sh index 580f4713..ed5bbe9a 100755 --- a/test/run.sh +++ b/test/run.sh @@ -22,6 +22,23 @@ # THE SOFTWARE. ################################################################################ +# test check routin +test_status=0 +test_number=0 +eval_test() { + label=$1 + cmdline=$2 + echo "$label: \"$cmdline\"" + eval "$cmdline" + if [ $? != 0 ] ; then + echo "$label: FAILED" + test_status=$(($test_status + 1)) + else + echo "$label: PASSED" + fi + test_number=$(($test_number + 1)) +} + # enable tools load failure reporting export HSA_TOOLS_REPORT_LOAD_FAILURE=1 # paths to ROC profiler and oher libraries @@ -37,12 +54,22 @@ export ROCP_METRICS=metrics.xml # test trace export ROC_TEST_TRACE=1 +## Intercepting usage model test + # tool library loaded by ROC profiler export ROCP_TOOL_LIB=./test/libintercept_test.so -../bin/run_tool.sh ./test/ctrl +export ROCP_KITER=50 +export ROCP_DITER=50 +export ROCP_AGENTS=1 +export ROCP_THRS=1 +eval_test "Intercepting usage model test" "../bin/run_tool.sh ./test/ctrl" + +## Standalone sampling usage model test unset ROCP_TOOL_LIB -eval ./test/standalone_test +eval_test "Standalone sampling usage model test" ./test/standalone_test + +## Libtool test # tool library loaded by ROC profiler export ROCP_TOOL_LIB=libtool.so @@ -61,18 +88,28 @@ export ROCP_DITER=50 export ROCP_AGENTS=1 export ROCP_THRS=1 export ROCP_INPUT=input.xml -eval ./test/ctrl +eval_test "'rocprof' libtool test" ./test/ctrl + +export ROCP_KITER=10 +export ROCP_DITER=10 +export ROCP_AGENTS=1 +export ROCP_THRS=10 +export ROCP_INPUT=input1.xml +eval_test "'rocprof' libtool test n-threads" ./test/ctrl + +## Libtool test, counter sets # Memcopies tracking export ROCP_MCOPY_TRACKING=1 export ROCP_KITER=1 export ROCP_DITER=4 -export ROCP_INPUT=input1.xml -eval ./test/ctrl +export ROCP_INPUT=input2.xml +eval_test "libtool test, counter sets" ./test/ctrl #valgrind --leak-check=full $tbin #valgrind --tool=massif $tbin #ms_print massif.out. -exit 0 +echo "$test_number tests total / $test_status tests failed" +exit $test_status diff --git a/test/tool/input1.xml b/test/tool/input1.xml index 254c83dc..f4ecd178 100644 --- a/test/tool/input1.xml +++ b/test/tool/input1.xml @@ -1,5 +1,14 @@ +# Filter by dispatches range, GPU index and kernel names + + # List of metrics diff --git a/test/tool/input2.xml b/test/tool/input2.xml new file mode 100644 index 00000000..254c83dc --- /dev/null +++ b/test/tool/input2.xml @@ -0,0 +1,5 @@ +# List of metrics + diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index d96ab12c..0eee2348 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -309,7 +309,7 @@ struct trace_data_arg_t { hsa_agent_t agent; }; -// Trace data callback for getting trace data from GPU local mamory +// Trace data callback for getting trace data from GPU local memory hsa_status_t trace_data_cb(hsa_ven_amd_aqlprofile_info_type_t info_type, hsa_ven_amd_aqlprofile_info_data_t* info_data, void* data) { hsa_status_t status = HSA_STATUS_SUCCESS; @@ -616,7 +616,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, kernel_properties_ptr->vgpr_count = kernel_code->reserved_vgpr_count; kernel_properties_ptr->sgpr_count = kernel_code->reserved_sgpr_count; kernel_properties_ptr->fbarrier_count = kernel_code->workgroup_fbarrier_count; - kernel_properties_ptr->signal = packet->completion_signal; + kernel_properties_ptr->signal = callback_data->completion_signal; // context properties rocprofiler_properties_t properties{}; diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp index 0293c6c4..35568ba0 100644 --- a/test/util/hsa_rsrc_factory.cpp +++ b/test/util/hsa_rsrc_factory.cpp @@ -71,13 +71,13 @@ static hsa_status_t FindGlobalPool(hsa_amd_memory_pool_t pool, void* data, bool return HSA_STATUS_ERROR_INVALID_ARGUMENT; } - err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); + err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); CHECK_STATUS("hsa_amd_memory_pool_get_info", err); if (HSA_AMD_SEGMENT_GLOBAL != segment) { return HSA_STATUS_SUCCESS; } - err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); + err = HsaRsrcFactory::HsaApi()->hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); CHECK_STATUS("hsa_amd_memory_pool_get_info", err); uint32_t karg_st = flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT; @@ -111,14 +111,16 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize cpu_pool_ = NULL; kern_arg_pool_ = NULL; + InitHsaApiTable(NULL); + // Initialize the Hsa Runtime if (initialize_hsa_) { - status = hsa_init(); + status = hsa_api_.hsa_init(); CHECK_STATUS("Error in hsa_init", status); } // Discover the set of Gpu devices available on the platform - status = hsa_iterate_agents(GetHsaAgentsCallback, this); + status = hsa_api_.hsa_iterate_agents(GetHsaAgentsCallback, this); CHECK_STATUS("Error Calling hsa_iterate_agents", status); if (cpu_pool_ == NULL) CHECK_STATUS("CPU memory pool is not found", HSA_STATUS_ERROR); if (kern_arg_pool_ == NULL) CHECK_STATUS("Kern-arg memory pool is not found", HSA_STATUS_ERROR); @@ -128,13 +130,13 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize #ifdef ROCP_LD_AQLPROFILE status = LoadAqlProfileLib(&aqlprofile_api_); #else - status = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, sizeof(aqlprofile_api_), &aqlprofile_api_); + status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_AQLPROFILE, hsa_ven_amd_aqlprofile_VERSION_MAJOR, sizeof(aqlprofile_api_), &aqlprofile_api_); #endif CHECK_STATUS("aqlprofile API table load failed", status); // Get Loader API table loader_api_ = {0}; - status = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_); + status = hsa_api_.hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, sizeof(loader_api_), &loader_api_); CHECK_STATUS("loader API table query failed", status); // Instantiate HSA timer @@ -152,11 +154,93 @@ HsaRsrcFactory::~HsaRsrcFactory() { for (auto p : cpu_list_) delete p; for (auto p : gpu_list_) delete p; if (initialize_hsa_) { - hsa_status_t status = hsa_shut_down(); + hsa_status_t status = hsa_api_.hsa_shut_down(); CHECK_STATUS("Error in hsa_shut_down", status); } } +void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { + std::lock_guard lck(mutex_); + + if (hsa_api_.hsa_init == NULL) { + if (table != NULL) { + hsa_api_.hsa_init = table->core_->hsa_init_fn; + hsa_api_.hsa_shut_down = table->core_->hsa_shut_down_fn; + hsa_api_.hsa_agent_get_info = table->core_->hsa_agent_get_info_fn; + + hsa_api_.hsa_iterate_agents = table->core_->hsa_iterate_agents_fn; + + hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn; + hsa_api_.hsa_queue_destroy = table->core_->hsa_queue_destroy_fn; + hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn; + hsa_api_.hsa_queue_store_write_index_relaxed = table->core_->hsa_queue_store_write_index_relaxed_fn; + hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn; + hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn; + hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn; + hsa_api_.hsa_signal_load_relaxed = table->core_->hsa_signal_load_relaxed_fn; + hsa_api_.hsa_signal_store_relaxed = table->core_->hsa_signal_store_relaxed_fn; + hsa_api_.hsa_signal_store_screlease = table->core_->hsa_signal_store_screlease_fn; + hsa_api_.hsa_signal_wait_scacquire = table->core_->hsa_signal_wait_scacquire_fn; + + hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn; + + hsa_api_.hsa_code_object_reader_create_from_file = table->core_->hsa_code_object_reader_create_from_file_fn; + hsa_api_.hsa_executable_create_alt = table->core_->hsa_executable_create_alt_fn; + hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn; + hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn; + hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn; + + hsa_api_.hsa_amd_agent_iterate_memory_pools = table->amd_ext_->hsa_amd_agent_iterate_memory_pools_fn; + hsa_api_.hsa_amd_memory_pool_get_info = table->amd_ext_->hsa_amd_memory_pool_get_info_fn; + hsa_api_.hsa_amd_memory_pool_allocate = table->amd_ext_->hsa_amd_memory_pool_allocate_fn; + hsa_api_.hsa_amd_agents_allow_access = table->amd_ext_->hsa_amd_agents_allow_access_fn; + + hsa_api_.hsa_amd_memory_async_copy = table->amd_ext_->hsa_amd_memory_async_copy_fn; + + hsa_api_.hsa_amd_signal_async_handler = table->amd_ext_->hsa_amd_signal_async_handler_fn; + hsa_api_.hsa_amd_profiling_get_async_copy_time = table->amd_ext_->hsa_amd_profiling_get_async_copy_time_fn; + hsa_api_.hsa_amd_profiling_get_dispatch_time = table->amd_ext_->hsa_amd_profiling_get_dispatch_time_fn; + } else { + hsa_api_.hsa_init = hsa_init; + hsa_api_.hsa_shut_down = hsa_shut_down; + hsa_api_.hsa_agent_get_info = hsa_agent_get_info; + + hsa_api_.hsa_iterate_agents = hsa_iterate_agents; + + hsa_api_.hsa_queue_create = hsa_queue_create; + hsa_api_.hsa_queue_destroy = hsa_queue_destroy; + hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed; + hsa_api_.hsa_queue_store_write_index_relaxed = hsa_queue_store_write_index_relaxed; + hsa_api_.hsa_queue_load_read_index_relaxed = hsa_queue_load_read_index_relaxed; + hsa_api_.hsa_signal_create = hsa_signal_create; + hsa_api_.hsa_signal_destroy = hsa_signal_destroy; + hsa_api_.hsa_signal_store_relaxed = hsa_signal_store_relaxed; + hsa_api_.hsa_signal_wait_scacquire = hsa_signal_wait_scacquire; + + hsa_api_.hsa_amd_agent_iterate_memory_pools = hsa_amd_agent_iterate_memory_pools; + hsa_api_.hsa_amd_memory_pool_get_info = hsa_amd_memory_pool_get_info; + hsa_api_.hsa_amd_memory_pool_allocate = hsa_amd_memory_pool_allocate; + hsa_api_.hsa_amd_agents_allow_access = hsa_amd_agents_allow_access; + + hsa_api_.hsa_amd_memory_async_copy = hsa_amd_memory_async_copy; + + hsa_api_.hsa_system_get_major_extension_table = hsa_system_get_major_extension_table; + + hsa_api_.hsa_code_object_reader_create_from_file = hsa_code_object_reader_create_from_file; + hsa_api_.hsa_executable_create_alt = hsa_executable_create_alt; + hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object; + hsa_api_.hsa_executable_freeze = hsa_executable_freeze; + hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol; + + hsa_api_.hsa_amd_signal_async_handler = hsa_amd_signal_async_handler; + hsa_api_.hsa_amd_profiling_get_async_copy_time = hsa_amd_profiling_get_async_copy_time; + hsa_api_.hsa_amd_profiling_get_dispatch_time = hsa_amd_profiling_get_dispatch_time; + hsa_api_.hsa_signal_load_relaxed = hsa_signal_load_relaxed; + hsa_api_.hsa_signal_store_screlease = hsa_signal_store_screlease; + } + } +} + hsa_status_t HsaRsrcFactory::LoadAqlProfileLib(aqlprofile_pfn_t* api) { void* handle = dlopen(kAqlProfileLib, RTLD_NOW); if (handle == NULL) { @@ -198,7 +282,7 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { AgentInfo* agent_info = NULL; hsa_device_type_t type; - status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); + status = hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type); CHECK_STATUS("Error Calling hsa_agent_get_info", status); if (type == HSA_DEVICE_TYPE_CPU) { @@ -207,9 +291,9 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { agent_info->dev_type = HSA_DEVICE_TYPE_CPU; agent_info->dev_index = cpu_list_.size(); - status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); + status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->cpu_pool); if ((status == HSA_STATUS_INFO_BREAK) && (cpu_pool_ == NULL)) cpu_pool_ = &agent_info->cpu_pool; - status = hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); + status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindKernArgPool, &agent_info->kern_arg_pool); if ((status == HSA_STATUS_INFO_BREAK) && (kern_arg_pool_ == NULL)) kern_arg_pool_ = &agent_info->kern_arg_pool; agent_info->gpu_pool = {}; @@ -221,28 +305,28 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { agent_info = new AgentInfo{}; agent_info->dev_id = agent; agent_info->dev_type = HSA_DEVICE_TYPE_GPU; - hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_NAME, agent_info->name); strncpy(agent_info->gfxip, agent_info->name, 4); agent_info->gfxip[4] = '\0'; - hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); - hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); - hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &agent_info->max_wave_size); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &agent_info->max_queue_size); + hsa_api_.hsa_agent_get_info(agent, HSA_AGENT_INFO_PROFILE, &agent_info->profile); agent_info->is_apu = (agent_info->profile == HSA_PROFILE_FULL) ? true : false; - hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT), &agent_info->cu_num); - hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU), &agent_info->waves_per_cu); - hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU), &agent_info->simds_per_cu); - hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ENGINES), &agent_info->se_num); - hsa_agent_get_info(agent, + hsa_api_.hsa_agent_get_info(agent, static_cast(HSA_AMD_AGENT_INFO_NUM_SHADER_ARRAYS_PER_SE), &agent_info->shader_arrays_per_se); agent_info->cpu_pool = {}; agent_info->kern_arg_pool = {}; - status = hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); + status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); // Set GPU index @@ -333,7 +417,7 @@ bool HsaRsrcFactory::GetCpuAgentInfo(uint32_t idx, const AgentInfo** agent_info) bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, hsa_queue_t** queue) { hsa_status_t status; - status = hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, + status = hsa_api_.hsa_queue_create(agent_info->dev_id, num_pkts, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, queue); return (status == HSA_STATUS_SUCCESS); } @@ -344,7 +428,7 @@ bool HsaRsrcFactory::CreateQueue(const AgentInfo* agent_info, uint32_t num_pkts, // @return bool true if successful, false otherwise bool HsaRsrcFactory::CreateSignal(uint32_t value, hsa_signal_t* signal) { hsa_status_t status; - status = hsa_signal_create(value, 0, NULL, signal); + status = hsa_api_.hsa_signal_create(value, 0, NULL, signal); return (status == HSA_STATUS_SUCCESS); } @@ -357,7 +441,7 @@ uint8_t* HsaRsrcFactory::AllocateLocalMemory(const AgentInfo* agent_info, size_t hsa_status_t status = HSA_STATUS_ERROR; uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status = hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, reinterpret_cast(&buffer)); + status = hsa_api_.hsa_amd_memory_pool_allocate(agent_info->gpu_pool, size, 0, reinterpret_cast(&buffer)); uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; return ptr; } @@ -372,11 +456,11 @@ uint8_t* HsaRsrcFactory::AllocateKernArgMemory(const AgentInfo* agent_info, size uint8_t* buffer = NULL; if (!cpu_agents_.empty()) { size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; - status = hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0, reinterpret_cast(&buffer)); + status = hsa_api_.hsa_amd_memory_pool_allocate(*kern_arg_pool_, size, 0, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the kernel arguments if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; - status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); } } uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; @@ -392,11 +476,11 @@ uint8_t* HsaRsrcFactory::AllocateSysMemory(const AgentInfo* agent_info, size_t s uint8_t* buffer = NULL; size = (size + MEM_PAGE_MASK) & ~MEM_PAGE_MASK; if (!cpu_agents_.empty()) { - status = hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0, reinterpret_cast(&buffer)); + status = hsa_api_.hsa_amd_memory_pool_allocate(*cpu_pool_, size, 0, reinterpret_cast(&buffer)); // Both the CPU and GPU can access the memory if (status == HSA_STATUS_SUCCESS) { hsa_agent_t ag_list[1] = {agent_info->dev_id}; - status = hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); + status = hsa_api_.hsa_amd_agents_allow_access(1, ag_list, NULL, buffer); } } uint8_t* ptr = (status == HSA_STATUS_SUCCESS) ? buffer : NULL; @@ -420,7 +504,7 @@ uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t s void HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const { while (1) { const hsa_signal_value_t signal_value = - hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED); + hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED); if (signal_value == 0) { break; } else { @@ -432,7 +516,7 @@ void HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const { // Wait signal with signal value restore void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { SignalWait(signal); - hsa_signal_store_relaxed(const_cast(signal), signal_value); + hsa_api_.hsa_signal_store_relaxed(const_cast(signal), signal_value); } // Copy data from GPU to host memory @@ -440,12 +524,12 @@ bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src hsa_status_t status = HSA_STATUS_ERROR; if (!cpu_agents_.empty()) { hsa_signal_t s = {}; - status = hsa_signal_create(1, 0, NULL, &s); + status = hsa_api_.hsa_signal_create(1, 0, NULL, &s); CHECK_STATUS("hsa_signal_create()", status); - status = hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); + status = hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); CHECK_STATUS("hsa_amd_memory_async_copy()", status); SignalWait(s); - status = hsa_signal_destroy(s); + status = hsa_api_.hsa_signal_destroy(s); CHECK_STATUS("hsa_signal_destroy()", status); } return (status == HSA_STATUS_SUCCESS); @@ -487,29 +571,29 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br // Create code object reader hsa_code_object_reader_t code_obj_rdr = {0}; - status = hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); + status = hsa_api_.hsa_code_object_reader_create_from_file(file_handle, &code_obj_rdr); if (status != HSA_STATUS_SUCCESS) { std::cerr << "Failed to create code object reader '" << filename << "'" << std::endl; return false; } // Create executable. - status = hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, + status = hsa_api_.hsa_executable_create_alt(HSA_PROFILE_FULL, HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, executable); CHECK_STATUS("Error in creating executable object", status); // Load code object. - status = hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, code_obj_rdr, + status = hsa_api_.hsa_executable_load_agent_code_object(*executable, agent_info->dev_id, code_obj_rdr, NULL, NULL); CHECK_STATUS("Error in loading executable object", status); // Freeze executable. - status = hsa_executable_freeze(*executable, ""); + status = hsa_api_.hsa_executable_freeze(*executable, ""); CHECK_STATUS("Error in freezing executable object", status); // Get symbol handle. hsa_executable_symbol_t kernelSymbol; - status = hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0, + status = hsa_api_.hsa_executable_get_symbol(*executable, NULL, kernel_name, agent_info->dev_id, 0, &kernelSymbol); CHECK_STATUS("Error in looking up kernel symbol", status); @@ -547,9 +631,9 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { const uint32_t slot_size_b = CMD_SLOT_SIZE_B; // adevance command queue - const uint64_t write_idx = hsa_queue_load_write_index_relaxed(queue); - hsa_queue_store_write_index_relaxed(queue, write_idx + 1); - while ((write_idx - hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { + const uint64_t write_idx = hsa_api_.hsa_queue_load_write_index_relaxed(queue); + hsa_api_.hsa_queue_store_write_index_relaxed(queue, write_idx + 1); + while ((write_idx - hsa_api_.hsa_queue_load_read_index_relaxed(queue)) >= queue->size) { sched_yield(); } @@ -566,7 +650,7 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet) { header_atomic_ptr->store(slot_data[0], std::memory_order_release); // ringdoor bell - hsa_signal_store_relaxed(queue->doorbell_signal, write_idx); + hsa_api_.hsa_signal_store_relaxed(queue->doorbell_signal, write_idx); return write_idx; } @@ -588,6 +672,7 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s return write_idx; } -HsaRsrcFactory* HsaRsrcFactory::instance_ = NULL; +std::atomic HsaRsrcFactory::instance_{}; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; +hsa_pfn_t HsaRsrcFactory::hsa_api_{}; diff --git a/test/util/hsa_rsrc_factory.h b/test/util/hsa_rsrc_factory.h index 738a8e2f..552789cc 100644 --- a/test/util/hsa_rsrc_factory.h +++ b/test/util/hsa_rsrc_factory.h @@ -26,6 +26,7 @@ POSSIBILITY OF SUCH DAMAGE. #define TEST_UTIL_HSA_RSRC_FACTORY_H_ #include +#include #include #include #include @@ -35,6 +36,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include @@ -67,6 +69,44 @@ static const size_t MEM_PAGE_BYTES = 0x1000; static const size_t MEM_PAGE_MASK = MEM_PAGE_BYTES - 1; typedef decltype(hsa_agent_t::handle) hsa_agent_handle_t; +struct hsa_pfn_t { + decltype(hsa_init)* hsa_init; + decltype(hsa_shut_down)* hsa_shut_down; + decltype(hsa_agent_get_info)* hsa_agent_get_info; + + decltype(hsa_iterate_agents)* hsa_iterate_agents; + + decltype(hsa_queue_create)* hsa_queue_create; + decltype(hsa_queue_destroy)* hsa_queue_destroy; + decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed; + decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed; + decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed; + decltype(hsa_signal_create)* hsa_signal_create; + decltype(hsa_signal_destroy)* hsa_signal_destroy; + decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed; + decltype(hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire; + + decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools; + decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info; + decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate; + decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access; + decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy; + + decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table; + + decltype(hsa_code_object_reader_create_from_file)* hsa_code_object_reader_create_from_file; + decltype(hsa_executable_create_alt)* hsa_executable_create_alt; + decltype(hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object; + decltype(hsa_executable_freeze)* hsa_executable_freeze; + decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol; + + decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler; + decltype(hsa_amd_profiling_get_async_copy_time)* hsa_amd_profiling_get_async_copy_time; + decltype(hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time; + decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed; + decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease; +}; + // Encapsulates information about a Hsa Agent such as its // handle, name, max queue size, max wavefront size, etc. struct AgentInfo { @@ -133,7 +173,7 @@ class HsaTimer { sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz; } - // Methids for system-clock/ns conversion + // Methods for system-clock/ns conversion timestamp_t sysclock_to_ns(const timestamp_t& sysclock) const { return timestamp_t((freq_t)sysclock * sysclock_factor_); } @@ -162,17 +202,20 @@ class HsaRsrcFactory { static HsaRsrcFactory* Create(bool initialize_hsa = true) { std::lock_guard lck(mutex_); - if (instance_ == NULL) { - instance_ = new HsaRsrcFactory(initialize_hsa); + HsaRsrcFactory* obj = instance_.load(std::memory_order_relaxed); + if (obj == NULL) { + obj = new HsaRsrcFactory(initialize_hsa); + instance_.store(obj, std::memory_order_release); } - return instance_; + return obj; } static HsaRsrcFactory& Instance() { - if (instance_ == NULL) instance_ = Create(false); - hsa_status_t status = (instance_ != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; + HsaRsrcFactory* obj = instance_.load(std::memory_order_acquire); + if (obj == NULL) obj = Create(false); + hsa_status_t status = (obj != NULL) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; CHECK_STATUS("HsaRsrcFactory::Instance() failed", status); - return *instance_; + return *obj; } static void Destroy() { @@ -274,6 +317,10 @@ class HsaRsrcFactory { static uint64_t Submit(hsa_queue_t* queue, const void* packet); static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); + // Initialize HSA API table + void static InitHsaApiTable(HsaApiTable* table); + static const hsa_pfn_t* HsaApi() { return &hsa_api_; } + // Return AqlProfile API table typedef hsa_ven_amd_aqlprofile_pfn_t aqlprofile_pfn_t; const aqlprofile_pfn_t* AqlProfileApi() const { return &aqlprofile_api_; } @@ -291,7 +338,7 @@ class HsaRsrcFactory { static void SetTimeoutNs(const timestamp_t& time) { std::lock_guard lck(mutex_); timeout_ns_ = time; - if (instance_ != NULL) instance_->timeout_ = instance_->timer_->ns_to_sysclock(time); + if (instance_ != NULL) Instance().timeout_ = Instance().timer_->ns_to_sysclock(time); } private: @@ -320,7 +367,7 @@ class HsaRsrcFactory { // HSA was initialized const bool initialize_hsa_; - static HsaRsrcFactory* instance_; + static std::atomic instance_; static mutex_t mutex_; // Used to maintain a list of Hsa Gpu Agent Info @@ -334,6 +381,9 @@ class HsaRsrcFactory { // System agents map std::map agent_map_; + // HSA runtime API table + static hsa_pfn_t hsa_api_; + // AqlProfile API table aqlprofile_pfn_t aqlprofile_api_; From 6bc8b939bf6e1e85841ba3e120ba057a78e886ce Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 4 Apr 2019 21:39:31 -0500 Subject: [PATCH 051/168] Update README.md --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 711bc86b..311ae05e 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,24 @@ Options: > -o - output CSV file [.csv] + The output CSV file columns meaning in the columns order: + Index - kernels dispatch order index + KernelName - the dispatched kernel name + gpu-id - GPU id the kernel was submitted to + queue-id - the ROCm queue unique id the kernel was submitted to + queue-index - The ROCm queue write index for the submitted AQL packet + tid - system application thread id which submitted the kernel + grd - the kernel's grid size + wgr - the kernel's work group size + lds - the kernel's LDS memory size + scr - the kernel's scratch memory size + vgpr - the kernel's VGPR size + sgpr - the kernel's SGPR size + fbar - the kernel's barriers limitation + sig - the kernel's completion signal + The columns with the counters values per kernel dispatch + DispatchNs/BeginNs/EndNs/CompleteNs timestamp columns if time-stamping was enabled + -d - directory where profiler store profiling data including thread treaces [/tmp] The data directory is renoving autonatically if the directory is matching the temporary one, which is the default. -t - to change the temporary directory [/tmp] From 2efc120ec5a785edc903be34d64f6aa12855f456 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 4 Apr 2019 21:40:37 -0500 Subject: [PATCH 052/168] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 311ae05e..316597ae 100644 --- a/README.md +++ b/README.md @@ -119,8 +119,8 @@ Options: sgpr - the kernel's SGPR size fbar - the kernel's barriers limitation sig - the kernel's completion signal - The columns with the counters values per kernel dispatch - DispatchNs/BeginNs/EndNs/CompleteNs timestamp columns if time-stamping was enabled + ... - The columns with the counters values per kernel dispatch + DispatchNs/BeginNs/EndNs/CompleteNs - timestamp columns if time-stamping was enabled -d - directory where profiler store profiling data including thread treaces [/tmp] The data directory is renoving autonatically if the directory is matching the temporary one, which is the default. From 83703a97fa9188ff01d1dbb2a8b134ac3697566f Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 4 Apr 2019 21:47:25 -0500 Subject: [PATCH 053/168] Update README.md --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 316597ae..a3ca363b 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,12 @@ Options: --basenames - to turn on/off truncating of the kernel full function names till the base ones [off] --timestamp - to turn on/off the kernel dispatches timestamps, dispatch/begin/end/complete [off] + Four kernel timestamps in nanoseconds are reported: + DispatchNs - the time when the kernel AQL dispatch packet was written to the queue + BeginNs - the kernel execution begin time + EndNs - the kernel execution end time + CompleteNs - the time when the completion signal of the AQL dispatch packet was received + --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] From 44198f37db96a11827dbc0397613ccb61efce7cb Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 4 Apr 2019 22:55:18 -0500 Subject: [PATCH 054/168] N-GPU standalone intercept --- src/core/rocprofiler.cpp | 42 ++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index de16fa19..81e146a0 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -233,13 +233,12 @@ hsa_status_t GetExcStatus(const std::exception& e) { : HSA_STATUS_ERROR; } - -inline size_t CreateEnableCmd(const hsa_agent_t& agent, packet_t* command, const size_t& slot_count) { - rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); - const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); +inline size_t CreateEnableCmd(const rocprofiler::util::AgentInfo* agent_info, packet_t* command, const size_t& slot_count) { const bool is_legacy = (strncmp(agent_info->name, "gfx8", 4) == 0); const size_t packet_count = (is_legacy) ? Profile::LEGACY_SLOT_SIZE_PKT : 1; + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + if (packet_count > slot_count) EXC_RAISING(HSA_STATUS_ERROR, "packet_count > slot_count"); // AQLprofile object @@ -291,9 +290,13 @@ hsa_status_t CreateQueuePro( uint32_t group_segment_size, hsa_queue_t **queue) { - static packet_t enable_cmd_packet[Profile::LEGACY_SLOT_SIZE_PKT]; - static size_t enable_cmd_size = 0; - static std::mutex enable_cmd_mutex; + typedef std::pair cmd_entry_t; + typedef std::vector cmd_vec_t; + static cmd_vec_t cmd_vec; + static uint32_t cmd_mask = 0; + static std::mutex cmd_mutex; + + rocprofiler::util::HsaRsrcFactory* hsa_rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); // Create HSA queue hsa_status_t status = hsa_queue_create_fn( @@ -308,15 +311,30 @@ hsa_status_t CreateQueuePro( if (status != HSA_STATUS_SUCCESS) return status; // Create 'Enable' cmd packet - if (enable_cmd_size == 0) { - std::lock_guard lck(enable_cmd_mutex); - if (enable_cmd_size == 0) { - enable_cmd_size = CreateEnableCmd(agent, enable_cmd_packet, Profile::LEGACY_SLOT_SIZE_PKT); + const rocprofiler::util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(agent); + const uint32_t dev_index = 1 << agent_info->dev_index; + const uint32_t dev_mask = 1 << dev_index; + if ((cmd_mask & dev_mask) == 0) { + std::lock_guard lck(cmd_mutex); + + if ((cmd_mask & dev_mask) == 0) { + cmd_mask |= dev_mask; + // Allocating cmd vector + uint32_t mask = 1; + while (1) { + const uint32_t max = 1 << cmd_vec.size(); + if (mask >= max) cmd_vec.push_back({}); + if (((mask & dev_mask) != 0) || (mask == 0)) break; + mask <<= 1; + } + if (mask == 0) EXC_RAISING(status, "bad device index (" << dev_index << ")"); + // Creating cmd packets + cmd_vec[dev_index].second = CreateEnableCmd(agent_info, cmd_vec[dev_index].first, Profile::LEGACY_SLOT_SIZE_PKT); } } // Enable counters for the queue - rocprofiler::util::HsaRsrcFactory::Instance().Submit(*queue, enable_cmd_packet, enable_cmd_size); + rocprofiler::util::HsaRsrcFactory::Instance().Submit(*queue, cmd_vec[dev_index].first, cmd_vec[dev_index].second); return HSA_STATUS_SUCCESS; } From d36407aa9e31787291875399ba9141cb4a360422 Mon Sep 17 00:00:00 2001 From: Rene van Oostrum Date: Thu, 11 Apr 2019 11:12:36 -0500 Subject: [PATCH 055/168] fix overwriting of LD_LIBRARY_PATH; instead, prepend to old path --- bin/rpl_run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 78edf446..527c99c1 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -43,7 +43,7 @@ HIP_TRACE=0 # Generate stats GEN_STATS=0 -export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$ROCTRACER_PATH/lib:$ROCTRACER_PATH/tool:$HSA_PATH +export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$ROCTRACER_PATH/lib:$ROCTRACER_PATH/tool:$HSA_PATH:$LD_LIBRARY_PATH export PATH=.:$PATH # enable error logging From 968f3bc0787e0b50a0eb1f6d7a9d8855eb31d976 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 8 Jul 2019 19:46:42 -0500 Subject: [PATCH 056/168] 2.6 update --- bin/rpl_run.sh | 22 ++++---- bin/tblextr.py | 13 +++-- cmake_modules/env.cmake | 2 + inc/rocprofiler.h | 7 ++- src/core/context.h | 46 ++++++++++------ src/core/profile.h | 18 ++++--- src/core/rocprofiler.cpp | 12 ++--- src/core/types.h | 12 +++++ test/app/standalone_test.cpp | 17 ++++++ test/run.sh | 26 ++++++--- test/tool/tool.cpp | 102 ++++++++++++++++++++++++++--------- 11 files changed, 195 insertions(+), 82 deletions(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 78edf446..f29582a8 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -30,12 +30,12 @@ RUN_DIR=`pwd` TMP_DIR="/tmp" DATA_DIR="rpl_data_${time_stamp}_$$" +RPL_PATH=$PKG_DIR/lib +TLIB_PATH=$PKG_DIR/tool + # PATH to custom HSA and OpenCl runtimes HSA_PATH=$PKG_DIR/lib/hsa -# roctracer path -if [ -z "$ROCTRACER_PATH" ] ; then ROCTRACER_PATH=$ROOT_DIR/roctracer; fi - # runtime API trace HSA_TRACE=0 HIP_TRACE=0 @@ -43,7 +43,7 @@ HIP_TRACE=0 # Generate stats GEN_STATS=0 -export LD_LIBRARY_PATH=$PKG_DIR/lib:$PKG_DIR/tool:$ROCTRACER_PATH/lib:$ROCTRACER_PATH/tool:$HSA_PATH +export LD_LIBRARY_PATH=$HSA_PATH:$LD_LIBRARY_PATH export PATH=.:$PATH # enable error logging @@ -54,9 +54,9 @@ unset ROCPROFILER_SESS # ROC Profiler environment # Loading of ROC Profiler by HSA runtime -export HSA_TOOLS_LIB=librocprofiler64.so +export HSA_TOOLS_LIB=$RPL_PATH/librocprofiler64.so # Loading of the test tool by ROC Profiler -export ROCP_TOOL_LIB=libtool.so +export ROCP_TOOL_LIB=$TLIB_PATH/libtool.so # Enabling HSA dispatches intercepting by ROC PRofiler export ROCP_HSA_INTERCEPT=1 # Disabling internal ROC Profiler proxy queue (simple version supported for testing purposes) @@ -129,7 +129,7 @@ usage() { echo " >" echo "" echo " -o - output CSV file [.csv]" - echo " -d - directory where profiler store profiling data including thread treaces [/tmp]" + echo " -d - directory where profiler store profiling data including traces [/tmp]" echo " The data directory is renoving autonatically if the directory is matching the temporary one, which is the default." echo " -t - to change the temporary directory [/tmp]" echo " By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory." @@ -201,7 +201,6 @@ run() { fi API_TRACE="" - PRELOAD_LIBS="" if [ "$HSA_TRACE" = 1 ] ; then API_TRACE="hsa" fi @@ -211,14 +210,11 @@ run() { else API_TRACE="all" fi - if [ -z "$HCC_HOME" ] ; then error "env var HCC_HOME is not defined"; fi - PRELOAD_LIBS="$PRELOAD_LIBS $HCC_HOME/lib/libmcwamp_hsa.so" fi if [ -n "$API_TRACE" ] ; then API_TRACE=$(echo $API_TRACE | sed 's/all//') if [ -n "$API_TRACE" ] ; then export ROCTRACER_DOMAIN=$API_TRACE; fi - export HSA_TOOLS_LIB="libtracer_tool.so libroctracer64.so $HSA_TOOLS_LIB" - PRELOAD_LIBS="$PRELOAD_LIBS $HSA_TOOLS_LIB" + export HSA_TOOLS_LIB="$RPL_PATH/libroctracer64.so $TLIB_PATH/libtracer_tool.so $HSA_TOOLS_LIB" fi redirection_cmd="" @@ -228,7 +224,7 @@ run() { fi #unset ROCP_OUTPUT_DIR - CMD_LINE="LD_PRELOAD='$PRELOAD_LIBS' $APP_CMD $redirection_cmd" + CMD_LINE="$APP_CMD $redirection_cmd" eval "$CMD_LINE" } diff --git a/bin/tblextr.py b/bin/tblextr.py index 329ab0d8..818e85f2 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -56,6 +56,13 @@ def fatal(msg): sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); sys.exit(1) + +dbglog_count = 0 +def dbglog(msg): + global dbglog_count + dbglog_count += 1 + sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); + fatal("error") ############################################################# # parse results method @@ -224,7 +231,7 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep dep_tid_list.append(int(rec_vals[3])) dep_id_list.append(record_id) record_id += 1 - else: fatal("hsa bad record") + else: fatal(api_name + " bad record: '" + record + "'") for (tid, from_ns) in dep_list: db.insert_entry(table_handle, [from_ns, from_ns, api_pid, tid, 'hsa_dispatch', '', record_id]) @@ -267,7 +274,7 @@ def fill_copy_db(table_name, db, indir): else: fatal("bad async-copy entry") rec_vals.append(m.group(1)) db.insert_entry(table_handle, rec_vals) - else: fatal("async-copy bad record") + else: fatal("async-copy bad record: '" + record + "'") dep_dict[COPY_PID]['to'] = dep_to_us_dict ############################################################# @@ -313,7 +320,7 @@ def fill_ops_db(table_name, db, indir): dep_dict[gpu_pid]['to'] = {} dep_dict[gpu_pid]['to'][corr_id] = int(rec_vals[0]) / 1000 dep_dict[gpu_pid]['bsp'] = OPS_PID - else: fatal("async-copy bad record") + else: fatal("hcc ops bad record: '" + record + "'") return filtr ############################################################# diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index 6bf6ed45..44fb0cd0 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -56,6 +56,8 @@ set ( CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bdynamic -Wl,-z,noexecstack" ) set ( CMAKE_SKIP_BUILD_RPATH TRUE ) +add_definitions ( -DNEW_TRACE_API=1 ) + ## CLANG options if ( "$ENV{CXX}" STREQUAL "/usr/bin/clang++" ) set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ferror-limit=1000000" ) diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 1e74c464..b59acfdf 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -27,8 +27,7 @@ THE SOFTWARE. // The goal of the implementation is to provide a HW specific low-level // performance analysis interface for profiling of GPU compute applications. // The profiling includes HW performance counters (PMC) with complex -// performance metrics and thread traces (SQTT). The profiling is supported -// by the SQTT, PMC and Callback APIs. +// performance metrics and traces. // // The library can be used by a tool library loaded by HSA runtime or by // higher level HW independent performance analysis API like PAPI. @@ -66,8 +65,8 @@ uint32_t rocprofiler_version_minor(); typedef struct { uint32_t intercept_mode; uint32_t memcopy_tracking; - uint32_t sqtt_size; - uint32_t sqtt_local; + uint32_t trace_size; + uint32_t trace_local; uint64_t timeout; uint32_t timestamp_on; } rocprofiler_settings_t; diff --git a/src/core/context.h b/src/core/context.h index a59effd0..856c7024 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -83,7 +83,7 @@ class Group { Group(const util::AgentInfo* agent_info, Context* context, const uint32_t& index) : pmc_profile_(agent_info), - sqtt_profile_(agent_info), + trace_profile_(agent_info), n_profiles_(0), refs_(1), context_(context), @@ -97,7 +97,7 @@ class Group { pmc_profile_.Insert(info); break; case ROCPROFILER_FEATURE_KIND_TRACE: - sqtt_profile_.Insert(info); + trace_profile_.Insert(info); break; default: EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); @@ -107,21 +107,21 @@ class Group { hsa_status_t Finalize() { hsa_status_t status = pmc_profile_.Finalize(start_vector_, stop_vector_, read_vector_); if (status == HSA_STATUS_SUCCESS) { - status = sqtt_profile_.Finalize(start_vector_, stop_vector_, read_vector_); + status = trace_profile_.Finalize(start_vector_, stop_vector_, read_vector_); } if (status == HSA_STATUS_SUCCESS) { if (!pmc_profile_.Empty()) ++n_profiles_; - if (!sqtt_profile_.Empty()) ++n_profiles_; + if (!trace_profile_.Empty()) ++n_profiles_; } return status; } void GetProfiles(profile_vector_t& vec) { pmc_profile_.GetProfiles(vec); - sqtt_profile_.GetProfiles(vec); + trace_profile_.GetProfiles(vec); } - void GetTraceProfiles(profile_vector_t& vec) { sqtt_profile_.GetProfiles(vec); } + void GetTraceProfiles(profile_vector_t& vec) { trace_profile_.GetProfiles(vec); } info_vector_t& GetInfoVector() { return info_vector_; } const pkt_vector_t& GetStartVector() const { return start_vector_; } @@ -137,7 +137,7 @@ class Group { private: PmcProfile pmc_profile_; - SqttProfile sqtt_profile_; + TraceProfile trace_profile_; info_vector_t info_vector_; pkt_vector_t start_vector_; pkt_vector_t stop_vector_; @@ -361,9 +361,9 @@ class Context { rocprofiler_feature_t* info = &info_array[i]; const rocprofiler_feature_kind_t kind = info->kind; const char* name = info->name; - if (!name) EXC_RAISING(HSA_STATUS_ERROR, "input feature name is NULL"); - info_map_[name] = info; if (kind == ROCPROFILER_FEATURE_KIND_METRIC) { + if (name == NULL) EXC_RAISING(HSA_STATUS_ERROR, "metric name is NULL"); + info_map_[name] = info; auto ret = metrics_map_.insert({name, NULL}); if (!ret.second) EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name @@ -435,7 +435,19 @@ class Context { set_[group_index].Insert(profile_info_t{event, NULL, 0, info}); } } else if (kind == ROCPROFILER_FEATURE_KIND_TRACE) { // Processing traces features - set_[0].Insert(profile_info_t{NULL, info->parameters, info->parameter_count, info}); + if (info->parameters != NULL) { + set_[0].Insert(profile_info_t{NULL, info->parameters, info->parameter_count, info}); + } else { + const Metric* metric = metrics_->Get(name); + if (metric == NULL) + EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found"); + counters_vec_t counters_vec = metric->GetCounters(); + if (counters_vec.size() != 1) + EXC_RAISING(HSA_STATUS_ERROR, "trace bad metric '" << name << "' is not base counter"); + const counter_t* counter = counters_vec[0]; + const event_t* event = &(counter->event); + set_[0].Insert(profile_info_t{event, NULL, 0, info}); + } } else { EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); } @@ -484,15 +496,15 @@ class Context { if (ainfo_data->sample_id == 0) rinfo->data.result_int64 = 0; rinfo->data.result_int64 += ainfo_data->pmc_data.result; rinfo->data.kind = ROCPROFILER_DATA_KIND_INT64; - } else if (ainfo_type == HSA_VEN_AMD_AQLPROFILE_INFO_SQTT_DATA) { + } else if (ainfo_type == HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA) { if (rinfo->data.result_bytes.copy) { - const bool sqtt_local = SqttProfile::IsLocal(); + const bool trace_local = TraceProfile::IsLocal(); util::HsaRsrcFactory* hsa_rsrc = &util::HsaRsrcFactory::Instance(); if (sample_id == 0) { const uint32_t output_buffer_size = profile->output_buffer.size; const uint32_t output_buffer_size64 = profile->output_buffer.size / sizeof(uint64_t); const util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(profile->agent); - void* ptr = (sqtt_local) ? hsa_rsrc->AllocateSysMemory(agent_info, output_buffer_size) : + void* ptr = (trace_local) ? hsa_rsrc->AllocateSysMemory(agent_info, output_buffer_size) : calloc(output_buffer_size64, sizeof(uint64_t)); rinfo->data.result_bytes.size = output_buffer_size; rinfo->data.result_bytes.ptr = ptr; @@ -500,19 +512,19 @@ class Context { } char* result_bytes_ptr = reinterpret_cast(rinfo->data.result_bytes.ptr); const char* end = result_bytes_ptr + rinfo->data.result_bytes.size; - const char* src = reinterpret_cast(ainfo_data->sqtt_data.ptr); - uint32_t size = ainfo_data->sqtt_data.size; + const char* src = reinterpret_cast(ainfo_data->trace_data.ptr); + uint32_t size = ainfo_data->trace_data.size; char* ptr = callback_data->ptr; uint32_t* header = reinterpret_cast(ptr); char* dest = ptr + sizeof(*header); if ((dest + size) >= end) { if (dest < end) size = end - dest; - else EXC_RAISING(HSA_STATUS_ERROR, "SQTT data out of output buffer"); + else EXC_RAISING(HSA_STATUS_ERROR, "Trace data out of output buffer"); } bool suc = true; - if (sqtt_local) { + if (trace_local) { suc = hsa_rsrc->Memcpy(profile->agent, dest, src, size); } else { memcpy(dest, src, size); diff --git a/src/core/profile.h b/src/core/profile.h index 6d91192b..223e2e5e 100644 --- a/src/core/profile.h +++ b/src/core/profile.h @@ -236,21 +236,27 @@ class PmcProfile : public Profile { } }; -class SqttProfile : public Profile { +class TraceProfile : public Profile { public: static inline void SetSize(const uint32_t& size) { output_buffer_size_ = size; } static inline uint32_t GetSize() { return output_buffer_size_; } static inline void SetLocal(const bool& b) { output_buffer_local_ = b; } static inline bool IsLocal() { return output_buffer_local_; } - SqttProfile(const util::AgentInfo* agent_info) : Profile(agent_info) { - profile_.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_SQTT; + TraceProfile(const util::AgentInfo* agent_info) : Profile(agent_info) { + profile_.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE; } void Insert(const profile_info_t& info) { - Profile::Insert(info); - for (unsigned j = 0; j < info.parameter_count; ++j) { - Config(&profile_).Insert(info.parameters[j]); + if (info.parameters != NULL) { + Profile::Insert(info); + for (unsigned j = 0; j < info.parameter_count; ++j) { + Config(&profile_).Insert(info.parameters[j]); + } + } else if (info.event != NULL) { + Config(&profile_).Insert(*(info.event)); + } else { + EXC_ABORT(HSA_STATUS_ERROR, "invalid trace info inserted"); } } diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index 81e146a0..090e5492 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -175,16 +175,16 @@ uint32_t LoadTool() { rocprofiler_settings_t settings{}; settings.intercept_mode = (intercept_mode != 0) ? 1 : 0; - settings.sqtt_size = SqttProfile::GetSize(); - settings.sqtt_local = SqttProfile::IsLocal() ? 1: 0; + settings.trace_size = TraceProfile::GetSize(); + settings.trace_local = TraceProfile::IsLocal() ? 1: 0; settings.timeout = util::HsaRsrcFactory::GetTimeoutNs(); settings.timestamp_on = InterceptQueue::IsTrackerOn() ? 1 : 0; if (handler) handler(); else if (handler_prop) handler_prop(&settings); - SqttProfile::SetSize(settings.sqtt_size); - SqttProfile::SetLocal(settings.sqtt_local != 0); + TraceProfile::SetSize(settings.trace_size); + TraceProfile::SetLocal(settings.trace_local != 0); util::HsaRsrcFactory::SetTimeoutNs(settings.timeout); InterceptQueue::TrackerOn(settings.timestamp_on != 0); if (settings.intercept_mode != 0) intercept_mode = DISPATCH_INTERCEPT_MODE; @@ -384,8 +384,8 @@ hsa_status_t hsa_amd_memory_async_copy_rect_interceptor( } rocprofiler_properties_t rocprofiler_properties; -uint32_t SqttProfile::output_buffer_size_ = 0x2000000; // 32M -bool SqttProfile::output_buffer_local_ = true; +uint32_t TraceProfile::output_buffer_size_ = 0x2000000; // 32M +bool TraceProfile::output_buffer_local_ = true; std::atomic Tracker::instance_{}; Tracker::mutex_t Tracker::glob_mutex_; Tracker::counter_t Tracker::counter_ = 0; diff --git a/src/core/types.h b/src/core/types.h index ef8600f0..c72bb343 100644 --- a/src/core/types.h +++ b/src/core/types.h @@ -23,6 +23,8 @@ THE SOFTWARE. #ifndef SRC_CORE_TYPES_H_ #define SRC_CORE_TYPES_H_ +#include + #include namespace rocprofiler { @@ -33,6 +35,16 @@ typedef hsa_ven_amd_aqlprofile_profile_t profile_t; typedef hsa_ext_amd_aql_pm4_packet_t packet_t; typedef uint32_t packet_word_t; typedef uint64_t timestamp_t; + +inline std::ostream& operator<< (std::ostream& out, const event_t& event) { + out << "[block_name(" << event.block_name << "). block_index(" << event.block_index << "). counter_id(" << event.counter_id << ")]"; + return out; +} +inline std::ostream& operator<< (std::ostream& out, const parameter_t& parameter) { + out << "[parameter_name(" << parameter.parameter_name << "). value(" << parameter.value << ")]"; + return out; +} + } // namespace rocprofiler #endif // SRC_CORE_TYPES_H_ diff --git a/test/app/standalone_test.cpp b/test/app/standalone_test.cpp index f6fc965e..b173c4d3 100644 --- a/test/app/standalone_test.cpp +++ b/test/app/standalone_test.cpp @@ -104,6 +104,23 @@ int main() { feature[7].name = "TCC_MISS_sum"; feature[8].kind = ROCPROFILER_FEATURE_KIND_METRIC; feature[8].name = "WRITE_SIZE"; +// feature[8].kind = ROCPROFILER_FEATURE_KIND_METRIC; +// feature[8].name = "TCC_EA_WRREQ_sum"; +// feature[9].kind = ROCPROFILER_FEATURE_KIND_METRIC; +// feature[9].name = "TCC_EA_WRREQ_64B_sum"; +#if 0 + // Tracing parameters + const unsigned parameter_count = 2; + rocprofiler_parameter_t parameters[parameter_count]; + feature[2].name = "THREAD_TRACE"; + feature[2].kind = ROCPROFILER_FEATURE_KIND_TRACE; + feature[2].parameters = parameters; + feature[2].parameter_count = parameter_count; + parameters[0].parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK; + parameters[0].value = 0; + parameters[1].parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK; + parameters[1].value = 0; +#endif // Instantiate HSA resources HsaRsrcFactory::Create(); diff --git a/test/run.sh b/test/run.sh index ed5bbe9a..d1aa2b88 100755 --- a/test/run.sh +++ b/test/run.sh @@ -22,21 +22,31 @@ # THE SOFTWARE. ################################################################################ +test_filter=-1 +if [ -n "$1" ] ; then + test_filter=$1 +fi + # test check routin test_status=0 test_number=0 +xeval_test() { + test_number=$test_number +} eval_test() { label=$1 cmdline=$2 - echo "$label: \"$cmdline\"" - eval "$cmdline" - if [ $? != 0 ] ; then - echo "$label: FAILED" - test_status=$(($test_status + 1)) - else - echo "$label: PASSED" + if [ $test_filter = -1 -o $test_filter = $test_number ] ; then + echo "$label: \"$cmdline\"" + eval "$cmdline" + if [ $? != 0 ] ; then + echo "$label: FAILED" + test_status=$(($test_status + 1)) + else + echo "$label: PASSED" + fi fi - test_number=$(($test_number + 1)) + test_number=$((test_number + 1)) } # enable tools load failure reporting diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 0eee2348..10facbd0 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -134,8 +134,10 @@ static uint32_t CTX_OUTSTANDING_MAX = 0; static uint32_t CTX_OUTSTANDING_MON = 0; // to truncate kernel names uint32_t to_truncate_names = 0; -// local SQTT buffer -bool is_sqtt_local = true; +// local trace buffer +bool is_trace_local = true; +// SPM trace enabled +bool is_spm_trace = false; static inline uint32_t GetPid() { return syscall(__NR_getpid); } static inline uint32_t GetTid() { return syscall(__NR_gettid); } @@ -281,7 +283,7 @@ void dealloc_context_entry(context_entry_t* entry) { // Dump trace data to file void dump_sqtt_trace(const char* label, const uint32_t chunk, const void* data, const uint32_t& size) { if (result_prefix != NULL) { - // Open SQTT file + // Open file std::ostringstream oss; oss << result_prefix << "/thread_trace_" << label << "_se" << chunk << ".out"; FILE* file = fopen(oss.str().c_str(), "w"); @@ -298,11 +300,36 @@ void dump_sqtt_trace(const char* label, const uint32_t chunk, const void* data, fprintf(file, "%04x\n", ptr[i]); } - // Close SQTT file + // Close file fclose(file); } } +// Dump trace data to file +void dump_spm_trace(const char* label, const void* data, const uint32_t& size) { + if (result_prefix != NULL) { + // Open trace file + std::ostringstream oss; + oss << result_prefix << "/spm_trace_" << label << ".out"; + const int fd = open(oss.str().c_str(), O_CREAT|O_WRONLY|O_TRUNC, 0666); + if (fd == -1) { + std::ostringstream errmsg; + errmsg << "open error, file '" << oss.str().c_str() << "'"; + perror(errmsg.str().c_str()); + abort(); + } + // write trace binary data + if (write(fd, data, size) == -1) { + std::ostringstream errmsg; + errmsg << "write error, file '" << oss.str().c_str() << "'"; + perror(errmsg.str().c_str()); + abort(); + } + // Close file + close(fd); + } +} + struct trace_data_arg_t { FILE* file; const char* label; @@ -314,23 +341,43 @@ hsa_status_t trace_data_cb(hsa_ven_amd_aqlprofile_info_type_t info_type, hsa_ven_amd_aqlprofile_info_data_t* info_data, void* data) { hsa_status_t status = HSA_STATUS_SUCCESS; trace_data_arg_t* arg = reinterpret_cast(data); - if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_SQTT_DATA) { - const void* data_ptr = info_data->sqtt_data.ptr; - const uint32_t data_size = info_data->sqtt_data.size; - fprintf(arg->file, " SE(%u) size(%u)\n", info_data->sample_id, data_size); + if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA) { + if (is_spm_trace) { + if (info_data->sample_id != 0) { + fatal("Only one SPM sample expected"); + } + const void* data_ptr = info_data->trace_data.ptr; + const uint32_t data_size = info_data->trace_data.size; + fprintf(arg->file, " size(%u)\n", data_size); - if (is_sqtt_local) { + if (is_trace_local == false) fatal("SPM trace supports only local trace allocation"); HsaRsrcFactory* hsa_rsrc = &HsaRsrcFactory::Instance(); const AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(arg->agent); const uint32_t mem_size = data_size; void* buffer = hsa_rsrc->AllocateSysMemory(agent_info, mem_size); if(!hsa_rsrc->Memcpy(agent_info, buffer, data_ptr, mem_size)) { - fatal("SQTT data memcopy to host failed"); + fatal("Trace data memcopy to host failed"); } - dump_sqtt_trace(arg->label, info_data->sample_id, buffer, data_size); + dump_spm_trace(arg->label, buffer, data_size); HsaRsrcFactory::FreeMemory(buffer); } else { - dump_sqtt_trace(arg->label, info_data->sample_id, data_ptr, data_size); + const void* data_ptr = info_data->trace_data.ptr; + const uint32_t data_size = info_data->trace_data.size; + fprintf(arg->file, " SE(%u) size(%u)\n", info_data->sample_id, data_size); + + if (is_trace_local) { + HsaRsrcFactory* hsa_rsrc = &HsaRsrcFactory::Instance(); + const AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(arg->agent); + const uint32_t mem_size = data_size; + void* buffer = hsa_rsrc->AllocateSysMemory(agent_info, mem_size); + if(!hsa_rsrc->Memcpy(agent_info, buffer, data_ptr, mem_size)) { + fatal("Trace data memcopy to host failed"); + } + dump_sqtt_trace(arg->label, info_data->sample_id, buffer, data_size); + HsaRsrcFactory::FreeMemory(buffer); + } else { + dump_sqtt_trace(arg->label, info_data->sample_id, data_ptr, data_size); + } } } else status = HSA_STATUS_ERROR; @@ -367,12 +414,12 @@ void output_results(const context_entry_t* entry, const char* label) { for (unsigned i = 0; i < p->data.result_bytes.instance_count; ++i) { const uint32_t chunk_size = *reinterpret_cast(ptr); const char* chunk_data = ptr + sizeof(uint32_t); - if (chunk_data >= end) fatal("SQTT data is out of the result buffer size"); + if (chunk_data >= end) fatal("Trace data is out of the result buffer size"); dump_sqtt_trace(label, i, chunk_data, chunk_size); const uint32_t off = align_size(chunk_size, sizeof(uint32_t)); ptr = chunk_data + off; - if (chunk_data >= end) fatal("SQTT data ptr is out of the result buffer size"); + if (chunk_data >= end) fatal("Trace data ptr is out of the result buffer size"); size += chunk_size; } fprintf(file, "size(%lu)\n", size); @@ -388,6 +435,7 @@ void output_results(const context_entry_t* entry, const char* label) { break; } default: + if (is_spm_trace) continue; fprintf(stderr, "RPL-tool: undefined data kind(%u)\n", p->data.kind); abort(); } @@ -821,19 +869,19 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) if (it != opts.end()) { CTX_OUTSTANDING_MAX = atol(it->second.c_str()); } it = opts.find("heartbeat"); if (it != opts.end()) { CTX_OUTSTANDING_MON = atol(it->second.c_str()); } - it = opts.find("sqtt-size"); + it = opts.find("trace-size"); if (it != opts.end()) { - std::string str = normalize_token(it->second, true, "option sqtt-size"); + std::string str = normalize_token(it->second, true, "option trace-size"); uint32_t multiplier = 1; switch (str.back()) { case 'K': multiplier = 1024; break; case 'M': multiplier = 1024 * 1024; break; } if (multiplier != 1) str = str.substr(0, str.length() - 1); - settings->sqtt_size = strtoull(str.c_str(), NULL, 0) * multiplier; + settings->trace_size = strtoull(str.c_str(), NULL, 0) * multiplier; } - it = opts.find("sqtt-local"); - if (it != opts.end()) { settings->sqtt_local = (it->second == "on"); } + it = opts.find("trace-local"); + if (it != opts.end()) { settings->trace_local = (it->second == "on"); } it = opts.find("memcopies"); if (it != opts.end()) { settings->memcopy_tracking = (it->second == "on"); } } @@ -850,14 +898,14 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) check_env_var("ROCP_TIMESTAMP_ON", settings->timestamp_on); // Set data timeout check_env_var("ROCP_DATA_TIMEOUT", settings->timeout); - // Set SQTT size - check_env_var("ROCP_SQTT_SIZE", settings->sqtt_size); - // Set SQTT local buffer - check_env_var("ROCP_SQTT_LOCAL", settings->sqtt_local); + // Set trace size + check_env_var("ROCP_TRACE_SIZE", settings->trace_size); + // Set trace local buffer + check_env_var("ROCP_TRACE_LOCAL", settings->trace_local); // Set memcopies tracking check_env_var("ROCP_MCOPY_TRACKING", settings->memcopy_tracking); - is_sqtt_local = settings->sqtt_local; + is_trace_local = settings->trace_local; // Printing out info char* info_symb = getenv("ROCP_INFO"); @@ -941,7 +989,11 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) range_vec->push_back(*(range_vec->begin()) + 1); } - const unsigned feature_count = metrics_vec.size(); + // Getting traces + const auto traces_list = xml->GetNodes("top.trace"); + if (traces_list.size() > 1) fatal("ROCProfiler: only one trace supported at a time"); + + const unsigned feature_count = metrics_vec.size() + traces_list.size(); rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; memset(features, 0, feature_count * sizeof(rocprofiler_feature_t)); From 12b64e432ee352f9f772925319bdf59a122d12c9 Mon Sep 17 00:00:00 2001 From: Nicholas Curtis Date: Wed, 10 Jul 2019 14:46:10 -0500 Subject: [PATCH 057/168] add flag to rpl_run.sh that enables the user to define custom metrics files --- bin/rpl_run.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index f29582a8..1c07997f 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -133,6 +133,7 @@ usage() { echo " The data directory is renoving autonatically if the directory is matching the temporary one, which is the default." echo " -t - to change the temporary directory [/tmp]" echo " By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory." + echo " -m - file defining custom metrics to use in-place of defaults." echo "" echo " --basenames - to turn on/off truncating of the kernel full function names till the base ones [off]" echo " --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off]" @@ -258,6 +259,9 @@ while [ 1 ] ; do if [ "$OUTPUT_DIR" = "-" ] ; then DATA_PATH=$TMP_DIR fi + elif [ "$1" = "-m" ] ; then + unset ROCP_METRICS + export ROCP_METRICS="$2" elif [ "$1" = "--list-basic" ] ; then export ROCP_INFO=b eval "$PKG_DIR/tool/ctrl" From b71ad35d94d0817120c8f909755bdf7b1a4ba7f1 Mon Sep 17 00:00:00 2001 From: Ye Luo Date: Mon, 29 Jul 2019 23:41:33 -0500 Subject: [PATCH 058/168] Protect application arguments with spaces. Issue: `exe -g "2 2 1"` was interpreted as `exe -g 2 2 1` Now `"exe" "-g" "2 2 1"` --- bin/rpl_run.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 1c07997f..20880fcd 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -347,7 +347,12 @@ else csv_output=$RUN_DIR/${input_base}.csv fi -APP_CMD=$* +APP_CMD="" +for i in `seq 1 $#` +do + eval "arg=\${$i}" + APP_CMD=$APP_CMD" "\"$arg\" +done echo "RPL: profiling '$APP_CMD'" echo "RPL: input file '$INPUT_FILE'" From 21b6013aba9dc611c97ac9a92602b80011162660 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 11 Sep 2019 10:19:08 -0500 Subject: [PATCH 059/168] minor cosmetic fixes --- bin/rpl_run.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 20880fcd..d6398585 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. @@ -348,10 +348,12 @@ else fi APP_CMD="" -for i in `seq 1 $#` -do +for i in `seq 1 $#`; do + if [ -n "$APP_CMD" ] ; then + APP_CMD=$APP_CMD" " + fi eval "arg=\${$i}" - APP_CMD=$APP_CMD" "\"$arg\" + APP_CMD=$APP_CMD\"$arg\" done echo "RPL: profiling '$APP_CMD'" From f77f71e7880f7179757193d6ec7158e0367b5b93 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 26 Sep 2019 23:05:35 -0500 Subject: [PATCH 060/168] rocm2.7 update --- README.md | 3 +- bin/dform.py | 2 +- bin/rpl_run.sh | 45 ++++++++++++++++++++++++--- bin/sqlitedb.py | 13 ++++---- bin/tblextr.py | 38 ++++++++++++++--------- src/core/metrics.h | 6 +++- src/core/profile.h | 7 +++-- src/xml/xml.h | 76 +++++++++++++++++++++++++++++++++++++--------- test/run.sh | 1 + 9 files changed, 144 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index a3ca363b..82f525dc 100644 --- a/README.md +++ b/README.md @@ -37,10 +37,11 @@ profiling includes HW performance counters with complex performance metrics. ## To build with the current installed ROCM: ``` - To build and install to /opt/rocm/rocprofiler + export CMAKE_PREFIX_PATH=/opt/rocm/include/hsa:/opt/rocm + cd .../rocprofiler mkdir build cd build - export CMAKE_PREFIX_PATH=/opt/rocm/include/hsa:/opt/rocm cmake .. make make install diff --git a/bin/dform.py b/bin/dform.py index 5fc8d6fc..1e5c63b1 100644 --- a/bin/dform.py +++ b/bin/dform.py @@ -25,7 +25,7 @@ def gen_api_json_trace(db, table, start_us, outfile): db.execute('DROP VIEW B') def gen_ops_json_trace(db, table, base_pid, start_us, outfile): - db.execute('create view B as select "Index", Name as name, ("gpu-id" + %d) as pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s order by ts asc;' % (base_pid, start_us, table)); + db.execute('create view B as select "Index", Name as name, ("dev-id" + %d) as pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s order by ts asc;' % (base_pid, start_us, table)); db.dump_json('B', table, outfile) db.execute('DROP VIEW B') diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index d6398585..eeb62d20 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -26,24 +26,32 @@ time_stamp=`date +%y%m%d_%H%M%S` BIN_DIR=$(dirname $(realpath $0)) PKG_DIR=$(dirname $BIN_DIR) ROOT_DIR=$(dirname $PKG_DIR) +TT_DIR=$ROOT_DIR/roctracer RUN_DIR=`pwd` TMP_DIR="/tmp" DATA_DIR="rpl_data_${time_stamp}_$$" RPL_PATH=$PKG_DIR/lib TLIB_PATH=$PKG_DIR/tool +TTLIB_PATH=$TT_DIR/tool -# PATH to custom HSA and OpenCl runtimes -HSA_PATH=$PKG_DIR/lib/hsa +# Default HIP path +if [ -z "$HIP_PATH" ] ; then + export HIP_PATH=/opt/rocm/hip +fi +# Default HCC path +if [ -z "$HCC_HOME" ] ; then + export HCC_HOME=/opt/rocm/hcc +fi # runtime API trace HSA_TRACE=0 +SYS_TRACE=0 HIP_TRACE=0 # Generate stats GEN_STATS=0 -export LD_LIBRARY_PATH=$HSA_PATH:$LD_LIBRARY_PATH export PATH=.:$PATH # enable error logging @@ -143,6 +151,7 @@ usage() { echo "" echo " --stats - generating kernel execution stats, file .stats.csv" echo " --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible" + echo " --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible" echo " --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible" echo " Generated files: .hsa_stats.txt .json" echo " Traced API list can be set by input .txt or .xml files." @@ -199,12 +208,17 @@ run() { fi fi mkdir -p "$ROCP_OUTPUT_DIR" + + OUTPUT_LIST="$OUTPUT_LIST $ROCP_OUTPUT_DIR/results.txt" fi API_TRACE="" if [ "$HSA_TRACE" = 1 ] ; then API_TRACE="hsa" fi + if [ "$SYS_TRACE" = 1 ] ; then + API_TRACE="sys" + fi if [ "$HIP_TRACE" = 1 ] ; then if [ -z "$API_TRACE" ] ; then API_TRACE="hip"; @@ -215,12 +229,14 @@ run() { if [ -n "$API_TRACE" ] ; then API_TRACE=$(echo $API_TRACE | sed 's/all//') if [ -n "$API_TRACE" ] ; then export ROCTRACER_DOMAIN=$API_TRACE; fi - export HSA_TOOLS_LIB="$RPL_PATH/libroctracer64.so $TLIB_PATH/libtracer_tool.so $HSA_TOOLS_LIB" + if [ "$API_TRACE" = "hip" -o "$API_TRACE" = "sys" ] ; then + OUTPUT_LIST="$ROCP_OUTPUT_DIR/" + fi + export HSA_TOOLS_LIB="$TTLIB_PATH/libtracer_tool.so" fi redirection_cmd="" if [ -n "$ROCP_OUTPUT_DIR" ] ; then - OUTPUT_LIST="$OUTPUT_LIST $ROCP_OUTPUT_DIR/results.txt" redirection_cmd="2>&1 | tee $ROCP_OUTPUT_DIR/log.txt" fi @@ -229,6 +245,19 @@ run() { eval "$CMD_LINE" } +merge_output() { + output_dir=$(echo "$1" | sed "s/\/[^\/]*$//") + for file_name in `ls $output_dir` ; do + output_name=$(echo $file_name | sed -n "/\.txt$/ s/^[0-9]*_//p") + if [ -n "$output_name" ] ; then + trace_file=$output_dir/$file_name + output_file=$output_dir/$output_name + touch $output_file + cat $trace_file >> $output_file + fi + done +} + # main echo "RPL: on '$time_stamp' from '$PKG_DIR' in '$RUN_DIR'" # Parsing arguments @@ -301,6 +330,11 @@ while [ 1 ] ; do export ROCP_TIMESTAMP_ON=1 GEN_STATS=1 HSA_TRACE=1 + elif [ "$1" = "--sys-trace" ] ; then + ARG_VAL=0 + export ROCP_TIMESTAMP_ON=1 + GEN_STATS=1 + SYS_TRACE=1 elif [ "$1" = "--hip-trace" ] ; then ARG_VAL=0 export ROCP_TIMESTAMP_ON=1 @@ -398,6 +432,7 @@ done if [ -n "$csv_output" ] ; then if [ "$GEN_STATS" = "1" ] ; then db_output=$(echo $csv_output | sed "s/\.csv/.db/") + merge_output $OUTPUT_LIST python $BIN_DIR/tblextr.py $db_output $OUTPUT_LIST else python $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index 295fe7a7..e02d4136 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -121,12 +121,13 @@ def flow_json(self, base_id, from_pid, from_tid, from_us_list, to_pid, to_us_dic for ind in range(len(from_tid)): if (len(corr_id_list) != 0): corr_id = corr_id_list[ind] else: corr_id = ind - from_ts = from_us_list[ind] - start_us - to_ts = to_us_dict[corr_id] - start_us - if from_ts > to_ts: from_ts = to_ts - fd.write(',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%s,"tid":%s,"name":"dep"}\n' % (from_ts, dep_id, str(from_pid), from_tid[ind])) - fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%s,"tid":0,"name":"dep"}\n' % (to_ts, dep_id, str(to_pid))) - dep_id += 1 + if corr_id in to_us_dict: + from_ts = from_us_list[ind] - start_us + to_ts = to_us_dict[corr_id] - start_us + if from_ts > to_ts: from_ts = to_ts + fd.write(',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%s,"tid":%s,"name":"dep"}\n' % (from_ts, dep_id, str(from_pid), from_tid[ind])) + fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%s,"tid":0,"name":"dep"}\n' % (to_ts, dep_id, str(to_pid))) + dep_id += 1 def dump_json(self, table_name, data_name, file_name): if not re.search(r'\.json$', file_name): diff --git a/bin/tblextr.py b/bin/tblextr.py index 818e85f2..057e984a 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -68,7 +68,7 @@ def dbglog(msg): # parse results method def parse_res(infile): global max_gpu_id - if not os.path.isfile(infile): fatal("Error: input file '" + infile + "' not found") + if not os.path.isfile(infile): return # fatal("Error: input file '" + infile + "' not found") inp = open(infile, 'r') beg_pattern = re.compile("^dispatch\[(\d*)\], (.*) kernel-name\(\"([^\"]*)\"\)") @@ -256,6 +256,8 @@ def fill_copy_db(table_name, db, indir): ptrn_val = re.compile(r'(\d+):(\d+) (.*)$') ptrn_id = re.compile(r'^async-copy(\d+)$') + if not os.path.isfile(file_name): return 0 + if not COPY_PID in dep_dict: dep_dict[COPY_PID] = {} dep_to_us_dict = {} @@ -325,11 +327,12 @@ def fill_ops_db(table_name, db, indir): return filtr ############################################################# # main -if (len(sys.argv) < 3): fatal("Usage: " + sys.argv[0] + " ") +if (len(sys.argv) < 2): fatal("Usage: " + sys.argv[0] + " ") outfile = sys.argv[1] infiles = sys.argv[2:] indir = re.sub(r'\/[^\/]*$', r'', infiles[0]) +inext = re.sub(r'^[^\.]*', r'', infiles[0]) dbfile = '' csvfile = '' @@ -342,9 +345,9 @@ def fill_ops_db(table_name, db, indir): else: fatal("Bad output file '" + outfile + "'") -for f in infiles: parse_res(f) -if len(var_table) == 0: sys.exit(1) -merge_table() +if inext == '.txt': + for f in infiles: parse_res(f) + if len(var_table) != 0: merge_table() if dbfile == '': dump_csv(csvfile) @@ -356,8 +359,7 @@ def fill_ops_db(table_name, db, indir): db = SQLiteDB(dbfile) hsa_trace_found = fill_api_db('HSA', db, indir, 'hsa', HSA_PID, COPY_PID, kern_dep_list, {}, 0) - if hsa_trace_found: - fill_copy_db('COPY', db, indir) + hsa_activity_found = fill_copy_db('COPY', db, indir) ops_filtr = fill_ops_db('OPS', db, indir) hip_trace_found = fill_api_db('HIP', db, indir, 'hip', HIP_PID, OPS_PID, [], ops_filtr, 1) @@ -370,6 +372,7 @@ def fill_ops_db(table_name, db, indir): if hsa_trace_found: db.label_json(HSA_PID, "CPU HSA API", jsonfile) + if hsa_activity_found: db.label_json(COPY_PID, "COPY", jsonfile) if hip_trace_found: @@ -379,17 +382,19 @@ def fill_ops_db(table_name, db, indir): for ind in range(0, int(max_gpu_id) + 1): db.label_json(int(ind) + int(GPU_BASE_PID), "GPU" + str(ind), jsonfile) - dform.post_process_data(db, 'A', csvfile) - dform.gen_table_bins(db, 'A', statfile, 'KernelName', 'DurationNs') - if hsa_trace_found and 'BeginNs' in var_list: - dform.gen_kernel_json_trace(db, 'A', GPU_BASE_PID, START_US, jsonfile) + if len(var_table) != 0: + dform.post_process_data(db, 'A', csvfile) + dform.gen_table_bins(db, 'A', statfile, 'KernelName', 'DurationNs') + if hsa_trace_found and 'BeginNs' in var_list: + dform.gen_kernel_json_trace(db, 'A', GPU_BASE_PID, START_US, jsonfile) if hsa_trace_found: statfile = re.sub(r'stats', r'hsa_stats', statfile) dform.post_process_data(db, 'HSA') dform.gen_table_bins(db, 'HSA', statfile, 'Name', 'DurationNs') dform.gen_api_json_trace(db, 'HSA', START_US, jsonfile) - + + if hsa_activity_found: dform.post_process_data(db, 'COPY') dform.gen_api_json_trace(db, 'COPY', START_US, jsonfile) @@ -398,7 +403,7 @@ def fill_ops_db(table_name, db, indir): dform.post_process_data(db, 'HIP') dform.gen_table_bins(db, 'HIP', statfile, 'Name', 'DurationNs') dform.gen_api_json_trace(db, 'HIP', START_US, jsonfile) - + dform.post_process_data(db, 'OPS') dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_US, jsonfile) @@ -414,12 +419,15 @@ def fill_ops_db(table_name, db, indir): dep_id = 0 for (to_pid, dep_str) in dep_dict.items(): if 'inv' in dep_str: continue + if not 'to' in dep_str: continue + + to_us_dict = dep_str['to'] + from_us_list = dep_str['from'] from_pid = dep_str['pid'] tid_list = dep_str['tid'] - from_us_list = dep_str['from'] - to_us_dict = dep_str['to'] corr_id_list = [] if 'id' in dep_str: corr_id_list = dep_str['id'] + db.flow_json(dep_id, from_pid, tid_list, from_us_list, to_pid, to_us_dict, corr_id_list, START_US, jsonfile) dep_id += len(tid_list) diff --git a/src/core/metrics.h b/src/core/metrics.h index cb55d189..f1532dcf 100644 --- a/src/core/metrics.h +++ b/src/core/metrics.h @@ -184,7 +184,11 @@ class MetricsDict { xml_->AddConst("top.const.metric", "SIMD_NUM", agent_info->simds_per_cu * agent_info->cu_num); xml_->AddConst("top.const.metric", "SE_NUM", agent_info->se_num); ImportMetrics(agent_info, "const"); - ImportMetrics(agent_info, agent_info->gfxip); + if (std::string("gfx906") == agent_info->name) { + ImportMetrics(agent_info, agent_info->name); + } else { + ImportMetrics(agent_info, agent_info->gfxip); + } ImportMetrics(agent_info, "global"); } } diff --git a/src/core/profile.h b/src/core/profile.h index 223e2e5e..9ed03375 100644 --- a/src/core/profile.h +++ b/src/core/profile.h @@ -140,13 +140,14 @@ class Profile { if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_start"); status = api->hsa_ven_amd_aqlprofile_stop(&profile_, &stop); if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_stop"); + hsa_status_t rd_status = HSA_STATUS_ERROR; #ifdef AQLPROF_NEW_API - hsa_status_t rd_status = api->hsa_ven_amd_aqlprofile_read(&profile_, &read); + if (profile_.type == HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC) { + rd_status = api->hsa_ven_amd_aqlprofile_read(&profile_, &read); + } #if 0 // Read API returns error if disabled if (rd_status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_read"); #endif -#else - hsa_status_t rd_status = HSA_STATUS_ERROR; #endif // Set completion signal diff --git a/src/xml/xml.h b/src/xml/xml.h index 933cd2b6..31ed100b 100644 --- a/src/xml/xml.h +++ b/src/xml/xml.h @@ -49,6 +49,7 @@ class Xml { std::string tag; nodes_t nodes; opts_t opts; + const level_t* copy; }; typedef std::vector nodes_vec_t; typedef std::map map_t; @@ -239,7 +240,7 @@ class Xml { if (error) { fprintf(stderr, "XML PreProcess failed, line '%s'\n", buf); - exit(1); + abort(); } lseek(fd_, 0, SEEK_SET); @@ -252,9 +253,9 @@ class Xml { token_t token = (remainder.size()) ? remainder : NextToken(); remainder.clear(); - // token_t token1 = token; - // token1.push_back('\0'); - // std::cout << "> " << &token1[0] << std::endl; + // token_t token1 = token; + // token1.push_back('\0'); + // std::cout << ">>> " << &token1[0] << std::endl; // End of file if (token.size() == 0) break; @@ -312,14 +313,18 @@ class Xml { if (token[j] == '=') break; if (j == token.size()) BadFormat(token); token[j] = '\0'; - const char* key = &token[0]; - const char* value = &token[j + 1]; - AddOption(key, value); + const std::string key = &token[0]; + const std::string value = &token[j + 1]; + if (key == "base") { + Inherit(value); + } else { + AddOption(key, value); + } } break; default: std::cout << "XML parser error: wrong state: " << state_ << std::endl; - exit(1); + abort(); } } } @@ -406,11 +411,11 @@ class Xml { token.push_back('\0'); std::cout << "Error: " << file_name_ << ", line " << file_line_ << ", bad XML token '" << &token[0] << "'" << std::endl; - exit(1); + abort(); } void AddLevel(const std::string& tag) { - level_t* level = new level_t; + level_t* level = new level_t{}; level->tag = tag; if (level_) { level_->nodes.push_back(level); @@ -418,11 +423,7 @@ class Xml { } level_ = level; - std::string global_tag; - for (level_t* level : stack_) { - global_tag += level->tag + "."; - } - global_tag += tag; + std::string global_tag = GlobalTag(tag); (*map_)[global_tag].push_back(level_); } @@ -431,8 +432,53 @@ class Xml { stack_.pop_back(); } + void Copy(const level_t* from, level_t* to) { + level_t* level = to; + if (level == NULL) { + AddLevel(from->tag); + level = level_; + level->copy = from; + } + level->opts = from->opts; + + for (auto node : from->nodes) { + bool found = false; + const std::string global_tag = GlobalTag(level->tag) + "." + node->tag; + for (auto item : (*map_)[global_tag]) { + if (node == item->copy) { + found = true; + break; + } + } + if (found == false) Copy(node, NULL); + } + + if (to == NULL) UpLevel(); + } + + void Inherit(const std::string& tag) { + std::string global_tag = GlobalTag(tag); + auto it = map_->find(global_tag); + if (it == map_->end()) { + fprintf(stderr, "Node \"%s\": Base not found \"%s\"\n", level_->tag.c_str(), tag.c_str()); + abort(); + } + for (auto node : it->second) { + Copy(node, level_); + } + } + std::string CurrentLevel() const { return level_->tag; } + std::string GlobalTag(const std::string& tag) const { + std::string global_tag; + for (level_t* level : stack_) { + global_tag += level->tag + "."; + } + global_tag += tag; + return global_tag; + } + void AddOption(const std::string& key, const std::string& value) { level_->opts[key] = value; } const std::string file_name_; diff --git a/test/run.sh b/test/run.sh index d1aa2b88..be59b128 100755 --- a/test/run.sh +++ b/test/run.sh @@ -22,6 +22,7 @@ # THE SOFTWARE. ################################################################################ +# test filter input test_filter=-1 if [ -n "$1" ] ; then test_filter=$1 From c30ee0e83d177f178c1b6d5fe58121e878d0514c Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 26 Sep 2019 23:35:54 -0500 Subject: [PATCH 061/168] 2.8 update --- bin/rpl_run.sh | 5 +++- src/core/metrics.h | 58 ++++++++++++++++++++++++++++-------- src/core/tracker.h | 2 +- src/util/hsa_rsrc_factory.h | 2 +- src/util/logger.h | 2 +- src/xml/xml.h | 57 ++++++++++++++++++++--------------- test/app/test.cpp | 2 +- test/ctrl/test_hsa.cpp | 2 +- test/run.sh | 4 ++- test/tool/gfx_metrics.xml | 34 +++++++++++++++++++++ test/tool/metrics.xml | 55 ++++++++++++++++++++++++++++++---- test/tool/tool.cpp | 1 - test/util/hsa_rsrc_factory.h | 2 +- 13 files changed, 174 insertions(+), 52 deletions(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index eeb62d20..2e0ba8ba 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -81,6 +81,7 @@ fatal() { echo "$0: Error: $1" echo "" usage + exit 1 } error() { @@ -420,11 +421,13 @@ if [ -n "$csv_output" ] ; then rm -f $csv_output fi +RET=0 for name in $input_list; do run $name $OUTPUT_DIR $APP_CMD if [ -n "$ROCPROFILER_SESS" -a -e "$ROCPROFILER_SESS/error" ] ; then echo "Error found, profiling aborted." csv_output="" + RET=1 break fi done @@ -450,4 +453,4 @@ if [ "$DATA_PATH" = "$TMP_DIR" ] ; then fi fi -exit 0 +exit $RET diff --git a/src/core/metrics.h b/src/core/metrics.h index f1532dcf..547156de 100644 --- a/src/core/metrics.h +++ b/src/core/metrics.h @@ -30,12 +30,14 @@ THE SOFTWARE. #include #include #include +#include #include #include #include "core/types.h" #include "util/exception.h" #include "util/hsa_rsrc_factory.h" +#include "util/logger.h" #include "xml/expr.h" #include "xml/xml.h" @@ -186,6 +188,8 @@ class MetricsDict { ImportMetrics(agent_info, "const"); if (std::string("gfx906") == agent_info->name) { ImportMetrics(agent_info, agent_info->name); + } else if (std::string("gfx908") == agent_info->name) { + ImportMetrics(agent_info, agent_info->name); } else { ImportMetrics(agent_info, agent_info->gfxip); } @@ -210,9 +214,16 @@ class MetricsDict { } void ImportMetrics(const util::AgentInfo* agent_info, const std::string& scope) { - auto metrics_list = xml_->GetNodes("top." + scope + ".metric"); + auto arr = xml_->GetNodes("top." + scope + ".metric"); + xml::Xml::node_list_t metrics_list(arr.begin(), arr.end()); + uint32_t metrics_number = metrics_list.size(); + bool do_lookup = true; if (!metrics_list.empty()) { - for (auto node : metrics_list) { + uint32_t it_number = metrics_number; + auto it = metrics_list.begin(); + auto end = metrics_list.end(); + while (it != end) { + auto node = *it; const std::string name = node->opts["name"]; const std::string expr_str = node->opts["expr"]; std::string descr = node->opts["descr"]; @@ -242,20 +253,41 @@ class MetricsDict { AddMetric(name, alias, counter); } } else { - xml::Expr* expr_obj = new xml::Expr(expr_str, new ExprCache(&cache_)); + xml::Expr* expr_obj = NULL; + try { + expr_obj = new xml::Expr(expr_str, new ExprCache(&cache_)); + } catch(const xml::exception_t& exc) { + if (do_lookup) metrics_list.push_back(node); + else throw(exc); + } + if (expr_obj) { #if 0 - std::cout << "# " << descr << std::endl; - std::cout << name << "=" << expr_obj->String() << "\n" << std::endl; + std::cout << "# " << descr << std::endl; + std::cout << name << "=" << expr_obj->String() << "\n" << std::endl; #endif - counters_vec_t counters_vec; - for (const std::string var : expr_obj->GetVars()) { - auto it = cache_.find(var); - if (it == cache_.end()) - EXC_RAISING(HSA_STATUS_ERROR, "Bad metric '" << name << "', var '" << var - << "' is not found"); - it->second->GetCounters(counters_vec); + counters_vec_t counters_vec; + for (const std::string var : expr_obj->GetVars()) { + auto it = cache_.find(var); + if (it == cache_.end()) { + EXC_RAISING(HSA_STATUS_ERROR, "Bad metric '" << name << "', var '" << var << "' is not found"); + } + it->second->GetCounters(counters_vec); + } + AddMetric(name, counters_vec, expr_obj); + } + } + + auto cur = it++; + metrics_list.erase(cur); + if (--it_number == 0) { + it_number = metrics_list.size(); + if (it_number < metrics_number) { + metrics_number = it_number; + } else if (it_number == metrics_number) { + do_lookup = false; + } else { + EXC_RAISING(HSA_STATUS_ERROR, "Internal error"); } - AddMetric(name, counters_vec, expr_obj); } } } diff --git a/src/core/tracker.h b/src/core/tracker.h index ffc06b85..c4d619c9 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -83,7 +83,7 @@ class Tracker { static void Destroy() { std::lock_guard lck(glob_mutex_); - if (instance_ != NULL) delete instance_; + if (instance_ != NULL) delete instance_.load(); instance_ = NULL; } diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index f982ddde..af031895 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -226,7 +226,7 @@ class HsaRsrcFactory { static void Destroy() { std::lock_guard lck(mutex_); - if (instance_) delete instance_; + if (instance_) delete instance_.load(); instance_ = NULL; } diff --git a/src/util/logger.h b/src/util/logger.h index 527589f6..8c9cbfd3 100644 --- a/src/util/logger.h +++ b/src/util/logger.h @@ -90,7 +90,7 @@ class Logger { static void Destroy() { std::lock_guard lck(mutex_); - if (instance_ != NULL) delete instance_; + if (instance_ != NULL) delete instance_.load(); instance_ = NULL; } diff --git a/src/xml/xml.h b/src/xml/xml.h index 31ed100b..608f3c54 100644 --- a/src/xml/xml.h +++ b/src/xml/xml.h @@ -43,7 +43,10 @@ class Xml { typedef std::vector token_t; struct level_t; - typedef std::vector nodes_t; + typedef std::vector node_vect_t; + typedef std::list node_list_t; + + typedef node_vect_t nodes_t; typedef std::map opts_t; struct level_t { std::string tag; @@ -143,6 +146,7 @@ class Xml { struct print_func { bool fun(const std::string& global_tag, level_t* node) { + std::cout << global_tag << ":" << std::endl; for (auto& opt : node->opts) { std::cout << global_tag << "." << opt.first << " = " << opt.second << std::endl; } @@ -216,14 +220,11 @@ class Xml { if (strncmp(buf, "#include \"", 10) == 0) { for (ind = 0; (ind < size) && (buf[ind] != '\n'); ++ind) {} - if (ind == size) { - fprintf(stderr, "XML PreProcess failed, line size limit %zu\n", kBufSize); - error = true; - break; + if (ind < size) { + buf[ind] = '\0'; + size = ind; + lseek(fd_, pos + ind + 1, SEEK_SET); } - buf[ind] = '\0'; - size = ind; - lseek(fd_, pos + ind + 1, SEEK_SET); for (ind = 10; (ind < size) && (buf[ind] != '"'); ++ind) {} if (ind == size) { @@ -291,6 +292,8 @@ class Xml { if (node_begin) { AddLevel(tag); } else { + Inherit(GetOption("base")); + if (strncmp(CurrentLevel().c_str(), tag, strlen(tag)) != 0) { token.back() = '>'; BadFormat(token); @@ -315,11 +318,7 @@ class Xml { token[j] = '\0'; const std::string key = &token[0]; const std::string value = &token[j + 1]; - if (key == "base") { - Inherit(value); - } else { - AddOption(key, value); - } + AddOption(key, value); } break; default: @@ -437,15 +436,16 @@ class Xml { if (level == NULL) { AddLevel(from->tag); level = level_; - level->copy = from; } + level->copy = from; level->opts = from->opts; for (auto node : from->nodes) { bool found = false; + const std::string name = GetOption("name", node); const std::string global_tag = GlobalTag(level->tag) + "." + node->tag; for (auto item : (*map_)[global_tag]) { - if (node == item->copy) { + if ((name == GetOption("name", item)) || (node == item->copy)) { found = true; break; } @@ -457,14 +457,16 @@ class Xml { } void Inherit(const std::string& tag) { - std::string global_tag = GlobalTag(tag); - auto it = map_->find(global_tag); - if (it == map_->end()) { - fprintf(stderr, "Node \"%s\": Base not found \"%s\"\n", level_->tag.c_str(), tag.c_str()); - abort(); - } - for (auto node : it->second) { - Copy(node, level_); + if (!tag.empty()) { + const std::string global_tag = GlobalTag(tag); + auto it = map_->find(global_tag); + if (it == map_->end()) { + fprintf(stderr, "Node \"%s\": Base not found \"%s\"\n", level_->tag.c_str(), tag.c_str()); + abort(); + } + for (auto node : it->second) { + Copy(node, level_); + } } } @@ -479,7 +481,14 @@ class Xml { return global_tag; } - void AddOption(const std::string& key, const std::string& value) { level_->opts[key] = value; } + void AddOption(const std::string& key, const std::string& value) { + level_->opts[key] = value; + } + std::string GetOption(const std::string& key, const level_t* level = NULL) { + level = (level != NULL) ? level : level_; + auto it = level->opts.find(key); + return (it != level->opts.end()) ? it->second : ""; + } const std::string file_name_; unsigned file_line_; diff --git a/test/app/test.cpp b/test/app/test.cpp index 796ba1eb..54067973 100644 --- a/test/app/test.cpp +++ b/test/app/test.cpp @@ -73,7 +73,7 @@ int main(int argc, char** argv) { TestHsa::HsaInstantiate(); - std::thread t[thrs]; + std::vector t(thrs); for (int n = 0; n < thrs; ++n) { t[n] = std::thread(thread_fun, kiter, diter, agents_number); } diff --git a/test/ctrl/test_hsa.cpp b/test/ctrl/test_hsa.cpp index d006d19c..3cb5dee7 100644 --- a/test/ctrl/test_hsa.cpp +++ b/test/ctrl/test_hsa.cpp @@ -62,7 +62,7 @@ bool TestHsa::Initialize(int /*arg_cnt*/, char** /*arg_list*/) { if (!hsa_rsrc_->GetGpuAgentInfo(agent_id, &agent_info_)) { agent_info_ = NULL; std::cerr << "> error: agent[" << agent_id << "] is not found" << std::endl; - return NULL; + return false; } } std::clog << "> Using agent[" << agent_info_->dev_index << "] : " << agent_info_->name << std::endl; diff --git a/test/run.sh b/test/run.sh index be59b128..4ba2110a 100755 --- a/test/run.sh +++ b/test/run.sh @@ -30,6 +30,7 @@ fi # test check routin test_status=0 +test_runnum=0 test_number=0 xeval_test() { test_number=$test_number @@ -39,6 +40,7 @@ eval_test() { cmdline=$2 if [ $test_filter = -1 -o $test_filter = $test_number ] ; then echo "$label: \"$cmdline\"" + test_runnum=$((test_runnum + 1)) eval "$cmdline" if [ $? != 0 ] ; then echo "$label: FAILED" @@ -122,5 +124,5 @@ eval_test "libtool test, counter sets" ./test/ctrl #valgrind --tool=massif $tbin #ms_print massif.out. -echo "$test_number tests total / $test_status tests failed" +echo "$test_number tests total / $test_runnum tests run / $test_status tests failed" exit $test_status diff --git a/test/tool/gfx_metrics.xml b/test/tool/gfx_metrics.xml index fecfe7b9..698826c6 100644 --- a/test/tool/gfx_metrics.xml +++ b/test/tool/gfx_metrics.xml @@ -67,3 +67,37 @@ + + + # EA1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/tool/metrics.xml b/test/tool/metrics.xml index 0b53b72e..c340a439 100644 --- a/test/tool/metrics.xml +++ b/test/tool/metrics.xml @@ -1,12 +1,12 @@ #include "gfx_metrics.xml" - + - + @@ -15,15 +15,16 @@ - + + - + - + @@ -34,7 +35,42 @@ - + + + + + # EA1 + + + + + + + + + + # both EA0 and EA1 should be included + + + + + + + + + + + + + + + + + +# VG20 + +# MI100 + # GPUBusy The percentage of time GPU was busy. @@ -149,6 +185,13 @@ expr=WRITE_SIZE > + # MemWrites32B The total number of effective 32B write transactions to the memory + + # L2CacheHit The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal). lck(mutex_); - if (instance_) delete instance_; + if (instance_) delete instance_.load(); instance_ = NULL; } From 8fc0d419e07b851c016e69880958f08fea24e7f7 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 18 Oct 2019 20:05:16 -0500 Subject: [PATCH 062/168] Create rocprof.md --- doc/rocprof.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 doc/rocprof.md diff --git a/doc/rocprof.md b/doc/rocprof.md new file mode 100644 index 00000000..b4b7a848 --- /dev/null +++ b/doc/rocprof.md @@ -0,0 +1 @@ +# rocprof From 4adacd14f0c8b031315038321a0274b908cc912c Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 18 Oct 2019 20:51:27 -0500 Subject: [PATCH 063/168] Update rocprof.md --- doc/rocprof.md | 382 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 382 insertions(+) diff --git a/doc/rocprof.md b/doc/rocprof.md index b4b7a848..648648fb 100644 --- a/doc/rocprof.md +++ b/doc/rocprof.md @@ -1 +1,383 @@ # rocprof +## 1. Overview +The rocProf is a command line tool implemented on the top of rocProfiler and rocTracer APIs. Source code for rocProf may be found here: +GitHub: https://github.com/ROCm-Developer-Tools/rocprofiler/blob/amd-master/bin/rocprof +This command line tool is implemented as a script which is setting up the environment for attaching the profiler and then run the provided application command line. The tool uses two profiling plugins loaded by ROC runtime and based on rocProfiler and rocTracer for collecting metrics/counters, HW traces and runtime API/activity traces. The tool consumes an input XML or text file with counters list or trace parameters and provides output profiling data and statistics in various formats as text, CSV and JSON traces. Google Chrome tracing can be used to visualize the JSON traces with runtime API/activity timelines and per kernel counters data. +## 2. Profiling Modes +‘rocprof’ can be used for GPU profiling using HW counters and application tracing +### 2.1. GPU profiling +GPU profiling is controlled with input file which defines a list of metrics/counters and a profiling scope. An input file is provided using option ‘-i ’. Output CSV file with a line per submitted kernel is generated. Each line has kernel name, kernel parameters and counter values. By option ‘—stats’ the kernel execution stats can be generated in CSV format. Currently profiling has limitation of serializing submitted kernels. +An example of input file: +``` + # Perf counters group 1 + pmc : Wavefronts VALUInsts SALUInsts SFetchInsts + # Perf counters group 2 + pmc : TCC_HIT[0], TCC_MISS[0] + # Filter by dispatches range, GPU index and kernel names + # supported range formats: "3:9", "3:", "3" + range: 1 : 4 + gpu: 0 1 2 3 + kernel: simple Pass1 simpleConvolutionPass2 +``` +An example of profiling command line for ‘MatrixTranspose’ application +``` +$ rocprof -i input.txt MatrixTranspose +RPL: on '191018_011134' from '/…./rocprofiler_pkg' in '/…./MatrixTranspose' +RPL: profiling '"./MatrixTranspose"' +RPL: input file 'input.txt' +RPL: output dir '/tmp/rpl_data_191018_011134_9695' +RPL: result dir '/tmp/rpl_data_191018_011134_9695/input0_results_191018_011134' +ROCProfiler: rc-file '/…./rpl_rc.xml' +ROCProfiler: input from "/tmp/rpl_data_191018_011134_9695/input0.xml" + gpu_index = + kernel = + range = + 4 metrics + L2CacheHit, VFetchInsts, VWriteInsts, MemUnitStalled + 0 traces +Device name Ellesmere [Radeon RX 470/480/570/570X/580/580X] +PASSED! + +ROCPRofiler: 1 contexts collected, output directory /tmp/rpl_data_191018_011134_9695/input0_results_191018_011134 +RPL: '/…./MatrixTranspose/input.csv' is generated +``` +#### 2.1.1. Counters and metrics +There are two profiling features, metrics and traces. Hardware performance counters are treated as the basic metrics and the formulas can be defined for derived metrics. +Counters and metrics can be dynamically configured using XML configuration files with counters and metrics tables: + - Counters table entry, basic metric: counter name, block name, event id + - Derived metrics table entry: metric name, an expression for calculation the metric from the counters + +Metrics XML File Example: +``` + + + + . . . + + + + . . . + + + + + +``` +##### 2.1.1.1. Metrics query +Available counters and metrics can be queried by options ‘—list-basic’ for counters and ‘—list-derived’ for derived metrics. The output for counters indicates number of block instances and number of block counter registers. The output for derived metrics prints the metrics expressions. +Examples: +``` +$ rocprof --list-basic +RPL: on '191018_014450' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' +ROCProfiler: rc-file '/…./rpl_rc.xml' +Basic HW counters: + gpu-agent0 : GRBM_COUNT : Tie High - Count Number of Clocks + block GRBM has 2 counters + gpu-agent0 : GRBM_GUI_ACTIVE : The GUI is Active + block GRBM has 2 counters + . . . + gpu-agent0 : TCC_HIT[0-15] : Number of cache hits. + block TCC has 4 counters + gpu-agent0 : TCC_MISS[0-15] : Number of cache misses. UC reads count as misses. + block TCC has 4 counters + . . . + +$ rocprof --list-derived +RPL: on '191018_015911' from '/opt/rocm/rocprofiler' in '/home/evgeny/work/BUILD/0_MatrixTranspose' +ROCProfiler: rc-file '/home/evgeny/rpl_rc.xml' +Derived metrics: + gpu-agent0 : TCC_HIT_sum : Number of cache hits. Sum over TCC instances. + TCC_HIT_sum = sum(TCC_HIT,16) + gpu-agent0 : TCC_MISS_sum : Number of cache misses. Sum over TCC instances. + TCC_MISS_sum = sum(TCC_MISS,16) + gpu-agent0 : TCC_MC_RDREQ_sum : Number of 32-byte reads. Sum over TCC instaces. + TCC_MC_RDREQ_sum = sum(TCC_MC_RDREQ,16) + . . . +``` +##### 2.1.1.2. Metrics collecting +Counters and metrics accumulated per kernel can be collected using input file with a list of metrics, see an example in 2.1. +Currently profiling has limitation of serializing submitted kernels. +The number of counters which can be dumped by one run is limited by GPU HW by number of counter registers per block. The number of counters can be different for different blocks and can be queried, see 2.1.1.1. +###### 2.1.1.2.1. Blocks instancing +GPU blocks are implemented as several identical instances. To dump counters of specific instance square brackets can be used, see an example in 2.1. +The number of block instances can be queried, see 2.1.1.1. +###### 2.1.1.2.2. HW limitations +The number of counters which can be dumped by one run is limited by GPU HW by number of counter registers per block. The number of counters can be different for different blocks and can be queried, see 2.1.1.1. + - Metrics groups + +To dump a list of metrics exceeding HW limitations the metrics list can be split on groups. +The tool supports automatic splitting on optimal metric groups: +``` +$ rocprof -i input.txt ./MatrixTranspose +RPL: on '191018_032645' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' +RPL: profiling './MatrixTranspose' +RPL: input file 'input.txt' +RPL: output dir '/tmp/rpl_data_191018_032645_12106' +RPL: result dir '/tmp/rpl_data_191018_032645_12106/input0_results_191018_032645' +ROCProfiler: rc-file '/…./rpl_rc.xml' +ROCProfiler: input from "/tmp/rpl_data_191018_032645_12106/input0.xml" + gpu_index = + kernel = + range = + 20 metrics + Wavefronts, VALUInsts, SALUInsts, SFetchInsts, FlatVMemInsts, LDSInsts, FlatLDSInsts, GDSInsts, VALUUtilization, FetchSize, WriteSize, L2CacheHit, VWriteInsts, GPUBusy, VALUBusy, SALUBusy, MemUnitStalled, WriteUnitStalled, LDSBankConflict, MemUnitBusy + 0 traces +Device name Ellesmere [Radeon RX 470/480/570/570X/580/580X] + +Input metrics out of HW limit. Proposed metrics group set: + group1: L2CacheHit VWriteInsts MemUnitStalled WriteUnitStalled MemUnitBusy FetchSize FlatVMemInsts LDSInsts VALUInsts SALUInsts SFetchInsts FlatLDSInsts GPUBusy Wavefronts + group2: WriteSize GDSInsts VALUUtilization VALUBusy SALUBusy LDSBankConflict + +ERROR: rocprofiler_open(), Construct(), Metrics list exceeds HW limits + +Aborted (core dumped) +Error found, profiling aborted. +``` + - Collecting with multiple runs + +To collect several metric groups a full application replay is used by defining several ‘pmc:’ lines in the input file, see 2.1. + +### 2.2. Application tracing +Supported application tracing includes runtime API and GPU activity tracing’ +Supported runtimes are: ROCr (HSA API) and HIP +Supported GPU activity: kernel execution, async memory copy, barrier packets. +The trace is generated in JSON format compatible with Chrome tracing. +The trace consists of several sections with timelines for API trace per thread and GPU activity. The timelines events show event name and parameters. +Supported options: ‘—hsa-trace’, ‘—hip-trace’, ‘—sys-trace’, where ‘sys trace’ is for HIP and HSA combined trace. +#### 2.2.1. HIP runtime trace +The trace is generated by option ‘—hip-trace’ and includes HIP API timelines and GPU activity at the runtime level. +#### 2.2.2. ROCr runtime trace +The trace is generated by option ‘—hsa-trace’ and includes ROCr API timelines and GPU activity at AQL queue level. Also, can provide counters per kernel. +#### 2.2.3. KFD driver trace +Is planned to include Thunk API trace and memory allocations/migration tracing. +#### 2.2.4. Code annotation +Support for application code annotation. +Start/stop API is supported to programmatically control the profiling. +A ‘roctx’ library provides annotation API. Annotation is visualized in JSON trace as a separate "Markers and Ranges" timeline section. +##### 2.2.4.1. Start/stop API +``` +// Tracing start API +void roctracer_start(); + +// Tracing stop API +void roctracer_stop(); +``` +##### 2.2.4.2. rocTX basic markers API +``` +// A marker created by given ASCII massage +void roctxMark(const char* message); + +// Returns the 0 based level of a nested range being started by given message associated to this range. +// A negative value is returned on the error. +int roctxRangePush(const char* message); + +// Marks the end of a nested range. +// Returns the 0 based level the range. +// A negative value is returned on the error. +int roctxRangePop(); +``` +### 2.3. Multiple GPUs profiling +The profiler supports multiple GPU’s profiling and provide GPI id for counters and kernels data in CSV output file. Also, GPU id is indicating for respective GPU activity timeline in JSON trace. +## 3. Profiling control +Profiling can be controlled by specifying a profiling scope, by filtering trace events and specifying interesting time intervals. +### 3.1. Profiling scope +Counters profiling scope can be specified by GPU id list, kernel name substrings list and dispatch range. +Supported range formats examples: "3:9", "3:", "3". You can see an example of input file in 2.1. +#### 3.2. Tracing control +Tracing can be filtered by events names using profiler input file and by enabling interesting time intervals by command line option. +#### 3.2.1. Filtering traced APIs +A list of traced API names can be specified in profiler input file. +An example of input file line for ROCr runtime trace (HAS API): +hsa: hsa_queue_create hsa_amd_memory_pool_allocate +#### 3.2.2. Tracing time period +Trace can be dumped periodically with initial delay, dumping period length and rate: +``` +--trace-period +``` +### 3.3. Concurrent kernels +Currently concurrent kernels profiling is not supported which is a planned feature. Kernels are serialized. +### 3.4. Multi-processes profiling +Multi-processes profiling is not currently supported. +### 3.5. Errors logging +Profiler errors are logged to global logs: +``` +/tmp/aql_profile_log.txt +/tmp/rocprofiler_log.txt +/tmp/roctracer_log.txt +``` +## 4. 3rd party visualization tools +‘rocprof’ is producing JSON trace compatible with Chrome Tracing, which is an internal trace visualization tool in Google Chrome. +### 4.1. Chrome tracing +Good review can be found by the link: https://aras-p.info/blog/2017/01/23/Chrome-Tracing-as-Profiler-Frontend/ +## 5. Command line options +The command line options can be printed with option ‘-h’: +``` +$ rocprof -h +RPL: on '191018_023018' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' +ROCm Profiling Library (RPL) run script, a part of ROCprofiler library package. +Full path: /opt/rocm/rocprofiler/bin/rocprof +Metrics definition: /opt/rocm/rocprofiler/lib/metrics.xml + +Usage: + rocprof [-h] [--list-basic] [--list-derived] [-i ] [-o ] + +Options: + -h - this help + --verbose - verbose mode, dumping all base counters used in the input metrics + --list-basic - to print the list of basic HW counters + --list-derived - to print the list of derived metrics with formulas + + -i <.txt|.xml file> - input file + Input file .txt format, automatically rerun application for every pmc line: + + # Perf counters group 1 + pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize + # Perf counters group 2 + pmc : WriteSize L2CacheHit + # Filter by dispatches range, GPU index and kernel names + # supported range formats: "3:9", "3:", "3" + range: 1 : 4 + gpu: 0 1 2 3 + kernel: simple Pass1 simpleConvolutionPass2 + + Input file .xml format, for single profiling run: + + # Metrics list definition, also the form ":" can be used + # All defined metrics can be found in the 'metrics.xml' + # There are basic metrics for raw HW counters and high-level metrics for derived counters + + + # Filter by dispatches range, GPU index and kernel names + + + -o - output CSV file [.csv] + -d - directory where profiler store profiling data including traces [/tmp] + The data directory is renoving autonatically if the directory is matching the temporary one, which is the default. + -t - to change the temporary directory [/tmp] + By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory. + + --basenames - to turn on/off truncating of the kernel full function names till the base ones [off] + --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off] + --ctx-wait - to wait for outstanding contexts on profiler exit [on] + --ctx-limit - maximum number of outstanding contexts [0 - unlimited] + --heartbeat - to print progress heartbeats [0 - disabled] + + --stats - generating kernel execution stats, file .stats.csv + --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible + --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible + --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible + Generated files: .hsa_stats.txt .json + Traced API list can be set by input .txt or .xml files. + Input .txt: + hsa: hsa_queue_create hsa_amd_memory_pool_allocate + Input .xml: + + + + + + --trace-period - to enable trace with initial delay, with periodic sample length and rate + Supported time formats: + +Configuration file: + You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: + First the configuration file is looking in the current directory, then in your home, and then in the package directory. + Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat'. + An example of 'rpl_rc.xml': + +``` +## 6. Publicly available counters and metrics +The following counters are publicly available for commercially available VEGA10/20 GPUs. + +Counters: +``` +• GRBM_COUNT : Tie High - Count Number of Clocks +• GRBM_GUI_ACTIVE : The GUI is Active +• SQ_WAVES : Count number of waves sent to SQs. (per-simd, emulated, global) +• SQ_INSTS_VALU : Number of VALU instructions issued. (per-simd, emulated) +• SQ_INSTS_VMEM_WR : Number of VMEM write instructions issued (including FLAT). (per-simd, emulated) +• SQ_INSTS_VMEM_RD : Number of VMEM read instructions issued (including FLAT). (per-simd, emulated) +• SQ_INSTS_SALU : Number of SALU instructions issued. (per-simd, emulated) +• SQ_INSTS_SMEM : Number of SMEM instructions issued. (per-simd, emulated) +• SQ_INSTS_FLAT : Number of FLAT instructions issued. (per-simd, emulated) +• SQ_INSTS_FLAT_LDS_ONLY : Number of FLAT instructions issued that read/wrote only from/to LDS (only works if EARLY_TA_DONE is enabled). (per-simd, emulated) +• SQ_INSTS_LDS : Number of LDS instructions issued (including FLAT). (per-simd, emulated) +• SQ_INSTS_GDS : Number of GDS instructions issued. (per-simd, emulated) +• SQ_WAIT_INST_LDS : Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic) +• SQ_ACTIVE_INST_VALU : regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic) +• SQ_INST_CYCLES_SALU : Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated) +• SQ_THREAD_CYCLES_VALU : Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd) +• SQ_LDS_BANK_CONFLICT : Number of cycles LDS is stalled by bank conflicts. (emulated) +• TA_TA_BUSY[0-15] : TA block is busy. Perf_Windowing not supported for this counter. +• TA_FLAT_READ_WAVEFRONTS[0-15] : Number of flat opcode reads processed by the TA. +• TA_FLAT_WRITE_WAVEFRONTS[0-15] : Number of flat opcode writes processed by the TA. +• TCC_HIT[0-15] : Number of cache hits. +• TCC_MISS[0-15] : Number of cache misses. UC reads count as misses. +• TCC_EA_WRREQ[0-15] : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands. +• TCC_EA_WRREQ_64B[0-15] : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. +• TCC_EA_WRREQ_STALL[0-15] : Number of cycles a write request was stalled. +• TCC_EA_RDREQ[0-15] : Number of TCC/EA read requests (either 32-byte or 64-byte) +• TCC_EA_RDREQ_32B[0-15] : Number of 32-byte TCC/EA read requests +• TCP_TCP_TA_DATA_STALL_CYCLES[0-15] : TCP stalls TA data interface. Now Windowed. +``` + +The following derived metrics have been defined and the profiler metrics XML specification can be found at: https://github.com/ROCm-Developer-Tools/rocprofiler/blob/amd-master/test/tool/metrics.xml. + +Metrics: +``` +• TA_BUSY_avr : TA block is busy. Average over TA instances. +• TA_BUSY_max : TA block is busy. Max over TA instances. +• TA_BUSY_min : TA block is busy. Min over TA instances. +• TA_FLAT_READ_WAVEFRONTS_sum : Number of flat opcode reads processed by the TA. Sum over TA instances. +• TA_FLAT_WRITE_WAVEFRONTS_sum : Number of flat opcode writes processed by the TA. Sum over TA instances. +• TCC_HIT_sum : Number of cache hits. Sum over TCC instances. +• TCC_MISS_sum : Number of cache misses. Sum over TCC instances. +• TCC_EA_RDREQ_32B_sum : Number of 32-byte TCC/EA read requests. Sum over TCC instances. +• TCC_EA_RDREQ_sum : Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances. +• TCC_EA_WRREQ_sum : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances. +• TCC_EA_WRREQ_64B_sum : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances. +• TCC_WRREQ_STALL_max : Number of cycles a write request was stalled. Max over TCC instances. +• TCC_MC_WRREQ_sum : Number of 32-byte effective writes. Sum over TCC instaces. +• FETCH_SIZE : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• WRITE_SIZE : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• GPUBusy : The percentage of time GPU was busy. +• Wavefronts : Total wavefronts. +• VALUInsts : The average number of vector ALU instructions executed per work-item (affected by flow control). +• SALUInsts : The average number of scalar ALU instructions executed per work-item (affected by flow control). +• VFetchInsts : The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory. +• SFetchInsts : The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control). +• VWriteInsts : The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory. +• FlatVMemInsts : The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch. +• LDSInsts : The average number of LDS read or LDS write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS. +• FlatLDSInsts : The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control). +• GDSInsts : The average number of GDS read or GDS write instructions executed per work item (affected by flow control). +• VALUUtilization : The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence). +• VALUBusy : The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). +• SALUBusy : The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). +• Mem32Bwrites : +• FetchSize : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• WriteSize : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• L2CacheHit : The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal). +• MemUnitBusy : The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound). +• MemUnitStalled : The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad). +• WriteUnitStalled : The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad). +• ALUStalledByLDS : The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). +• LDSBankConflict : The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad). +``` From b7a108677ec367c31ea14d8c43423e67ad7c4925 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 22 Nov 2019 17:57:59 -0600 Subject: [PATCH 064/168] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 82f525dc..b8f26bd2 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,9 @@ Options: --heartbeat - to print progress heartbeats [0 - disabled] --stats - generating kernel executino stats, file .stats.csv + --roctx-trace - to enable rocTX trace: + Will show the application code instrumentation rocTX events: roctxMark, roctxRangePush, roctxRangePop in JSON trace + "Markers and Ranges" section. Application code needs to be explicitely instrumented with rocTX events. --hip-trace - to trace HIP, generates API execution stats/trace and JSON file viewable in chrome tracing 'HCC_HOME' env var is required to be set to where 'hcc' is installed. --hsa-trace - to trace HSA, generates API execution stats/trace and JSON file viewable in chrome tracing From 908ea02e256818f27065777c44c136003abb07a8 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 22 Nov 2019 18:47:13 -0600 Subject: [PATCH 065/168] Update README.md --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b8f26bd2..3eabe823 100644 --- a/README.md +++ b/README.md @@ -140,9 +140,6 @@ Options: --heartbeat - to print progress heartbeats [0 - disabled] --stats - generating kernel executino stats, file .stats.csv - --roctx-trace - to enable rocTX trace: - Will show the application code instrumentation rocTX events: roctxMark, roctxRangePush, roctxRangePop in JSON trace - "Markers and Ranges" section. Application code needs to be explicitely instrumented with rocTX events. --hip-trace - to trace HIP, generates API execution stats/trace and JSON file viewable in chrome tracing 'HCC_HOME' env var is required to be set to where 'hcc' is installed. --hsa-trace - to trace HSA, generates API execution stats/trace and JSON file viewable in chrome tracing @@ -155,6 +152,11 @@ Options: + + --roctx-trace - to enable rocTX applicatin code annotation trace; should be use in addition to the trace optins above. + Will show the application code annotation with rocTX events: roctxMark, roctxRangePush, roctxRangePop in JSON trace + "Markers and Ranges" section. + Application code needs to be explicitely instrumented using rocTX events APIs. Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: From b49880d25093ae3c1bf52296eff2d03e510de011 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 22 Nov 2019 18:48:09 -0600 Subject: [PATCH 066/168] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3eabe823..e1bc47f3 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,7 @@ Options: Will show the application code annotation with rocTX events: roctxMark, roctxRangePush, roctxRangePop in JSON trace "Markers and Ranges" section. Application code needs to be explicitely instrumented using rocTX events APIs. + See roctracer documentation on rocTX API details. Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: From 54aaa3da88fff0f7fc336e3730f15689afd47a73 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 22 Nov 2019 18:48:48 -0600 Subject: [PATCH 067/168] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e1bc47f3..2f63df0b 100644 --- a/README.md +++ b/README.md @@ -153,7 +153,7 @@ Options: - --roctx-trace - to enable rocTX applicatin code annotation trace; should be use in addition to the trace optins above. + --roctx-trace - to enable rocTX applicatin code annotation trace; should be use in addition to the trace options above. Will show the application code annotation with rocTX events: roctxMark, roctxRangePush, roctxRangePop in JSON trace "Markers and Ranges" section. Application code needs to be explicitely instrumented using rocTX events APIs. From 4e0158a8efeb4de4df7666ae306dcf56f3eb5c15 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Sat, 23 Nov 2019 00:25:30 -0600 Subject: [PATCH 068/168] rocm2.10 update --- bin/dform.py | 5 +++ bin/rpl_run.sh | 56 +++++++++++++++++++-------- bin/tblextr.py | 83 ++++++++++++++++++++++++++++++++++++---- src/core/metrics.h | 14 ++++++- src/core/rocprofiler.cpp | 4 +- 5 files changed, 133 insertions(+), 29 deletions(-) diff --git a/bin/dform.py b/bin/dform.py index 1e5c63b1..f797d637 100644 --- a/bin/dform.py +++ b/bin/dform.py @@ -24,6 +24,11 @@ def gen_api_json_trace(db, table, start_us, outfile): db.dump_json('B', table, outfile) db.execute('DROP VIEW B') +def gen_ext_json_trace(db, table, start_us, outfile): + db.execute('create view B as select Name as name, pid, tid, (BeginNs/1000 - %d) as ts, ((EndNs - BeginNs)/1000) as dur from %s order by ts asc;' % (start_us, table)); + db.dump_json('B', table, outfile) + db.execute('DROP VIEW B') + def gen_ops_json_trace(db, table, base_pid, start_us, outfile): db.execute('create view B as select "Index", Name as name, ("dev-id" + %d) as pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s order by ts asc;' % (base_pid, start_us, table)); db.dump_json('B', table, outfile) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 2e0ba8ba..a6299b66 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -45,6 +45,7 @@ if [ -z "$HCC_HOME" ] ; then fi # runtime API trace +ROCTX_TRACE=0 HSA_TRACE=0 SYS_TRACE=0 HIP_TRACE=0 @@ -142,7 +143,6 @@ usage() { echo " The data directory is renoving autonatically if the directory is matching the temporary one, which is the default." echo " -t - to change the temporary directory [/tmp]" echo " By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory." - echo " -m - file defining custom metrics to use in-place of defaults." echo "" echo " --basenames - to turn on/off truncating of the kernel full function names till the base ones [off]" echo " --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off]" @@ -151,6 +151,7 @@ usage() { echo " --heartbeat - to print progress heartbeats [0 - disabled]" echo "" echo " --stats - generating kernel execution stats, file .stats.csv" + echo " --roctx-trace - to enable rocTX trace" echo " --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible" echo " --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible" echo " --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible" @@ -214,23 +215,22 @@ run() { fi API_TRACE="" - if [ "$HSA_TRACE" = 1 ] ; then - API_TRACE="hsa" + if [ "$ROCTX_TRACE" = 1 ] ; then + API_TRACE=${API_TRACE}":roctx" fi - if [ "$SYS_TRACE" = 1 ] ; then - API_TRACE="sys" + if [ "$HSA_TRACE" = 1 ] ; then + API_TRACE=${API_TRACE}":hsa" fi if [ "$HIP_TRACE" = 1 ] ; then - if [ -z "$API_TRACE" ] ; then - API_TRACE="hip"; - else - API_TRACE="all" - fi + API_TRACE=${API_TRACE}":hip" + fi + if [ "$SYS_TRACE" = 1 ] ; then + API_TRACE=${API_TRACE}":sys" fi + if [ -n "$API_TRACE" ] ; then - API_TRACE=$(echo $API_TRACE | sed 's/all//') - if [ -n "$API_TRACE" ] ; then export ROCTRACER_DOMAIN=$API_TRACE; fi - if [ "$API_TRACE" = "hip" -o "$API_TRACE" = "sys" ] ; then + export ROCTRACER_DOMAIN=$API_TRACE + if [ "$HSA_TRACE" = 0 ] ; then OUTPUT_LIST="$ROCP_OUTPUT_DIR/" fi export HSA_TOOLS_LIB="$TTLIB_PATH/libtracer_tool.so" @@ -241,7 +241,6 @@ run() { redirection_cmd="2>&1 | tee $ROCP_OUTPUT_DIR/log.txt" fi - #unset ROCP_OUTPUT_DIR CMD_LINE="$APP_CMD $redirection_cmd" eval "$CMD_LINE" } @@ -259,6 +258,29 @@ merge_output() { done } +convert_time_val() { + local time_maxumim_us=$((0xffffffff)) + local __resultvar=$1 + eval "local val=$"$__resultvar + val_m=`echo $val | sed -n "s/^\([0-9]*\)m$/\1/p"` + val_s=`echo $val | sed -n "s/^\([0-9]*\)s$/\1/p"` + val_ms=`echo $val | sed -n "s/^\([0-9]*\)ms$/\1/p"` + val_us=`echo $val | sed -n "s/^\([0-9]*\)us$/\1/p"` + if [ -n "$val_m" ] ; then val_us=$((val_m*60000000)) + elif [ -n "$val_s" ] ; then val_us=$((val_s*1000000)) + elif [ -n "$val_ms" ] ; then val_us=$((val_ms*1000)) + fi + + if [ -z "$val_us" ] ; then + error_message="invalid time value format ($val)" + elif [ "$val_us" -gt "$time_maxumim_us" ] ; then + error_message="time value exceeds maximum supported ($val > ${time_maxumim_us}us)" + else + eval $__resultvar="'$val_us'" + fi +} + +################################################################################################ # main echo "RPL: on '$time_stamp' from '$PKG_DIR' in '$RUN_DIR'" # Parsing arguments @@ -289,9 +311,6 @@ while [ 1 ] ; do if [ "$OUTPUT_DIR" = "-" ] ; then DATA_PATH=$TMP_DIR fi - elif [ "$1" = "-m" ] ; then - unset ROCP_METRICS - export ROCP_METRICS="$2" elif [ "$1" = "--list-basic" ] ; then export ROCP_INFO=b eval "$PKG_DIR/tool/ctrl" @@ -326,6 +345,9 @@ while [ 1 ] ; do ARG_VAL=0 export ROCP_TIMESTAMP_ON=1 GEN_STATS=1 + elif [ "$1" = "--roctx-trace" ] ; then + ARG_VAL=0 + ROCTX_TRACE=1 elif [ "$1" = "--hsa-trace" ] ; then ARG_VAL=0 export ROCP_TIMESTAMP_ON=1 diff --git a/bin/tblextr.py b/bin/tblextr.py index 057e984a..895f41b1 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -36,7 +36,8 @@ OPS_PID = 1 HSA_PID = 2 HIP_PID = 3 -GPU_BASE_PID = 4 +EXT_PID = 4 +GPU_BASE_PID = 5 max_gpu_id = -1 START_US = 0 @@ -186,10 +187,66 @@ def fill_kernel_db(table_name, db): db.insert_entry(table_handle, val_list) ############################################################# -# fill HSA DB -hsa_table_descr = [ +# Fill Ext DB +ext_table_descr = [ + ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'Index'], + {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'Index':'INTEGER'} +] +def fill_ext_db(table_name, db, indir, trace_name, api_pid): + file_name = indir + '/' + trace_name + '_trace.txt' + ptrn_val = re.compile(r'(\d+) (\d+):(\d+) (\d+):(.*)$') + + if not os.path.isfile(file_name): return 0 + + range_stack = {} + + record_id = 0 + table_handle = db.add_table(table_name, ext_table_descr) + with open(file_name, mode='r') as fd: + for line in fd.readlines(): + record = line[:-1] + m = ptrn_val.match(record) + if m: + tms = int(m.group(1)) + pid = m.group(2) + tid = m.group(3) + cid = int(m.group(4)) + msg = m.group(5) + + rec_vals = [] + + if cid != 2: + rec_vals.append(tms) + rec_vals.append(tms + 1) + rec_vals.append(api_pid) + rec_vals.append(tid) + rec_vals.append(msg) + rec_vals.append(record_id) + + if cid == 1: + if not pid in range_stack: range_stack[pid] = {} + pid_stack = range_stack[pid] + if not tid in pid_stack: pid_stack[tid] = [] + rec_stack = pid_stack[tid] + rec_stack.append(rec_vals) + continue + + if cid == 2: + pid_stack = range_stack[pid] + rec_stack = pid_stack[tid] + rec_vals = rec_stack.pop() + rec_vals[1] = tms + + db.insert_entry(table_handle, rec_vals) + record_id += 1 + + return 1 +############################################################# + +# Fill API DB +api_table_descr = [ ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'args', 'Index'], - {'Index':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER'} + {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'Index':'INTEGER'} ] def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep_filtr, expl_id): file_name = indir + '/' + api_name + '_api_trace.txt' @@ -211,14 +268,15 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep START_US = 0 record_id = 0 - table_handle = db.add_table(table_name, hsa_table_descr) + table_handle = db.add_table(table_name, api_table_descr) with open(file_name, mode='r') as fd: for line in fd.readlines(): record = line[:-1] m = ptrn_val.match(record) if m: rec_vals = [] - for ind in range(1,7): + rec_len = len(api_table_descr[0]) + for ind in range(1,rec_len): rec_vals.append(m.group(ind)) rec_vals[2] = api_pid rec_vals.append(record_id) @@ -332,7 +390,8 @@ def fill_ops_db(table_name, db, indir): outfile = sys.argv[1] infiles = sys.argv[2:] indir = re.sub(r'\/[^\/]*$', r'', infiles[0]) -inext = re.sub(r'^[^\.]*', r'', infiles[0]) +inext = re.sub(r'\s+$', r'', infiles[0]) +inext = re.sub(r'^.*(\.[^\.]+)$', r'\1', inext) dbfile = '' csvfile = '' @@ -358,6 +417,8 @@ def fill_ops_db(table_name, db, indir): with open(dbfile, mode='w') as fd: fd.truncate() db = SQLiteDB(dbfile) + ext_trace_found = fill_ext_db('rocTX', db, indir, 'roctx', EXT_PID) + hsa_trace_found = fill_api_db('HSA', db, indir, 'hsa', HSA_PID, COPY_PID, kern_dep_list, {}, 0) hsa_activity_found = fill_copy_db('COPY', db, indir) @@ -366,10 +427,13 @@ def fill_ops_db(table_name, db, indir): fill_kernel_db('A', db) - any_trace_found = hsa_trace_found | hip_trace_found + any_trace_found = ext_trace_found | hsa_trace_found | hip_trace_found if any_trace_found: db.open_json(jsonfile) + if ext_trace_found: + db.label_json(EXT_PID, "Markers and Ranges", jsonfile) + if hsa_trace_found: db.label_json(HSA_PID, "CPU HSA API", jsonfile) if hsa_activity_found: @@ -382,6 +446,9 @@ def fill_ops_db(table_name, db, indir): for ind in range(0, int(max_gpu_id) + 1): db.label_json(int(ind) + int(GPU_BASE_PID), "GPU" + str(ind), jsonfile) + if ext_trace_found: + dform.gen_ext_json_trace(db, 'rocTX', START_US, jsonfile) + if len(var_table) != 0: dform.post_process_data(db, 'A', csvfile) dform.gen_table_bins(db, 'A', statfile, 'KernelName', 'DurationNs') diff --git a/src/core/metrics.h b/src/core/metrics.h index 547156de..57ec7c31 100644 --- a/src/core/metrics.h +++ b/src/core/metrics.h @@ -171,11 +171,20 @@ class MetricsDict { const_iterator_t Begin() const { return cache_.begin(); } const_iterator_t End() const { return cache_.end(); } + std::string GetAgentName() const { return agent_name_; } + + xml::Xml::nodes_t GetNodes() const { + auto nodes_vec = GetNodes(agent_name_); + auto global_vec = GetNodes("global"); + nodes_vec.insert(nodes_vec.end(), global_vec.begin(), global_vec.end()); + return nodes_vec; + } + + private: xml::Xml::nodes_t GetNodes(const std::string& scope) const { return (xml_ != NULL) ? xml_->GetNodes("top." + scope + ".metric") : xml::Xml::nodes_t(); } - private: MetricsDict(const util::AgentInfo* agent_info) : xml_(NULL), agent_info_(agent_info) { const char* xml_name = getenv("ROCP_METRICS"); if (xml_name != NULL) { @@ -186,11 +195,13 @@ class MetricsDict { xml_->AddConst("top.const.metric", "SIMD_NUM", agent_info->simds_per_cu * agent_info->cu_num); xml_->AddConst("top.const.metric", "SE_NUM", agent_info->se_num); ImportMetrics(agent_info, "const"); + agent_name_ = agent_info->name; if (std::string("gfx906") == agent_info->name) { ImportMetrics(agent_info, agent_info->name); } else if (std::string("gfx908") == agent_info->name) { ImportMetrics(agent_info, agent_info->name); } else { + agent_name_ = agent_info->gfxip; ImportMetrics(agent_info, agent_info->gfxip); } ImportMetrics(agent_info, "global"); @@ -327,6 +338,7 @@ class MetricsDict { xml::Xml* xml_; const util::AgentInfo* agent_info_; + std::string agent_name_; cache_t cache_; static map_t* map_; diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index 090e5492..cbfbc432 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -738,9 +738,7 @@ PUBLIC_API hsa_status_t rocprofiler_iterate_info( case ROCPROFILER_INFO_KIND_METRIC: { const rocprofiler::MetricsDict* dict = rocprofiler::GetMetrics(agent_info->dev_id); - auto nodes_vec = dict->GetNodes(agent_info->gfxip); - auto global_vec = dict->GetNodes("global"); - nodes_vec.insert(nodes_vec.end(), global_vec.begin(), global_vec.end()); + auto nodes_vec = dict->GetNodes(); for (auto* node : nodes_vec) { const std::string& name = node->opts["name"]; From 8914bf04559ea18401612570cd9df15d9b370d29 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 27 Nov 2019 17:37:54 -0600 Subject: [PATCH 069/168] Update LICENSE --- LICENSE | 1 + 1 file changed, 1 insertion(+) diff --git a/LICENSE b/LICENSE index 9e78331e..8384c985 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,5 @@ Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +[MITx11 License] Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 8703877d2373c530c83945ccd96aeda3867ea966 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 3 Dec 2019 16:47:39 -0600 Subject: [PATCH 070/168] custom metrics file rocprof option; profiled cmd line quoting option; --- bin/rpl_run.sh | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index a6299b66..6a7bbf80 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -106,6 +106,7 @@ usage() { echo " --verbose - verbose mode, dumping all base counters used in the input metrics" echo " --list-basic - to print the list of basic HW counters" echo " --list-derived - to print the list of derived metrics with formulas" + echo " --cmd-qts - quoting profiled cmd line [on]" echo "" echo " -i <.txt|.xml file> - input file" echo " Input file .txt format, automatically rerun application for every pmc line:" @@ -143,6 +144,7 @@ usage() { echo " The data directory is renoving autonatically if the directory is matching the temporary one, which is the default." echo " -t - to change the temporary directory [/tmp]" echo " By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory." + echo " -m - file defining custom metrics to use in-place of defaults." echo "" echo " --basenames - to turn on/off truncating of the kernel full function names till the base ones [off]" echo " --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off]" @@ -311,6 +313,9 @@ while [ 1 ] ; do if [ "$OUTPUT_DIR" = "-" ] ; then DATA_PATH=$TMP_DIR fi + elif [ "$1" = "-m" ] ; then + unset ROCP_METRICS + export ROCP_METRICS="$2" elif [ "$1" = "--list-basic" ] ; then export ROCP_INFO=b eval "$PKG_DIR/tool/ctrl" @@ -366,6 +371,10 @@ while [ 1 ] ; do elif [ "$1" = "--verbose" ] ; then ARG_VAL=0 export ROCP_VERBOSE_MODE=1 + elif [ "$1" = "--cmd-qts" ] ; then + if [ "$2" = "off" ] ; then + CMD_QTS=0 + fi else break fi @@ -404,14 +413,18 @@ else csv_output=$RUN_DIR/${input_base}.csv fi -APP_CMD="" -for i in `seq 1 $#`; do - if [ -n "$APP_CMD" ] ; then - APP_CMD=$APP_CMD" " - fi - eval "arg=\${$i}" - APP_CMD=$APP_CMD\"$arg\" -done +# Profiled cmd line string +APP_CMD=$* +if [ "$CMD_QTS" = 1 ] ; then + APP_CMD="" + for i in `seq 1 $#`; do + if [ -n "$APP_CMD" ] ; then + APP_CMD=$APP_CMD" " + fi + eval "arg=\${$i}" + APP_CMD=$APP_CMD\"$arg\" + done +fi echo "RPL: profiling '$APP_CMD'" echo "RPL: input file '$INPUT_FILE'" From b4d4c031fd54630a64b49a7f339bff739f77cd0e Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 17 Dec 2019 01:58:30 -0600 Subject: [PATCH 071/168] 3.0 update --- CMakeLists.txt | 2 +- bin/dform.py | 20 ++- bin/rpl_run.sh | 64 ++++++++-- bin/sqlitedb.py | 12 +- bin/tblextr.py | 76 ++++++++--- bin/txt2xml.sh | 27 ++-- inc/rocprofiler.h | 3 +- src/core/intercept_queue.h | 25 ++-- src/core/rocprofiler.cpp | 12 +- src/core/tracker.h | 8 +- src/util/hsa_rsrc_factory.cpp | 59 +++++++++ src/util/hsa_rsrc_factory.h | 19 +++ test/CMakeLists.txt | 3 + test/app/intercept_test_stand.cpp | 2 +- test/ocl/SimpleConvolution | Bin 0 -> 132704 bytes test/ocl/SimpleConvolution_Kernels.cl | 175 ++++++++++++++++++++++++++ test/run.sh | 8 +- test/tool/tool.cpp | 19 +-- test/util/hsa_rsrc_factory.cpp | 97 +++++++++++--- test/util/hsa_rsrc_factory.h | 51 +++++--- 20 files changed, 577 insertions(+), 105 deletions(-) create mode 100755 test/ocl/SimpleConvolution create mode 100644 test/ocl/SimpleConvolution_Kernels.cl diff --git a/CMakeLists.txt b/CMakeLists.txt index 18bbee13..8b81b5a6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -126,7 +126,7 @@ set ( CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/pos ## RPM package specific variables set ( CPACK_RPM_PACKAGE_DEPENDS "hsa-rocr-dev" ) -set ( CPACK_RPM_PRE_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" ) +set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" ) set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" ) include ( CPack ) diff --git a/bin/dform.py b/bin/dform.py index f797d637..93194608 100644 --- a/bin/dform.py +++ b/bin/dform.py @@ -1,13 +1,18 @@ #!/usr/bin/python from sqlitedb import SQLiteDB -def post_process_data(db, table_name, outfile = ''): +def gen_message(outfile): + if outfile != '': + print("File '" + outfile + "' is generating") + +def post_process_data(db, table_name, outfile = ''): # db.add_data_column('A', 'DispDurNs', 'INTEGER', 'BeginNs - DispatchNs') # db.add_data_column('A', 'ComplDurNs', 'INTEGER', 'CompleteNs - EndNs') # db.add_data_column('A', 'TotalDurNs', 'INTEGER', 'CompleteNs - DispatchNs') # db.add_data_column(table_name, 'TimeNs', 'INTEGER', 'BeginNs - %d' % start_ns) db.add_data_column(table_name, 'DurationNs', 'INTEGER', 'EndNs - BeginNs') if outfile != '': db.dump_csv(table_name, outfile) + gen_message(outfile) def gen_data_bins(db, outfile): db.execute('create view C as select Name, Calls, TotalDurationNs, TotalDurationNs/Calls as AverageNs, TotalDurationNs*100.0/(select sum(TotalDurationNs) from %s) as Percentage from %s order by TotalDurationNs desc;' % ('B', 'B')); @@ -18,24 +23,29 @@ def gen_table_bins(db, table, outfile, name_var, dur_ns_var): db.execute('create view B as select (%s) as Name, count(%s) as Calls, sum(%s) as TotalDurationNs from %s group by %s' % (name_var, name_var, dur_ns_var, table, name_var)) gen_data_bins(db, outfile) db.execute('DROP VIEW B') + gen_message(outfile) def gen_api_json_trace(db, table, start_us, outfile): - db.execute('create view B as select "Index", Name as name, pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s order by ts asc;' % (start_us, table)); + db.execute('create view B as select "Index", Name as name, pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s;' % (start_us, table)); db.dump_json('B', table, outfile) db.execute('DROP VIEW B') + gen_message(outfile) def gen_ext_json_trace(db, table, start_us, outfile): - db.execute('create view B as select Name as name, pid, tid, (BeginNs/1000 - %d) as ts, ((EndNs - BeginNs)/1000) as dur from %s order by ts asc;' % (start_us, table)); + db.execute('create view B as select Name as name, pid, tid, (BeginNs/1000 - %d) as ts, ((EndNs - BeginNs)/1000) as dur from %s;' % (start_us, table)); db.dump_json('B', table, outfile) db.execute('DROP VIEW B') + gen_message(outfile) def gen_ops_json_trace(db, table, base_pid, start_us, outfile): - db.execute('create view B as select "Index", Name as name, ("dev-id" + %d) as pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s order by ts asc;' % (base_pid, start_us, table)); + db.execute('create view B as select "Index", Name as name, ("dev-id" + %d) as pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s;' % (base_pid, start_us, table)); db.dump_json('B', table, outfile) db.execute('DROP VIEW B') + gen_message(outfile) def gen_kernel_json_trace(db, table, base_pid, start_us, outfile): - db.execute('create view B as select "Index", KernelName as name, ("gpu-id" + %d) as pid, (0) as tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s order by ts asc;' % (base_pid, start_us, table)); + db.execute('create view B as select "Index", KernelName as name, ("gpu-id" + %d) as pid, (0) as tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s;' % (base_pid, start_us, table)); db.dump_json('B', table, outfile) db.execute('DROP VIEW B') + gen_message(outfile) ############################################################################################## diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 6a7bbf80..86383d14 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -46,6 +46,7 @@ fi # runtime API trace ROCTX_TRACE=0 +KFD_TRACE=0 HSA_TRACE=0 SYS_TRACE=0 HIP_TRACE=0 @@ -53,6 +54,9 @@ HIP_TRACE=0 # Generate stats GEN_STATS=0 +# Quoting profiled cmd line +CMD_QTS=1 + export PATH=.:$PATH # enable error logging @@ -91,6 +95,13 @@ error() { exit 1 } +error_message="" +errck() { + if [ -n "$error_message" ]; then + fatal "$1 : $error_message" + fi +} + # usage method usage() { bin_name=`basename $0` @@ -109,7 +120,7 @@ usage() { echo " --cmd-qts - quoting profiled cmd line [on]" echo "" echo " -i <.txt|.xml file> - input file" - echo " Input file .txt format, automatically rerun application for every pmc line:" + echo " Input file .txt format, automatically rerun application for every profiling features line:" echo "" echo " # Perf counters group 1" echo " pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize" @@ -154,6 +165,7 @@ usage() { echo "" echo " --stats - generating kernel execution stats, file .stats.csv" echo " --roctx-trace - to enable rocTX trace" + echo " --kfd-trace - to trace KFD, generates API execution stats and JSON file chrome-tracing compatible" echo " --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible" echo " --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible" echo " --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible" @@ -167,6 +179,10 @@ usage() { echo " " echo " " echo "" + echo " --trace-period - to enable trace with initial delay, with periodic sample length and rate" + echo " Supported time formats: " + echo " --obj-tracking - to turn on/off kernels code objects tracking [off]" + echo "" echo "Configuration file:" echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" echo " First the configuration file is looking in the current directory, then in your home, and then in the package directory." @@ -212,7 +228,6 @@ run() { fi fi mkdir -p "$ROCP_OUTPUT_DIR" - OUTPUT_LIST="$OUTPUT_LIST $ROCP_OUTPUT_DIR/results.txt" fi @@ -220,8 +235,9 @@ run() { if [ "$ROCTX_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":roctx" fi - if [ "$HSA_TRACE" = 1 ] ; then - API_TRACE=${API_TRACE}":hsa" + if [ "$KFD_TRACE" = 1 ] ; then + API_TRACE=${API_TRACE}":kfd" + export LD_PRELOAD="libkfdwrapper64.so libhsakmt.so.1" fi if [ "$HIP_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":hip" @@ -230,11 +246,12 @@ run() { API_TRACE=${API_TRACE}":sys" fi - if [ -n "$API_TRACE" ] ; then + if [ "$HSA_TRACE" = 1 ] ; then + export ROCTRACER_DOMAIN=$API_TRACE":hsa" + export HSA_TOOLS_LIB="$HSA_TOOLS_LIB $TTLIB_PATH/libtracer_tool.so" + elif [ -n "$API_TRACE" ] ; then export ROCTRACER_DOMAIN=$API_TRACE - if [ "$HSA_TRACE" = 0 ] ; then - OUTPUT_LIST="$ROCP_OUTPUT_DIR/" - fi + OUTPUT_LIST="$ROCP_OUTPUT_DIR/" export HSA_TOOLS_LIB="$TTLIB_PATH/libtracer_tool.so" fi @@ -245,6 +262,8 @@ run() { CMD_LINE="$APP_CMD $redirection_cmd" eval "$CMD_LINE" + + unset LD_PRELOAD } merge_output() { @@ -353,6 +372,11 @@ while [ 1 ] ; do elif [ "$1" = "--roctx-trace" ] ; then ARG_VAL=0 ROCTX_TRACE=1 + elif [ "$1" = "--kfd-trace" ] ; then + ARG_VAL=0 + export ROCP_TIMESTAMP_ON=1 + GEN_STATS=1 + KFD_TRACE=1 elif [ "$1" = "--hsa-trace" ] ; then ARG_VAL=0 export ROCP_TIMESTAMP_ON=1 @@ -368,6 +392,26 @@ while [ 1 ] ; do export ROCP_TIMESTAMP_ON=1 GEN_STATS=1 HIP_TRACE=1 + elif [ "$1" = "--trace-period" ] ; then + period_expr="^\([^:]*\):\([^:]*\):\([^:]*\)$" + period_ck=`echo "$2" | sed -n "s/"${period_expr}"/ok/p"` + if [ -z "$period_ck" ] ; then + fatal "Wrong option '$1 $2'" + fi + period_delay=`echo "$2" | sed -n "s/"${period_expr}"/\1/p"` + period_len=`echo "$2" | sed -n "s/"${period_expr}"/\2/p"` + period_rate=`echo "$2" | sed -n "s/"${period_expr}"/\3/p"` + convert_time_val period_delay + errck "Option '$ARG_IN', delay value" + convert_time_val period_len + errck "Option '$ARG_IN', length value" + convert_time_val period_rate + errck "Option '$ARG_IN', rate value" + export ROCP_CTRL_RATE="$period_delay:$period_len:$period_rate" + elif [ "$1" = "--obj-tracking" ] ; then + if [ "$2" = "on" ] ; then + export ROCP_OBJ_TRACKING=1 + fi elif [ "$1" = "--verbose" ] ; then ARG_VAL=0 export ROCP_VERBOSE_MODE=1 @@ -475,9 +519,7 @@ if [ -n "$csv_output" ] ; then else python $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST fi - if [ "$?" -eq 0 ] ; then - echo "RPL: '$csv_output' is generated" - else + if [ "$?" -ne 0 ] ; then echo "Data extracting error: $OUTPUT_LIST'" fi fi diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index e02d4136..cd649e6a 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -6,7 +6,7 @@ class SQLiteDB: def __init__(self, file_name): self.connection = sqlite3.connect(file_name) self.tables = {} - self.json_arg_list_enabled = 0 + self.section_index = 0 def __del__(self): self.connection.close() @@ -83,6 +83,9 @@ def _get_raw_by_id(self, table_name, req_id): raise Exception('Index is not unique, table "' + table_name + '"') return list(raws[0]) + def table_get_raws(self, table_name): + return self._get_raws(table_name) + # dump CSV table def dump_csv(self, table_name, file_name): if not re.search(r'\.csv$', file_name): @@ -111,7 +114,8 @@ def label_json(self, pid, label, file_name): if not re.search(r'\.json$', file_name): raise Exception('wrong output file type: "' + file_name + '"' ) with open(file_name, mode='a') as fd: - fd.write(',{"args":{"name":"%s"},"ph":"M","pid":%s,"name":"process_name"}\n' %(label, pid)); + fd.write(',{"args":{"name":"%s %s"},"ph":"M","pid":%s,"name":"process_name"}\n' %(self.section_index, label, pid)); + self.section_index += 1 def flow_json(self, base_id, from_pid, from_tid, from_us_list, to_pid, to_us_dict, corr_id_list, start_us, file_name): if not re.search(r'\.json$', file_name): @@ -137,9 +141,9 @@ def dump_json(self, table_name, data_name, file_name): name_ptrn = re.compile(r'(name|Name)') table_fields = self._get_fields(table_name) - table_raws = self._get_raws_indexed(table_name) + table_raws = self._get_raws(table_name) data_fields = self._get_fields(data_name) - data_raws = self._get_raws_indexed(data_name) + data_raws = self._get_raws(data_name) with open(file_name, mode='a') as fd: table_raws_len = len(table_raws) diff --git a/bin/tblextr.py b/bin/tblextr.py index 895f41b1..490cdb8b 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -32,15 +32,20 @@ # SQ_WAVES (4096) # SQ_INSTS_VMEM_RD (36864) -COPY_PID = 0 -OPS_PID = 1 -HSA_PID = 2 -HIP_PID = 3 -EXT_PID = 4 -GPU_BASE_PID = 5 +EXT_PID = 0 +COPY_PID = 1 +HIP_PID = 2 +HSA_PID = 3 +KFD_PID = 4 +OPS_PID = 5 +GPU_BASE_PID = 6 +NONE_PID = -1 + max_gpu_id = -1 START_US = 0 +hsa_activity_found = 0 + # dependencies dictionary dep_dict = {} kern_dep_list = [] @@ -166,6 +171,8 @@ def dump_csv(file_name): if ind != dispatch_number: fatal("Dispatch #" + ind + " index mismatch (" + dispatch_number + ")\n") val_list = [entry[var] for var in var_list] fd.write(','.join(val_list) + '\n'); + + print("File '" + file_name + "' is generating") ############################################################# # fill kernels DB @@ -249,6 +256,12 @@ def fill_ext_db(table_name, db, indir, trace_name, api_pid): {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'Index':'INTEGER'} ] def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep_filtr, expl_id): + global hsa_activity_found + copy_raws = [] + if (hsa_activity_found): copy_raws = db.table_get_raws('COPY') + copy_csv = '' + copy_index = 0 + file_name = indir + '/' + api_name + '_api_trace.txt' ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) ([^\(]+)(\(.*)$') ptrn_ac = re.compile(r'hsa_amd_memory_async_copy') @@ -288,6 +301,16 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep dep_from_us_list.append(from_us) dep_tid_list.append(int(rec_vals[3])) dep_id_list.append(record_id) + + if len(copy_raws) != 0: + copy_data = list(copy_raws[copy_index]) + args_str = rec_vals[5] + args_str = re.sub(r'\(', r'', args_str) + args_str = re.sub(r'\).*$', r'', args_str) + copy_line = str(copy_data[0]) + ', ' + str(copy_data[1]) + ', ' + rec_vals[4] + ', ' + args_str + copy_csv += str(copy_index) + ', ' + copy_line + '\n' + copy_index += 1 + record_id += 1 else: fatal(api_name + " bad record: '" + record + "'") @@ -295,11 +318,18 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep db.insert_entry(table_handle, [from_ns, from_ns, api_pid, tid, 'hsa_dispatch', '', record_id]) record_id += 1 - if not dep_pid in dep_dict: dep_dict[dep_pid] = {} - dep_dict[dep_pid]['pid'] = api_pid - dep_dict[dep_pid]['tid'] = dep_tid_list - dep_dict[dep_pid]['from'] = dep_from_us_list - if expl_id: dep_dict[dep_pid]['id'] = dep_id_list + if dep_pid != NONE_PID: + if not dep_pid in dep_dict: dep_dict[dep_pid] = {} + dep_dict[dep_pid]['pid'] = api_pid + dep_dict[dep_pid]['tid'] = dep_tid_list + dep_dict[dep_pid]['from'] = dep_from_us_list + if expl_id: dep_dict[dep_pid]['id'] = dep_id_list + + if copy_csv != '': + file_name = os.environ['PWD'] + '/results_mcopy.csv' + with open(file_name, mode='w') as fd: + print("File '" + file_name + "' is generating") + fd.write(copy_csv) return 1 ############################################################# @@ -337,6 +367,8 @@ def fill_copy_db(table_name, db, indir): else: fatal("async-copy bad record: '" + record + "'") dep_dict[COPY_PID]['to'] = dep_to_us_dict + + return 1 ############################################################# # fill HCC ops DB @@ -419,29 +451,35 @@ def fill_ops_db(table_name, db, indir): ext_trace_found = fill_ext_db('rocTX', db, indir, 'roctx', EXT_PID) - hsa_trace_found = fill_api_db('HSA', db, indir, 'hsa', HSA_PID, COPY_PID, kern_dep_list, {}, 0) + kfd_trace_found = fill_api_db('KFD', db, indir, 'kfd', KFD_PID, NONE_PID, [], {}, 0) + hsa_activity_found = fill_copy_db('COPY', db, indir) + hsa_trace_found = fill_api_db('HSA', db, indir, 'hsa', HSA_PID, COPY_PID, kern_dep_list, {}, 0) ops_filtr = fill_ops_db('OPS', db, indir) hip_trace_found = fill_api_db('HIP', db, indir, 'hip', HIP_PID, OPS_PID, [], ops_filtr, 1) fill_kernel_db('A', db) - any_trace_found = ext_trace_found | hsa_trace_found | hip_trace_found + any_trace_found = ext_trace_found | kfd_trace_found | hsa_trace_found | hip_trace_found if any_trace_found: db.open_json(jsonfile) if ext_trace_found: db.label_json(EXT_PID, "Markers and Ranges", jsonfile) + if hip_trace_found: + db.label_json(HIP_PID, "CPU HIP API", jsonfile) + if hsa_trace_found: db.label_json(HSA_PID, "CPU HSA API", jsonfile) + + if kfd_trace_found: + db.label_json(KFD_PID, "CPU KFD API", jsonfile) + if hsa_activity_found: db.label_json(COPY_PID, "COPY", jsonfile) - if hip_trace_found: - db.label_json(HIP_PID, "CPU HIP API", jsonfile) - if any_trace_found and max_gpu_id >= 0: for ind in range(0, int(max_gpu_id) + 1): db.label_json(int(ind) + int(GPU_BASE_PID), "GPU" + str(ind), jsonfile) @@ -474,6 +512,12 @@ def fill_ops_db(table_name, db, indir): dform.post_process_data(db, 'OPS') dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_US, jsonfile) + if kfd_trace_found: + statfile = re.sub(r'stats', r'kfd_stats', statfile) + dform.post_process_data(db, 'KFD') + dform.gen_table_bins(db, 'KFD', statfile, 'Name', 'DurationNs') + dform.gen_api_json_trace(db, 'KFD', START_US, jsonfile) + if any_trace_found: for (to_pid, dep_str) in dep_dict.items(): if 'bsp' in dep_str: diff --git a/bin/txt2xml.sh b/bin/txt2xml.sh index 27bbe8c4..126337ed 100755 --- a/bin/txt2xml.sh +++ b/bin/txt2xml.sh @@ -64,29 +64,32 @@ parse() { gpu_index=$line fi else - output=$outdir/input${index}.xml - header="# $timestamp '$output' generated with '$0 $*'" - echo $header > $output + found=$(echo $feature | sed -n "/^\(pmc\|sqtt\|hsa\)$/ p") + if [ -n "$found" ] ; then + output=$outdir/input${index}.xml + header="# $timestamp '$output' generated with '$0 $*'" + echo $header > $output - if [ "$feature" == "pmc" ] ; then - line=`echo "$line" | sed -e "s/ /,/g"` - cat >> $output <> $output < EOF - fi + fi - if [ "$feature" == "sqtt" ] ; then - cat >> $output <> $output < EOF - fi + fi - if [ "$feature" == "hsa" ] ; then - cat >> $output <> $output < EOF + fi fi fi diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index b59acfdf..313f7f42 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -64,6 +64,7 @@ uint32_t rocprofiler_version_minor(); typedef struct { uint32_t intercept_mode; + uint32_t code_obj_tracking; uint32_t memcopy_tracking; uint32_t trace_size; uint32_t trace_local; @@ -222,7 +223,7 @@ typedef struct { hsa_signal_t completion_signal; // Completion signal const hsa_kernel_dispatch_packet_t* packet; // HSA dispatch packet const char* kernel_name; // Kernel name - uint64_t kernel_object; // Kernel object pointer + uint64_t kernel_object; // Kernel object address const amd_kernel_code_t* kernel_code; // Kernel code pointer int64_t thread_id; // Thread id const rocprofiler_dispatch_record_t* record; // Dispatch record diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index 5a6234ab..f639b3e5 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -148,9 +148,20 @@ class InterceptQueue { } // Prepareing dispatch callback data - const amd_kernel_code_t* kernel_code = GetKernelCode(dispatch_packet); - const uint64_t kernel_symbol = kernel_code->runtime_loader_kernel_symbol; - const char* kernel_name = GetKernelName(kernel_symbol); + uint64_t kernel_object = dispatch_packet->kernel_object; + const amd_kernel_code_t* kernel_code = GetKernelCode(kernel_object); + + const uint16_t kernel_object_flag = *((uint64_t*)kernel_code + 1); + if (kernel_object_flag == 0) { + if (!util::HsaRsrcFactory::IsExecutableTracking()) { + fprintf(stderr, "Error: V3 code object detected - code objects tracking should be enabled\n"); + abort(); + } + } + const char* kernel_name = (util::HsaRsrcFactory::IsExecutableTracking()) ? + util::HsaRsrcFactory::GetKernelName(kernel_object) : + GetKernelName(kernel_code->runtime_loader_kernel_symbol); + rocprofiler_callback_data_t data = {obj->agent_info_->dev_id, obj->agent_info_->dev_index, obj->queue_, @@ -159,7 +170,7 @@ class InterceptQueue { completion_signal, dispatch_packet, kernel_name, - kernel_symbol, + kernel_object, kernel_code, syscall(__NR_gettid), (tracker_entry) ? tracker_entry->record : NULL}; @@ -243,14 +254,14 @@ class InterceptQueue { return static_cast((*header >> HSA_PACKET_HEADER_TYPE) & header_type_mask); } - static const amd_kernel_code_t* GetKernelCode(const hsa_kernel_dispatch_packet_t* dispatch_packet) { + static const amd_kernel_code_t* GetKernelCode(uint64_t kernel_object) { const amd_kernel_code_t* kernel_code = NULL; hsa_status_t status = util::HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address( - reinterpret_cast(dispatch_packet->kernel_object), + reinterpret_cast(kernel_object), reinterpret_cast(&kernel_code)); if (HSA_STATUS_SUCCESS != status) { - kernel_code = reinterpret_cast(dispatch_packet->kernel_object); + kernel_code = reinterpret_cast(kernel_object); } return kernel_code; } diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index cbfbc432..61fd4619 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -144,7 +144,8 @@ void * tool_handle = NULL; // Return true if intercepting mode is enabled enum { DISPATCH_INTERCEPT_MODE = 0x1, - MEMCOPY_INTERCEPT_MODE = 0x2 + CODE_OBJ_TRACKING_MODE = 0x2, + MEMCOPY_INTERCEPT_MODE = 0x4, }; uint32_t LoadTool() { uint32_t intercept_mode = 0; @@ -188,6 +189,7 @@ uint32_t LoadTool() { util::HsaRsrcFactory::SetTimeoutNs(settings.timeout); InterceptQueue::TrackerOn(settings.timestamp_on != 0); if (settings.intercept_mode != 0) intercept_mode = DISPATCH_INTERCEPT_MODE; + if (settings.code_obj_tracking) intercept_mode |= CODE_OBJ_TRACKING_MODE; if (settings.memcopy_tracking) intercept_mode |= MEMCOPY_INTERCEPT_MODE; } @@ -432,7 +434,13 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa // Loading a tool lib and setting of intercept mode const uint32_t intercept_mode_mask = rocprofiler::LoadTool(); - if (intercept_mode_mask & rocprofiler::DISPATCH_INTERCEPT_MODE) intercept_mode = true; + if (intercept_mode_mask & rocprofiler::DISPATCH_INTERCEPT_MODE) { + intercept_mode = true; + } + if (intercept_mode_mask & rocprofiler::CODE_OBJ_TRACKING_MODE) { + if (intercept_mode == false) EXC_RAISING(HSA_STATUS_ERROR, "code objects tracking without intercept mode enabled"); + rocprofiler::util::HsaRsrcFactory::EnableExecutableTracking(table); + } if (intercept_mode_mask & rocprofiler::MEMCOPY_INTERCEPT_MODE) { hsa_status_t status = hsa_amd_profiling_async_copy_enable(true); if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "hsa_amd_profiling_async_copy_enable"); diff --git a/src/core/tracker.h b/src/core/tracker.h index c4d619c9..e366c761 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -105,9 +105,10 @@ class Tracker { entry->record = record; // Creating a proxy signal - status = hsa_api_.hsa_signal_create(1, 0, NULL, &(entry->signal)); + const hsa_signal_value_t signal_value = (orig.handle) ? hsa_api_.hsa_signal_load_relaxed(orig) : 1; + status = hsa_api_.hsa_signal_create(signal_value, 0, NULL, &(entry->signal)); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); - status = hsa_api_.hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); + status = hsa_api_.hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler, entry); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); // Adding antry to the list @@ -210,9 +211,6 @@ class Tracker { amd_signal_t* prof_signal_ptr = reinterpret_cast(entry->signal.handle); orig_signal_ptr->start_ts = prof_signal_ptr->start_ts; orig_signal_ptr->end_ts = prof_signal_ptr->end_ts; - - const hsa_signal_value_t new_value = hsa_api_.hsa_signal_load_relaxed(orig) - 1; - if (signal_value != new_value) EXC_ABORT(HSA_STATUS_ERROR, "Tracker::Complete bad signal value"); hsa_api_.hsa_signal_store_screlease(orig, signal_value); } } diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index a47062dd..65f94357 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -193,6 +193,8 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn; hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn; hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn; + hsa_api_.hsa_executable_symbol_get_info = table->core_->hsa_executable_symbol_get_info_fn; + hsa_api_.hsa_executable_iterate_symbols = table->core_->hsa_executable_iterate_symbols_fn; hsa_api_.hsa_system_get_info = table->core_->hsa_system_get_info_fn; hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn; @@ -231,6 +233,8 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object; hsa_api_.hsa_executable_freeze = hsa_executable_freeze; hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol; + hsa_api_.hsa_executable_symbol_get_info = hsa_executable_symbol_get_info; + hsa_api_.hsa_executable_iterate_symbols = hsa_executable_iterate_symbols; hsa_api_.hsa_system_get_info = hsa_system_get_info; hsa_api_.hsa_system_get_major_extension_table = hsa_system_get_major_extension_table; @@ -337,6 +341,11 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); + // GFX8 and GFX9 SGPR/VGPR block sizes + agent_info->sgpr_block_dflt = (strcmp(agent_info->gfxip, "gfx8") == 0) ? 1 : 2; + agent_info->sgpr_block_size = 8; + agent_info->vgpr_block_size = 4; + // Set GPU index agent_info->dev_index = gpu_list_.size(); gpu_list_.push_back(agent_info); @@ -681,10 +690,60 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s return write_idx; } +const char* HsaRsrcFactory::GetKernelName(uint64_t addr) { + std::lock_guard lck(mutex_); + const auto it = symbols_map_->find(addr); + if (it == symbols_map_->end()) { + fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr); + abort(); + } + return strdup(it->second); +} + +void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) { + std::lock_guard lck(mutex_); + executable_tracking_on_ = true; + table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; +} + +hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data) { + hsa_symbol_kind_t value = (hsa_symbol_kind_t)0; + hsa_status_t status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &value); + CHECK_STATUS("Error in getting symbol info", status); + if (value == HSA_SYMBOL_KIND_KERNEL) { + uint64_t addr = 0; + uint32_t len = 0; + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &addr); + CHECK_STATUS("Error in getting kernel object", status); + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len); + CHECK_STATUS("Error in getting name len", status); + char *name = new char[len + 1]; + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); + CHECK_STATUS("Error in getting kernel name", status); + name[len] = 0; + auto ret = symbols_map_->insert({addr, name}); + if (ret.second == false) { + delete[] ret.first->second; + ret.first->second = name; + } + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options) { + std::lock_guard lck(mutex_); + if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t; + hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL); + CHECK_STATUS("Error in iterating executable symbols", status); + return hsa_api_.hsa_executable_freeze(executable, options);; +} + std::atomic HsaRsrcFactory::instance_{}; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; hsa_pfn_t HsaRsrcFactory::hsa_api_{}; +bool HsaRsrcFactory::executable_tracking_on_ = false; +HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = NULL; } // namespace util } // namespace rocprofiler diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index af031895..bf7f5fcf 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -95,6 +95,8 @@ struct hsa_pfn_t { decltype(hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object; decltype(hsa_executable_freeze)* hsa_executable_freeze; decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol; + decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info; + decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols; decltype(hsa_system_get_info)* hsa_system_get_info; decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table; @@ -160,6 +162,11 @@ struct AgentInfo { // Number of Shader Arrays Per Shader Engines in Gpu uint32_t shader_arrays_per_se; + + // SGPR/VGPR block sizes + uint32_t sgpr_block_dflt; + uint32_t sgpr_block_size; + uint32_t vgpr_block_size; }; // HSA timer class @@ -323,6 +330,11 @@ class HsaRsrcFactory { static uint64_t Submit(hsa_queue_t* queue, const void* packet); static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); + // Enable executables loading tracking + static bool IsExecutableTracking() { return executable_tracking_on_; } + static void EnableExecutableTracking(HsaApiTable* table); + static const char* GetKernelName(uint64_t addr); + // Initialize HSA API table void static InitHsaApiTable(HsaApiTable* table); static const hsa_pfn_t* HsaApi() { return &hsa_api_; } @@ -387,6 +399,13 @@ class HsaRsrcFactory { // System agents map std::map agent_map_; + // Executables loading tracking + typedef std::map symbols_map_t; + static symbols_map_t* symbols_map_; + static bool executable_tracking_on_; + static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options); + static hsa_status_t executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data); + // HSA runtime API table static hsa_pfn_t hsa_api_; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 7f128e86..9212f2af 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -88,3 +88,6 @@ set ( TEST_LIB_SRC ${TEST_DIR}/tool/tool.cpp ${UTIL_SRC} ) add_library ( ${TEST_LIB} SHARED ${TEST_LIB_SRC} ) target_include_directories ( ${TEST_LIB} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) target_link_libraries ( ${TEST_LIB} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt ) + +## Copy OCL test +execute_process ( COMMAND sh -xc "cp -r ${TEST_DIR}/ocl ${PROJECT_BINARY_DIR}/test" ) diff --git a/test/app/intercept_test_stand.cpp b/test/app/intercept_test_stand.cpp index de3dbdaf..4f46f65e 100644 --- a/test/app/intercept_test_stand.cpp +++ b/test/app/intercept_test_stand.cpp @@ -73,7 +73,7 @@ void dump_context_entry(context_entry_t* entry) { const rocprofiler_dispatch_record_t* record = entry->data.record; fflush(stdout); - fprintf(stdout, "kernel symbol(0x%lx) name(\"%s\")", entry->data.kernel_object, kernel_name.c_str()); + fprintf(stdout, "kernel-object(0x%lx) name(\"%s\")", entry->data.kernel_object, kernel_name.c_str()); if (record) fprintf(stdout, ", gpu-id(%u), time(%lu,%lu,%lu,%lu)", HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index, record->dispatch, diff --git a/test/ocl/SimpleConvolution b/test/ocl/SimpleConvolution new file mode 100755 index 0000000000000000000000000000000000000000..be4c1332a279c55c80b0461daa2fb94f1f7344e7 GIT binary patch literal 132704 zcmb?^349bq_J0S0Mnoqny77+g=z=0997;Ua2}z)%K_kbSRWS)UNFx`{^=1-!`r`>m>;>2zjTcmKc1CsX}iRlR!e z)vH&ps;hhE=j4s+myn=YpMKgOHA0PJJpv`aHLYl0w2Vg^qaCDqv{dbQ?EsV-fPbr; zM_pOzURBLXlTenQ{`gPC|CoM+n0lB*(&GfAX;x~r&Xny%KOS`{)8rRqNh{T~RFO|C zdEK;1=5Jb6EbvzP0WJu6LC{9CCdA7$hJWR*X;p9c_O4-+k0 zd`DB3Q7G|u;GBFEYh+*GjTBs z|J4d{jY?~gdhuV6|JnF&!2h-Qw?5b5;(Gkg#s3ZXZ^Zus{NIRw>*L48P557k|A0aU zk%rXuZ7OX>dI$dRQm~cWgNu9dzgQveL;6?z-;e(#_j6EKH!{xjk>7&?{}QH_OA;rJu~H> z?-tBY>|Fbgi#}50y@vG$?iFz7scnF=^fH9aleiTuti_JAb(OtEcYXQ8oY1N6t(+GkeU9kAKv; z`>21eI-~BYL(aMP;D2veT;1|j>Es!|yjnWrpsaTu9rx8SkKteYs`He!Pj7zy+T0m0 z&X|AS+_5=tfBErenTK5CD|oeT?!&J?Qf(i|AC=7Z|4m`I)Bdg(I>2^yDEL)lcR&b+qrM+H3i2V z@!9B250tFh0JX*Z1P{@)!(-$;76-p`K5Pg)|Mr2g%U8wG1IKABJ_pBXcUv4fug1Y|jw9D6 zaqySH3CH56J5Ils$HD(2j$9iD#im~nhyS66#xDP09Qt?1p|duQyjgJ0vE&^Tr~LRh z{oWI&{K0YfIW>+RIPb*b=chRM6XW0)#u+a=Gg*VZ`YcgE@W9dY*{JArAi0ICS>L;pfCS{rg!QdwvUwiPgV(aqxG>;peD0 zdiyp``LS{Mc_)rPxjat!Yvbsxe;hiO#<9cW;_!cV9Q@cg{azAhynGf%pFhOuugl}~ z?~FL|J{m_qH^kvH4g93y-}$MH)9$%({O$X3_`e{I-bR9-RA-}r6VWfR@Mpy7uao2G z`Jp&?R~&v;q3u}qe@q-bB*gKPcORx{CuzyrlD}jM=~BeEru{>-d#ILny+>e1pCU8X z!=Im|9jmpiFBXVYiPr8{cH4NKM`2`gCGafYxv_WoJHY(O+M!y__p&_aKYm_Q?RH-7 z5pXlcAwTts&l<(2RsN4IO*;(yx6P3JLyh9|B=Dr)a*V{^qf7iTpmUgZjMk|5cdPRA zp&#<8ZI*mCD*WjvPdXhHlFmeh9}asK_^Ty;xWZqJei!m?kaP+ZKKDrW*P&YKryha1 zSP4)HI~3&=KMktwOCO@p}r4M@-F$oBQS3(d=>OZ`>fd{%Qq@~0D7i9w5ond zQFsk>l4H<62K;l}6(~DLcZkm)pa-7_u6IuCMQUIt{O+FPkIERLeY0Cxtw+~0{US;wyFL?b@6G&c%i&$ zO8*M29f1NYzr<1gZ@`n!v`Q)0{feI+^ds?3Km3l>UrRiK^wVnGnMz(w`G>Pq!}lxw zIQ#ue;Kew7PSW{SmA@B?qWvc;`?vhLhJD0KYRYD1<#F#YT`Q>?TVGjKnqONzy|#EJ z!0~0?tm;|bvIcMNtnzBDFn98V!qT$Zvgws|-m==s6SAtRXO&GZo>or1I8e{9r%gbt`#dgh+EU8Cy-luoQ zR?aG}y;k;VoF1K2=9Tp`Yo~*v66&(rN^hBx3ITwCa^h3Iv;B{?~xORB2t z%0!LHe?tCfS*6Yk)0`Qvl2%way`iD7rmVKEdRB2&rT5yx*=fJM%~91DK{@#sk)M%; zxnQ0{YG!q5nV?ZUvZmgfQ-W*3(Qgn!6z!#~5l~!RWlVitG|4-D3A9U!2Cy{mk`^M|)DyNGMApz*;zot-E52LQlnK=`dQPNOc zSW{nHRyez|)>~g(1)8N*`83JNliA_QCgUrURz1nf9`sh!R?jJ{Dw{RkTTxh6TU%W# z6@-ytWAeZ2BvPkc{Ra)B)GRo3DNb=!RdosUTUgdmQdZ-wtez!UOh2ozkfwo=Vh!I| zLu_JHYE?RSa^d8(!kkHG73NF1Pnn z)U{wN#BnW-fmAu#TUJ#yz0bJ(jl=5f+A=uR*=0Et3%!8;k|s5+xU{gOxXvpa<^Q0q zvrDRH))d#4MF;F}v1_Y)COJHI)_>)(BUJ3uM*c;!qZ7yfGK6LSqdz%@X#%6`%F1R; z*|m`Q+3D{Gl+*Vu3G)sb`&a%!@(J~PVr`R~WIz4mW3k%%_`Yo~{~c|u`r zSxr@O2@Gr|RVRnSfAB1$#7vV9y+&0RnQPMvE&ude3`(~5-+F6C8`1fOOwXF5SjTk2 zKtja~nA4nmo^Y1jU z$Q(VUv|3KlDkQLXa#Vx=$)!-Os2S1fenKKkEZY?G+uO-bqlpWJmDiT_o-yp4|3>p< zUzNZad&_brijj$Vq6A*p(ewWWe^UMbBP)nL87WM>XwoP~s0dw0mvdzzhqc`;{6_gk z*V^Zu>Z-b&N-nXcmy{IN$rV=Nlv#5yH6eVjtDjlM%n}tj7uFys^OhHuRLs!oFeR00 z<+M1hoO1^H03lJy4CWS=7gttkwPmEEVV=ZfslgW^uCV5Jtk){NS~*>wHnVJI$;=v& z)Lg5Ti}+ZZscO}g6wfLLsp3+cBblL<&tYV*p{I*WwdrMEZ{^Ig>hjWJED{RqZ0v#N z%<9=?z>-lrnq6I0 z&(P)Ux-xHlO;(=lvls@AkAqO25`+N4Xy=HwNQ z%gvjdGqG@F!^pxKSk4HnoGyXvrIoV^>+8x&dzTvBFq#a(b9c08xjI5o>Jpby-eje!d9*V>FST##y5hYpv_lVlDT77{X$1nH1GARFgHc zl-%T#ms;T@RS%qv%yICNk;PgsQFUIx zET$*tMuZyKOUrhN($8{M9aT|QRTF<4jH<1lb#e8qNoD9M?$~7I$+f+*uo%o+MjDq{ z6&+&%V-<`l7u81g${5kBoXjd5 zF?>|yDm{8RV)*C?8ckwvxgSoECgN`Z(gETs5$ARLvS9DwpxNw&gLxoiehcc9j#;A=$r1GL==_71%hcUsYRM}ZhVlyiaNm+;`qNdFrj>{*`DNY5XTQ#} z;XBoRG2(e{*!o!aJ&C`~LL%){_d|*2d1LEi-KQg-XZNg+b$^riJ1rzq>zn}b)N|yc z?oSc_bOZ@qsQWU+x7zU5{Yv5&+wj)?ed6cX@F{IlKe(+A-A6yEHat2s@)=^o%aI_n z(rkF^ECO@J*zj!I`gm-3b?ZuG={7uhvOf7Xyt*|nvI=Z?bbRDfWW(dALF7|m!_zBR zpBfup-SQJz4K_RsG4g4&;c<&D@>yuZ`jj;+sP8y>gxBA+ETJnth~pQScD zZrw#bEjGM$50aU!Hav!02 z{yZCATN}0iKicp~Hv9!Pe6kJivEfr}_)Hr<)rKEy!w<3HvuyY@8$R2HA7jJk*zg`3 zew+=j+wkLU_!&lhw`8Ir|4PRixUv0w|+3+)L z_zD}o#fH~?Unc3n0eAaP(sX~DcYyKc0h*@!)+epA6~blD=o?X_=aeRVyH51rim>3+ z?CinklNMPfdUs%9$Mk{*Ew1II~T_I|(y%Gg}0_gD^uj zbBTc8C(KaIY!>hq!VJ;OMghMaBC}b*6A7P6xKY3t5@rZw)(Chk;UR>J1bhzR zp@j1VJc2Mo9n&M=(+D%fG1CNmGGT@`W~zXXA3`0l!a}A&J>6;4Oq1ikOW8 zeuXeY5VJ4gr5hm{Yjf zCg7cfIdz*Y0^UKGQ?|K8!0!|0RBbj3cne`p#b%>`Um?sX*sKxoTEd)q%_0G>Aj~P( z%op&pggMok9sxf^m{Y8oCg4X1b80nH1$-Z2PN`tymlNg`YBmdaBH?nvjRL-qFsDqj zM!;hUa{@Ap1bhzRO2YX99zpnO!X5!XJMSc|ycw|gOqmq;Sr7c6FZgmR<|sYzMX4T4 zJcaH=4>Y+q0hiupyn9jtvkX_Arup0IFVcfAyW2$Zoq><^;DJ5Kz>d~~O%m&CZ6oda z$*gnweXJ8$SAPtl3AYjQjw^Sa*u*m0O3sve=w&w#AREKC0tntJ+AY;b4ibz6w$`7c zFIZ3hu9&)xRHnoI2G(_L;`NLh1y3*WdbMA*^*2J_KCEe7j~PF~YOMzJz&hiVcPZO; z<79aA@@CHas~6zVw9yaVZN)-_2VsJ%d2pHYzqu2=UOwF7H~p*Jz#CWTfgQT<)9(Dq zx$SLy0zz+J!v_-d_I92(H}-(k_Ye`Ix^)>g)TZcXEJdRG_8zJSKAU}PD zjpygybP^w})F5TW}n(CP-L)5?Wc2hRfE0_!5N5n|(b$x|;Y}l1jGBxkK6~#ze zN8w9WwEEZA^O~#aNfe!5=IeRI)pVt(TP_;@S8L5I)E&FxW+bgw;!AOle)jcjcQsug z3Lh;Bf85&2hK_7~9m(87eLXu`pTw2RkMW>U_tO4-J#V<0-bD@9ZJDCpo2{=wE#41T ze2JuWWmH90`CnX3H;eL@it_tfx5Dzg|7h(-GWQ8zPfzQ5T-Bc?`*%Ul?=Wk*no3c4 z{)h!VA7M1Kc0l{Cra9=?1wGq|Xc35VAX?voF4sl-dqGbM>bEWw1#(cp*R!JaNtCI7 zn+<5KX~?ma_w~Hf+6<;$O(~-AyRA0>s$aTG*`u%L!`38`zZc`r*Yi>9570{e!f5`c z)_0&;SJPIJzq$2gLjF=vLmBk%n$~O3JbG2+uWh}OkUu+;|9a~PQGc<>?`TaG_1%&D zU9HJNm5k~`pe;AldqVXd1Yig=z(w2x9EY-jl1^3Z3o)*Kn`YvL^sl8 zJ@9@Wax&1U9z4!X@?B53PHe=0(lT8yCb}0M7{T9W^xG}`yT50W_hdcrvvKDx68uOH ztX-|O!{2`gGOfJ2f_99x-@}*~^F4t)9`CEj>*;WXySvsH51kNfZzyx=8|pw!#;2D(6f`IU5A2K60*&$k`f|4^93!H&#D= z0_n^d7{x{a(O?=WRBCGD(kd%|Y+c*-HDe|uneFeo0?^7ziUiEKqs&-K@ z`}-uB`&Nx*OBw52<-WwXDK#K2_286F;~>=S&BGpaM6PdNvddo&g@Nu9q}!_(T}@SxOG=nz zmvBO;J~=P&l`--gA>jm=OG+Oh??D67FQjkF^j*+1cc?YrNk-na#5Jl=GBbD}1xse`${!`H(iJ^i7BTP#U4wyq8O19+&XaBZt$PNCf)3~vZ zmTG{Rknw$e5G36(#M%kIrC875Dk7HA`jr%A3A+H}DKGF1eQ4&?%&RgB&HKnB<6FV2 zb5tR+LU3x0EOWdBj2~#P!M=r`$H<8keo_L)rZ|N^uv30rQT}V3!s)a;<;YCV&c+)k z+03z+7n+`o!RY%kB`@%Y7*mj|=^p3WiVc6sa}ghs2~qwH?c`5m>JW`pao23)cRH9@v=|_}W-a8^UCi z7yNzK6Vkph?@ZAaPD<{&N5XPmJPHMk1?Zk;%%>}O<6ig{4Q7v(mIrWNsMMWildZ`; z2H5mAPH;)0U-u$v+XHjnn4Or06;if4H81dq9>{hl2anKw>k>mZ_UzlYZ_k#KI!bkG^t-;jqb{RUt6R5w)M{&`U$3FsOU;}lI!M=s6WimuBJS+ z0H>{o=DJf|epBf6N_PrpXX29e_O_PS?0{FPDxBE)CFw#f zqAl}RF;}3zsLJ{IAgeYbk!+NUh9b{I>)xQ5W1%S5i`nipq0NP8!dxb(VSOOulbmGp zcodRSVP0jgO6i^_C79o>=zC*4L|vT=z1(_kQYsNOQv=`Vfyod%kfbli6CQZrvJTqS0gc1K>$1|+IVQ)$o zAENrda`^|)e1qp2FBlYprG!$`!p#kt+KLzPCD}iPv5#PNyI~@t6BP9jna+6Edk9(^ zWTb;87#IY%aS{zehm{>Cb|75-!`Y5+eF35tInGyb2R(dpWFC7Al~+^|mx@MZ+|vwP zcsPoP8Aame61Nq>g9N7vR~{osbTvH&V)XvrDtdnQYuguK923@36Co-4)0_fhip|g9 z82o54`1umOAATGBh}a4HA+RHZ77=4IXrSp@E_((5<3W@(js{tqKG6eE=>TH~xFod| zVvNwRRNtTn>p$-LRPt@bU$D~!UmB3fW}i)NYzM(7`avhpo$P9=AAlDVj)!}G2W}rx zE}dwN9;$K|!8A5vU}AK34HoMWc*rFETspg^Oh^YC$)R)octt5lhWWM~;|^gnUiT0h zPJa3p)W$NS%AHTsAHq8b#-W1u$tVJ==d;aK#cU*>=3XSsT|Nyxt6Z9Ubt<1UC@bCQ zNf|Dz9riSY28|lo?gAQgHf-6Q?jng6 z+hMj{Xn>yiZ4}K#G_41FSz~NN%ncksQy631z%faK6u|-by_J014H7fmmC!7Qs4&MF ztbq|WL=Rh*#=&YcOgTFh=_{gkS(D7c8^})*VsP4Vj8bF0Q?4VFTzo;BG(i7+gt)oB z4M}+!RrE6-KySI&lj!4UhJ5OffkQJQg>W>FqkE+k4o4G?IvC64SWZR{)IbJ3SvZ@% zA_S}4Zg#{(n544C?Job7w9-Xy0YhcFU z+{nqV>(#E!MknYRSKzC;?|y_>|5Vr}Lrfp(ZNc$Hq5NNcJAQTf&!r>W+dnt351a5D zxb5Q$I&t>`X+@h%YnC9*432YW2CquR@Je(oTX%jB9oKCSqDJUE+%7@I62rI&OkmjM z2HJB2&-&4GTy5oz(_KyFiO2wQN(7njYFZEE>gz!&C;h9O^iPewPz>xb z(cd<&BrkwC8apojZS%5oOEzF{BsVnd=BT{JjO`3zzG?f;bsGNvJfJ`J{3EIE8rC*8 zko|&&MR-DX#;*AtnZET2nF$|epIhoqtbZyG&Lt@)IIDqDhQS8Jy(%^Hjju3^Lg1vV zQ1%%brq`Unthov830nhm-NP~i^M<*W?ad5~8$zAvB|XO3onm%@>|?gmQk-6TZz$_()Iq5%-63u>G>e=$DDT zm2n}r=ymRaI_DUeriUcFlAU3CFCH5jG*Bo4oIUi1+>*_C zC7WgY(mJ*L>11FO=xIFf=&j!qBKCxf$K?->&3L*In!NjPE?I zyEbMfeed#fXFq+jiZ6qs76wM$qWji>>-pcBv&BU0(N>&{w!`@WYkvI{W{7Q=2P#g& zjMaOquNxg8C)9tUPIMb4{R58icH`)e;AGfIWFPCRk>{yJ!?QqQ(NJNVOlh#mu4Pxc zhY4%dOMW(1e8^d3p02_CJg%mlu$}a_uIDn_$EA8e6seo(Y7O73xRrpZFOis){1S&6 zIQIEsJC1$6XbWkLj@0boy-&EIql9#Qw|7QD8^{;C$>ym9FwLih8=w|es}_4^+l>Pm zZ9up*MAgZ1nooq#PG`M@xB@U0UJ}vkt-v|gKn=w4gom6M{*2K_Jst*q%4N-S`z%hh zTO=o+iFML;86WiKeUcw(3qsd2! za|nMs$vYN+*Ath)p4%6ZvPd=Aqc#EV712PiCKo|DPASu3GEizu+8czv_39{K-aeLikxV9{ki^?%PJ;(UH$oP<^1aCe@ zUv51Mh={$@GBe~$T6A_kc+VdfRQoA4z6TpZ2}4I_71`Ph)Scl z8RBK`nlJhruf8QHjpBy<#@xV0;~3ST2PL8nz6BR)HOL&}2-P3~*Ntt)9xCS?>a->CyZ{m;GXfWCnf~lZh20_w;=(`y-}{4 zUSUH97JGW&y$K=r#^}ycZty&}F;aBG%kCl+TQLKA3Qxorm6aEGT40T4OnjCMEM{E7 zs2Y7i01<4S*ax}~gv5SPC8j9n6mSM4b1q?yCJq$ri?0E|h5rJioN_-$*P7p>Nb~Au z%ss|C;6!Ytu=N(@z=Z!4`um--k!F>%gsI5+^=-*NbzBxke8xz){? zBeIbn*y{4{!@X$iKX8h%g-o&cH%Eabr{Jl?IRqaBT=ew~63lgYgIkh-GceFg|A7h5jaxi925OI=GY_ z>&7t50ek+R**IvmAq{V(1b^m81Ev!g<2WpE^Ma3w*$y|#jA2`lxt!eNGhaXbR=0s- zdd4Q#%{P*DRS%~yc&F>ZH^{>Cf|S*WaqZm@v(4q_`K=Yl<11W+W~Jet54QB+B1u$L zlsUg=4tn-gkc^meC)V!L1E9B?nAgwyVp z5(mxM2izjy!pnh_+|{GEO&^+ww2if+$Jo~q>H9ovA;8&s!X$+GgY#X!aR#M=&Xxch z?nF;wiEeC#FAp+0gtKLa@w@GkFagz*Ex42tUydGOPc}Ua!cK9%LZ(xk_lQGvyEq$w z3%?4a6z2)BW!?tEZFUm8KL){tF$j7|@b(D7DI{0|#nT$3n|ax-A=~J}W`gc}*)1YO zmp>0~n_v08ow=V3$rO{i86<+*iu=oRoNT6h|UIC8{%MR|n5 zp2QH;z6JTj#h*SWz;utou(lS)uBGrxCQNr45 zHg>!YwSs#s{0>UdRBMFwXS?;hkb46%z}*+d&qQ)7W}~a=T+B>jf?0#@>~M+X-#AF| zj;Fk}aE-6zx5rk{QK?>t~jHSE~*nm?d2j>be z`Mg+x5Wa{~!boNQH^!^Y(pz~zd&RMo<`zpD<93KiZu;Qp@$N zPtMEu0(Ejc`wec4;-`+08mq^dlHaM4)m20v$cDM(jlI z2$w0t>0l8!6A$I>ungyxkD^A;9Ah(ej@Ah%!^y{`G@Ll| z8@u7m0WLfcG^H{M&^P8!#CC<-*{3pXfMW_JJ{8(!DT%vi0wf>qf8?9&;;Z%hNdw6~+j2 zVuos&%~8R#V^Xir|EsTU=R*poI?p4{q0S+|g$IL_ROdnHX0wjI6wP*`Y}kXoHXivP ziZMnjMwm@Tx|xaz^7I&tv=isJN%b6Y4n`gVF1!~=$w(vAV(#P?6nX*2E1+B*+l#5# zzZ3ld^9+|O+Z+WM$nO#=t^$e^Zu)DiyMxwDhaBS!u^5uYteGyN!L*B&khMP3H z=4wjhH6-jnG10Zy7ABlbJQ^8dihG}-TiqQ3^L}_c$1f$r;fUimNwzpXW%P3eSi-@* z&xCQZO85tqCna%bGxfsYpibVdPBH!oc7!M2;A)cR6ehC_H;@Bs#58Z501eUT-+6^f zl5RjhcdmN~@Ba!MMmdcxFQe0S(?Jj+FNh@?8HKLc`ttYCx?FmY<($CB;Q9}n$>VvS zi@D>y6&j|I@ac%YjE|H!ZJ-ohY-xER2E3(Z<{0e=n#m3UvXhh<*Z_`_!BGxxDW##Q zyx=hRMZUQh94>#D4oPV+v$YL$5i*DO+VuGLMwr5I_*-W{S4An6ELckUHUoNINcP<` z!ix03CVfwb9?I`AJSZzI>+^P-zO2r~?Bd_{6WW>oWAywBX{S=!B^(0}ifsakG13`Z zK}u%RBGQNmy=j5rzx&%Z5G8tVi*bhv?aa*4VJ-GK#t(-Ji3 z)LRgQoqC%?oI`IVz=fwqit$VVPMQPJ|H9$o&E z;G#@^3Gq;sJrDV1*Si#L%l;Xc`M-wqGx>#LT_3YvDw|ZBUb1zU|0ojX7sp`{S9;j` z!Liww^Jo8mAq3N%IwG+PB$vH-tGL5mhh zwhG+G0{jw92rFCUZegl)b{7lqONgQ}U~e6kG74CLUsNg|gl0Mo>FO0YQ)6scBQZR4 zKoX6rjb>XLCCD@R#R48xAm3JCEDP|91yWUkG+Tj_Sb$$FutXR>#9*q#;1kv2v1-AC z%77TnyySeg9bud)z8+=K2l@3EMX*SOorm=N3++zuUc;T*euXa1CXN`i1ccVs#NgOT}7h#`!#P zdeaSM3H=7a`U>9Q2@gY5AcLBhgeLE1({VROMag)(q9H%)%FR(c=Cjc#14T*uV04@w ze1WRT3;Bip@a$q>d%FO@h-O^IGNF(#pImUoF2k|GHnF~qbY_B(Fm7j#g^qIk=*;DZ zSQeCtqtY~1!I2*(wIiSjb`cn^~`j9uVD7RB8a|0?KIK7%$JL`Ba|K$G)i=aLcG z+y4d+;tWgp3b#v;cYeZDXeI4e##l!jY2(Vq?t+_CIkC& zWROKa{2|C0UqP*8{_zz^mF4b*gD{#fSQNrYVGtU}W(Tpu9Q%b!iE|u%SOlE&=tCoM zH-JAWaIxa$Xz(J=edxidn{3BXbMev`Jyf>!3hU7(oW1X}-#m!2q-=gUynw!|u^CvfVxWCo(H|RiHqD0U zSk@V-Vy!RQhHsp<-_{>Pb+-O5&S*j5-EspTcRUJV*<25wLUHwnlcQ(@G%74$!3+>(l^VKX|81*`kEd6jDu}@ z>j;UKMNl}YPs?|#-=6~Cd`Y(aM~Iyh%o)aKY;%JzkQUBT<_tmelxRY50v=YlTDw|4 zG;_5!p`b7Lvg`x)%Zjd8`7x@5)ILf;SK;*zN(l#` zhNb=?w+SUILB4TupO$Z3w?8Gk_M%k62HCP3>hIe!-F~FyTT#Pmxj?qOlr6u7(bHSz z!}r_rWYzK=zq#eZZ7ny-mRs2J<$YRSjAJPAN6z~%NSUXi<;>ueqRe1UX++x-3Y6J} zVHQ!T??D|)>K55{E8G4E#@$=$txWhd>K3s$?*0+0<;#y?rKJm zt{-tSP=-e|AcI^iG(KDU@{*x3=@WJ zt^YzV5T*0r(#evng#99kkS*^;R^wRqjsw}fl57ph-r0*RQ6QUYy!ua}5()c75+UmX zS#XuSTj`?R|JL_%#xt@7wN+`zGLonV->SvX38ti_uxw1_&48$O7C4YYEBFGh6**&3 zgP19r4nq$*_sDfN8@ziAi_2OvfJ!4CLCn^c$PHIPpUPs{2;Y~fZWitNlY8Zzi#*H- zO`VwL!Avvs5PqEIQj3wL5XJ$FDS{#s#91av{3j^P842A3qRvR@M&j5Lw$1Wcz=aP4 zQVMhlhXVvU5CX}FP)>PjP0)cIZMnoS-vA5p+zv*z{W;9z*h&>~(dRIeVzm4{xDJnq zwEVVGWoQ}pPr3y0CJ^bno&7SUEg=2q8Rw;>wt#wm*K zhyMV5CGshV-(@gHo{VRsdmzR*1`lyoH0Pkpom{;`oTHc50_W`IXNX(OULFd~S&Zli zPmGba=Gz$y$7y~gG~;akGB)opiY($B%083Lr$ZmoD3W0#RQ9H)q3qAGwBWk&NZRTX z)*{^a1=DPUbi1y6Ci>h(#pX@qVTS}aRwMD>%$3*O)UbS=v zSqpyG*IrYpv!3YE#kh-cXB?f~j9z!@?DsJkbH!lnXVeXkjxhECR7e#p`a2l=?prd} z2#r~@Xt;)aO}Fv&^Gosh`UcWD`3jO(hbhe=4-SV>Lfl``viVWB7=*F)e_jl7Lt>B{ zOx!ahcRR>Anc7W89lf}NI7ctO23(lO3#EZAgFdOO=}$pf{$H$}ivoZv_=eB$VrWYi zU4)wQ>Kx?e1|Q(Q_=*%3>W&y=H40e=D2=r!N8$sK%9@&}DV9r_0kL>8Uxn#7g&9i5 z9U4B8j5{=(KwK+TGWVOlR6cL+5A*wgew-;}Ld=DLijrC+O621*R z&wDDjy<|zs)5skRs<0YD3EuX_E|$FQn_`K_ooWUWQ^)~XI^Xa=>N(hU4S!2=Bv+Gei7cwek9385j8M3j9CMMJ<|3tc-6?F4Y@ zG_--lISg$#XoOu6Lwgt6Hhts(%L_=)Kn5}zv`5D}seTuts-#w%3m*;yzA?lpRe0)X%OqP%_0(^s!W)QQilLw#; z?k7a7letC;j6~kGBM60wB%-iC!**z$O}zc<)YzBEbk2lZiF0UdJ#gWfk(uyuFlRPm zGUAn2F2Y6VN_8Kt;d!E8AUYdka5-W(sa|5B zUXeuD{Ru>bdcTB8JJlP4EKc=aOPoW!Wx$0OMbtYE*2Gv~-@|HUE20glfS63wQ<9h? z)9*zzYx2v<+qbCWK86TVehHJ+T`XnYNR#dh?1dECBt zw?tjmgg|Fr;M-_N%zi|wIOOlP`U2wftpAs=?YzJ~_Cq>D)!-_Yplb^3jjq92U-SW; zI2~IKcsAEUXsI*SiB`a_I(5P?Iwa8|Jyau|twb1%AJG^y>4BI;dOQiOIhFYsagKeLC!)-$=y&t*2%mkMbNTsJe(`Gl)$=%CpN=SQbu-yL30IW$XrwfjHy*Nb z4pcc8pfNM)Cm{)aO7t)f-l_v=#$ubemb zE#dh1z*SqJ%T@?uKz2ww26|5J-be%wu_V6u%DR*C2i!$gh6< z5(l34lI>mm`kG%Rzdps6Y_(aMk4UUmN3vD%^)d^;#II-h^$&h6<(K17j}RbVk`jqU z%J>n5`6XLRlSM>g71_Wd{E|hIWf75BMIK=heo+ngsdRRvN{y@{601rztMH3e&P3{n zY;;C2Dfc5EwGuN1fm>dUVculJm4Nej#POM`DPd|V<;-)$u z()^{XubG_lAaDh2B)S84Dsc`oIUKm~5kSgevWvF&5a>z$$hQ3=v4p)5rfc%c{yvt; zQ}od7i_sijr1!IUkSKjO?p+n>!PT9>I^WP@%zyxT2nS~6`|3R5#ZbkHm+{3j;%UZT zIkknn$p1#gSJATaXw%=IPM)$}0~6PSE7^)33brDV&oNO>BhhLm+y>Ht4aPcpK}GjK z5w|(%UaIJt2!x|_mqzFUtr<^?zC_iqA?ePr(Orf(k>?jJ*(AR|{#Ek(Pw*SeN{x`l zE88T)!1FkNaxlw{WN7EJpnQD zx5CbGe{d6WSYV30U25#0Mc}4rxEZ{OlU?X2<7qMCU~W89$MuPjTV!XuJ+cGGfJbrG z>vpM6a$@{`v8BKa?5pDL{WOab!qF!zLhx94bXC#m{)?(>pH)y&*B^4JPhN(mQv*^u9!2I_2F8oKxQQ#9c^wUBB*+zx!j* zTNs0$m$*XG`zz=fQvk}5&}&3Jcn>XZyD=NR*R{RtLp50Nm@sECyh`lDuFkIZt_@x5 zy4G~9=DATG^HmcTU!^QOBHsb~vzWCSFl%w=q1UWcfhUGuU_+R-TXH*a68?s6PF~O1Ca(#BA1B~9YdD1^#hSu0tvT!j z3;%XifOdDTdB@)D&d4*!Qp0fDma}-dD^#<8^~0qw`s} z*s5LMV0#rujB}_@Q!K}Px)?a82u;NCB`^PBKD#moJv|1!QN#sFZ~ybzpcwS_!W^9Z zeM;Poq__Y1>}8a5%DXfMy+y>`NqYOA&#Ge3D~Lfao49|F-u~ya<6_WDib3yt^m+I` z(%b)h_Aiuk%G=7ij+<+b5!XU`oX_BqxD14w7XAo7h%p)Ya;(r>V}dz?W0;N(1L`s^ z0s2fZ58o~)f`g<76$C@UO`vO=V17X%Nq4cNyOIN0(d}Wt)`xDcqPs_W2t~IgLYEVa z@mPfJCP}x%M)%2>bich#^81;cZm(&@xHv*pt#uBuQN1)K)xRpLZ%3)}HP&KL!Y|_| z6p$L=90JWQ$HLdrz^{mi`)XjkqS+p$$ya6ACJ*DW2vyD)pju+1`eaP1--e|8FN3OB z2a1J@TIvL!6MB|2g>62FxunTY-<&yxXF!cZV7igXfZ+$->sGo~)y2ofWTjlOGDiy7 zK|M_jvKH!x)q>`Z2$}81+X%3vp2UJi`Ro%(2B&3bgA5)2GT1B~|LZ?lq=e(w1tD_w zkqYDNj{p5zF@hY9e>Mm^N7XdooTKVu;`VS<<)BH%cTP&fVo*wnK`D_qzAeT4`NzKe zd`=!6UVIC1PJY6~ts|8eVo+HUgUYQjs5B7A*Jp{HC8tD}$DlGU29*)Sy+JBNWAHLK z1{Ey^l`mlx;jN_d$zH_a{|4w}XftsR<-EXZ4j1?^aob61F-SS7_+n70jzQ%L;=U!7 zi7}|0AA`!!7*viVj;}j02Z4&PY_1rh;Yju^Kx1Wb9PENCi>_^5?{&Q^3yPpht}ov0 zdY>zcu2o$tyOwvo)b&Ewb6x-JTGsUsYkeCW@5YF@?PH8EEVCWgJisSUiB`6y6$3Gn7Toc_)W<`j$z4Ldg{@DpBN z8_XG!9r!5IwQOy6LTBi_{zEs)^J#%Kd}(lIa8MvL_yD}#He>HD-F4cUU74;wCpPbL z6>O^iFw?gdFPz<$lks-_a$sKB|lK=A1k1`6>jDzjIK|u$0~)y+|bDjKFP-sGOsFj8;GUYF%C+4Nrt6 z;r2v|-kz0=`r5{<6il*Nc%ens7~s+*ZVaBq$jaw+=AL(vJp|bW8?y?Cu8~C=G{`s* zc{=hM^mgA8;_=z&Tgn8OHm3OG`hLZWQL-xZkjUET;|$CKolNoB=;I;)eE1eK35f#8 z7sqZzKDWGBBaJC919+LDD9V}}eLN?|{Ct(a(N`ca0pC_xj08|=uY_0}&=4jeQ529a zp$3^(A)&=g+I=-h_}S=VHZjc#gNh4T223N_=wk)ux2b$cD==UfDT*0PAbTXA2x2sJ zIzOPeRTkyXW^MHGoB63Kf1{7DJrvT&Vg!+&AIUEe`QTb%Krw}7ASx2bpz`@RH9z25 zV4wq}D8>Sy*b>Qa75U&=VL-8hWkRAR`cLFT6%xtfD6&*S%%o5f60b5sPe^D8Lo>*Z zL~_DxA+5rYca{nHbR>{n<$F~AMu8`No-!lvOdxwCp9o@_6$V8sSO&Z!f$S>3Mdfew zwF(TRL5gAoQNJUSzg6UeYlcrG4pC)M=Ag%jM~zgK53U6!;M*jNu>kTtk$hd`xBH3| z1{C+oqO7^mS0VDD3Zzs86G$u}W&$dZ&|(QKlu#=Z$c}^`NXsi=S`>!7vrNdh6$xZl z`5h{Mqrj8?CRuDRlXhQsB%cW88%SMEjRZozA@E8lm5PKPXn6Uc^&mM!KX@gCZ4~t$09H zTYf2MU~%$0UKjcd2(F|3R5gPl@__kslS@Sb)r2%-GnhyDLqW7=k3iQg;nm zN&$)uLV2mUG%k`P^JN8HRS+7xhAf>f^J`>&h05p5v0Nuh2h03snZFQ;jis-BlBF^V zElW^65tgcjmX-pgSZcwg(TZBAK%WUI=^a@?u+%`77R!8kP-;Qtiw>I4{07x(If&3r zqL7Xx%2&H@3}Dry2bV^UBuN*F(iKS9B)e{Th$K&u`E;SkU#Rj=VtymFFhJ(hg(80` zlFU{-nV~Je6SZM6pWP@ehFwn4WI6gsJ6D?kD|6|xiTYzyeuOXM62%n_4eij0q8#0&ovQ-CimM`A8b?d2bg(E* zugO-a&E=1fs~;B#uIXTrPglwOKQh0AT)ijr>0pshAIbbESId#l(M9v7^k^I9=pb1R zy{#>OP?)j-lwzzKmqw!`oGL4%Ac+v}xdsgv$$XE@AEWXokYMs6;OZPbZ9gdfsUN7`GrZ;?g+&hqU{W&>PCUlaacb4#9U0ULl$^0l!j_ zx>`7+hv)H>Ndx>_0e=3e2*F}SJ`SMpg_ugsX|vTc_6Q(7Nt9@#Cm9~LC>8J!38w+B z@Br2&oDaC62ylgjYc#`{j}8s~X-IBpUMgvmv*rJyMqiUTI&wVB;cwe4-@=^dk)wxl zQgPeA4R)W|ep4gq;gc8Gog27mA+C&v&f?><^NMmadhm*mhi_mr^E|m3zXG~TLaDhK zzX0+J=#P5F&j1<#aGIz=Dphe0F|kyk6fs-$1NJ`k!2h2rK7`JxhVYjZ@&a@nMGx;H zl+Iob<+%4N^3$X3B0n?NF7gryDUt6KkP_JsfFj?3REqp4F?~f2ANqgN=j}Ok6_{_? z!IU={WQ|KC$5=(aI&{3AfO%6ad0OuR-t-`<1n0q+Ql6*V$!VFCiuCivm)z{tW2 z1Ha-oi`L=p*I?{+;IMlSUPaT-_>zj@t7!Pkm;rwym~u6}hTgz0DfzeIt^CR4liMsi z-$K@H%K`Ci@weTN3%vyS+|x*jgD$YDwI8nFm7leV3fTci3~G-v4j+ zUDl9^q_Kg-CvgNbAt&RD`3o}ni%tpry`K!Pd$7wtz6X0|a1MCQH9AP1*}9|w+c8t% zKwoG?hHKdrOdmJs_+2FNW`Z~B#cr%Wh&*{m;8Iwuta&7A;^%x`(@w$=?fr8;w?k(6 zb3O&QivFC>G%u>)ZrFvh!3g8waQ@vExs7vTFp_UEq6>s(ErkQK%O;maee+=9Bc zn{KD0u=&ZamOZ7nnVtJ+t#jR};S@Az{B)B7jb;(QAcOmMY4Wc7t>n(Jn>++9I5#K< z69@V1Pupt1g?GUMZR~tNMM3ICklxG#TkI(`D1OdW{gP)2+T>ngH_94+Cj&_&9M$%d zz)8JNAejLqgYrvQFUHTy-AaK2>Q{&612En}Njbj6Sp^I%J=XcUF*vEvRg22dRo} z?lF)|YfLiz6q)NVF@j7tRAO8lA!A4~XV}PG6O+tJXpG!sNp2Xr8@~f-A!9h@9>LA$ zT$CwpPK!y#tH?Ypxmipyo;r(+N0NESM&=U{gGTizb#sOyGeB~)gk*k<2vs4;6xhhz zACt`IXqiG?CCMx$nP($p7E3aR*~mlId9f{kB$qU4ZD(l z>uvNc-C(HNdh-BYR6Qp0>m)A!Sel@3y^bG%j``k!$3V~7ZZq3(+$5Sy+$a$1x+x}| zD`U{nW6&8z-1(qmp4+p(^Js%&(Ax_$aq{&kap#a;7aVeY{l1KHPC1vxptp!PZd90m z-7meW81xEa(90%n9O+%OUwX&IpqCVb-uLL}a2DzP%(-~DbwA5#Pya$Wr@XDK>*&|V zh|49tzk{BW%1tq-)W)E46>)i_66*m8PYf!=GC9~ztO1LTUTj5V z1Xoi!uGrx59dO2Qd_i!S%|i?FNH!;|Y;i!f=8VthT`_MB|(jv-A$S=`u= zyJJXGxk-tKiLraF|`-()TeCN7q?RTz%HQ%`o^P~muqUEQ5<=-;@1oIU%7WH@kkF_^} zkD}PxhkKGVge}Qc)Ch>91`R4xQ6VC%EGZiVN;NMgSoalnytFIJaYs@nCM5Q03P&mQU;SenXt;;VyN8B5SaZ`r%(BJ4 z88CZ*xsov65X+j%XXT>ed!xq0tT4Y7!F%Vv`sY+v!)Rz`dB(jnI^tx%v0>;l((&$J zY2hVJpzCf;kAo8H{(u9u@5X_3?bMD(U(ekHtp;f0Lk&%hO0Nae3eo$;Mk`rlemlM|BWNEJU@G9kgttdQ$AUFd3uBuZ(Io@3VN_O_<6MUUy`N zn~b`#I$ZPCzvpm`y2EX}?gZ~6kP5-NiB7Iq{Ou-63x9LJ<@dK~^tVH}Ph|zQap)K5 zZ>_M6y>lH3?d|?{3}AMDa}c%*RqOSjTQuJD0zQX8lr2ktjjR71t}pz7jFx@!{jHDV zVrCm3*xUhpyqr0q8~!+;072pp7x>~f4lOteo+S-BS+wA@w}`s1G-A3GF@iSYOCXdU zjL@p#c4*d#-V6PD*p1SkhO#w61u(l&zMBcy8#Kxs^v^TfO!u+1r$wl&5bfz@3^!O^ zpXr^q)9;v{l!j|hkAh})@t155h+vbUT;r%XivRLprU9`*g_-Q(`{%) zY)_wRA7OkP;A8#7UB?gzuHO_j)uAHo=6x3VcJtl~nBBasglRPIaiD_z(}oT6;ppun zaF8*n8NDZMxDvLs#iTnSK1`$jz7bSz2NlDmo2hrlzbQn+pSmy)$CTB&f2mKt}_X{ zlU95Sg4n4H4MOFZAXFTLJt5jZsEEE-^a_-|S??20=&r>8Xc4MaoEo}SJdN&*lxZJp zjm9=r9>PF)>*bgzD?ti4LJBQH7vWA@0raxB138ok?6r-mv2}DKbP-%Oim^?XU_M(* zp$Pd%nf~wM!`l16@KiEH>*#{Xj1jjsQWyf=zh!o?{$06g(21hPEM@)1&MjIKD0Dv5 z`9SA=ovS+U?z}6z_<(!U*OA`Se+7K0an*{7uPL@th@TAUIjdMv$>b&}10yG0xqW`*<{TIHFbP5g0 zNrQ3np+GIR`4_mGoHpQmKi*%hSEC$*?wetZL8B7>_7F6Jr2p?j&|P$04nfZhIr_vS zW9aQBpeALQkz>!RUX(+Ds04lJnHbn>MF$K$X`p#4bpsvs5|}F9_Yew<@j#XwuDCH! zb_a~#fXF5q=O4Z4Ue16Sr%ghC%H~ypfeOtY=O2@n7ZMa|5XaSm(47v)V$t5Sg@6sf z#KJypkWnOU;KrDq5REb<1Tk znEt#D7M#qoV*be0@;M{*G5^scbr*&Nk|qbdr1_{$bG%8@95-G7n!)4Bg0i47B~JSY zV+c|`fWeg2?)V-BjHZ2mDPl@AQ;mYQu7Vj@UPnG%;z4-63@q8j-?=w^73sYZf}?-* zudlgU{%H9HUnxsXtmJ+aN8LmyI zo)E7MKo{2I@>XxhEvdm&q|*|nRc|8@Ur1+JCd!ONwUL-76HN&XOyEZ%i)9k?)i4-8 zsKlWVClscU$h5eZixglj2QW@A!Fvz>_&xGfJaf-CRAoV2%&XQOVI5#_lgBuY18mju zgDWym^Nf#g-1}_fj)<1=@!A$qf6EdYP%Nvp7Wc%d(MV#;F?WodOq|ifti#G;oM+rmYeFCiOxXxwzd|B*sJr@&+LU^(0ZNw2==RpWp zYjvL1F%2BA07uN_xIAOsW4bUV5cKFQ+Ulvy*{AKjgy)XVrP^)eg}iBK=-SXlXj8Pm zv^(l&BULq;1nvL6pzx`7x1g}4^Fw+B3?ZIA<)mNTV(mUy!h0lNRNjicMQ{L#J&_vnm;E<{#$>ov~yhXBydmsiQ{Pw?tYc{QW`d0q`>3cQ zjg=ywK@It+XZ=fAt?AWF?+0A_9F>kG^A(>t(TX`;w~CW%Vz$nWc|Bvc?j3CGle2Yl z+U5-OMVz)d8dd74g?dM`HbFMN*QV8NF0&b*7 zSkj{{>10dVrKh#yM03->$cN8mk^u?wtal01nAG`{>E&3}Gl<$MWrZQCnI(FKTWDgm z1uEfcU8bLC_I8`{PFxrLDO$g>O;e6=Yp&r zQl5#D66%tIwo%JMjRgf1>zZWxQl8euE4UtnL*FYqjR|0aG2BE%SbTgAQ+gQlWhF#w z5x`|*0t5y?hmIN_dC+tcV$zpr=iS7?59pr>bilZalRxGgyX%=gWPyu4oWP+Ik$e`M zXdvAk>~`Xs_=FoyBzWphv>b55iB=)aAMsaP(rYZ~wU+coOL~W%*4`Fv3c>=wZ?2L7 z3G%Ev(QYPnKAiXFk506gsHl6&3LB6Dt}(R8VFkj8m_a9!XK72=oIRA?kM`((BvBV& zEI~hdRuIB>4s}jsaU|>b{b-hO0>4eElJKJ+CRzMQa@2)|;${s~ld~@rel&xARINS7 zEBFzFljAe#O1^x6TYg|4@C>@5b*I7(?Hi4VuC)p8Q{5jq^@zGwMpS%G>SC-Pq%>T| z^@??>2&i-Q1&ckoq>1$puoUrB4lT7W4p5+H2*Fhl(1kpLb^MJYpS2Ehc`U7~BpSsB z@Fah5)oBYMfz^O(QODHe#2k`faK$H@A(eJF0q`U#A_4r--tX z5M}SmmQfZESO@Nl+P6kozzvt$O&t7S%+CZm#L;vZ71(b{AF`wsjNAAVQ5NLsP|xbh z@5D9BlC9xTly0XHWdS!FY8BG_5r4HMy~dIjPqFAY8!hl1dRnUxEegT{K(j6983cJY zqAZg-4Gn4jM3f~e{D88;2Bd&%RD(tbVU!gJh_d1=Z5blU)(icOA}=QF7qsA0S$Ztk zD`@C)9>G9J%@EP_DJh@M99Y8O>`JN3s}qG!&7g58l{O(u`jiwA z*chBjpK2pF8E(_X2xw&k)oAQ=;wHt_nyRmcl@o1DXZ_di*5C*(AY>82>-*Wb|KYmM ze!zpS>m2n9Cq49a9TgY;`$2q*QClm@_3XABbOuJj$d}7OgK?o%0#ozifR`W!$Sz-k zh}%VUk=klu0d|%U@*}$z0jSp)sMO~p_8Svnd2JzBbgt;Uqx1I8TRU&*yvgNRz&lFr z`!cp%q_SkoZf^~4p}ezkAkGNlBukw7k2*hP3XPN_ELN{E($bEdAPVbRc+X*R;E8c4 z?f_4lZ{7jQ;acYl%oYPTWS6-EO8?6HTkbld1bLKThLj+-8Dk^7S(VhKAnc+kh|yJR zt%0N-ClzoKV;OL|819wzIg+t|)vk5xSDq}LIFhjnP6 z%6b@AaSeUPwdo5r%h3sUjBVnZUd`4i-S!*l;oDhl9(Z`?`*@~)i}!o1J$f}9MCCD< zz!aa&v6zA}#CyK(XCj~guuXhAB{ z7O(2?AzryByK}8SzF|h7MX;Qw;cW#9W(4q7hsHM?S}j6=7kg-Y+VLP~uJ7aB5AK#f z3tS$ow{-N}aR>CIVhgF5C-0D}^-OHu|0>GmzzmL7SB|Qy1`R8}tI-vtVp`&38T40B z3D4EZ0OmIcE$?Y`E-id+A zSfCO%!rizvnJ-Q5$2lKw0>GSE;)SKg+br|b+$zS`%3lK)@?=K4TOenZ(~U{%jt=PL zYn|qBt%U^meU3*l_PEAuiCy?78jJQ#Dk}&>BbR)eL-QV2Ye84tW$-85{Wx7%kHTvI zA(q?@yLBYTGwEunMhve`-k$6vY_!NuP9_|2yLtv-(Hajgs!xdY#n;@=9klQeH_`+9|k7kuMW5c(^^{OQ!~#l*Y_xhu9pp zTnLa3?;Erc0SeyN=^FEK9lnDuTm1;g#4i_&W7KiI0Ag06pPx}SZfMop> z!q1LF+vNukwnlfGKXkQ@+k*#tqFjv~(c*Q|BR6)wzwUSl*j|W-U}amM3i4QL!uK)j zbx1lO>jn^OKCsxh&jLg=?9SUateuylXh)&rUrWF1TP==GDXql4>1Per| zd6t<`tVeKKRK5Fkr-&||ws>x=>u89>36WQ6_&m4q6=#b8i+4|HPg7%$T7eSncNmg$ z;xM3WoV_fGX>(=S5iI*|$g(Hf%d(%*#-gmW*D9l=b{b$Q>`oABmfnTt$myODBfrPn zXx!~I34YhPTCh~t27h-)UV?E@wyewKo!S5n*=SZqU;m;LteNXKEqlM`99U6 zghFcTt4`PlZ)uZW!%df}=mhfk+-#hsdI9iEN^of_#?ulA4MH%{qNtCnMRP||_r`f; zu=~fTOPA-EBtgDT$u) zWiEV7B+H@I9feR=-%lFT14X#+28%IRk4T7*UBvxpxWt0YH%Mi3K_)dWws`{VPYl<1 zF2t~|U9nBs0P59TBMsGwv4YUlZx-!gDkyYr+=82sPcgy$CD`&cLayt#F(jss!Sipi zH(tvT5nO<8T8z?H8?FKigbLex7A^8#WXj7&*Gzfu!-YlOfckw7d9r?Q)9d$aA-4L` z%VF!&($tOmd{sC?Kz&B*Ag62WxCDHN21;oMF2Hc03@Kw@okzV8yu>F%P2UL!4Wf1} zVFl}1g&^!bu`6IafcZBrj*WcV)xufY1)}=Jl9y;q@H&9$?^uof*46TcHUN^a(IE#7 z&N~$3Q2(*dIk4=7yNlE~@uGi!`vQFu|5Q7v8);HyHAjQAwiy<*(7j&LC6nmb=MdqJ0ueWUJoQm^?bqp67|D+Rd>nv)M*XQE8^9PPM!Ux5M+6;LRi%H0MdDjfBc5^GmK6Ss*$I~~8F*{mhl1S^gDD6#Z=NUEvj4467 z#-&w^nHEH0FRywM^ribaCs!h} zSTn*Wb&F^hlN<@G!?Gb-sD=V94*& zRmb8w^2Nl@64LXiw{E3zk9m$K-B7UgJpxKI_m7g*lf{}r?Rie9WnRl5)T-ZOwEvQu zXPKB-K**FYr%h|o>Rni$rC)X*Us;SU?o^_*mm!jOUl+pwo@?gaOO*C3Vf-kNwg=wp zZrR$|k1h0Cq`V7ZJ5$9$-Bo;q)$39Jw}mS9dxs}eMcswydY*o3nAKvc*so1o({U{- zzB*b~#4R)&Dr6w88I2lwO}Eloz%Q$1F9=>BSYAllmq=Ru@st5C(OXq`BeK%D<3K606MgQ<|W}6HIhxu6Uuk9bOGr?_-_C7pH_}MkE z)yRXwc|nBZ(yN&T`F}oFoIzDfJWs0VWT4=)?Az*Gu9gq7T0S?vourjA3#DF1g>&SK z4p82nDi0OiT7}og%o>3m z4lv+oSIcl$N2Z9gmf(-kGHw{q<-~2zxE8UY-`);LAGkRIb1hlML#%A51M;W_fM^=c=m29_WVZD99OWtLLO!ApIG zDN7r#QI=9Of|q)-d#OJKdzix<(bdPo_M!ck+8t|+0a?dapd4JvHjeDe_0eYuxauJtw_i}=b~TPRSwuNeG+9?4uW_yHGf>yoI8Yrdh& zIFy8$`)jpq{xH%&4k&Pb2S$^D%X62gLpbCh{TAb1-0q>nWW8JbQpCG;!$?2gr2i3Z zPWngd^w+Zk1pQire&^k+FIHkN4r%seai@j%F_0lV+jBt{+An$?j7dknj1UOzF|6Yq zwVrS`;kGdgFRx0}meQrM1A7uV+&dB%D3;uQ4ytZlHV$yVzFBy~RkBusjtahDegYuD zxGiD3kIC%UyV@17u$S3lUp2P;J+!`oBu;8=3LaWTY6G<>$t6cPjY)M z(5{A_x-&HmM|EwW?d5o(2^C_u*W+L+w#s3yLHKdQOD$v(+svn1BojXa*8KpJLH+$3 zM!*MOI05+Zu%92QkA{TYpzOe+4=1hKb6^Dai<3vN7Cvpy$Mt$h<8K^?`?3XYd9Iwar(y zKR~yDqa@2cdwDWm|9~oN{=V*F`y(0)p|kzdZZ(9#M%Fi!Tn1SCW&{=4Rxf}vQj%X$ z5#m{9?w3wW7(~Bj>5ba5*YgcOp zVK%iE0oFdpRI3a=L$#b#%LNIv_ko~Yh(qHaj=Bl*Guc<5swEp2GFof(H>|8cAr}53 z3BH*C9PN0FkGKWpAKw82XIHeTD1cEwr&E4ecZx}=3s#aeA3*WW4Wb6w6`=mu86~mG z;ch)3VdqN}CRWZm_5ulSCH!vPu{6O?rU@Q5DdvNs7rXJhk1yE!81Ho{Ynj{Eg@qJ! zIekTR;%)u3kuNuPInd7g#x`9I+ch7&_H=j4dnqjk+EN<-bhtgUF=Z5i$%%$*lwmTJ zHl~EwLx9~*D~KsFp&C1tIIX{~$)DfC^bcU~B}ZZRT!3mX=-d4EwHbIJQu@g6JNu>J zfke#wN2iQ=@0yXg6^WnVIosLruVM9Hs+t68}wQ-RmPJY?35#}Z+&egaLLkT{$Qw>lPUROT|S#;@SqxDsC7{Bms5TH*3&2f})$!7Be{7%W>iSbu$f$6|&McDwHk zX*WJf9^P)zCYOtJyB+JFkN6w*8=-Uj(%@*Z9|+qFIDVL6K=XNs;Q*yK;X&FRpwP~D zKhgop+4Tso$rDC6*m<0P2rcWE{_}naJKTA?w8PY%mRc1B4FBOfBk4bi$q4>)5FLu} zpAPuXcF^Bt`p<|E{xd@7^t-{yEV=;OxC6W=9j+$5$5eh4KtlPkRGx3-r1BNG`IO2} z)RjL<#!FEz;q<<5sMLEf+O4R6)<%Hde2FLZjy8Fq{)OJw`Ztw0vi>^9eFjIf=z?t^ zY%{@lc#F*h!vRWfIvCr$5V2akN>0>2;_LrQ{U=I0{D8I?+zxz2ov444(f_gjbxvc;A%tM)##qC;(!}9BIC5|eddro`kBkVG6${b1u$ZjeL`&WLxS|y zGRtsP#xcp`H#$+wV!O9El zvAyL$5b|;j3AzY<-i}?$&whm0=ac|_u&n6R5~WV>dQnfH+5iU7BoeR2K&4Rdx+`3s zM=3O)dUtiCqrv0Goui+6Om+XN%>n_}Iuz#bI@cMKu@h-$>>}}YsFuxmVJn-(=R;7h z6yBl+KQ1flN!X{Euis|xy^cfN_7~9J_4_^_a`BuA?Lj_n0h23~ms1kP(v%_d9TjNI715X-%2H+}a|T-F(OX#EV6-)EjZ^CSH>{6mBDQG{kFvhmx|IDg^4x}P5f$9`f6 zdYeAlTKbtm{|uBg15Ao|?h}KQ*ydZ&K*<*`fCoi~_ZD26Q8-1;2lAn(YSAEw$J;4~ zw-VRu#!!en1QqE;BOrF5L9b!$dQiIGo@UIzbv7MYz#GC@QKt8748R`s0ht*W6;=Ngx1dYRni^d%ix+$b&7sZ|OoInsmWiw?EU{rCi-waUgDrOEgMu)R4#NBZ ztlNGmn0H<;`dwJ5uW6+S-AYfM7-%JpR=OBB*TT5&kXAw!d< zMtkNqaC$sH;kxR5NU7UEd|g$4YLWdzJj!@^F#0=7xw0_jQdo`$y_{Y8k5DUi>30BT zm;M>TDk=THK+JzXXqA8Od!gkYjiB!3A&C5|!<4(9^7CMpo9Lu#aR}EQrMuRzg958} z23_ku#9rZA^TYY=neYsO#R*;bZKCP7J#^xC+K4YF@pOYYUJen$Z)2MnQngo4kPMn8 z;L2|O%fQ`k{Rx2Ct$!|Ix6%3=_3?zAQvV>7lpvJ8gE;N2q%;SVu>NN8Bj>xJ>rZq^ zJcRJ0WuR-&>-fNWm_|8|GUPlyWP6nRrnTiz6B=QYh;9CPco10! z29dQVVgIG9H;FqdG0*3Zv!mY%T@Syq9(aIaqfNzn>xwOn4^(V96&u5e1Xcj=e?vLg z^>nZeunQHNYgcRvG`3g5c);wHkVM#XRBSBZ-&-s7j@zRIwhMrovrKi z^|66E#a|1ZZo;khU(@NhP&)k=8FroS1kA3}Lc+FFr#PEVFTN34r;n%;5723bsZ(EF zr=x5-O`}dPVansL>6D6U4q8tiBEzoJcEId9{g1FWsna;zi&^VQd;PEJRATDXPuJ;R z$B>d`%uk)XxCIkBop7Y~R0Wd+)u~4ioqhtcU8he7+eMwq4g~7d|1auPMxDf9gqwqT ztMTvtfjVuZPRAHJT@^A&>%ngYOhTQy1;0^{)m~47gXq+ou)Wl2+0TJGCBKI25aUbJ z_q27QP9|2Hm36GH*iA|-4Kh9Vus&q6R(3I&cf}S zFhrk_HC&gcy#yPwb~Effz$945Zh&OK>;^cKupeoF=YAC8%`?S|)y4DvBT&2<6mJ=3 zxP*9v!_<@Lqz33BcI%{`LWW($m4MkrY$1$mGTx(wi2jau!^@B?gadv?lk?ze^)$dk z%y#t;U9F>RYAvT)FX2{KsC7@`5&GU#R2nVcO|1`+VOMKAV0N|sM_449x_8`uO6gZi z+YwqVej7x7Af(l^*s9i1x>{r7NX+j{JE<1u(}Y^?FlxoZ(Xy$fRV4)@9LXpoEFM=FtO1G zghf^CI~7>19>Tjp#OuUzt$@M@jCs3eFNo>zaI4O9jr@sZ#epA81#D+tJT9BdNZUB z(COc1qn}6mDF*$TaP(h=>cT$PNctPC_8F+tKi5XTjr2b|K`%LeXc+&z!k}L)>F==8 zAEncOzfYijc98z{2K~#z(eGo>|6bDHX|>NFo&FpfedPwwKgpoKDjfaCpt`V6FJiTN zxHH+b{|P$%V{PNDC+hU4*yyhzeTPB+(s1-W2L10Oea%YWsnb8$D^Pxo^zX%8u#oJYaP-eG z=tmQ))w5rxFXF>VI{n*h^b;FFKgFP59ghC1P+drIwWNQ*O8;b?{<${#Gf4k4ISmlo zoP0(Y`(I(uFOu{RS?zO*PXGPrK>I8w{p$_-mxQC=$DrRS=_}}GP5s<39-#p;pk5^=wBu2$64h+ zRj2=4&p`X+k$#au|AI5a*uN8^3HfJ8`u(l+Pt)m7vC(fJeTPARdN}$XgZ{UYe!P`_ zqE7!{k3jj?kp8{$E-5jH(NAm!{m*a{D@uCc3-(cux%PF1LBCMaA7Q0GQm6mE8fc#xq<_6Z|J`0; z==U+`p9?9t8#P|+ay~;+=0@jucq+axVq?)p7K05IPlZ!qHMAFsHpNg7%W`Hr{l>$G zf*y!u3VZm`_PD}h@W)ryp|JK#RG@-_`zqWC3%bvTQ}A?y`)PQv#IN9)y6INfD6gen z83yH5J;Ru8Crl#vUk1w1D;bi?W!kS<-$Jj~A_Dc&+2TW-#MVv-#(f8Ff(6}c!dW%n z;C_2=t6sx&y|&PxmLai5y($gL#o_ci1fc}~#h?tmT&7+zrd|gefqLm|p%<~W^kCfg z>oWw+<>BHlU#m*Y!HuM%hKZ+HprE%G?l6FPB02U!>(( z&vhy3hnLa4gkEV%V14Oqp_f?2x|M2MtSY!C8{8L%)9V!oE@b=?t=r$eM(BEtwNai& zy}p(=>#@zRh109rp!}YU8)c?m3+Y}$uL@rmQvSH1vxQ#7)?T987QF=bGK2dYeZ$n( zPY_(_HB_|jr-fBp@f3)bi1IF`UH}UPkA(2?Iqe468lNb4_;Umtj}Gx^>k{O;e|?va z8#sx9cYySbV#Xd}A3h~=bPcGCgFf8&_cPb5TPLQOLy>b0|EIzrFr$4nEQw`g?GAm4 zp!wgl53DkSJK!wkc^nnMCli$Izz0v3gQ%!^FBV($@VQ*0Ap?XykA z?nCW*&lCdtr@p>_mb#e}E0%em*yf^NQGf@#0A)c0m_%l_wc=#LK1DS2zVZ>+v~2M{ zBD~(uzA_B^VZVbt57w!=e}*ZLbqED`Ag84aJeE1Ws4xY`Qc)hP1>SiE*C8zPZ;Hl) zwZMI03bar(9*pSSn#D)X@K7c8=N-qm@W7v4*q?{3w|gNtKC;~QXCU>oFx2A@f;taY zagGjCAn!L6;K5qp^QS3Za960Jg?O+Qcq&W*CoRN-wLn9d0_RW%9;^j&!xSi?4m?;3 z3=UJEkvj0e0u5+!Y}?=e57&Bl(YNd{!iW~motkpHX6wW}vxSe*TX^d4GHh)GdV87P z!hbMs#&0o3tR#jOz7uDog>OfT(CYB$q-f!M#>L*idcqm(E&OK`u($9}0c)=}TlmJq zUD$C5quc2{&$1O1)H%=C;&?yH+8XSQge8KW_xKN_T{a@7Sc5{-0WY5XdLFuazeUQ< zC?^P`^Mf!tm9Qu>TC*FBFmF83PAW18seRyUmw69ieMzcRCxy@2NG}c$d@Kd;`VE4Q zL@FpsJW=)#B_@b&HDF{n;$=Yunn2jG6zJVuA#^(}2&tokkn%y6_Q538qLT`)TcHNQ zPp06LkqWBYIHLFt0c8|Yc4Oa$vUc5WAak38R1)?RBl@BDC8Gx6;CNRm&*JS#cY=vN zQsC=o1Z<%Wa~f}biEVo98wgZMf$l}hE|3~TpdW#37wBWcey2dQC=dewRCuhN+VjA& zt9O48YBv(5k=pU3hB&bbYB5f1`>#}&*Ls{-gQSQPISlSRkiEtZ7H^h-3lf(uXd;?R zWO$%8&Vh$NM11sALt=&(@KxFi;!p@PQeXZe!^8FgrT%`9Jm;;c0N8R$ARUj3-8arNCR{W2w!}KdVJ_0 zP(DBk6EO3HEc)irJYAh*bahy~J+1{$N^j9N;bs$0=r@b+c$!TXu@X3#B;YXeKj12c zk?CN-VJ3I}*8^x*?LFk%Rog_EO|>TgYk$#HZ7C|;8&B0(*T7|6fAF|emDfgHBPr_o z9*+8WFnS#Wjm5FS{)&Td7=*W>zxVRsr(oGV z`Y>~OFxXPgk-D6;sV*l?C*&ObGYarvvJDjWNW2iXAWyLFM{4q|t^^t{F&? zNN^R$6Hg`HXyTcU<~MKCiA4GQXP_K|l-;-HqpV%UQo!sgUP##aRB_K+QlM^~T2c^F z1A>t1LD+bbYST#t*KOwK5PT>FzXYkEx}_4O2T{f#W!G&bGVHpw0A|;%p0E)V$hTAI z=KWkmw~2gMK<*PiUvKKC|0pBZUIvxb`oQSD2>oVz5j_-jz+6VI{Wu!~BYY`L(Wd{T z4~!nim0jE)kZBk9L&8AS8o{;$)_x7dl@aV=nA1D*M+nY40-&<_{IpuP7_N=xgQRFa zmtr95&BR78e^<(j1%@tV<-@?UDJ)R19xH^`!(O78-uwt zMjXzDF`k7iS`L_qQ6g{h?Z)^FnRa8mP8g_Ijqw6t?aih!?xZgy?3c#a3sf58tB0ko zyw;7uni0k*XUyY47=utgzhUI|W$(v>#SWL4b{MDI;TXKniZ}UA$kv{ycV`=M=Pm5e zO#R^$%%vSh;2dCw445&b9X>#&-45Fb0~M zbvv+9gdN1|wt4XTFWx_I*ddCy7d+4oUkNLDqEOT{#W}hudgDDQG(|Ep+skxQJd0ap zVTz;(X$t1j6nTP#oaqrRF&35#X^J!a&7piiZGezs)qoB6x`RA3s2Lv=1k6 zX^OWWl$!EdHwEiOnBoe$1P?TYLMU9qG}LeCP`y0R6rUj1KWq3;-3aRU(g-t<+rCOS z!WvAP2qSO}kd&EABUIxo{Q(;m&WHU3*4fPUtQSb)X{tPE#V#aXJGYZ5%o<{zkol+t zE;<<87kwwWHz3R8KFZ*pE9hd%iDz(c69i-xaO;q^7Q%}vNWzu9RZfI>_9{4!Fi^2p z!6|^XuQsb73eM?W`kkx-ip(m=eLz~6*LoGu6ru_c(tKVXSZB$4fx&tKUh4(&i~^ev zqW}+fiSK|qc8Qw_LrJT|R|3|4w<&QxtnK}%Q%dM07x=&kgWE40fbakx=scG)HXw^i zU3I@K!fRbdT39G`FO}kfr9_uEgIOSK>U|dp^FS0amP*!9Oikw#g$Mc&*^q$AV>)wr zAOW%MDR|%7dk?DmbvTk&L5id{>ZEq^%qKU$`4A|N66Is0jzDi6k*^{08X^aycRNuo zCQ9p(>HV9?!-za16unHM^d`zbk4$d>kw5+x$Z?_Q{fb!uZ5>g*=lno;`@KTsRw8c< zHJR`*QT|Pol}Dy`6_Ni*Lr?{p%6?*+0m6uq8Ad4(v4cSBze9#Hjk z;%8U+t5A1h!z-FE#J#$gha(v15zy2sNc|pC59SaHiQ*>80-FG5hY{dm3c!QCr9TNu z_LjZ^F#Bk45n)I3KKLIUQjNJ*<=5Wh7uT-*63FT0!Or`vAiPfu!n+S)1Ihc=?S3|F z!G?jGs-c>87VfmYGo_WiKeNNIRL`+tB%DAAd9W7v>TW+TyGrv}fCsxuGa!Q9oLPX` z%{iK|Q>fDBZQZF12tuVt5Gp@?11e{b$^uZKZzTrxnS4xLXvYBA>*6E^@Pfy;(~0-} z*T8cUFPQuFB+4s9Is6__ECzmdrN4;XUXRfIc(B`R1cb8NYY?@v%`!z3b~ZJ=d8^bQ z!2QOA;oVB!JlJ_Z11;>lR{>_{y@arTlJ~LX9n2~}z6+~7^A)V}4N@?0H}|{!E`L$G zbtY0m9<1h^8K%HhEWm?ZrOzOOU8UCnv#az1VHs2@bxU_DD}qp26og7GVL7C7c(c$t zg!{eo4y<$Am$1&uNQHL4$B0)+ynBfk%+QO8GLb0P62;~J>Cw8T0q(c%&d|J&EU> z1_hxK9fZn(&q3u%Qn?mXLPx(zm@mFTI?l(3J?;FXR2RrZQpoN|H zeSq0{FD2{-@{T9(U{?A0O<3i*y|9WGsbJCXj@$i3?bgYrggjW?uPjV~1uVdWU8OG| zf?cII0kf;Lo-ppf^QO0Vr?N5#m6jk>>Iu7rRD3T9twTh=_uhbYmVXNCyoyxl==TKi zs)=_$@q!upKSaqUN&`_W?l<~2e-XQZA4iwu<-u;Rr$NbXuX_Nq+si}P3Tk?cu4#b# zy?ASA-fiT~gPr$zL3p1Mgm)}q50LjAFZ$U8M8CS%VU_bffmQyER514&5JtirO2~uN zoZe-AUUqX{!vZ|mRVsl9c5~(dW;f?J!k(Z?+t+ueGB^m8-a)7w{1{Z8C6xwH2_5~W zyawwW4`go^QlZ^1iFiML1iS>|1#`bXM0tZK4x(7x?}c0ZMeO!^0v)xN2fMw_hER5U zIjNN``t>F3C2D%xI;lZ``~5Qv@0-Y*2RrYz(8A990l@6M+X&<8ymtV32eZmAufQs6 z_rNNhNCk7h6*v2f+O3mI33;%Z^WR|#aNHvXKX#SAf(Uk%-U7_7(o2NBNtH5R=x&&M zf>7}Up)!}SU8LdwmC)|D>t$Hy&JSUo*O3bCeoqpwhIkJWFPNd15G9u=Hz4JIUy0>= zX5xMC_^i*OINX%;afgmY@ksI4v1pK)a_T7{Ci;^j%>2=2YMhx$HdAS4iu+zA?JP5u zXQn2ZscB~F5;HZ!OqG}^?q8JjD$G>1nX1!MkeUms60yNdHJPc!e#AC^a=Aab%AZ{A zPpAQ9cJn^fA&r!u~ihW;9ImC$qSI|ShUx~{L+u9`IGzo$wMZpf@yC4 zIu=DCMKEX@A51}wn%R6O1AoZo*E;#@SmZQQ31*64*WeF`BaqZdj`l;7{YjTUndeVV z^CxHclV$#7wLdw}pKS9dm-~~e{K?h+!3p`aRl6_^&G%6!C#S5%&N$4IKe$?) zp4cHy&uUbtC4EFz>r2rn;c88v=W2avIiVftqj6HUrza~N=|YQ+^hubURbJpbw>#3O z(E=Um!Uspb-H|>+SfnMr3@;Y#NUxS<=aG(+Y5k>&9qHIIE2xX70y@&;uEyo|^b-7+ z;(sO!D0}bV3qT)~ax||327lX|DLgWsSc{XZqj@7Q@!Q@^b&&DwZk&*@mzVf$Z>ECC zcgGN`ViCGGC=ZEs$Jq|(vc#uI)UoH%rd*I)B;dvl9CEtaSM$dgB&;-%V; zZG`2u;qP- z#MO~Hf`~Y~^mDR)&ePA+OBM~=V-VRCdF&pN1`X=Dx8F*`T_R#%Vj^FC=m4ygbizBNA-*XhA)mOTP zc1~=~>Io{l4cK^>lU+xo_!xQX)uX3- zN1=viwuQv6B|Z@KGFl{Eq~+7;-iwi8>+j|Fy|P+|%*l5ulU=RDuEn{)-Et5wUC$c& z6~CL~+H^Ql`{oVM!2A52Blj5ZXTuhJy#F-~6nySrX;6eE<>tirVPDelzt!kpYmg&Db%bJA=u_^dlCWqejT;i>;Nfra9VnX zJI75fR*Evy3Q5V&EOh6jrBAa1yVVgf?JDcckQJcI)ZF}ug=y~W^qc~l03s(dKX-Co zH(DagJqZT4bIHug9iIX>&d4Z852%OqDd{74T7(%Lhf?#kjMC9k5W zZe~@@Z0F3X`pOa(5bTnisYR8QRdvpyk`iZCwIHS>IkQV=SJljS7FARj9E8BVm0@LP zI+bC=%1SG$m9+BO>WZTI&bqSlT4#ABsS9a(E0-75mKPV+*430(&N~16qU!3>$`ZvX zekiQeTyDdtDFvb8QUymLq`SmfTUJ$HQR2M3)Hxyq*jZPFh+bPXtF*UYJEgcHzqF#X zsJ1k_} z%v6lLB72^0+3fP#S{T3?kbkmrG74AB#DBdqM=3~~=$unpQ>#}Z46f*x7OhR0m61s! zlw4RdHOo1qBxy*Al9HV^EF~{*80ZX}GcRd=(p5>dut8>Nol)WLv|7In0y0ozW=@9i zD5_jhRaD}xDL*5=Wr$c2yL}3rv9Q@V@fcj0LNIW!0J>gDs!o4N7?o)0*dXkPG;7{Y6 zrf=v(4Cy2p)J`x;`K7g`b#?TZSv6Jl)y~@Tt4f6v7FVRA@z=qnXU{IGEV;10w7%5T zyr6t`bp^eBPE|!c+i&4SQF*mV#TE97vs9~L7Tu7d76bHh_U_fIm*~};S-F`sy}+>> z{&H5obV)tSj896f5G|*wvY@oOsHW)h05PR0jM}oWR%ZE~*H5cN1{-P5T0b1f-C)HP z85Q-lWqza*(dL!uhye=_0~Vl}U20xks$8jDSzlgS$Cg(dHzH zF{|tAoCujU<@1!X((+klb&5-zf?~{8W*607L7xaDL#RQe9AXt0RTL132NIfFRddBK z;ZRCmQEe@{jo!Ljuujt}ucSBSqi-2sKXWG9tkI&<=am-2pWuy#*Pt;e&hom_8WH7d zr5L)%LO7>jEZuKX`Rvlf+S1~o3cFSRaH}vSD=RlublBzy5@TilfQOM$hC9oh!?8<7 zZue0zhOQWBX6B@)VW2v`Q18+G0|<_&3JWIj8%%}i`T4o|9P<{6;Y0yQT80#opE2eN z7$2vl7o_I9^Csoy3$X%+1%(B!l>Br#(#cQ9cp^OqgL{qxh^G$#@=`E5<(NdM2w^#l zwbJn;q~exflFSe6H4C!~i>oT(KlR0R&dU0V3Mad1(Hlvzl;LGnvrC7U&Y4wOIe$2u z55C84;H;z(!)uEe6l#Z8p|2>e7>>V@N%Mwx3l+nYk^=t9fm~lwIvivs*OgZcPtD>f zsjRx1EY0AXK8HP_-YLv2hfB+$ODcvl=VQkt2Q)zj7H2whCTC?yzbCyRwdXqv3#+SY zor!bH%8SbwXNO!>Kh*gzF{BvwFFHFKabT`a|9oetUd}d-3W0A4;r0STex1IIE&g63 z$&L&?r1IyV@9*eEK+uN@)QUAX6A_~hFBkZ$_$5{K2$(GJ*XS2j!m@Rxvr20OlfPUJ zA)=xRt-PeDuBdK)H8=_%Lw6%$b4}Ikl*$rm2BVYLLqu;UnBb?^w=o=&!B!tk)+RZX zs>+J_&XW;)Pj(`NoQwc06NthmJ8SE!tE+0zarX|2XUR7B3>(H+>_kWC95zghR^+%< z4nAhr*VTjYd}rxA_;PLe9AQGMoWq96@yp*PU>!!leG>ico%grKsGRKt9lY*1fq1yB zNkcqxr}>c~S!n$1lxbp0VDhB&!pXSnEBG7b{I^}^4O?LGBk;A|(OEc$mN!pC#C!z2 zDY&zSrc>Q%lU#+V2$}ycM)%u+|P!_<7tSp+Q@nxBwwR9YeR z6RvOJ7LK2_oGaa(>9Tms5zC$Rf8?i^`@ix_OV69+`iuI}9{Mbrr2<0Rk@y%QVFqlH zK4D{J6s8<}YJ5sTdLbPW-i48(rC^wH7#gOfOiC$aAeZI3d1bhID8Ps{3-haah}Jn7 zZfmdz8{cKa{`=+h*(ov8o095E7jm1LhSt+w4kK+jFJ+cxNBP&w2O8-wmE&Yu0Jm`b z=m)SU2J{#sTc+;<354bwP~O)1!|~-Eio9tF?Ad9Zzg8}YnqllJZ&gxr^YhVd^0q;r zh5xI1`fr;iVRANKtP*5RCM7F{(00PG#GR6Zxz=Z)8Z*6r6`ynUe%#P><@nE!DcmWj-SXX7yum0c zpA+r20-@L4`z9NgzmNe??13|oJ}%G(Bmf4jVGF!;C2 z+XjPwyS!~M__xd327`aQylpTTw!Bz5!;MFV8$){ZHdtY__+sdK$C^wI?v7G}J8Q3jCn+8`gd>Nh!FZ zyt-N}fYoxs8!JO%D$H2*;e->HH|Ex0PWdFz&CHuDXUe!#l!W=k3M_Zda$a6kd_?ZgLqXaH3rgClWZ46(Z;R$}64b^!!wsNMab_ z$ATY%IVMGsq&csT~x|m#I87f1U zTs8}~X2u1ak*1w(^lFVoHc3q23X0+Bjrtq(MhSYw6$Pbrk~x;ejc^{|$9h}|j@R8E zg5z2Jj0D5-O~ZOqC702y@jN(P^0%UgqFaqU5v3(!%}&=xUoFc?%P*>&RT^M_RNd5~ z^16(w8nHTpIdgp5)h0PAqrJdO6QsnOgF65%Eb2b*8lAHxt zMyM&R#Toz>%EZ18Nu4cfidfclBD=n#4j8kF%CWA7Yb80 zTxQU>D3s5}J`=H`Lk2Rq*Xk9S#T^p?cwAb*mL060LIa~OPzpq6F<)$);36`%`haD5 zB|AClT3j`|8vcPLSh2qXU7iR)y7G!5HxW3E5g4`{5T(>vk44?-BY2TBY@~Cp=xK9A zUn_TE7zLpE95`SRmX$80k~%%9%wMLa>u#e(Q}{Q1_!o(V$@JQj7C{r$?I!Dm+9MPW zGvl4pvX{ZBw=r3F&S6d#J!0Gz>-hRmG}k|X_4E7dyvWa7_Y7?C!2&qZL*fSw(0>nq zl#GLVXwf@3BLXK@%S|z|Ra@l1Zm!(R|BdZ2CZ@Q9F}hh(zwUTJLrWNZYm+v_@Y|z1 z{eZ3@4Ben1F${jtP-Nl<+eed#)@N?6G2&wItzvx3Ct_{M6(i5ZNI!LU2?j58$aWEO>(kT3}uy$x2B&}zG?&MUlMsg zACXd&Bc#(F?(&_D7tp7z?(#L^T!-`HfY&_&I35(4 z^aOV5GJ)_{O7xTLXNsie7-+%PAJ496)N;B_4yvaxuhI$ocsS9a6D_4c7@Nk0Oy3+KHqCN zKTzxQCG-UU2EcKC0Oz}KPQz1zU*fzK&q{wuz3_1QILNmZuPdy8JO^+tfql=$L;ZK* z`~c2h;>^c?_;Xy9uQ=w#IR^Ia8QtdS8Atda{BL}y%Xc~v^j~m%@IyZF+wSi2ZR9g@ zF>x6&@e^YE%#ChPE*LQO^s^F9CJB~L!~e8tKA(83-eKWA?Y?oPKj|wr1p-9M~Re}XqTi% ze10-$^;Dm)C(ai90^r{W{0xDg?#E99Ui9N#e+ggc4}1;yM^hgCN73t#BwTbVI?qav4w6Baau z{-8Ib*yme=Ym0xuZn}SEq0wFo|4I$;ue6v%xCd-09gc;iKP&;gji7f5Nt#kI(K#UGRwTKfMmXK-VkX^auLXdGM(qb&w+b>j3EG&Gh+dY5y4k z^q^BWJGo=pA|ek3CzxTiKl`zH6+YjzzMy-309}9E4)9w?WKxfyf|`EIxc4IZm4R%j zcm03$IOgit!*yfi(x_H-Nsoz*JtLp$(HnhnYM)5=(MfR^%h9y7Zw1RA#yE-g_+pK) zhu-(O7)Ntr8XS?C5iyG$5veh4jo)y3!)zBKj^sqdtaU_8h}j78J3xM?Bb(u6ad7ue>k~OM3I)=DKVHWt8H{_t ze-g$kWwAcrk4P)oW?b~g8o#ecyt)Okc{_Tr*U*K%j+pQU6nQfyTJ&-Ydn{2~qn1YA z7$M@#wQLMOm)O7wrG^vM~p`v$9O#tZ?O|?N3_E-jPT@7w0~SAkMa3N zAr3^Xl=Q>&$yqUNQIYk*`?-lm{ANAzYaEGqRyhX)+^D@rq?;MDGY-98kZuOi`%KWg z1oWcs?DBOX9d-B_tDj`W>{laJnSPSeXAp26fG~^UC+8w9<9xNLcYlDkB7ZIDoyvTS zW-mo<8j>FY-iC6eg!}!=-ws`j0WOXwvtyELf=MjK@drfsd@>5jH=@+9qLjT+cn1KG2A*^V8iqq%s48DOhA7I-5$ZQl$~O`BfFQC* zU6rUPiz3vm{glHIV=&B%Qh)8Iyc2aVvN!Zpf9t0#jXqCNZttaj(NFoJ*GkkbO3j*f z6Pisu3@QJ#)e7xz)#r4Wr#V;=3J?2l4k>!a*cv3S0-hk9opG<9`FUu99WiWkpp zj8-4)t32KdTp#Vtg17b|%7cAWlzg@iQMUC_LH3V6PQhxziw^adp33JA6|WMzCqn(X zr}9+9Sw2;{A@bk9tI9o5>MyGDc$A87%(tqL;cZm~_-j=K%Ap|T7bkzlnjWiQgJUS2dDAl%@X zi#xp~4)wm{luZtG`*BLALw)!-<*^8LO}z3_q`Koc<+n)nmE)BAqtunijZ#+}r)*Z$ z$KsXid#De^E06V1?~PYFd#HzxrHS_gzo)t(Ub!Cz0z6uMHeT7>OMM6Uz125>-&=hd z@ZRdyc;%fw9qgthouaHr`2GY%xnumet#0Mh z@ef6IUZUKbq5gb{vOQzD0~bGJsvo+P`(04udzX43Pr2Kz?wqdda;q<2rYxSI?#x$S zn4qq`RJnbk`rr&@$3%6*h01Re)vqs69?eo;nyl=}QkNAfOS07!1LHNKR$rW= ztj-bi-_3mjJ<@Gc)U8vLMN>N?!19KR)ekOK{&%tZ$;HZ})7AH;D^FhX9lT)E3{k>c zr2g-6<&De5#ld2AU5WC3iMZ%0RX5L6KAR~nTFTVd%apInZX?MhSEz4QC@)>1E}E_U zbj3J8KC4h4ovl1nqrO$6JY1{ZQm1@W3mxyTQ~y`5?5;y{TfMq^u5z$m-8olzY!22K z9-61#JYU%}4|(^^SD(5{*)<=@tyifJFHnBDN_}&I^61q-d+-|drfZcCuR-3u*Q#r- zQ{KN8$t~BZ4_)t24qT_cfzu-m4q!fTgF|g=bSNL(-~h&|Mu+-jlS6s0kyo3W9O{D$ z9m>x*zmD_jW|p{rkwd+4u|wIth&cBwcBubFRld8JSDRWK>H{8!@)ORl;r#G_SmHi3 znWe1`W!Dno+}-L>pIGWp-f894#-$GR{x*m5BhIhl{Lqao0WUx9RaL#AxAJ{Yby08S z#ppDo5A>D|b=0ptc8EV#?N*eV`lt>4l(l`3Z0f6i-dB07 zZ;>GQYXn;B&1Wf_Bh=Qjlsgc*&QkVAst1yk|3<0bB`G(n>RU<57pl4@NqM6OA>Z^+ zzZs=;^kjN(Pj&k!r7e0O($6E1jZ*eUpNaJ9UTV`QWo_?7q`&K}-hP&Hcb_#;xcD$u z{bHnYa~uob6Q}MOseBwa?d6fmjYo|Gr0b{wNI!72>Kmbaa`ZT)pX^U~!!at-YmXVT zX{2)a7aJnR&;e9MbDf9`VO;<@HlY z=fElId&8CO2`u|*!dSwGF#X{W^_k(y;!_VhaIqv&Jvdz1nD{?l{5DiQn4~;$`ZT0l z&rtt7Ls@?YrFeT7>gxTG)CQG#e^lxf=*!?-8OpB?^uAvcU4CRZyZvlIKjQVJX z(iw9<(ktTBmQ3YP97^8t549;%S$q_dj~u18XDEk{nu!UXqt(|ll#TrnX};~RzMY}$ zI0n7UFUP36GL&t{0=n;5^}bAH!*NJ@k5eDcR9=Wj@{4$Nd8YF0@koApyt*<|d1L^R z?+j3H$y8PjM6zR`dRvC_+d!!MpF!%aE@kB)^`}gwQ>1S=L47k**?a=wADo~*o~hh1 znCUfx)dyY5HzNJ>V0EQSS$5*29WLbsCv&$s)z31NUz}-3?>p&cxccG*^}!6KBjI^y z+I*_IDnohpR2LllH1(ql<(bnC1H5*qx+O!=h8CggJzd?Ap){P)g{Zb_)Cr*S$|&`| z@k-NK=VIh+u3Soy3%ltiu7~mR4mO<9y?cEI9_QvkN8iYr@lH?`TjiOzc~6% z2avuPqwYOlx##?GNN*ntZSNX42&FcUQ(yb1(lu^-1TJ2=Ky64?9!pk{em+^njK??0 z_c7y^)c>!&tB;PWxbCwnk7R;n%LdCrHfGbLK1mKNyDMoWQ%asCud%>58%sDYZS%5T zt>gtstFyZjHvJL=oG;=~<2Wt}i7~{4khD(Y7%(4J0vOXY1oGh$LJ|?E@oB;#gkVT| zfd1~xoz;79-ve7s|L8fMbENy;Z|CdXxpU{vym@!z_LSJwb#v&I<>IR;%H!FURM4lp z<<^VD6Wvr0n>96su!6~upKXF;x&`2NSU%Szo(SXeH(`0CNqoHqm;YWPC!54)XW(-040#v>zC^Ci zN%^BDv5h45u3GtYlX#ZOy0%XKyh(hm?rM~^w_g6BNxWE(GOnLZ!~7$&arxYAd0&&b zc@8f3&yi0yiI?bdOM`q05gI5$gWTIB9-G??$($?qHHm+k2Xx;&d3Td|dLGJn9kSgd zzI@te&=z=}dA|<-0b&4~;~6P#l#k#V=*L14aCDWttwD?-`He888pQqdyibMX?gnu! zG4vMjwn2QCE{}%gt_HD{c>6Ckaz}&s30)qmk=q-@PGa-@GvrMT;(5BfR?06lh<_$F zLsx&XLA*kjTWjUc26104E`L%hpPeHP%zO~@Bh+PA=z^T$^G6qm$3nQgH!NRl6#pH5 z0sR^_(H9not#w~Uz4y-~3+o4`fTu5>B5#8Ubt-cE>8bK)qqt_)b%_1RIr2!O*hRhl zK!e=5Kpbnh6xPIC`Naib&U?{>r^)9U#eJuJ2H8D!y8LCM*fAe`+BskDULYQ&%O~c` zhZ@D_8*zDSqujed{H$@Fs%GL}Lw`V*gFXGdI`KN$mDknE7wW{mTIdTr(y!{om&l~N zy-vPZCyuan`Tjj2`CHhkVY$Cfe7r`&$LkMkE^*_lve?SuOrj)PNiLcfE zHLmZZ)EMGa!8^hwxjQ1R4#_ttcV~YybR;BxAHwCyaFPQjIdGB#CpmDE11C9fk^}!= zIFRnLR8`ZRn#TBEf%pe|RRu5Ygh@)l;C=P(`>Xk$UeQokJ6lsM8LF`EJWElICiY&j zOx@#^oeGEU&nSxNlL}@}38o(^JjuUf;-SJ!rmwv5W`Qnuge@Y%m}cXR`W_P?72YHD zJzgcKZ~-iO3Ro&v0hYW9^O+g~V_$~}T|Z`nn8_K@Iy?nTF;u|pOoepJQqw6Yx_{to{X4_|=j15!-&@djZr9yR_c1-l z^f1#SOph_G{WD$vMy5-cCYi2ax|V4_(=n!-nQmvgo9RBL2bmsbdW7jQrnTpD`AnBE zO)_1CQ>^a#^qOl$v~%V)ZTX_Dy*rfZq@GaX~Pndx?> zyP57|dXVX1rbn0_V_JIwm(O$w(EpXnIW%}lp5-OY3#(}PS8Gd;re7}Hvd%V)ZTX_Dy*rfZq@GaX~P zndx?>yP57|dXVX1rbn0_V_Lh6%V)ZTX_Bcp;V}Mw>T|KjfzCWid7PL_PHZmxNsg+!_}|Bq(O7G=?Y$b0Bx3PoEE!wsD#3eGgVC-dYTr###wK>@p@tT$ZL(5Ez-XDoXWWG&Zj|4usz_$=ZaZ38S$I?rAEvXJ3sG- zpC)KKqBqpwZMLy5oWIqG`NG&0ZoE-U`iyrqVuA1*2Wv!=cM~@zR^#AIRPciR${JCXm|MVo#@qtEoWt7U93?w5C8@(OwI} zz4jsPF9?Z+V*F8EPpSun9{h!T_<0p@%E#E96qmwg{Dpkl<;%eRp6g!Vl+SpVg)6+; zsZm5c4?_4yF0>Tzn1;n#RLl=Q$arj}j=zfIpBm=;8NZS7HH_czI~`x`4aRT(#(1RL zjklm)RBr4djh}|N6t*#*W?b#{1^yc2lb2W&O?+Sdhth??NBQhoZ7IYY)bSz4V;|Hw zy@;gnamF9-vD7{7m8I|<#wUMase9TZL*ZN)%9MY))1q6kMEw`y1Hh?$ulBSrwX-pB{kIUSGd_tN{Ek6%M)j8%q^Rdk!hkHwAwA z&>l-a{A--gkf*Y=dYPw|Uwz0emj-m5h_VqcCf(e|@?U->;ru2*B?Pz~^G3=9dp%Ac+X-yy>_(;j0AH>>zl_QM+* z7P|uS`L6(c4kq@*hjkt~Spr-=SG_Km_GVM~5b$|;-W&E>>R#F2ii+d;Z@FOob4Z04`Ln!)vund0XXfU z^^5 z^gQW@@lK1PiTA1h*lrMz|Ly?%o&fxJ0eA{bqjE=fTB<}EzbR}~xW0D&-x7e|37noc zoptNmgTU!|C;53JoLM6bSBk&$6-yyMX1pCZl2nB?0eCI|e+dmY538@<`R3ID@qZqG z&%pw0q^dSTybHKrJ(Ivmt{!K7YviXVAbt)w@gXup^X*+BUD!6pN4Q@Y`8+>B4(|xa z|N9D`EhZnaRL1Xv{XyY30r6*Hf!go6;=t#k9mOLVmA--NxiTRBUje6fZ#bmm8$JAw z3dch*f7(=TnC}GSbB5{{9LYh>3Bdm$0RL(L{zw4+0&ubqy>{0hfcweqJut zAbuQzO6@hm?KQ4)7b36v5#Kn-mH_;Dh0hiiueanl&8q?N6Bq~l_;5`C{sZ8|&+%iH z%2|CQiK|+?$nnc38i21U>^Oxd~ASh<`0`zjpjKa2g+XvOSD$rNWb(4}IL~ z4A{8jThE2)TBL_3f2!ltw=5LSV?1)ZrS9q5D+-q@Twgo?e=`98Q2_qy0K5hUj2}N2 z2jC+C_%`4XX9f9sPeA-b0r(dIM;%wlgZ}0Z#+f~4+y<&OYU(~lyDEn@e zPj!VRIsb;oE%hUP>qO!8fcT3R>vo^;w0jG1YRAZPmP$`{359jQBkIciZ$QPz_hOdi z`e8G0dM@K{0b;MhtpWMm&-u7A!OhPD@N>`dZ{P6%d|Lp1R{&lY(fUNNJ{;$Y^#G@K z@y-*=j0?`6e37;h2VKds|1Fe=Q|B2E6C>w~f$AQ$6H;tW?ftOR4m9Sk{xY^Nxq; zh)+*KyAsr$)?D1vVrx-v?QpWaoet;9lsBZtoM<9(rLXV1R64yBdc{^Q@SF+a9qQb! z{tP^_sbgkvc3|rAbUNNf(T&40JKwjaCvFcpQg)OJ==AV6MDjHqHuc)+73eS>`9jG? zpUL$swNnnZCEJC~Uh?hI=z6=`?)g9`PUFO>RCE+U%f=Q|rl-8B!?yDUQex0+E9jsC z8$Q&bQnh|H*B7q7)K0DB2rHMbA-!2D*xJdHu$TV@9Hw@60qwn{tCE3BHTL002T&@b zTU2VO&%LtM38GZAlB0e*yWtAUx^Pu@XP2FbwnSS+e>pceI0Yr5B;fEs$I!ZUHpL?S zS!jW?zf{)!L&a~F@@!~X?A@8pB1D>adzI>LhhwYJk*-w`l49=4{BWPU>`H#xpW1O` zm`JD`s|s>SVLYka<*`$=w?&mxz<$hq|L`9|c8qq;2OKih%Tgv$)zZm5tY?4hJp$S)wIrz0|&zEfSgy<99p}zSU3#c$UNN}~a ztE{TJM-xyPIr7RYyfSZ(!0~8;S{t=HQJ-AxWINzoNYX1VW~l;+)s@Z&<>SZC6D@Hm z#&mt?D3$3Jb1JkVfbcX(z9L-`$cG2IFeMO`S}w1urQ5@+J8Aa^a1d%=e%K~AjVRjb zCA8#<>f(CaM&~G0E!)JZfbJJ>Q4SC{tjkq!7;-elr`(F;ItP8bijRHMYxl-r z;UsOBKtTm&H)d?(wcW-MU^#UzsbzJoTZi)_?0B@rU%1t1S$R^Izqfc2(o5&bD*u41 z^251QKAnas?fX#i7^&+#4voDEy<0t5B4*p=4RB#-gINRR4YsnpyuAUU;W{?VB^pWq zr_~K`1nt0Z3gdaJn^(FYR2~25DYdtII%6!_p%gS0*_V+zHumaUDOY#XI=Qpl-ES)0 zx#C~a^Ayu~27m3S>)xhyvsSP{`rT7|C6iU1qNhNr3f^dQ#d7yh3YL7K*V9Z(NzQr- zZT)njX|;CjUAsCC@q&}Y%=-LTJWet)2xV>4QGUo7ecC1b)YIZiEAm(vz#!uo4YV0X+|G-&0X1K28|jN6fnNc9&cw0O&ca%d2Zu(O~VE zslcISu2l`C=zql8N}qR4V@Sno#yc7)eeqae6ZQ5^N2nd;R~s)>1yZIXo2D8euFNza zT^eu2DWILo%Xv4sD%RDBbFNs4e&(=zPR!DBTbHDG`<3zMQ+!bL%%}4AKt?(iXudo$IuU)IOQC|Yruo9sTtam?WMO(A{c^cf; zP|&@xxA%|P25;5ws{6a$qPwcuU&`eib1c;@VCCVU3YvV`y!<}$;hy0@9ez$ zQn{-0z#PrOm$kP~^<}&Yh0Gb(2u#LaDgSnZFvdZLIyr9V#?$N7tyq`sOZm)$h61dT zx0@}ul8Kv2L#TaBp}6hVcS?)p+o;D3qy~{Dp~iw9-1p^+ZVu~FUVAaG8j^gZLmwC9 zmJXX{u+$l8p=4@&uNpnn1O^ct?J)H;m_G*?c074D_CYzRPF&Ab*^7>{O^1kgkW6Zc zro)iP7NH5MBWm0|h_xhKXE9@u4ChMd=^CHKjIM;fnDMNzVEF(hpy2_QvRjSC2cfnL z{WgZdft=kxR7UYvx!NF_pj920js6y6yela^hhs9|+6^i39hn>~KTgqyjeCjB>I9<` zYV9*Z*@OA@Svy-ARhm!N=6DPh-?lsPI<}KT_u^PCr-ls&Locr($aG-2H>2xlz>XZPzcM ztgwEmMk&9cCJ~3x<7@G#4)8Whjac~0Ws0eaB7|X#m4Cx7k4+q0ab=&YQOxzoRD;4` zDV4A8nfW$a1|IN;6UoIiEJZ!0P(Q4ecB80yMrO`^b2GlX_Yy2KnfH%a|E9N(y>vE- zurD2^MPnE#rO_c`Ojj2csMTV#Lxzr1bI|-P3k_M$rPlb=)t*us^!5_ma9BN6eJiDv zN^%TmX%vdxX=5LCVB&yd5OT#f^`8zSfT@;BDPv4e@98aSYBgrw1H+>AW%5jg8-mxxox2;H#0rvWSwwETXuK4is=fFC9cwDV%7rpkBE|a~pVr#D+d(V(yT8 z7AtgWAsgEdNIj~|^xg@bhq=Byi-agRGnCIFU7<_`0--lLQ5b{*?>UF)m@D`0kZIpx zo0JIQ&+I$u2wS+rc@qp^+mb6#TxWO}Zty8eC4Fz$!X11UbNYz7oBC(I!!Sr37tZwM zu%&(~b9%EcTY>oJO}`x2F?`!^>c1mw0U^HziijJSVjH6hyiJtvgjkI$+7e~bo9{~B z!wJ@5e=!BxmPpTT(qr473TyG7wzQe_=KIre?5Ct){3Yl+ork$5`eFQ$uL6_ae5d-r z**aE{8E)!t;u(4cqLJ?db8o(9jhPLZoSy1WyfgLJxeIX>IWlzK>1EG+_KaQ2lAImpACJb@v%aF!&%k#t&fuwz_$hvzjX&yx6wLRChwj%gz5Mo; z|2)#*Q;kY`^Ih|koZcJXd;c=hQc0$K^ZoPWai)I_X-Py)dh?z1QB!|zSR=-!eDhp8 zJms73sb4#wpWnnsG!@Lfp&>+b*1z)I^@L78Vk#Dp{vDq5W?%XEgF5}YHR04kL@HPM zg7(`}T}(0Nd&%1#(g_cF1m2r|KB7^~n9i>9-=uG?Ao@b*s?!ic>fMyBW9a|qzpi60 yHVODg&#fxk^gU<#C%OJ3*q2TL+eroKNW*udKdFEFmA~?7UDEF&h~Nm`^8W|42Fy(W literal 0 HcmV?d00001 diff --git a/test/ocl/SimpleConvolution_Kernels.cl b/test/ocl/SimpleConvolution_Kernels.cl new file mode 100644 index 00000000..08dcde88 --- /dev/null +++ b/test/ocl/SimpleConvolution_Kernels.cl @@ -0,0 +1,175 @@ +/********************************************************************** +Copyright ©2015 Advanced Micro Devices, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +• Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +• Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY + DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +********************************************************************/ + +/** + * The kernel has two implementation of convolution. + * 1. Non-Separable Convolution + * 2. Separable Convolution +*/ + + +/** + * NonSeparableConvolution + * is where each pixel of the output image + * is the weighted sum of the neighbourhood pixels of the input image + * The neighbourhood is defined by the dimensions of the mask and + * weight of each neighbour is defined by the mask itself. + * @param input Padded Input matrix on which convolution is to be performed + * @param mask mask matrix using which convolution was to be performed + * @param output Output matrix after performing convolution + * @param inputDimensions dimensions of the input matrix + * @param maskDimensions dimensions of the mask matrix + * @param nExWidth Size of padded input width + */ + +__kernel void simpleNonSeparableConvolution(__global uint * input, + __global float * mask, + __global int * output, + const uint2 inputDimensions, + const uint2 maskDimensions, + const uint nExWidth) +{ + uint tid = get_global_id(0); + + uint width = inputDimensions.x; + uint height = inputDimensions.y; + + uint x = tid%width; + uint y = tid/width; + + uint maskWidth = maskDimensions.x; + uint maskHeight = maskDimensions.y; + + if(x >= width || y >= height) + return; + + /* + * initializing weighted sum value + */ + float sumFX = 0.0f; + int m = 0, n = 0; + + //performing weighted sum within the mask boundaries + for(uint j = y ; j < (y + maskHeight); ++j, m++) + { + n = 0; + for(uint i = x; i < (x + maskWidth); ++i, n++) + { + uint maskIndex = m * maskWidth + n; + uint index = j * nExWidth + i; + + sumFX += ((float)input[index] * mask[maskIndex]); + } + } + + sumFX += 0.5f; + output[tid] = (int)sumFX; +} + + + + +/** + * SeparableConvolution + * is product of 2 one-dimensional convolution. + * A 2-dimensional convolution operation is separated into 2 one one-dimensional convolution. + * SeparableConvolution is implemented in two passes. + * The first pass is called Row-wise convolution. + * And second pass is called Column-wise convolution. + */ + + /** + * First Pass - Row-wise convolution + * @param input Input matrix on which convolution is to be performed + * @param rowFilter rowFilter vector using which row-wise convolution was to be performed + * @param tmpOutput Output matrix after performing first pass convolution + * @param inputDimensions dimensions of the input matrix + * @param filterSize length of row filter vector + * @param exInputDimensions dimensions of padded input + */ + __kernel void simpleSeparableConvolutionPass1(__global uint * input, + __global float * rowFilter, + __global float * tmpOutput, + const uint2 inputDimensions, + const uint filterSize, + const uint2 exInputDimensions) +{ + int i = 0, cnt = 0; + + uint width = inputDimensions.x; + uint height = inputDimensions.y; + + uint tid = get_global_id(0); + uint x = tid%width; + uint y = tid/width; + + if(x >= width || y >= (height+filterSize-1)) + return; + + /* + * initializing weighted sum value + */ + float sum = 0.0f; + + for(uint i = x; i < (x + filterSize); ++i) { + sum = mad((float)input[y * exInputDimensions.x + i], rowFilter[cnt++], sum); + } + + /* Transposed save */ + tmpOutput[x * exInputDimensions.y + y] = sum; +} + +/** + * Second Pass - Column-wise convolution + * @param input Input matrix on which convolution is to be performed + * @param colFilter colFilter vector using which column-wise convolution was to be performed + * @param Output Output matrix after performing second pass convolution + * @param inputDimensions dimensions of the input matrix + * @param filterSize length of col filter vector + * @param exInputDimensions dimensions of padded input + */ + __kernel void simpleSeparableConvolutionPass2(__global float * input, + __global float * colFilter, + __global int * output, + const uint2 inputDimensions, + const uint filterSize, + const uint2 exInputDimensions) +{ + int i = 0, cnt = 0; + + uint width = inputDimensions.x; + uint height = inputDimensions.y; + + uint tid = get_global_id(0); + uint x = tid%height; + uint y = tid/height; + + if(y >= width || x >= height) + return; + + /* + * initializing wighted sum value + */ + float sum = 0.0f; + + for(uint i = x; i < (x + filterSize); ++i) { + sum = mad(input[y * exInputDimensions.y + i], colFilter[cnt++], sum); + } + + /* Tranposed save */ + sum += 0.5f; + output[x * width + y] = (int)sum; +} diff --git a/test/run.sh b/test/run.sh index 4ba2110a..dfd15e34 100755 --- a/test/run.sh +++ b/test/run.sh @@ -74,7 +74,7 @@ export ROCP_TOOL_LIB=./test/libintercept_test.so export ROCP_KITER=50 export ROCP_DITER=50 export ROCP_AGENTS=1 -export ROCP_THRS=1 +export ROCP_THRS=3 eval_test "Intercepting usage model test" "../bin/run_tool.sh ./test/ctrl" ## Standalone sampling usage model test @@ -120,6 +120,12 @@ export ROCP_DITER=4 export ROCP_INPUT=input2.xml eval_test "libtool test, counter sets" ./test/ctrl +## OpenCL test + +export ROCP_OBJ_TRACKING=1 +export ROCP_INPUT=input1.xml +eval_test "libtool test, OpenCL sample" ./test/ocl/SimpleConvolution + #valgrind --leak-check=full $tbin #valgrind --tool=massif $tbin #ms_print massif.out. diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 73aa245a..81626a2a 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -473,19 +473,20 @@ bool dump_context_entry(context_entry_t* entry) { const uint32_t index = entry->index; FILE* file_handle = entry->file_handle; const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); + const AgentInfo* agent_info = HsaRsrcFactory::Instance().GetAgentInfo(entry->agent); fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), tid(%lu), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), kernel-name(\"%s\")", index, - HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index, + agent_info->dev_index, entry->data.queue_id, entry->data.queue_index, entry->data.thread_id, entry->kernel_properties.grid_size, entry->kernel_properties.workgroup_size, - entry->kernel_properties.lds_size, + (entry->kernel_properties.lds_size * (128 * 4)), entry->kernel_properties.scratch_size, - entry->kernel_properties.vgpr_count, - entry->kernel_properties.sgpr_count, + (entry->kernel_properties.vgpr_count + 1) * agent_info->vgpr_block_size, + (entry->kernel_properties.sgpr_count + agent_info->sgpr_block_dflt) * agent_info->sgpr_block_size, entry->kernel_properties.fbarrier_count, entry->kernel_properties.signal.handle, nik_name.c_str()); @@ -658,10 +659,10 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, uint64_t workgroup_size = packet->workgroup_size_x * packet->workgroup_size_y * packet->workgroup_size_z; if (workgroup_size > UINT32_MAX) abort(); kernel_properties_ptr->workgroup_size = (uint32_t)workgroup_size; - kernel_properties_ptr->lds_size = packet->group_segment_size; + kernel_properties_ptr->lds_size = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE); // packet->group_segment_size; kernel_properties_ptr->scratch_size = packet->private_segment_size; - kernel_properties_ptr->vgpr_count = kernel_code->reserved_vgpr_count; - kernel_properties_ptr->sgpr_count = kernel_code->reserved_sgpr_count; + kernel_properties_ptr->vgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT); + kernel_properties_ptr->sgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT); kernel_properties_ptr->fbarrier_count = kernel_code->workgroup_fbarrier_count; kernel_properties_ptr->signal = callback_data->completion_signal; @@ -881,6 +882,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) } it = opts.find("trace-local"); if (it != opts.end()) { settings->trace_local = (it->second == "on"); } + it = opts.find("obj-tracking"); + if (it != opts.end()) { settings->code_obj_tracking = (it->second == "on"); } it = opts.find("memcopies"); if (it != opts.end()) { settings->memcopy_tracking = (it->second == "on"); } } @@ -901,6 +904,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) check_env_var("ROCP_TRACE_SIZE", settings->trace_size); // Set trace local buffer check_env_var("ROCP_TRACE_LOCAL", settings->trace_local); + // Set code objects tracking + check_env_var("ROCP_OBJ_TRACKING", settings->code_obj_tracking); // Set memcopies tracking check_env_var("ROCP_MCOPY_TRACKING", settings->memcopy_tracking); diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp index 35568ba0..d23a445d 100644 --- a/test/util/hsa_rsrc_factory.cpp +++ b/test/util/hsa_rsrc_factory.cpp @@ -140,7 +140,7 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize CHECK_STATUS("loader API table query failed", status); // Instantiate HSA timer - timer_ = new HsaTimer; + timer_ = new HsaTimer(&hsa_api_); CHECK_STATUS("HSA timer allocation failed", (timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); @@ -167,7 +167,6 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_init = table->core_->hsa_init_fn; hsa_api_.hsa_shut_down = table->core_->hsa_shut_down_fn; hsa_api_.hsa_agent_get_info = table->core_->hsa_agent_get_info_fn; - hsa_api_.hsa_iterate_agents = table->core_->hsa_iterate_agents_fn; hsa_api_.hsa_queue_create = table->core_->hsa_queue_create_fn; @@ -175,36 +174,39 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_queue_load_write_index_relaxed = table->core_->hsa_queue_load_write_index_relaxed_fn; hsa_api_.hsa_queue_store_write_index_relaxed = table->core_->hsa_queue_store_write_index_relaxed_fn; hsa_api_.hsa_queue_load_read_index_relaxed = table->core_->hsa_queue_load_read_index_relaxed_fn; + hsa_api_.hsa_signal_create = table->core_->hsa_signal_create_fn; hsa_api_.hsa_signal_destroy = table->core_->hsa_signal_destroy_fn; hsa_api_.hsa_signal_load_relaxed = table->core_->hsa_signal_load_relaxed_fn; hsa_api_.hsa_signal_store_relaxed = table->core_->hsa_signal_store_relaxed_fn; - hsa_api_.hsa_signal_store_screlease = table->core_->hsa_signal_store_screlease_fn; hsa_api_.hsa_signal_wait_scacquire = table->core_->hsa_signal_wait_scacquire_fn; - - hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn; + hsa_api_.hsa_signal_store_screlease = table->core_->hsa_signal_store_screlease_fn; hsa_api_.hsa_code_object_reader_create_from_file = table->core_->hsa_code_object_reader_create_from_file_fn; hsa_api_.hsa_executable_create_alt = table->core_->hsa_executable_create_alt_fn; hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn; hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn; hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn; + hsa_api_.hsa_executable_symbol_get_info = table->core_->hsa_executable_symbol_get_info_fn; + hsa_api_.hsa_executable_iterate_symbols = table->core_->hsa_executable_iterate_symbols_fn; + + hsa_api_.hsa_system_get_info = table->core_->hsa_system_get_info_fn; + hsa_api_.hsa_system_get_major_extension_table = table->core_->hsa_system_get_major_extension_table_fn; hsa_api_.hsa_amd_agent_iterate_memory_pools = table->amd_ext_->hsa_amd_agent_iterate_memory_pools_fn; hsa_api_.hsa_amd_memory_pool_get_info = table->amd_ext_->hsa_amd_memory_pool_get_info_fn; hsa_api_.hsa_amd_memory_pool_allocate = table->amd_ext_->hsa_amd_memory_pool_allocate_fn; hsa_api_.hsa_amd_agents_allow_access = table->amd_ext_->hsa_amd_agents_allow_access_fn; - hsa_api_.hsa_amd_memory_async_copy = table->amd_ext_->hsa_amd_memory_async_copy_fn; hsa_api_.hsa_amd_signal_async_handler = table->amd_ext_->hsa_amd_signal_async_handler_fn; + hsa_api_.hsa_amd_profiling_set_profiler_enabled = table->amd_ext_->hsa_amd_profiling_set_profiler_enabled_fn; hsa_api_.hsa_amd_profiling_get_async_copy_time = table->amd_ext_->hsa_amd_profiling_get_async_copy_time_fn; hsa_api_.hsa_amd_profiling_get_dispatch_time = table->amd_ext_->hsa_amd_profiling_get_dispatch_time_fn; } else { hsa_api_.hsa_init = hsa_init; hsa_api_.hsa_shut_down = hsa_shut_down; hsa_api_.hsa_agent_get_info = hsa_agent_get_info; - hsa_api_.hsa_iterate_agents = hsa_iterate_agents; hsa_api_.hsa_queue_create = hsa_queue_create; @@ -212,31 +214,35 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_queue_load_write_index_relaxed = hsa_queue_load_write_index_relaxed; hsa_api_.hsa_queue_store_write_index_relaxed = hsa_queue_store_write_index_relaxed; hsa_api_.hsa_queue_load_read_index_relaxed = hsa_queue_load_read_index_relaxed; + hsa_api_.hsa_signal_create = hsa_signal_create; hsa_api_.hsa_signal_destroy = hsa_signal_destroy; + hsa_api_.hsa_signal_load_relaxed = hsa_signal_load_relaxed; hsa_api_.hsa_signal_store_relaxed = hsa_signal_store_relaxed; hsa_api_.hsa_signal_wait_scacquire = hsa_signal_wait_scacquire; - - hsa_api_.hsa_amd_agent_iterate_memory_pools = hsa_amd_agent_iterate_memory_pools; - hsa_api_.hsa_amd_memory_pool_get_info = hsa_amd_memory_pool_get_info; - hsa_api_.hsa_amd_memory_pool_allocate = hsa_amd_memory_pool_allocate; - hsa_api_.hsa_amd_agents_allow_access = hsa_amd_agents_allow_access; - - hsa_api_.hsa_amd_memory_async_copy = hsa_amd_memory_async_copy; - - hsa_api_.hsa_system_get_major_extension_table = hsa_system_get_major_extension_table; + hsa_api_.hsa_signal_store_screlease = hsa_signal_store_screlease; hsa_api_.hsa_code_object_reader_create_from_file = hsa_code_object_reader_create_from_file; hsa_api_.hsa_executable_create_alt = hsa_executable_create_alt; hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object; hsa_api_.hsa_executable_freeze = hsa_executable_freeze; hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol; + hsa_api_.hsa_executable_symbol_get_info = hsa_executable_symbol_get_info; + hsa_api_.hsa_executable_iterate_symbols = hsa_executable_iterate_symbols; + + hsa_api_.hsa_system_get_info = hsa_system_get_info; + hsa_api_.hsa_system_get_major_extension_table = hsa_system_get_major_extension_table; + + hsa_api_.hsa_amd_agent_iterate_memory_pools = hsa_amd_agent_iterate_memory_pools; + hsa_api_.hsa_amd_memory_pool_get_info = hsa_amd_memory_pool_get_info; + hsa_api_.hsa_amd_memory_pool_allocate = hsa_amd_memory_pool_allocate; + hsa_api_.hsa_amd_agents_allow_access = hsa_amd_agents_allow_access; + hsa_api_.hsa_amd_memory_async_copy = hsa_amd_memory_async_copy; hsa_api_.hsa_amd_signal_async_handler = hsa_amd_signal_async_handler; + hsa_api_.hsa_amd_profiling_set_profiler_enabled = hsa_amd_profiling_set_profiler_enabled; hsa_api_.hsa_amd_profiling_get_async_copy_time = hsa_amd_profiling_get_async_copy_time; hsa_api_.hsa_amd_profiling_get_dispatch_time = hsa_amd_profiling_get_dispatch_time; - hsa_api_.hsa_signal_load_relaxed = hsa_signal_load_relaxed; - hsa_api_.hsa_signal_store_screlease = hsa_signal_store_screlease; } } } @@ -329,6 +335,11 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { status = hsa_api_.hsa_amd_agent_iterate_memory_pools(agent, FindStandardPool, &agent_info->gpu_pool); CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); + // GFX8 and GFX9 SGPR/VGPR block sizes + agent_info->sgpr_block_dflt = (strcmp(agent_info->gfxip, "gfx8") == 0) ? 1 : 2; + agent_info->sgpr_block_size = 8; + agent_info->vgpr_block_size = 4; + // Set GPU index agent_info->dev_index = gpu_list_.size(); gpu_list_.push_back(agent_info); @@ -672,7 +683,57 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s return write_idx; } +const char* HsaRsrcFactory::GetKernelName(uint64_t addr) { + std::lock_guard lck(mutex_); + const auto it = symbols_map_->find(addr); + if (it == symbols_map_->end()) { + fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr); + abort(); + } + return strdup(it->second); +} + +void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) { + std::lock_guard lck(mutex_); + executable_tracking_on_ = true; + table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; +} + +hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data) { + hsa_symbol_kind_t value = (hsa_symbol_kind_t)0; + hsa_status_t status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &value); + CHECK_STATUS("Error in getting symbol info", status); + if (value == HSA_SYMBOL_KIND_KERNEL) { + uint64_t addr = 0; + uint32_t len = 0; + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &addr); + CHECK_STATUS("Error in getting kernel object", status); + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len); + CHECK_STATUS("Error in getting name len", status); + char *name = new char[len + 1]; + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); + CHECK_STATUS("Error in getting kernel name", status); + name[len] = 0; + auto ret = symbols_map_->insert({addr, name}); + if (ret.second == false) { + delete[] ret.first->second; + ret.first->second = name; + } + } + return HSA_STATUS_SUCCESS; +} + +hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options) { + std::lock_guard lck(mutex_); + if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t; + hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL); + CHECK_STATUS("Error in iterating executable symbols", status); + return hsa_api_.hsa_executable_freeze(executable, options);; +} + std::atomic HsaRsrcFactory::instance_{}; HsaRsrcFactory::mutex_t HsaRsrcFactory::mutex_; HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MAX; hsa_pfn_t HsaRsrcFactory::hsa_api_{}; +bool HsaRsrcFactory::executable_tracking_on_ = false; +HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = NULL; diff --git a/test/util/hsa_rsrc_factory.h b/test/util/hsa_rsrc_factory.h index 9c0207e2..151dab8e 100644 --- a/test/util/hsa_rsrc_factory.h +++ b/test/util/hsa_rsrc_factory.h @@ -73,7 +73,6 @@ struct hsa_pfn_t { decltype(hsa_init)* hsa_init; decltype(hsa_shut_down)* hsa_shut_down; decltype(hsa_agent_get_info)* hsa_agent_get_info; - decltype(hsa_iterate_agents)* hsa_iterate_agents; decltype(hsa_queue_create)* hsa_queue_create; @@ -81,30 +80,35 @@ struct hsa_pfn_t { decltype(hsa_queue_load_write_index_relaxed)* hsa_queue_load_write_index_relaxed; decltype(hsa_queue_store_write_index_relaxed)* hsa_queue_store_write_index_relaxed; decltype(hsa_queue_load_read_index_relaxed)* hsa_queue_load_read_index_relaxed; + decltype(hsa_signal_create)* hsa_signal_create; decltype(hsa_signal_destroy)* hsa_signal_destroy; + decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed; decltype(hsa_signal_store_relaxed)* hsa_signal_store_relaxed; decltype(hsa_signal_wait_scacquire)* hsa_signal_wait_scacquire; - - decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools; - decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info; - decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate; - decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access; - decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy; - - decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table; + decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease; decltype(hsa_code_object_reader_create_from_file)* hsa_code_object_reader_create_from_file; decltype(hsa_executable_create_alt)* hsa_executable_create_alt; decltype(hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object; decltype(hsa_executable_freeze)* hsa_executable_freeze; decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol; + decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info; + decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols; + + decltype(hsa_system_get_info)* hsa_system_get_info; + decltype(hsa_system_get_major_extension_table)* hsa_system_get_major_extension_table; + + decltype(hsa_amd_agent_iterate_memory_pools)* hsa_amd_agent_iterate_memory_pools; + decltype(hsa_amd_memory_pool_get_info)* hsa_amd_memory_pool_get_info; + decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate; + decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access; + decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy; decltype(hsa_amd_signal_async_handler)* hsa_amd_signal_async_handler; + decltype(hsa_amd_profiling_set_profiler_enabled)* hsa_amd_profiling_set_profiler_enabled; decltype(hsa_amd_profiling_get_async_copy_time)* hsa_amd_profiling_get_async_copy_time; decltype(hsa_amd_profiling_get_dispatch_time)* hsa_amd_profiling_get_dispatch_time; - decltype(hsa_signal_load_relaxed)* hsa_signal_load_relaxed; - decltype(hsa_signal_store_screlease)* hsa_signal_store_screlease; }; // Encapsulates information about a Hsa Agent such as its @@ -156,6 +160,11 @@ struct AgentInfo { // Number of Shader Arrays Per Shader Engines in Gpu uint32_t shader_arrays_per_se; + + // SGPR/VGPR block sizes + uint32_t sgpr_block_dflt; + uint32_t sgpr_block_size; + uint32_t vgpr_block_size; }; // HSA timer class @@ -166,9 +175,9 @@ class HsaTimer { static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; typedef long double freq_t; - HsaTimer() { + HsaTimer(const hsa_pfn_t* hsa_api) : hsa_api_(hsa_api) { timestamp_t sysclock_hz = 0; - hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); + hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY)", status); sysclock_factor_ = (freq_t)1000000000 / (freq_t)sysclock_hz; } @@ -184,7 +193,7 @@ class HsaTimer { // Return timestamp in 'ns' timestamp_t timestamp_ns() const { timestamp_t sysclock; - hsa_status_t status = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); + hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &sysclock); CHECK_STATUS("hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP)", status); return sysclock_to_ns(sysclock); } @@ -192,6 +201,8 @@ class HsaTimer { private: // Timestamp frequency factor freq_t sysclock_factor_; + // HSA API table + const hsa_pfn_t* const hsa_api_; }; class HsaRsrcFactory { @@ -317,6 +328,11 @@ class HsaRsrcFactory { static uint64_t Submit(hsa_queue_t* queue, const void* packet); static uint64_t Submit(hsa_queue_t* queue, const void* packet, size_t size_bytes); + // Enable executables loading tracking + static bool IsExecutableTracking() { return executable_tracking_on_; } + static void EnableExecutableTracking(HsaApiTable* table); + static const char* GetKernelName(uint64_t addr); + // Initialize HSA API table void static InitHsaApiTable(HsaApiTable* table); static const hsa_pfn_t* HsaApi() { return &hsa_api_; } @@ -381,6 +397,13 @@ class HsaRsrcFactory { // System agents map std::map agent_map_; + // Executables loading tracking + typedef std::map symbols_map_t; + static symbols_map_t* symbols_map_; + static bool executable_tracking_on_; + static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options); + static hsa_status_t executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data); + // HSA runtime API table static hsa_pfn_t hsa_api_; From 3eb4a67795904a51f80ed93ea55147954bce49a2 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 19 Dec 2019 19:49:37 -0600 Subject: [PATCH 072/168] fixing stat file names eexpression --- bin/tblextr.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/bin/tblextr.py b/bin/tblextr.py index 490cdb8b..2c4e442b 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -446,6 +446,10 @@ def fill_ops_db(table_name, db, indir): statfile = re.sub(r'\.csv$', '.stats.csv', csvfile) jsonfile = re.sub(r'\.csv$', '.json', csvfile) + hsa_statfile = re.sub(r'\.stats\.csv$', r'.hsa_stats.csv', statfile) + hip_statfile = re.sub(r'\.stats\.csv$', r'.hip_stats.csv', statfile) + kfd_statfile = re.sub(r'\.stats\.csv$', r'.kfd_stats.csv', statfile) + with open(dbfile, mode='w') as fd: fd.truncate() db = SQLiteDB(dbfile) @@ -494,9 +498,8 @@ def fill_ops_db(table_name, db, indir): dform.gen_kernel_json_trace(db, 'A', GPU_BASE_PID, START_US, jsonfile) if hsa_trace_found: - statfile = re.sub(r'stats', r'hsa_stats', statfile) dform.post_process_data(db, 'HSA') - dform.gen_table_bins(db, 'HSA', statfile, 'Name', 'DurationNs') + dform.gen_table_bins(db, 'HSA', hsa_statfile, 'Name', 'DurationNs') dform.gen_api_json_trace(db, 'HSA', START_US, jsonfile) if hsa_activity_found: @@ -504,18 +507,16 @@ def fill_ops_db(table_name, db, indir): dform.gen_api_json_trace(db, 'COPY', START_US, jsonfile) if hip_trace_found: - statfile = re.sub(r'stats', r'hip_stats', statfile) dform.post_process_data(db, 'HIP') - dform.gen_table_bins(db, 'HIP', statfile, 'Name', 'DurationNs') + dform.gen_table_bins(db, 'HIP', hip_statfile, 'Name', 'DurationNs') dform.gen_api_json_trace(db, 'HIP', START_US, jsonfile) dform.post_process_data(db, 'OPS') dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_US, jsonfile) if kfd_trace_found: - statfile = re.sub(r'stats', r'kfd_stats', statfile) dform.post_process_data(db, 'KFD') - dform.gen_table_bins(db, 'KFD', statfile, 'Name', 'DurationNs') + dform.gen_table_bins(db, 'KFD', kfd_statfile, 'Name', 'DurationNs') dform.gen_api_json_trace(db, 'KFD', START_US, jsonfile) if any_trace_found: From 3667c0439a776b3bc84a0fe769176e9b9f1ffd8b Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 20 Dec 2019 11:18:43 -0600 Subject: [PATCH 073/168] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2f63df0b..9ed4010a 100644 --- a/README.md +++ b/README.md @@ -160,7 +160,7 @@ Options: See roctracer documentation on rocTX API details. Configuration file: - You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: + You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:$HOME: First the configuration file is looking in the current directory, then in your home, and then in the package directory. Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat'. An example of 'rpl_rc.xml': From fa28e6d677502db025e2fd7223afdb95242af97f Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 31 Dec 2019 06:16:33 -0600 Subject: [PATCH 074/168] spec update --- doc/rocprofiler_spec.md | 414 +++++++++++++++++++++++++++++++--------- 1 file changed, 326 insertions(+), 88 deletions(-) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index a8ef7a2b..496595f1 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -64,7 +64,6 @@ To check the conformance of used library APi header and the library binary the v macros and API methods can be used. Returning the error and error string methods: -- rocprofiler_errno - method for returning the error number - rocprofiler_error_string - method for returning the error string Library version: @@ -114,12 +113,18 @@ The library provides back compatibility if the library major version is less or Returned API status: - hsa_status_t - HSA status codes are used from hsa.h header +Loadable plugin on-load/unload methods: +- rocprofiler_settings_t – global properties +- OnLoadTool +- OnLoadToolProp +- OnUnloadTool + Info API: - rocprofiler_info_kind_t - profiling info kind - rocprofiler_info_query_t - profiling info query - rocprofiler_info_data_t - profiling info data - rocprofiler_get_info - return the info for a given info kind -- rocprofiler_iterate_info - iterate over the info for a given info kind +- rocprofiler_iterote_inf_ - iterate over the info for a given info kind - rocprofiler_query_info - iterate over the info for a given info query Context API: @@ -137,6 +142,8 @@ Context API: - rocprofiler_get_group - return profiling group for a given index - rocprofiler_get_metrics - method for calculating the metrics data - rocprofiler_iterate_trace_data - method for iterating output trace data instances +- rocprofiler_time_id_t - supported time value ID enumeration +- rocprofiler_get_time – return time for a given time ID and profiling timestamp value Sampling API: - rocprofiler_start - start profiling @@ -152,10 +159,44 @@ Sampling API: Intercepting API: - rocprofiler_callback_t - profiling callback type - rocprofiler_callback_data_t - profiling callback data type +- rocprofiler_dispatch_record_t – dispatch record +- rocprofiler_queue_callbacks_t – queue callbacks, dispatch/destroy - rocprofiler_set_queue_callbacks - set queue kernel dispatch and queue destroy callbacks - rocprofiler_remove_queue_callbacks - remove queue callbacks + +Context pool API: +- rocprofiler_pool_t – context pool handle +- rocprofiler_pool_entry_t – context pool entry +- rocprofiler_pool_properties_t – context pool properties +- rocprofiler_pool_handler_t – context pool completion handler +- rocprofiler_pool_open - context pool open +- rocprofiler_pool_close - context pool close +- rocprofiler_pool_fetch – fetch and empty context entry to pool +- rocprofiler_pool_release – release a context entry +- rocprofiler_pool_iterate – iterated fetched context entries +- rocprofiler_pool_flush – flush completed context entries +``` +### 4.2. Loading and Configuring +``` +Loading and Configuring +The profiling properties can be set by profiler plugin on loading by ROC runtime. +The profiler library plugin can be set by ROCP_TOOL_LIB env var. + +Global properties: + +typedef struct { + uint32_t intercept_mode; + uint64_t timeout; + uint32_t timestamp_on; +} rocprofiler_settings_t; + +On load/unload methods defined in profiling tool library loaded by ROCP_TOOL_LIB env var: +extern "C" void OnLoadTool(); +extern "C" void OnLoadToolProp(rocprofiler_settings_t* settings); +extern "C" void OnUnloadTool(); + ``` -### 4.2. Info API +### 4.3. Info API ``` The profiling metrics are defined by name and the traces are defined by name and parameters. All supported features can be iterated using 'iterate_info/query_info' methods. The counter @@ -163,6 +204,7 @@ names are defined in counters table configuration file, each counter has a uniqu defined by block name and event id. The traces and trace parameters names are same as in the hardware documentation and the parameters codes are rocprofiler_feature_parameter_t values, see below in the "Context API" section. + Profiling info kind: typedef enum { @@ -220,7 +262,7 @@ has_status_t rocprofiler_query_info( hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback void *data); // data passed to callback ``` -### 4.3. Context API +### 4.4. Context API ``` Profiling context is accumulating all profiling information including profiling features which carry profiling data, required buffers for profiling command packets and output data. @@ -381,8 +423,22 @@ hsa_status_t rocprofiler_iterate_trace_data( hsa_ven_amd_aqlprofile_data_callback_t callback, // [in] callback to iterate // the output data void* callback_data); // [in/out] passed to callback data + +Converting of profiling timestamp to time value for suported time ID. +Supported time value ID enumeration: +typedef enum { + ROCPROFILER_TIME_ID_CLOCK_REALTIME = 0, // Linux realtime clock time + ROCPROFILER_TIME_ID_CLOCK_MONOTONIC = 1, // Linux monotonic clock time +} rocprofiler_time_id_t; + +Method for converting of profiling timestamp to time value for a given time ID: +hsa_status_t rocprofiler_get_time( + rocprofiler_time_id_t time_id, // identifier of the particular + // time to convert the timestamp + uint64_t timestamp, // profiling timestamp + uint64_t* value_ns); // [out] returned time ‘ns’ value ``` -### 4.4. Sampling API +### 4.5. Sampling API ``` The API supports the counters sampling usage model with start/read/stop methods and also lets to wait for the profiling data in the intercepting usage model with get_data method. @@ -423,7 +479,7 @@ hsa_status_t rocprofiler_group_read( hsa_status_t rocprofiler_group_get_data( rocprofiler_group_t* group); // [in/out] profiling group ``` -### 4.5. Intercepting API +### 4.6. Intercepting API ``` The library provides a callback API for enabling profiling for the kernels dispatched to HSA AQL queues. The API enables per-kernel profiling data collection. @@ -471,34 +527,101 @@ hsa_status_t rocprofiler_set_intercepting( hsa_status_t rocprofiler_remove_intercepting(); ``` +### 4.7. Profiling Context Pools +``` +The API provide capability to create a context pool for a given agent and a set of features, to fetch/release a context entry, to register a callback for pool’s contexts completion. +Profiling pool handle: +typename rocprofiler_pool_t; +Profiling pool entry: +typedef struct { + rocprofiler_t* context; // context object + void* payload; // payload data object +} rocprofiler_pool_entry_t; + +Profiling handler, calling on profiling completion: +typedef bool (*rocprofiler_pool_handler_t)(const rocprofiler_pool_entry_t* entry, void* arg); + +Profiling properties: +typedef struct { + uint32_t num_entries; // pool size entries + uint32_t payload_bytes; // payload size bytes + rocprofiler_pool_handler_t handler; // handler on context completion + void* handler_arg; // the handler arg +} rocprofiler_pool_properties_t; + +Open profiling pool: +hsa_status_t rocprofiler_pool_open( + hsa_agent_t agent, // GPU handle + rocprofiler_feature_t* features, // [in] profiling features array + uint32_t feature_count, // profiling info count + rocprofiler_pool_t** pool, // [out] context object + uint32_t mode, // profiling mode mask + rocprofiler_pool_properties_t*); // pool properties + +Close profiling pool: +hsa_status_t rocprofiler_pool_close( + rocprofiler_pool_t* pool); // profiling pool handle + +Fetch profiling pool entry: +hsa_status_t rocprofiler_pool_fetch( + rocprofiler_pool_t* pool, // profiling pool handle + rocprofiler_pool_entry_t* entry); // [out] empty profiling pool entry + +Release profiling pool entry: +hsa_status_t rocprofiler_pool_release( + rocprofiler_pool_entry_t* entry); // released profiling pool entry + +Iterate fetched profiling pool entries: +hsa_status_t rocprofiler_pool_iterate( + rocprofiler_pool_t* pool, // profiling pool handle + hsa_status_t (*callback)(rocprofiler_pool_entry_t* entry, void* data), + // callback + void *data); // [in/out] data passed to callback + +Flush completed entries in profiling pool: +hsa_status_t rocprofiler_pool_flush( + rocprofiler_pool_t* pool); // profiling pool handle +``` ## 5. Application code examples ### 5.1. Querying available metrics ``` Info data callback: - hsa_status_t info_data_callback(const rocprofiler_info_data_t info, void *data) { - switch (info.kind) { - case ROCPROFILER_INFO_KIND_METRIC: { - printf("metric %s, description %s\n", - info.metric.name, - info.metric.description); - break; - } - default: - printf("wrong info kind %u\n", kind); - return HSA_STATUS_ERROR; - } - return HSA_STATUS_SUCCESS; - } + hsa_status_t info_data_callback(const rocprofiler_info_data_t info, void *data) { + switch (info.kind) { + case ROCPROFILER_INFO_KIND_METRIC: { + if (info.metric.expr != NULL) { + fprintf(stdout, "Basic counter: gpu-agent%d : %s : %s\n", + info.agent_index, info.metric.name, info.metric.description); + fprintf(stdout, " %s = %s\n", info.metric.name, info.metric.expr); + } else { + fprintf(stdout, "Derived counter: gpu-agent%d : %s", + info.agent_index, info.metric.name); + if (info.metric.instances > 1) { + fprintf(stdout, "[0-%u]", info.metric.instances - 1); + } + fprintf(stdout, " : %s\n", info.metric.description); + fprintf(stdout, " block %s has %u counters\n", + info.metric.block_name, info.metric.block_counters); + } + fflush(stdout); + break; + } + default: + printf("wrong info kind %u\n", kind); + return HSA_STATUS_ERROR; + } + return HSA_STATUS_SUCCESS; + } Printing all available metrics: - hsa_status_t status = rocprofiler_iterate_info( - agent, - ROCPROFILER_INFO_KIND_METRIC, - info_data_callback, - NULL); - + hsa_status_t status = rocprofiler_iterate_info( + agent, + ROCPROFILER_INFO_KIND_METRIC, + info_data_callback, + NULL); + ``` ### 5.2. Profiling code example ``` @@ -509,85 +632,200 @@ saved and then direct context method rocprofiler_get_data with default group ind can be used. hsa_status_t_dispatch_callback( - const rocprofiler_callback_data_t* callback_data, - void* user_data, - rocprofiler_group_t* group) + const rocprofiler_callback_data_t* callback_data, + void* user_data, + rocprofiler_group_t* group) { - hsa_status_t status = HSA_STATUS_SUCCESS; - // Profiling context - rocprofiler_t* context; - // Profiling info objects - rocprofiler_feature_t features* = new rocprofiler_feature_t[2]; - // Tracing parameters - rocprofiler_feature_parameter_t* parameters = new rocprofiler_feature_parameter_t[2]; - - // Setting profiling features - features[0].type = ROCPROFILER_METRIC; - features[0].name = "L1_MISS_RATIO"; - features[1].type = ROCPROFILER_METRIC; - features[1].name = "DRAM_BANDWIDTH"; - - // Creating profiling context - status = rocprofiler_open(callback_data->dispatch.agent, features, 2, &context, - ROCPROFILER_MODE_SINGLEGROUP, NULL); - - - // Get the profiling group - // For general case with many groups there is rocprofiler_group_count() API - const uint32_t group_index = 0 - status = rocprofiler_get_group(context, group_index, group); - - - // In SINGLEGROUP mode the context handle itself can be saved, because there is just one group - - - return status; + hsa_status_t status = HSA_STATUS_SUCCESS; + // Profiling context + rocprofiler_t* context; + // Profiling info objects + rocprofiler_feature_t features* = new rocprofiler_feature_t[2]; + // Tracing parameters + rocprofiler_feature_parameter_t* parameters = new rocprofiler_feature_parameter_t[2]; + + // Setting profiling features + features[0].type = ROCPROFILER_METRIC; + features[0].name = "L1_MISS_RATIO"; + features[1].type = ROCPROFILER_METRIC; + features[1].name = "DRAM_BANDWIDTH"; + + // Creating profiling context + status = rocprofiler_open(callback_data->dispatch.agent, features, 2, &context, + ROCPROFILER_MODE_SINGLEGROUP, NULL); + + + // Get the profiling group + // For general case with many groups there is rocprofiler_group_count() API + const uint32_t group_index = 0 + status = rocprofiler_get_group(context, group_index, group); + + + // In SINGLEGROUP mode the context handle itself can be saved, because there is just one group + + + return status; } +Profiling tool constructor is adding the dispatch callback: + void profiling_libary_constructor() { - // Defining callback data, no data in this simple example - void* callback_data = NULL; + // Defining callback data, no data in this simple example + void* callback_data = NULL; - // Adding observers - hsa_sttaus_t status = rocprofiler_add_dispatch_callback(dispatch_callback, callback_data); - + // Adding observers + hsa_sttaus_t status = rocprofiler_add_dispatch_callback(dispatch_callback, callback_data); + - // Dispatching profiled kernel - + // Dispatching profiled kernel + } void profiling_libary_destructor() { - > { - // In SINGLEGROUP mode the rocprofiler_get_group() method with default zero group - // index can be used, if context handle would be saved - status = rocprofiler_group_get_data(entry->group); - - status = rocprofiler_get_metrics(entry->group->context); - - status = rocprofiler_close(entry->group->context); - - - dispatch_data, entry->features, entry->features_count)>; - } + > { + // In SINGLEGROUP mode the rocprofiler_get_group() method with default zero group + // index can be used, if context handle would be saved + status = rocprofiler_group_get_data(entry->group); + + status = rocprofiler_get_metrics(entry->group->context); + + status = rocprofiler_close(entry->group->context); + + + dispatch_data, entry->features, entry->features_count)>; + } } ``` ### 5.3. Option to use completion callback ``` Creating profiling context with completion callback: - . . . - rocprofiler_properties_t properties = {}; - properties.callback = completion_callback; - properties.callback_arg = NULL; // no args defined - status = rocprofiler_open(agent, features, 3, &context, - ROCPROFILER_MODE_SINGLEGROUP, properties); - - . . . + . . . + rocprofiler_properties_t properties = {}; + properties.callback = completion_callback; + properties.callback_arg = NULL; // no args defined + status = rocprofiler_open(agent, features, 3, &context, + ROCPROFILER_MODE_SINGLEGROUP, properties); + + . . . Definition of completion callback: void completion_callback(profiler_group_t group, void* arg) { - - hsa_status_t status = rocprofiler_close(group.context); - + + hsa_status_t status = rocprofiler_close(group.context); + +} +``` +### 5.4. Option to Use Context Pool +``` +Code example of context pool usage. +Creating profiling contexts pool: + . . . + rocprofiler_pool_properties_t properties{}; + properties.num_entries = 100; + properties.payload_bytes = sizeof(context_entry_t); + properties.handler = context_handler; + properties.handler_arg = handler_arg; + status = rocprofiler_pool_open(agent, features, 3, &context, + ROCPROFILER_MODE_SINGLEGROUP, properties); + + . . . + +Fetching a context entry: + rocprofiler_pool_entry_t pool_entry{}; + status = rocprofiler_pool_fetch(pool, &pool_entry); + + // Profiling context entry + rocprofiler_t* context = pool_entry.context; + context_entry_t* entry = reinterpret_cast + (pool_entry.payload); +``` +### 5.5. Standalone Sampling Usage Code Example +``` +The profiling metrics are being read from separate standalone queue other than the application kernels are submitted to. To enable the sampling mode, the profiling mode in all user queues should be enabled. It can be done by loading ROC-profiler library to HSA runtime using the environment variable HSA_TOOLS_LIB for all shell sessions. + // Sampling rate + uint32_t sampling_rate = ; + // Sampling count + uint32_t sampling_count = ; + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; + // HSA agent + hsa_agent_t agent; + // Profiling context + rocprofiler_t* context = NULL; + // Profiling properties + rocprofiler_properties_t properties; + + // Getting HSA agent + + + // Profiling feature objects + const unsigned feature_count = 2; + rocprofiler_feature_t feature[feature_count]; + + // Counters and metrics + feature[0].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[0].name = "GPUBusy"; + feature[1].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[1].name = "SQ_WAVES"; + + // Creating profiling context with standalone queue + properties = {}; + properties.queue_depth = 128; + status = rocprofiler_open(agent, feature, feature_count, &context, + ROCPROFILER_MODE_STANDALONE| ROCPROFILER_MODE_CREATEQUEUE| + ROCPROFILER_MODE_SINGLEGROUP, &properties); + + + // Start counters and sample them in the loop with the sampling rate + status = rocprofiler_start(context, 0); + + + for (unsigned ind = 0; ind < sampling_count; ++ind) { + sleep(sampling_rate); + status = rocprofiler_read(context, 0); + + status = rocprofiler_get_data(context, 0); + + status = rocprofiler_get_metrics(context); + + print_results(feature, feature_count); + } + + // Stop counters + status = rocprofiler_stop(context, group_n); + + + // Finishing cleanup + // Deleting profiling context will delete all allocated resources + status = rocprofiler_close(context); + +``` +### 5.6. Printing Out Profiling Results +``` +Below is a code example for printing out the profiling results from profiling features array: +void print_results(rocprofiler_feature_t* feature, uint32_t feature_count) { + for (rocprofiler_feature_t* p = feature; p < feature + feature_count; ++p) + { + std::cout << (p - feature) << ": " << p->name; + switch (p->data.kind) { + case ROCPROFILER_DATA_KIND_INT64: + std::cout << " result_int64 (" << p->data.result_int64 << ")" + << std::endl; + break; + + case ROCPROFILER_DATA_KIND_BYTES: { + std::cout << " result_bytes ptr(" << p->data.result_bytes.ptr << + ") " << " size(" << p->data.result_bytes.size << ")" + << " instance_count(" << p->data.result_bytes.instance_count + << ")"; + break; + } + default: + std::cout << "bad result kind (" << p->data.kind << ")" + << std::endl; + + } + } } ``` From ca823d250b17b8b5d3989a29673b3db451337901 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 31 Dec 2019 06:20:30 -0600 Subject: [PATCH 075/168] Update rocprofiler_spec.md --- doc/rocprofiler_spec.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index 496595f1..57bf2739 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -113,7 +113,7 @@ The library provides back compatibility if the library major version is less or Returned API status: - hsa_status_t - HSA status codes are used from hsa.h header -Loadable plugin on-load/unload methods: +Loading and Configuring, loadable plugin on-load/unload methods: - rocprofiler_settings_t – global properties - OnLoadTool - OnLoadToolProp From d93e1b447adec157321ab47ef8539fdc8234c46f Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 31 Dec 2019 07:26:27 -0600 Subject: [PATCH 076/168] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 9ed4010a..c1b7692f 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,9 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics. +['rocprof' cmdline tool specification](doc/rocprof.md) +[API specification](doc/rocprofiler_spec.md) + ## Metrics [The link to profiler default metrics XML specification](test/tool/metrics.xml) From 64bd4dec0e34ce59df8ba213593d11e17cc9547e Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 31 Dec 2019 07:27:17 -0600 Subject: [PATCH 077/168] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c1b7692f..52c5a27e 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ HW specific low-level performance analysis interface for profiling of GPU comput profiling includes HW performance counters with complex performance metrics. ['rocprof' cmdline tool specification](doc/rocprof.md) + [API specification](doc/rocprofiler_spec.md) ## Metrics From 95a4317bc0f61be8a64c4d4104cedb2dc7c9e210 Mon Sep 17 00:00:00 2001 From: Srinivasan Subramanian Date: Wed, 18 Sep 2019 18:26:18 -0700 Subject: [PATCH 078/168] multiple rocm version support, remove shared library conflicts Change-Id: Ic618c90be4c6274b4c6fbc43e46c321d60fe1c28 --- CMakeLists.txt | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b81b5a6..edc30d1a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,9 @@ cmake_minimum_required ( VERSION 2.8.12 ) +# Install prefix +set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix default") + ## Verbose output. set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) @@ -45,13 +48,22 @@ message ( "-- LIB-VERSION: ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}" ) set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} ) set ( BUILD_VERSION_MINOR ${VERSION_MINOR} ) set ( BUILD_VERSION_PATCH ${VERSION_PATCH} ) -set ( LIB_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" ) message ( "VERSION BUILD DEFINED ${VERSION_BUILD}" ) set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" ) endif () set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) +set ( LIB_VERSION_MAJOR ${VERSION_MAJOR} ) +set ( LIB_VERSION_MINOR ${VERSION_MINOR} ) +if ( ${ROCM_PATCH_VERSION} ) + set ( LIB_VERSION_PATCH ${ROCM_PATCH_VERSION} ) +else() + set ( LIB_VERSION_PATCH ${VERSION_PATCH} ) +endif() +set ( LIB_VERSION_STRING "${LIB_VERSION_MAJOR}.${LIB_VERSION_MINOR}.${LIB_VERSION_PATCH}" ) +message ( "-- LIB-VERSION STRING: ${LIB_VERSION_STRING}" ) + ## Set target and root/lib/test directory set ( TARGET_NAME "${ROCPROFILER_TARGET}" ) set ( ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ) @@ -63,7 +75,7 @@ include ( ${LIB_DIR}/CMakeLists.txt ) ## Set the VERSION and SOVERSION values set_property ( TARGET ${TARGET_NAME} PROPERTY VERSION "${LIB_VERSION_STRING}" ) -set_property ( TARGET ${TARGET_NAME} PROPERTY SOVERSION "${BUILD_VERSION_MAJOR}" ) +set_property ( TARGET ${TARGET_NAME} PROPERTY SOVERSION "${LIB_VERSION_MAJOR}" ) ## If the library is a release, strip the target library if ( "${CMAKE_BUILD_TYPE}" STREQUAL release ) From 6cc11fa0c5498ae9a8ffbb3c5866dc103d4da2d3 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Mon, 6 Jan 2020 13:37:10 -0600 Subject: [PATCH 079/168] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 52c5a27e..50b61cf7 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ profiling includes HW performance counters with complex performance metrics. ['rocprof' cmdline tool specification](doc/rocprof.md) -[API specification](doc/rocprofiler_spec.md) +['rocprofiler' profiling C API specification](doc/rocprofiler_spec.md) ## Metrics [The link to profiler default metrics XML specification](test/tool/metrics.xml) From 2904d293d704f5399f95b773daf908746d9c28e4 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Mon, 6 Jan 2020 13:38:02 -0600 Subject: [PATCH 080/168] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 50b61cf7..b85f5084 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics. +## Documentation ['rocprof' cmdline tool specification](doc/rocprof.md) ['rocprofiler' profiling C API specification](doc/rocprofiler_spec.md) From 145232e0def29a20a5d405a0a229c20f621c0319 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Mon, 6 Jan 2020 13:43:51 -0600 Subject: [PATCH 081/168] Update README.md --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b85f5084..331a6ffc 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,8 @@ HW specific low-level performance analysis interface for profiling of GPU comput profiling includes HW performance counters with complex performance metrics. ## Documentation -['rocprof' cmdline tool specification](doc/rocprof.md) - -['rocprofiler' profiling C API specification](doc/rocprofiler_spec.md) +- ['rocprof' cmdline tool specification](doc/rocprof.md) +- ['rocprofiler' profiling C API specification](doc/rocprofiler_spec.md) ## Metrics [The link to profiler default metrics XML specification](test/tool/metrics.xml) From 628051723ed4f757c2615dae32d9480a529214ae Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 7 Jan 2020 11:11:52 -0600 Subject: [PATCH 082/168] adding: get_time() API; C test; --- bin/rpl_run.sh | 8 +++- inc/rocprofiler.h | 14 ++++++- src/core/rocprofiler.cpp | 9 ++++ src/util/hsa_rsrc_factory.cpp | 5 +++ src/util/hsa_rsrc_factory.h | 79 +++++++++++++++++++++++++++++++++++ test/CMakeLists.txt | 10 +++-- test/app/c_test.c | 25 +++++++++++ test/run.sh | 21 +++++----- 8 files changed, 155 insertions(+), 16 deletions(-) create mode 100644 test/app/c_test.c diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 86383d14..ce492e81 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -179,6 +179,7 @@ usage() { echo " " echo " " echo "" + echo " --trace-start - to enable tracing on start [on]" echo " --trace-period - to enable trace with initial delay, with periodic sample length and rate" echo " Supported time formats: " echo " --obj-tracking - to turn on/off kernels code objects tracking [off]" @@ -232,12 +233,13 @@ run() { fi API_TRACE="" + LD_PRELOAD="" if [ "$ROCTX_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":roctx" fi if [ "$KFD_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":kfd" - export LD_PRELOAD="libkfdwrapper64.so libhsakmt.so.1" + export LD_PRELOAD="libkfdwrapper64.so libhsakmt.so.1 $LD_PRELOAD" fi if [ "$HIP_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":hip" @@ -392,6 +394,10 @@ while [ 1 ] ; do export ROCP_TIMESTAMP_ON=1 GEN_STATS=1 HIP_TRACE=1 + elif [ "$1" = "--trace-start" ] ; then + if [ "$2" = "off" ] ; then + export ROCP_CTRL_RATE="-1" + fi elif [ "$1" = "--trace-period" ] ; then period_expr="^\([^:]*\):\([^:]*\):\([^:]*\)$" period_ck=`echo "$2" | sed -n "s/"${period_expr}"/ok/p"` diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 313f7f42..31082cf4 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -199,6 +199,18 @@ hsa_status_t rocprofiler_close(rocprofiler_t* context); // [in] profiling conte hsa_status_t rocprofiler_reset(rocprofiler_t* context, // [in] profiling context uint32_t group_index); // group index +// Supported time value ID +typedef enum { + ROCPROFILER_TIME_ID_CLOCK_REALTIME = 0, // Linux realtime clock time + ROCPROFILER_TIME_ID_CLOCK_MONOTONIC = 1, // Linux monotonic clock time +} rocprofiler_time_id_t; + +// Return time value for a given time ID and profiling timestamp +hsa_status_t rocprofiler_get_time( + rocprofiler_time_id_t time_id, // identifier of the particular time to convert the timesatmp + uint64_t timestamp, // profiling timestamp + uint64_t* value_ns); // [out] returned time 'ns' value + //////////////////////////////////////////////////////////////////////////////// // Queue callbacks // @@ -376,7 +388,7 @@ hsa_status_t rocprofiler_query_info( hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback void *data); // [in/out] data passed to callback -// Creates a profiled queue. All dispatches on this queue will be profiled +// Create a profiled queue. All dispatches on this queue will be profiled hsa_status_t rocprofiler_queue_create_profiled( hsa_agent_t agent_handle,uint32_t size, hsa_queue_type32_t type, void (*callback)(hsa_status_t status, hsa_queue_t* source, void* data), diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index 61fd4619..3f1362a7 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -843,4 +843,13 @@ PUBLIC_API hsa_status_t rocprofiler_queue_create_profiled( return rocprofiler::InterceptQueue::QueueCreateTracked(agent, size, type, callback, data, private_segment_size, group_segment_size, queue); } +// Return time value for a given time ID and profiling timestamp +hsa_status_t rocprofiler_get_time( + rocprofiler_time_id_t time_id, + uint64_t timestamp, + uint64_t* value_ns) +{ + return rocprofiler::util::HsaRsrcFactory::Instance().GetTime(time_id, timestamp, value_ns); +} + } // extern "C" diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index 65f94357..9ce362d4 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -150,6 +150,11 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize CHECK_STATUS("HSA timer allocation failed", (timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); + // Time correlation + const uint32_t corr_iters = 1000; + CorrelateTime(HsaTimer::TIME_ID_CLOCK_REALTIME, corr_iters); + CorrelateTime(HsaTimer::TIME_ID_CLOCK_MONOTONIC, corr_iters); + // System timeout timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_); } diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index bf7f5fcf..0362bc2c 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -35,6 +35,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include @@ -177,6 +178,12 @@ class HsaTimer { static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; typedef long double freq_t; + enum time_id_t { + TIME_ID_CLOCK_REALTIME = 0, + TIME_ID_CLOCK_MONOTONIC = 1, + TIME_ID_NUMBER + }; + HsaTimer(const hsa_pfn_t* hsa_api) : hsa_api_(hsa_api) { timestamp_t sysclock_hz = 0; hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); @@ -192,6 +199,11 @@ class HsaTimer { return timestamp_t((freq_t)time / sysclock_factor_); } + // Method for timespec/ns conversion + timestamp_t timespec_to_ns(const timespec& time) const { + return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec; + } + // Return timestamp in 'ns' timestamp_t timestamp_ns() const { timestamp_t sysclock; @@ -200,6 +212,54 @@ class HsaTimer { return sysclock_to_ns(sysclock); } + // Return time in 'ns' + timestamp_t clocktime_ns(clockid_t clock_id) const { + timespec time; + clock_gettime(clock_id, &time); + return timespec_to_ns(time); + } + + // Return pair of correlated values of profiling timestamp and time with + // correlation error for a given time ID and number of iterations + void correlated_pair_ns(time_id_t time_id, uint32_t iters, + timestamp_t* timestamp_v, timestamp_t* time_v, timestamp_t* error_v) { + clockid_t clock_id = 0; + switch (clock_id) { + case TIME_ID_CLOCK_REALTIME: + clock_id = CLOCK_REALTIME; + break; + case TIME_ID_CLOCK_MONOTONIC: + clock_id = CLOCK_MONOTONIC; + break; + default: + CHECK_STATUS("internal error: invalid time_id", HSA_STATUS_ERROR); + } + + std::vector ts_vec(iters); + std::vector tm_vec(iters); + const uint32_t steps = iters - 1; + + for (uint32_t i = 0; i < iters; ++i) { + hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &ts_vec[i]); + clock_gettime(clock_id, &tm_vec[i]); + } + + const timestamp_t ts_base = sysclock_to_ns(ts_vec.front()); + const timestamp_t tm_base = timespec_to_ns(tm_vec.front()); + const timestamp_t error = (ts_vec.back() - ts_vec.front()) / (2 * steps); + + timestamp_t ts_accum = 0; + timestamp_t tm_accum = 0; + for (uint32_t i = 0; i < iters; ++i) { + ts_accum += (ts_vec[i] - ts_base); + tm_accum += (timespec_to_ns(tm_vec[i]) - tm_base); + } + + *timestamp_v = (ts_accum / iters) + ts_base + error; + *time_v = (tm_accum / iters) + tm_base; + *error_v = error; + } + private: // Timestamp frequency factor freq_t sysclock_factor_; @@ -359,6 +419,21 @@ class HsaRsrcFactory { if (instance_ != NULL) Instance().timeout_ = Instance().timer_->ns_to_sysclock(time); } + void CorrelateTime(HsaTimer::time_id_t time_id, uint32_t iters) { + timestamp_t timestamp_v = 0; + timestamp_t time_v = 0; + timestamp_t error_v = 0; + timer_->correlated_pair_ns(time_id, iters, ×tamp_v, &time_v, &error_v); + time_shift_[time_id] = time_v - timestamp_v; + time_error_[time_id] = error_v; + } + + hsa_status_t GetTime(uint32_t time_id, uint64_t value, uint64_t* time) { + if (time_id >= HsaTimer::TIME_ID_NUMBER) return HSA_STATUS_ERROR; + *time = value + time_shift_[time_id]; + return HSA_STATUS_SUCCESS; + } + private: // System agents iterating callback static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); @@ -423,6 +498,10 @@ class HsaRsrcFactory { // HSA timer HsaTimer* timer_; + // Time shift array to support time conversion + timestamp_t time_shift_[HsaTimer::TIME_ID_NUMBER]; + timestamp_t time_error_[HsaTimer::TIME_ID_NUMBER]; + // CPU/kern-arg memory pools hsa_amd_memory_pool_t *cpu_pool_; hsa_amd_memory_pool_t *kern_arg_pool_; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 9212f2af..1ae8a554 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -32,6 +32,10 @@ if ( NOT DEFINED TEST_DIR ) include ( env ) endif () +## C test +add_executable ( "c_test" ${TEST_DIR}/app/c_test.c ) +target_include_directories ( "c_test" PRIVATE ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) + ## Util sources file( GLOB UTIL_SRC "${TEST_DIR}/util/*.cpp" ) @@ -67,17 +71,17 @@ execute_process ( COMMAND sh -xc "${TEST_DIR}/../bin/build_kernel.sh ${TEST_DIR} ## Building standalone test executable add_executable ( ${STEXE_NAME} ${STTST_SRC} ${UTIL_SRC} ${KERN_SRC} ) target_include_directories ( ${STEXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) -target_link_libraries( ${STEXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) +target_link_libraries ( ${STEXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) ## Building intercept test executable add_library ( ${INEXE_NAME} SHARED ${INTST_SRC} ${UTIL_SRC} ${KERN_SRC} ) target_include_directories ( ${INEXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) -target_link_libraries( ${INEXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) +target_link_libraries ( ${INEXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) ## Building ctrl test executable add_executable ( ${EXE_NAME} ${CTRL_SRC} ${UTIL_SRC} ${KERN_SRC} ) target_include_directories ( ${EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) -target_link_libraries( ${EXE_NAME} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) +target_link_libraries ( ${EXE_NAME} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/run.sh ${PROJECT_BINARY_DIR}" ) execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/tool/*.xml ${PROJECT_BINARY_DIR}" ) execute_process ( COMMAND sh -xc "mkdir -p ${PROJECT_BINARY_DIR}/RESULTS" ) diff --git a/test/app/c_test.c b/test/app/c_test.c new file mode 100644 index 00000000..70c6d306 --- /dev/null +++ b/test/app/c_test.c @@ -0,0 +1,25 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include "inc/rocprofiler.h" +const int ret = 0; +int main() { return ret; } diff --git a/test/run.sh b/test/run.sh index dfd15e34..4612fa1c 100755 --- a/test/run.sh +++ b/test/run.sh @@ -56,8 +56,6 @@ eval_test() { export HSA_TOOLS_REPORT_LOAD_FAILURE=1 # paths to ROC profiler and oher libraries export LD_LIBRARY_PATH=$PWD -# ROC profiler library loaded by HSA runtime -export HSA_TOOLS_LIB=librocprofiler64.so # enable error messages logging to '/tmp/rocprofiler_log.txt' export ROCPROFILER_LOG=1 # ROC profiler metrics config file @@ -67,8 +65,17 @@ export ROCP_METRICS=metrics.xml # test trace export ROC_TEST_TRACE=1 -## Intercepting usage model test +## C test +eval_test "C test" ./test/c_test +## Standalone sampling usage model test +unset HSA_TOOLS_LIB +unset ROCP_TOOL_LIB +eval_test "Standalone sampling usage model test" ./test/standalone_test + +## Intercepting usage model test +# ROC profiler library loaded by HSA runtime +export HSA_TOOLS_LIB=librocprofiler64.so # tool library loaded by ROC profiler export ROCP_TOOL_LIB=./test/libintercept_test.so export ROCP_KITER=50 @@ -77,13 +84,7 @@ export ROCP_AGENTS=1 export ROCP_THRS=3 eval_test "Intercepting usage model test" "../bin/run_tool.sh ./test/ctrl" -## Standalone sampling usage model test - -unset ROCP_TOOL_LIB -eval_test "Standalone sampling usage model test" ./test/standalone_test - ## Libtool test - # tool library loaded by ROC profiler export ROCP_TOOL_LIB=libtool.so # ROC profiler kernels timing @@ -111,7 +112,6 @@ export ROCP_INPUT=input1.xml eval_test "'rocprof' libtool test n-threads" ./test/ctrl ## Libtool test, counter sets - # Memcopies tracking export ROCP_MCOPY_TRACKING=1 @@ -121,7 +121,6 @@ export ROCP_INPUT=input2.xml eval_test "libtool test, counter sets" ./test/ctrl ## OpenCL test - export ROCP_OBJ_TRACKING=1 export ROCP_INPUT=input1.xml eval_test "libtool test, OpenCL sample" ./test/ocl/SimpleConvolution From 294cb95794c0222af8c5c6dd01475cabb3d8cbf9 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 7 Jan 2020 13:10:22 -0600 Subject: [PATCH 083/168] Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 331a6ffc..f7aacd06 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,11 @@ profiling includes HW performance counters with complex performance metrics. ## To build with the current installed ROCM: ``` + - Python2.7 is required. + The required modules: CppHeaderParser, argparse. + To install: + sudo pip install CppHeaderParser argparse + - To build and install to /opt/rocm/rocprofiler export CMAKE_PREFIX_PATH=/opt/rocm/include/hsa:/opt/rocm From 5c8dea1a6fdf7085fe5bc04e57fa8dd340156a81 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 7 Jan 2020 13:12:19 -0600 Subject: [PATCH 084/168] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index f7aacd06..85a51619 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,8 @@ profiling includes HW performance counters with complex performance metrics. ## To build with the current installed ROCM: ``` + - ROCm is required. + - Python2.7 is required. The required modules: CppHeaderParser, argparse. To install: From c69d8367c2eaf172591895a4708bfd60d248c850 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 7 Jan 2020 13:12:51 -0600 Subject: [PATCH 085/168] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 85a51619..0e065952 100644 --- a/README.md +++ b/README.md @@ -40,9 +40,9 @@ profiling includes HW performance counters with complex performance metrics. ## To build with the current installed ROCM: ``` - - ROCm is required. + - ROCm is required. - - Python2.7 is required. + - Python2.7 is required. The required modules: CppHeaderParser, argparse. To install: sudo pip install CppHeaderParser argparse From 2c262c3efdd70929757b760d1967a46312ba004c Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 7 Jan 2020 16:35:52 -0600 Subject: [PATCH 086/168] Update README.md --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0e065952..8281f6e1 100644 --- a/README.md +++ b/README.md @@ -151,10 +151,12 @@ Options: --heartbeat - to print progress heartbeats [0 - disabled] --stats - generating kernel executino stats, file .stats.csv + --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible --hip-trace - to trace HIP, generates API execution stats/trace and JSON file viewable in chrome tracing 'HCC_HOME' env var is required to be set to where 'hcc' is installed. --hsa-trace - to trace HSA, generates API execution stats/trace and JSON file viewable in chrome tracing - Generated files: .hsa_stats.txt .json + --kfd-trace - to trace KFD, generates API execution stats and JSON file chrome-tracing compatible" + Generated files: ._stats.txt .json Traced API list can be set by input .txt or .xml files. Input .txt: hsa: hsa_queue_create hsa_amd_memory_pool_allocate @@ -173,12 +175,13 @@ Options: Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:$HOME: First the configuration file is looking in the current directory, then in your home, and then in the package directory. - Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat'. + Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'obj-tracking'. An example of 'rpl_rc.xml': ``` From 9560d8fc07780b6683a04a13c1d3afa2810dcbd8 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 7 Jan 2020 16:37:02 -0600 Subject: [PATCH 087/168] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8281f6e1..4a8e949d 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ profiling includes HW performance counters with complex performance metrics. ## Profiling utility usage: ``` - rocprof [-h] [--list-basic] [--list-derived] [-i ] [-o ] +rocprof [-h] [--list-basic] [--list-derived] [-i ] [-o ] Options: -h - this help From 8f0e758589dfdc1b01ee91daa525b3eea5fddb4a Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 7 Jan 2020 16:42:35 -0600 Subject: [PATCH 088/168] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4a8e949d..c541d6db 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,6 @@ Configuration file: timestamp=off ctx-limit=0 heartbeat=0 - obj-tracking=0 + obj-tracking=off > ``` From 315a69e2f38e3cbd0fa89d39d5b5e7786aedbbc0 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 7 Jan 2020 17:08:45 -0600 Subject: [PATCH 089/168] Update README.md --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index c541d6db..1aafa0a9 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,10 @@ ROC profiler library. Profiling with perf-counters and derived metrics. Library HW specific low-level performance analysis interface for profiling of GPU compute applications. The profiling includes HW performance counters with complex performance metrics. +To use the rocProfiler API you need the API header and to link your application with roctracer .so librray: + - the API header: /opt/rocm/rocprofiler/include/rocprofiler.h + - the .so library: /opt/rocm/lib/librocprofiler64.so + ## Documentation - ['rocprof' cmdline tool specification](doc/rocprof.md) - ['rocprofiler' profiling C API specification](doc/rocprofiler_spec.md) From 3acd29ba630b8ffb3af3b166922bcc3d779a7221 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Sat, 11 Jan 2020 04:47:14 -0600 Subject: [PATCH 090/168] Update rocprofiler_spec.md --- doc/rocprofiler_spec.md | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index 57bf2739..a7219cec 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -74,7 +74,6 @@ Library version: ``` ### 3.2. Returning the error and error string methods ``` -rocprofiler_errno_t rocprofiler_errno(); const char* rocprofiler_error_string(); ``` ### 3.3. Library version From 47a3ad58344f7f6efa0b84e9a210cf56b1e83dcc Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Sat, 11 Jan 2020 05:04:33 -0600 Subject: [PATCH 091/168] Update rocprof.md --- doc/rocprof.md | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/doc/rocprof.md b/doc/rocprof.md index 648648fb..717653eb 100644 --- a/doc/rocprof.md +++ b/doc/rocprof.md @@ -153,7 +153,8 @@ The trace is generated by option ‘—hip-trace’ and includes HIP API timelin #### 2.2.2. ROCr runtime trace The trace is generated by option ‘—hsa-trace’ and includes ROCr API timelines and GPU activity at AQL queue level. Also, can provide counters per kernel. #### 2.2.3. KFD driver trace -Is planned to include Thunk API trace and memory allocations/migration tracing. +The trace is generated by option ‘—kfd-trace’ and includes KFD Thunk API timeline. +It is planned to add memory allocations/migration tracing. #### 2.2.4. Code annotation Support for application code annotation. Start/stop API is supported to programmatically control the profiling. @@ -230,6 +231,7 @@ Options: --verbose - verbose mode, dumping all base counters used in the input metrics --list-basic - to print the list of basic HW counters --list-derived - to print the list of derived metrics with formulas + --cmd-qts - quoting profiled cmd-line [on] -i <.txt|.xml file> - input file Input file .txt format, automatically rerun application for every pmc line: @@ -273,12 +275,16 @@ Options: --ctx-wait - to wait for outstanding contexts on profiler exit [on] --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] + --obj-tracking - to turn on/off kernels code objects tracking [off] --stats - generating kernel execution stats, file .stats.csv - --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible - --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible + + --roctx-trace - to enable rocTX application code annotation trace, "Markers and Ranges" JSON trace section. --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible - Generated files: .hsa_stats.txt .json + --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible + --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible + --kfd-trace - to trace KFD, generates API execution stats and JSON file chrome-tracing compatible + Generated files: ._stats.txt .json Traced API list can be set by input .txt or .xml files. Input .txt: hsa: hsa_queue_create hsa_amd_memory_pool_allocate @@ -288,19 +294,21 @@ Options: + --trace-start - to enable tracing on start [on] --trace-period - to enable trace with initial delay, with periodic sample length and rate Supported time formats: Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: First the configuration file is looking in the current directory, then in your home, and then in the package directory. - Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat'. + Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'obj-tracking'. An example of 'rpl_rc.xml': ``` ## 6. Publicly available counters and metrics From 1b3ce5e0363dbf704de52f164bd4b8498b8f5b8b Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Sat, 11 Jan 2020 05:10:02 -0600 Subject: [PATCH 092/168] Update README.md --- README.md | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 1aafa0a9..3d9da12e 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,7 @@ Options: --verbose - verbose mode, dumping all base counters used in the input metrics --list-basic - to print the list of basic HW counters --list-derived - to print the list of derived metrics with formulas + --cmd-qts - quoting profiled cmd-line [on] -i <.txt|.xml file> - input file Input file .txt format, automatically rerun application for every pmc line: @@ -153,28 +154,28 @@ Options: --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] + --obj-tracking - to turn on/off kernels code objects tracking [off] - --stats - generating kernel executino stats, file .stats.csv + --stats - generating kernel execution stats, file .stats.csv + + --roctx-trace - to enable rocTX application code annotation trace, "Markers and Ranges" JSON trace section. --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible - --hip-trace - to trace HIP, generates API execution stats/trace and JSON file viewable in chrome tracing - 'HCC_HOME' env var is required to be set to where 'hcc' is installed. - --hsa-trace - to trace HSA, generates API execution stats/trace and JSON file viewable in chrome tracing - --kfd-trace - to trace KFD, generates API execution stats and JSON file chrome-tracing compatible" + --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible + --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible + --kfd-trace - to trace KFD, generates API execution stats and JSON file chrome-tracing compatible Generated files: ._stats.txt .json Traced API list can be set by input .txt or .xml files. Input .txt: hsa: hsa_queue_create hsa_amd_memory_pool_allocate Input .xml: - + - - --roctx-trace - to enable rocTX applicatin code annotation trace; should be use in addition to the trace options above. - Will show the application code annotation with rocTX events: roctxMark, roctxRangePush, roctxRangePop in JSON trace - "Markers and Ranges" section. - Application code needs to be explicitely instrumented using rocTX events APIs. - See roctracer documentation on rocTX API details. + + --trace-start - to enable tracing on start [on] + --trace-period - to enable trace with initial delay, with periodic sample length and rate + Supported time formats: Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:$HOME: From 8b402d07f8d6a21397cfa37b98244eec81f40363 Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Tue, 14 Jan 2020 11:11:05 -0500 Subject: [PATCH 093/168] Fix crash in the fill_ext_db --- bin/tblextr.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/bin/tblextr.py b/bin/tblextr.py index 2c4e442b..10120395 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -239,13 +239,17 @@ def fill_ext_db(table_name, db, indir, trace_name, api_pid): continue if cid == 2: + if not pid in range_stack: range_stack[pid] = {} pid_stack = range_stack[pid] + if not tid in pid_stack: pid_stack[tid] = [] rec_stack = pid_stack[tid] - rec_vals = rec_stack.pop() - rec_vals[1] = tms + if len(rec_stack) != 0: + rec_vals = rec_stack.pop() + rec_vals[1] = tms - db.insert_entry(table_handle, rec_vals) - record_id += 1 + if len(rec_vals) != 0: + db.insert_entry(table_handle, rec_vals) + record_id += 1 return 1 ############################################################# From 38348e9e170839908e39ad6b1142e4bba1543971 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 23 Jan 2020 16:56:55 -0600 Subject: [PATCH 094/168] Update rocprofiler_spec.md --- doc/rocprofiler_spec.md | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index a7219cec..566ce21f 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -1,4 +1,5 @@ # ROC Profiler Library Specification +API version 7 ## 1. High level overview ``` From c771b374326496fe755d6ade4c83f8fe994e8a4e Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 23 Jan 2020 17:09:21 -0600 Subject: [PATCH 095/168] Update rocprofiler_spec.md --- doc/rocprofiler_spec.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index 566ce21f..efbc727f 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -1,5 +1,5 @@ # ROC Profiler Library Specification -API version 7 +ROC Profiler API version 7 ## 1. High level overview ``` From 2cd889ade33b9ed4308e9e804dcbb10adaa7a002 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 23 Jan 2020 21:58:09 -0600 Subject: [PATCH 096/168] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3d9da12e..e706920e 100644 --- a/README.md +++ b/README.md @@ -154,7 +154,6 @@ Options: --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] - --obj-tracking - to turn on/off kernels code objects tracking [off] --stats - generating kernel execution stats, file .stats.csv @@ -176,6 +175,7 @@ Options: --trace-start - to enable tracing on start [on] --trace-period - to enable trace with initial delay, with periodic sample length and rate Supported time formats: + --obj-tracking - to turn on/off kernels code objects tracking [off] Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:$HOME: From b592051c632dd5f13d338747e3398f448b54c108 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 23 Jan 2020 22:01:47 -0600 Subject: [PATCH 097/168] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e706920e..ec16a82d 100644 --- a/README.md +++ b/README.md @@ -154,9 +154,10 @@ Options: --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] + --obj-tracking - to turn on/off kernels code objects tracking [off] --stats - generating kernel execution stats, file .stats.csv - + --roctx-trace - to enable rocTX application code annotation trace, "Markers and Ranges" JSON trace section. --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible @@ -175,7 +176,6 @@ Options: --trace-start - to enable tracing on start [on] --trace-period - to enable trace with initial delay, with periodic sample length and rate Supported time formats: - --obj-tracking - to turn on/off kernels code objects tracking [off] Configuration file: You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:$HOME: From f840d11c20c48b5c2f14f7200b1e019445f95961 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 23 Jan 2020 22:03:34 -0600 Subject: [PATCH 098/168] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index ec16a82d..06aa990f 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,7 @@ Options: --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] --obj-tracking - to turn on/off kernels code objects tracking [off] + To support V3 code-object. --stats - generating kernel execution stats, file .stats.csv From b575b925acd22933da1edd257526735323a9aab4 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 24 Jan 2020 13:00:20 -0600 Subject: [PATCH 099/168] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 06aa990f..4c433aaf 100644 --- a/README.md +++ b/README.md @@ -46,8 +46,8 @@ To use the rocProfiler API you need the API header and to link your application ``` - ROCm is required. - - Python2.7 is required. - The required modules: CppHeaderParser, argparse. + - Python is required. + The required modules: sqlite3, CppHeaderParser, argparse. To install: sudo pip install CppHeaderParser argparse From 7df4f3a46ee0ea26d87b4aa258ce5f6e9edcaf12 Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Mon, 27 Jan 2020 17:48:02 -0500 Subject: [PATCH 100/168] Update tblextr.py --- bin/tblextr.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/bin/tblextr.py b/bin/tblextr.py index 10120395..215f79b5 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -239,17 +239,15 @@ def fill_ext_db(table_name, db, indir, trace_name, api_pid): continue if cid == 2: - if not pid in range_stack: range_stack[pid] = {} + if not pid in range_stack: fatal("ROCTX range begin not found, pid(" + pid + ")"); pid_stack = range_stack[pid] - if not tid in pid_stack: pid_stack[tid] = [] + if not tid in pid_stack: fatal("ROCTX range begin not found, tid(" + tid + ")"); rec_stack = pid_stack[tid] - if len(rec_stack) != 0: - rec_vals = rec_stack.pop() - rec_vals[1] = tms - - if len(rec_vals) != 0: - db.insert_entry(table_handle, rec_vals) - record_id += 1 + rec_vals = rec_stack.pop() + rec_vals[1] = tms + + db.insert_entry(table_handle, rec_vals) + record_id += 1 return 1 ############################################################# From 68113462b8de2bb30a50966aa642801fe498eaa7 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 29 Jan 2020 19:47:54 -0600 Subject: [PATCH 101/168] Update README.md non default python modules --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4c433aaf..4529c326 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ To use the rocProfiler API you need the API header and to link your application - ROCm is required. - Python is required. - The required modules: sqlite3, CppHeaderParser, argparse. + The required modules: CppHeaderParser, argparse. To install: sudo pip install CppHeaderParser argparse From 03279c28d0725355bca5881b6d09b7ecca5ee957 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 27 Feb 2020 15:07:49 -0600 Subject: [PATCH 102/168] 3.1 update --- CMakeLists.txt | 16 ++++++++++++++-- bin/rpl_run.sh | 2 +- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b81b5a6..edc30d1a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,9 @@ cmake_minimum_required ( VERSION 2.8.12 ) +# Install prefix +set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix default") + ## Verbose output. set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) @@ -45,13 +48,22 @@ message ( "-- LIB-VERSION: ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}" ) set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} ) set ( BUILD_VERSION_MINOR ${VERSION_MINOR} ) set ( BUILD_VERSION_PATCH ${VERSION_PATCH} ) -set ( LIB_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" ) message ( "VERSION BUILD DEFINED ${VERSION_BUILD}" ) set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" ) endif () set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) +set ( LIB_VERSION_MAJOR ${VERSION_MAJOR} ) +set ( LIB_VERSION_MINOR ${VERSION_MINOR} ) +if ( ${ROCM_PATCH_VERSION} ) + set ( LIB_VERSION_PATCH ${ROCM_PATCH_VERSION} ) +else() + set ( LIB_VERSION_PATCH ${VERSION_PATCH} ) +endif() +set ( LIB_VERSION_STRING "${LIB_VERSION_MAJOR}.${LIB_VERSION_MINOR}.${LIB_VERSION_PATCH}" ) +message ( "-- LIB-VERSION STRING: ${LIB_VERSION_STRING}" ) + ## Set target and root/lib/test directory set ( TARGET_NAME "${ROCPROFILER_TARGET}" ) set ( ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ) @@ -63,7 +75,7 @@ include ( ${LIB_DIR}/CMakeLists.txt ) ## Set the VERSION and SOVERSION values set_property ( TARGET ${TARGET_NAME} PROPERTY VERSION "${LIB_VERSION_STRING}" ) -set_property ( TARGET ${TARGET_NAME} PROPERTY SOVERSION "${BUILD_VERSION_MAJOR}" ) +set_property ( TARGET ${TARGET_NAME} PROPERTY SOVERSION "${LIB_VERSION_MAJOR}" ) ## If the library is a release, strip the target library if ( "${CMAKE_BUILD_TYPE}" STREQUAL release ) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index ce492e81..d34888cd 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -239,7 +239,7 @@ run() { fi if [ "$KFD_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":kfd" - export LD_PRELOAD="libkfdwrapper64.so libhsakmt.so.1 $LD_PRELOAD" + export LD_PRELOAD="$TT_DIR/lib/libkfdwrapper64.so libhsakmt.so.1 $LD_PRELOAD" fi if [ "$HIP_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":hip" From c65b74bb0d306565dc399c7b36f514959fac494c Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Wed, 4 Mar 2020 10:29:25 -0500 Subject: [PATCH 103/168] Update sqlitedb.py Add comments section to json with rocminfo and hipcc_version --- bin/sqlitedb.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index cd649e6a..3b494863 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -1,5 +1,6 @@ import csv, sqlite3, re, sys from functools import reduce +from txt2params import gen_params # SQLite Database class class SQLiteDB: @@ -97,12 +98,44 @@ def dump_csv(self, table_name, file_name): for raw in self._get_raws(table_name): fd.write(reduce(lambda a, b: str(a) + ',' + str(b), raw) + '\n') + # dump JSON trace def open_json(self, file_name): if not re.search(r'\.json$', file_name): raise Exception('wrong output file type: "' + file_name + '"' ) + status1, output1 = commands.getstatusoutput("/opt/rocm/bin/rocminfo > rocminfo.txt") + if status1 != 0 : + raise Exception('Could not run command: rocminfo') + params = gen_params('rocminfo.txt'); + + status2, output2 = commands.getstatusoutput("/opt/rocm/bin/hipcc --version > hipccversion.txt") + if status2 != 0 : + raise Exception('Could not run command: hipcc --version') + params2 = gen_params('hipccversion.txt'); + with open(file_name, mode='w') as fd: - fd.write('{ "traceEvents":[{}\n'); + cnt = 0 + fd.write('{\n') + fd.write('"comments": {\n') + fd.write(' "rocminfo": {\n') + for key in params: + cnt = cnt + 1 + if cnt == len(params): + fd.write(' "' + key + '": "' + params[key] + '"\n') + else: + fd.write(' "' + key + '": "' + params[key] + '",\n') + fd.write(' },\n') + cnt = 0 + fd.write(' "hipcc_version": {\n') + for key in params2: + cnt = cnt + 1 + if cnt == len(params2): + fd.write(' "' + key + '": "' + params2[key] + '"\n') + else: + fd.write(' "' + key + '": "' + params2[key] + '",\n') + fd.write(' }\n') + fd.write('},\n') + fd.write('"traceEvents":[{}\n'); def close_json(self, file_name): if not re.search(r'\.json$', file_name): From dc9bee75d645e23e648bd7d2bfdba8b2320c1b0f Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Wed, 4 Mar 2020 10:36:28 -0500 Subject: [PATCH 104/168] Create txt2params.py --- bin/txt2params.py | 88 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 bin/txt2params.py diff --git a/bin/txt2params.py b/bin/txt2params.py new file mode 100644 index 00000000..ce5a2a8c --- /dev/null +++ b/bin/txt2params.py @@ -0,0 +1,88 @@ +#!/usr/bin/python + +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +import os, sys, re + +def gen_params(txtfile): + fields = {} + parent_field = '' + nbr_indent = 0 + nbr_indent_prev = 0 + check_for_dims = False + with open(txtfile) as fp: + for line in fp: + mv = re.match(r'HCC clang version\s+(.*)',line) + if mv: + key = 'HCCclangversion' + val = mv.group(1) + fields[key] = val + continue + if check_for_dims == True: + mc = re.match(r'\s*([x|y|z])\s+(.*)',line) + if mc: + key_sav = mc.group(1) + if parent_field != '': + key = parent_field + '_' + mc.group(1) + else: + key = mc.group(1) + val = re.sub(r"\s+", "", mc.group(2)) + fields[key] = val + if key_sav == 'z': + check_for_dims = False + nbr_indent_prev = nbr_indent + mi = re.search(r'^(\s+)\w+', line) + md = re.search(':', line) + if mi: + nbr_indent = len(mi.group(1)) / 2 #indentation cnt + else: + if not md: + tmp = re.sub(r"\s+", "", line) + if tmp.isalnum(): + parent_field = tmp + continue + + if nbr_indent < nbr_indent_prev: + pos = parent_field.rfind('_') + if pos != -1: + parent_field = parent_field[:pos] # remove last _* + + for lin in line.split(';'): + lin = re.sub(r"\s+", "", lin) + m = re.match(r'(.*):(.*)', lin) + if m: + key, val = m.group(1), m.group(2) + if parent_field != '': + key = parent_field + '_' + key + if val == '': + mk = re.match(r'.*Dimension',key) + if mk: # expect x,y,z on next 3 lines + check_for_dims = True + parent_field = key + else: + fields[key] = val + else: + if nbr_indent != nbr_indent_prev and not check_for_dims : + parent_field = parent_field + '_' + lin.replace(':','') + + return fields From 3b9438c443fb28a2e4a8d70bc853db70317a82b0 Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Fri, 6 Mar 2020 10:29:04 -0500 Subject: [PATCH 105/168] Update sqlitedb.py --- bin/sqlitedb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index 3b494863..472efcc7 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -116,7 +116,7 @@ def open_json(self, file_name): with open(file_name, mode='w') as fd: cnt = 0 fd.write('{\n') - fd.write('"comments": {\n') + fd.write('"otherData": {\n') fd.write(' "rocminfo": {\n') for key in params: cnt = cnt + 1 From 65ece940f97173efeaa80bb6d579a2c316a93b46 Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Fri, 13 Mar 2020 18:08:47 -0400 Subject: [PATCH 106/168] Update tblextr.py --- bin/tblextr.py | 53 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/bin/tblextr.py b/bin/tblextr.py index 2c4e442b..fd064758 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -25,6 +25,7 @@ import os, sys, re from sqlitedb import SQLiteDB import dform +from txt2params import gen_params # Parsing results in the format: #dispatch[0], queue_index(0), kernel_name("SimpleConvolution"), time(1048928000311041,1048928006154674,1048928006168274,1048928006170503): @@ -59,6 +60,48 @@ var_table = {} ############################################################# +def json_metadata_gen(sysinfo_file, index): + if not re.search(r'\.txt$', sysinfo_file): + raise Exception('wrong output file type: "' + sysinfo_file + '"' ) + if index == 1: + status, output = commands.getstatusoutput("/opt/rocm/bin/rocminfo > " + sysinfo_file) + if status != 0 : + raise Exception('Could not run command: rocminfo') + params = gen_params(sysinfo_file); + elif index == 2: + status, output = commands.getstatusoutput("/opt/rocm/bin/hipcc --version >" + sysinfo_file) + if status != 0 : + raise Exception('Could not run command: hipcc --version') + params = gen_params(sysinfo_file); + return params + +def json_metadata_write(jsonfile, params, params2): + with open(jsonfile, mode='a') as fd: + cnt = 0 + fd.write('],\n') + fd.write('"otherData": {\n') + fd.write(' "rocminfo": {\n') + for key in params: + cnt = cnt + 1 + if cnt == len(params): + fd.write(' "' + key + '": "' + params[key] + '"\n') + else: + fd.write(' "' + key + '": "' + params[key] + '",\n') + if len(params2) == 0: + fd.write(' }\n') + return + fd.write(' },\n') + cnt = 0 + fd.write(' "hipcc_version": {\n') + for key in params2: + cnt = cnt + 1 + if cnt == len(params2): + fd.write(' "' + key + '": "' + params2[key] + '"\n') + else: + fd.write(' "' + key + '": "' + params2[key] + '",\n') + fd.write(' }\n') + fd.write('}\n') + def fatal(msg): sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); sys.exit(1) @@ -239,11 +282,13 @@ def fill_ext_db(table_name, db, indir, trace_name, api_pid): continue if cid == 2: + if not pid in range_stack: fatal("ROCTX range begin not found, pid(" + pid + ")"); pid_stack = range_stack[pid] + if not tid in pid_stack: fatal("ROCTX range begin not found, tid(" + tid + ")"); rec_stack = pid_stack[tid] rec_vals = rec_stack.pop() rec_vals[1] = tms - + db.insert_entry(table_handle, rec_vals) record_id += 1 @@ -449,6 +494,8 @@ def fill_ops_db(table_name, db, indir): hsa_statfile = re.sub(r'\.stats\.csv$', r'.hsa_stats.csv', statfile) hip_statfile = re.sub(r'\.stats\.csv$', r'.hip_stats.csv', statfile) kfd_statfile = re.sub(r'\.stats\.csv$', r'.kfd_stats.csv', statfile) + sysinfo_file = re.sub(r'\.stats\.csv$', r'.sysinfo_stats.txt', statfile) + params = json_metadata_gen(sysinfo_file, 1) with open(dbfile, mode='w') as fd: fd.truncate() db = SQLiteDB(dbfile) @@ -514,6 +561,9 @@ def fill_ops_db(table_name, db, indir): dform.post_process_data(db, 'OPS') dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_US, jsonfile) + sysinfo_file2 = re.sub(r'\.stats\.csv$', r'.sysinfo_stats2.txt', statfile) + params2 = json_metadata_gen(sysinfo_file2, 2) + if kfd_trace_found: dform.post_process_data(db, 'KFD') dform.gen_table_bins(db, 'KFD', kfd_statfile, 'Name', 'DurationNs') @@ -544,6 +594,7 @@ def fill_ops_db(table_name, db, indir): dep_id += len(tid_list) if any_trace_found: + json_metadata_write(jsonfile, params, params2) db.close_json(jsonfile); db.close() From e5048f68d45365a17dbdad85447477b4aba349b8 Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Fri, 13 Mar 2020 18:11:30 -0400 Subject: [PATCH 107/168] Update sqlitedb.py --- bin/sqlitedb.py | 36 +++--------------------------------- 1 file changed, 3 insertions(+), 33 deletions(-) diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index 472efcc7..484b6488 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -103,45 +103,14 @@ def dump_csv(self, table_name, file_name): def open_json(self, file_name): if not re.search(r'\.json$', file_name): raise Exception('wrong output file type: "' + file_name + '"' ) - status1, output1 = commands.getstatusoutput("/opt/rocm/bin/rocminfo > rocminfo.txt") - if status1 != 0 : - raise Exception('Could not run command: rocminfo') - params = gen_params('rocminfo.txt'); - - status2, output2 = commands.getstatusoutput("/opt/rocm/bin/hipcc --version > hipccversion.txt") - if status2 != 0 : - raise Exception('Could not run command: hipcc --version') - params2 = gen_params('hipccversion.txt'); - with open(file_name, mode='w') as fd: - cnt = 0 - fd.write('{\n') - fd.write('"otherData": {\n') - fd.write(' "rocminfo": {\n') - for key in params: - cnt = cnt + 1 - if cnt == len(params): - fd.write(' "' + key + '": "' + params[key] + '"\n') - else: - fd.write(' "' + key + '": "' + params[key] + '",\n') - fd.write(' },\n') - cnt = 0 - fd.write(' "hipcc_version": {\n') - for key in params2: - cnt = cnt + 1 - if cnt == len(params2): - fd.write(' "' + key + '": "' + params2[key] + '"\n') - else: - fd.write(' "' + key + '": "' + params2[key] + '",\n') - fd.write(' }\n') - fd.write('},\n') - fd.write('"traceEvents":[{}\n'); + fd.write('{ "traceEvents":[{}\n'); def close_json(self, file_name): if not re.search(r'\.json$', file_name): raise Exception('wrong output file type: "' + file_name + '"' ) with open(file_name, mode='a') as fd: - fd.write(']}\n'); + fd.write('}') def label_json(self, pid, label, file_name): if not re.search(r'\.json$', file_name): @@ -264,3 +233,4 @@ def add_csv_table(self, table_name, file_name, extra = ()): self.insert_table(table, reader) ############################################################################################## + From 30ea6b7bd54bfb087a8daaa9d465750e9c758535 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 18 Mar 2020 17:27:47 -0500 Subject: [PATCH 108/168] rocprofiler spec: correcting info_data_t --- doc/rocprofiler_spec.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index efbc727f..c276a1f9 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -221,7 +221,11 @@ typedef struct { union { struct { const char* name; // metric name + uint32_t instances; // instances number + const char* expr; // metric expression, NULL for basic counters const char* description; // metric description + const char* block_name; // block name + uint32_t block_counters; // number of block counters } metric; struct { const char* name; // trace name From c355f87c5ab2f07a67a6d736f5617ed04050544a Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 18 Mar 2020 17:37:45 -0500 Subject: [PATCH 109/168] rocprofiler spec: correcting get info code example --- doc/rocprofiler_spec.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index c276a1f9..25e61df7 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -595,11 +595,11 @@ Info data callback: switch (info.kind) { case ROCPROFILER_INFO_KIND_METRIC: { if (info.metric.expr != NULL) { - fprintf(stdout, "Basic counter: gpu-agent%d : %s : %s\n", + fprintf(stdout, "Derived counter: gpu-agent%d : %s : %s\n", info.agent_index, info.metric.name, info.metric.description); fprintf(stdout, " %s = %s\n", info.metric.name, info.metric.expr); } else { - fprintf(stdout, "Derived counter: gpu-agent%d : %s", + fprintf(stdout, "Basic counter: gpu-agent%d : %s", info.agent_index, info.metric.name); if (info.metric.instances > 1) { fprintf(stdout, "[0-%u]", info.metric.instances - 1); From 11c83187320d0f5db43f82b872beb473128d58a3 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 19 Mar 2020 00:39:38 -0500 Subject: [PATCH 110/168] kernel build fix --- bin/build_kernel.sh | 39 +++++++++++++++++++++++++++++---------- test/ctrl/test_hsa.cpp | 9 +-------- 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/bin/build_kernel.sh b/bin/build_kernel.sh index 6c4afe6f..e89cf561 100755 --- a/bin/build_kernel.sh +++ b/bin/build_kernel.sh @@ -1,7 +1,9 @@ -#!/bin/sh +#!/bin/sh -x TEST_NAME=$1 DST_DIR=$2 +ROCM_DIR=$3 +TGT_LIST=$4 if [ -z "$TEST_NAME" ] ; then echo "Usage: $0 " @@ -13,18 +15,35 @@ if [ -z "$DST_DIR" ] ; then DST_DIR=$(dirname TEST_NAME) fi -GFXIP=$(/opt/rocm/bin/rocminfo | grep "amdgcn-amd-amdhsa--" | head -n 1 | sed -n "s/^.*amdgcn-amd-amdhsa--\(\w*\).*$/\1/p") -if [ -z "$GFXIP" ] ; then - echo "GPU is not found" - exit 1 +if [ -z "$ROCM_DIR" ] ; then + ROCM_DIR=/opt/rocm fi -OBJ_PREF=$(echo $GFXIP | head -c 4) -OBJ_NAME=$(echo "_$(basename $TEST_NAME)" | sed -e 's/_./\U&\E/g' -e 's/_//g') -OBJ_FILE=${OBJ_PREF}_${OBJ_NAME}.hsaco +if [ -z "$TGT_LIST" ] ; then + TGT_LIST=$(/opt/rocm/bin/rocminfo | grep "amdgcn-amd-amdhsa--" | head -n 1 | sed -n "s/^.*amdgcn-amd-amdhsa--\(\w*\).*$/\1/p") +fi -/opt/rocm/opencl/bin/x86_64/clang -cl-std=CL2.0 -cl-std=CL2.0 -include /opt/rocm/opencl/include/opencl-c.h -Xclang -mlink-bitcode-file -Xclang /opt/rocm/opencl/lib/x86_64/bitcode/opencl.amdgcn.bc -Xclang -mlink-bitcode-file -Xclang /opt/rocm/opencl/lib/x86_64/bitcode/ockl.amdgcn.bc -target amdgcn-amd-amdhsa -mcpu=$GFXIP -mno-code-object-v3 $TEST_NAME.cl -o $OBJ_FILE +if [ -z "$TGT_LIST" ] ; then + echo "Error: GPU targets not found" + exit 1 +fi -echo "'$OBJ_FILE' is generated for '$GFXIP'" +OCL_VER="2.0" +OCL_DIR=$ROCM_DIR/opencl + +LLVM_DIR=$ROCM_DIR/hcc +CLANG=$LLVM_DIR/bin/clang +BITCODE_OPTS="\ + -Xclang -mlink-bitcode-file -Xclang $LLVM_DIR/lib/opencl.amdgcn.bc \ + -Xclang -mlink-bitcode-file -Xclang $LLVM_DIR/lib/ockl.amdgcn.bc \ + -Xclang -mlink-bitcode-file -Xclang $LLVM_DIR/lib/ocml.amdgcn.bc" + +for GFXIP in $TGT_LIST ; do + OBJ_PREF=$GFXIP + OBJ_NAME=$(echo "_$(basename $TEST_NAME)" | sed -e 's/_./\U&\E/g' -e 's/_//g') + OBJ_FILE=${OBJ_PREF}_${OBJ_NAME}.hsaco + $CLANG -cl-std=CL$OCL_VER -include $OCL_DIR/include/opencl-c.h $BITCODE_OPTS -target amdgcn-amd-amdhsa -mcpu=$GFXIP -mno-code-object-v3 $TEST_NAME.cl -o $DST_DIR/$OBJ_FILE + echo "'$OBJ_FILE' is generated for '$GFXIP'" +done exit 0 diff --git a/test/ctrl/test_hsa.cpp b/test/ctrl/test_hsa.cpp index 3cb5dee7..47f788cf 100644 --- a/test/ctrl/test_hsa.cpp +++ b/test/ctrl/test_hsa.cpp @@ -82,14 +82,7 @@ bool TestHsa::Initialize(int /*arg_cnt*/, char** /*arg_list*/) { // Obtain the code object file name std::string agentName(agent_info_->name); - if (agentName.compare(0, 4, "gfx8") == 0) { - brig_path_obj_.append("gfx8"); - } else if (agentName.compare(0, 4, "gfx9") == 0) { - brig_path_obj_.append("gfx9"); - } else { - TEST_ASSERT(false); - return false; - } + brig_path_obj_.append(agentName); brig_path_obj_.append("_" + name_ + ".hsaco"); return true; From 741b8707a234674aef409f18c505c9d4cb68045a Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Thu, 19 Mar 2020 17:19:12 -0400 Subject: [PATCH 111/168] Update sqlitedb.py --- bin/sqlitedb.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index 484b6488..9fa1823c 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -97,7 +97,6 @@ def dump_csv(self, table_name, file_name): fd.write(','.join(fields) + '\n') for raw in self._get_raws(table_name): fd.write(reduce(lambda a, b: str(a) + ',' + str(b), raw) + '\n') - # dump JSON trace def open_json(self, file_name): @@ -232,5 +231,19 @@ def add_csv_table(self, table_name, file_name, extra = ()): table = self.add_table(table_name, descr, extra) self.insert_table(table, reader) + def metadata_json(self, jsonfile, sysinfo_file): + params = gen_params(sysinfo_file); + with open(jsonfile, mode='a') as fd: + cnt = 0 + fd.write('],\n') + fd.write('"otherData": {\n') + for key in params: + cnt = cnt + 1 + if cnt == len(params): + fd.write(' "' + key + '": "' + params[key] + '"\n') + else: + fd.write(' "' + key + '": "' + params[key] + '",\n') + fd.write(' }\n') + ############################################################################################## From 19d26f394aae9faa86ce7462bf24ae09fdccd2c9 Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Thu, 19 Mar 2020 17:22:39 -0400 Subject: [PATCH 112/168] Update tblextr.py --- bin/tblextr.py | 64 ++++++++++++++------------------------------------ 1 file changed, 17 insertions(+), 47 deletions(-) diff --git a/bin/tblextr.py b/bin/tblextr.py index fd064758..35aebee8 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -25,7 +25,6 @@ import os, sys, re from sqlitedb import SQLiteDB import dform -from txt2params import gen_params # Parsing results in the format: #dispatch[0], queue_index(0), kernel_name("SimpleConvolution"), time(1048928000311041,1048928006154674,1048928006168274,1048928006170503): @@ -60,47 +59,16 @@ var_table = {} ############################################################# -def json_metadata_gen(sysinfo_file, index): - if not re.search(r'\.txt$', sysinfo_file): - raise Exception('wrong output file type: "' + sysinfo_file + '"' ) - if index == 1: - status, output = commands.getstatusoutput("/opt/rocm/bin/rocminfo > " + sysinfo_file) - if status != 0 : - raise Exception('Could not run command: rocminfo') - params = gen_params(sysinfo_file); - elif index == 2: - status, output = commands.getstatusoutput("/opt/rocm/bin/hipcc --version >" + sysinfo_file) - if status != 0 : - raise Exception('Could not run command: hipcc --version') - params = gen_params(sysinfo_file); - return params - -def json_metadata_write(jsonfile, params, params2): - with open(jsonfile, mode='a') as fd: - cnt = 0 - fd.write('],\n') - fd.write('"otherData": {\n') - fd.write(' "rocminfo": {\n') - for key in params: - cnt = cnt + 1 - if cnt == len(params): - fd.write(' "' + key + '": "' + params[key] + '"\n') - else: - fd.write(' "' + key + '": "' + params[key] + '",\n') - if len(params2) == 0: - fd.write(' }\n') - return - fd.write(' },\n') - cnt = 0 - fd.write(' "hipcc_version": {\n') - for key in params2: - cnt = cnt + 1 - if cnt == len(params2): - fd.write(' "' + key + '": "' + params2[key] + '"\n') - else: - fd.write(' "' + key + '": "' + params2[key] + '",\n') - fd.write(' }\n') - fd.write('}\n') +def metadata_gen(sysinfo_file, sysinfo_cmd): + if not re.search(r'\.txt$', sysinfo_file): + raise Exception('wrong output file type: "' + sysinfo_file + '"' ) + if re.search(r'rocminfo', sysinfo_cmd): + direct_str = " > " + else: + direct_str = " >> " + status, output = commands.getstatusoutput(sysinfo_cmd + direct_str + sysinfo_file) + if status != 0 : + raise Exception('Could not run command: ' + sysinfo_cmd) def fatal(msg): sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); @@ -288,7 +256,7 @@ def fill_ext_db(table_name, db, indir, trace_name, api_pid): rec_stack = pid_stack[tid] rec_vals = rec_stack.pop() rec_vals[1] = tms - + db.insert_entry(table_handle, rec_vals) record_id += 1 @@ -495,7 +463,7 @@ def fill_ops_db(table_name, db, indir): hip_statfile = re.sub(r'\.stats\.csv$', r'.hip_stats.csv', statfile) kfd_statfile = re.sub(r'\.stats\.csv$', r'.kfd_stats.csv', statfile) sysinfo_file = re.sub(r'\.stats\.csv$', r'.sysinfo_stats.txt', statfile) - params = json_metadata_gen(sysinfo_file, 1) + metadata_gen(sysinfo_file, '/opt/rocm/bin/rocminfo') with open(dbfile, mode='w') as fd: fd.truncate() db = SQLiteDB(dbfile) @@ -561,8 +529,10 @@ def fill_ops_db(table_name, db, indir): dform.post_process_data(db, 'OPS') dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_US, jsonfile) - sysinfo_file2 = re.sub(r'\.stats\.csv$', r'.sysinfo_stats2.txt', statfile) - params2 = json_metadata_gen(sysinfo_file2, 2) + #sysinfo_file2 = re.sub(r'\.stats\.csv$', r'.sysinfo_stats2.txt', statfile) + #params2 = metadata_gen(sysinfo_file2, 2) + #params2 = + metadata_gen(sysinfo_file, '/opt/rocm/bin/hipcc --version') if kfd_trace_found: dform.post_process_data(db, 'KFD') @@ -594,7 +564,7 @@ def fill_ops_db(table_name, db, indir): dep_id += len(tid_list) if any_trace_found: - json_metadata_write(jsonfile, params, params2) + db.metadata_json(jsonfile, sysinfo_file) db.close_json(jsonfile); db.close() From d4e182cafd0aeae9388c7b28bfa0fb21a8619d7d Mon Sep 17 00:00:00 2001 From: rkebichi <54912798+rkebichi@users.noreply.github.com> Date: Thu, 19 Mar 2020 17:25:06 -0400 Subject: [PATCH 113/168] Update txt2params.py --- bin/txt2params.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/bin/txt2params.py b/bin/txt2params.py index ce5a2a8c..358acc3a 100644 --- a/bin/txt2params.py +++ b/bin/txt2params.py @@ -24,6 +24,11 @@ import os, sys, re +# gen_params() takes a text file like the output of rocminfo cmd and parses it into a map {key,value} +# where key is the param and value is the value of this param +# for example: Threadmodel : "posix" +# it also processes encompasing sections to generate a full param name such as (section names separated by '_'): +# "Agent2_PoolInfo_ISAInfo_ISA1_WorkgroupMaxSizeperDimension_x": "1024(0x400)", def gen_params(txtfile): fields = {} parent_field = '' @@ -32,12 +37,24 @@ def gen_params(txtfile): check_for_dims = False with open(txtfile) as fp: for line in fp: - mv = re.match(r'HCC clang version\s+(.*)',line) + me = re.match(r'\*\*\* Done \*\*\*',line) #Marks the end of cmd + if me: + parent_field = '' + nbr_indent = 0 + nbr_indent_prev = 0 + check_for_dims = False + continue + mv = re.match(r'HCC clang version\s+(.*)',line) # outlier: only line with a version number and no ':', special case if mv: key = 'HCCclangversion' val = mv.group(1) fields[key] = val continue + # Variable 'check_for_dims' is True for text like this: + # Workgroup Max Size per Dimension: + # x 1024(0x400) + # y 1024(0x400) + # z 1024(0x400) if check_for_dims == True: mc = re.match(r'\s*([x|y|z])\s+(.*)',line) if mc: @@ -62,11 +79,14 @@ def gen_params(txtfile): parent_field = tmp continue - if nbr_indent < nbr_indent_prev: + if nbr_indent < nbr_indent_prev: pos = parent_field.rfind('_') if pos != -1: - parent_field = parent_field[:pos] # remove last _* + parent_field = parent_field[:pos] + # Process lines such as : + # Segment: GLOBAL; FLAGS: KERNARG, FINE GRAINED + # Size: 131897644(0x7dc992c) KB for lin in line.split(';'): lin = re.sub(r"\s+", "", lin) m = re.match(r'(.*):(.*)', lin) @@ -86,3 +106,4 @@ def gen_params(txtfile): parent_field = parent_field + '_' + lin.replace(':','') return fields + From 8f856d9fa96c1f2ac90e36fea59d0e819a650061 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 20 Mar 2020 14:52:33 -0500 Subject: [PATCH 114/168] cleanup --- bin/tblextr.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/bin/tblextr.py b/bin/tblextr.py index d3c470e1..3f47c65d 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -85,7 +85,7 @@ def dbglog(msg): # parse results method def parse_res(infile): global max_gpu_id - if not os.path.isfile(infile): return # fatal("Error: input file '" + infile + "' not found") + if not os.path.isfile(infile): return inp = open(infile, 'r') beg_pattern = re.compile("^dispatch\[(\d*)\], (.*) kernel-name\(\"([^\"]*)\"\)") @@ -529,9 +529,6 @@ def fill_ops_db(table_name, db, indir): dform.post_process_data(db, 'OPS') dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_US, jsonfile) - #sysinfo_file2 = re.sub(r'\.stats\.csv$', r'.sysinfo_stats2.txt', statfile) - #params2 = metadata_gen(sysinfo_file2, 2) - #params2 = metadata_gen(sysinfo_file, '/opt/rocm/bin/hipcc --version') if kfd_trace_found: From 5fb023db4f547f183bd000dce307aad767f07e27 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Tue, 24 Mar 2020 19:48:12 -0500 Subject: [PATCH 115/168] adding labels sort index --- bin/sqlitedb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index 9fa1823c..805c954c 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -115,7 +115,7 @@ def label_json(self, pid, label, file_name): if not re.search(r'\.json$', file_name): raise Exception('wrong output file type: "' + file_name + '"' ) with open(file_name, mode='a') as fd: - fd.write(',{"args":{"name":"%s %s"},"ph":"M","pid":%s,"name":"process_name"}\n' %(self.section_index, label, pid)); + fd.write(',{"args":{"name":"%s"},"ph":"M","pid":%s,"name":"process_name","sort_index":%d}\n' %(label, pid, self.section_index)); self.section_index += 1 def flow_json(self, base_id, from_pid, from_tid, from_us_list, to_pid, to_us_dict, corr_id_list, start_us, file_name): From dd824f652a149ce31ea65300de4967d94f6556f8 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 30 Apr 2020 10:02:30 -0500 Subject: [PATCH 116/168] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4529c326..1e5df698 100644 --- a/README.md +++ b/README.md @@ -47,9 +47,9 @@ To use the rocProfiler API you need the API header and to link your application - ROCm is required. - Python is required. - The required modules: CppHeaderParser, argparse. + The required modules: CppHeaderParser, argparse, sqlite3 To install: - sudo pip install CppHeaderParser argparse + sudo pip install CppHeaderParser argparse sqlite3 - To build and install to /opt/rocm/rocprofiler export CMAKE_PREFIX_PATH=/opt/rocm/include/hsa:/opt/rocm From a0a4bf10fd620271ae9be47be3f961a1085ae503 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 30 Apr 2020 17:04:28 -0500 Subject: [PATCH 117/168] intercept test removing not needed headers --- bin/build_kernel.sh | 40 +++++++++++++++++++++++++++---------- test/app/intercept_test.cpp | 22 ++++++++------------ 2 files changed, 37 insertions(+), 25 deletions(-) diff --git a/bin/build_kernel.sh b/bin/build_kernel.sh index e89cf561..9412a68e 100755 --- a/bin/build_kernel.sh +++ b/bin/build_kernel.sh @@ -1,4 +1,5 @@ #!/bin/sh -x +SO_EXT="hsaco" TEST_NAME=$1 DST_DIR=$2 @@ -7,9 +8,10 @@ TGT_LIST=$4 if [ -z "$TEST_NAME" ] ; then echo "Usage: $0 " - echo " Will look for .cl and will build .so dynamic object library" + echo " Will look for .cl and will build .$SO_EXT dynamic code object library" exit 1 fi +OBJ_NAME=$(echo "_$(basename $TEST_NAME)" | sed -e 's/_./\U&\E/g' -e 's/_//g') if [ -z "$DST_DIR" ] ; then DST_DIR=$(dirname TEST_NAME) @@ -29,21 +31,37 @@ if [ -z "$TGT_LIST" ] ; then fi OCL_VER="2.0" -OCL_DIR=$ROCM_DIR/opencl -LLVM_DIR=$ROCM_DIR/hcc -CLANG=$LLVM_DIR/bin/clang +if [ -e $ROCM_DIR/llvm ] ; then + LLVM_DIR=$ROCM_DIR/llvm + LIB_DIR=$ROCM_DIR/lib +else + LLVM_DIR=$ROCM_DIR/hcc + LIB_DIR=$LLVM_DIR/lib +fi + +BC_DIR=$LIB_DIR/bitcode +if [ ! -d "$BC_DIR" ] ; then BC_DIR=$LIB_DIR; fi + +CLANG_ROOT=$LLVM_DIR/lib/clang +CLANG_DIR=`ls -d $CLANG_ROOT/* | head -n 1` +if [ "$CLANG_DIR" = "" ] ; then + echo "Error: LLVM clang library was not found" + exit 1 +fi + +BIN_DIR=$LLVM_DIR/bin +INC_DIR=$CLANG_DIR/include BITCODE_OPTS="\ - -Xclang -mlink-bitcode-file -Xclang $LLVM_DIR/lib/opencl.amdgcn.bc \ - -Xclang -mlink-bitcode-file -Xclang $LLVM_DIR/lib/ockl.amdgcn.bc \ - -Xclang -mlink-bitcode-file -Xclang $LLVM_DIR/lib/ocml.amdgcn.bc" + -Xclang -mlink-bitcode-file -Xclang $BC_DIR/opencl.amdgcn.bc \ + -Xclang -mlink-bitcode-file -Xclang $BC_DIR/ockl.amdgcn.bc \ + -Xclang -mlink-bitcode-file -Xclang $BC_DIR/ocml.amdgcn.bc" for GFXIP in $TGT_LIST ; do OBJ_PREF=$GFXIP - OBJ_NAME=$(echo "_$(basename $TEST_NAME)" | sed -e 's/_./\U&\E/g' -e 's/_//g') - OBJ_FILE=${OBJ_PREF}_${OBJ_NAME}.hsaco - $CLANG -cl-std=CL$OCL_VER -include $OCL_DIR/include/opencl-c.h $BITCODE_OPTS -target amdgcn-amd-amdhsa -mcpu=$GFXIP -mno-code-object-v3 $TEST_NAME.cl -o $DST_DIR/$OBJ_FILE - echo "'$OBJ_FILE' is generated for '$GFXIP'" + OBJ_FILE="${OBJ_PREF}_${OBJ_NAME}.$SO_EXT" + $BIN_DIR/clang -cl-std=CL$OCL_VER -include $INC_DIR/opencl-c.h $BITCODE_OPTS -target amdgcn-amd-amdhsa -mcpu=$GFXIP -mno-code-object-v3 $TEST_NAME.cl -o $DST_DIR/$OBJ_FILE + echo "'$OBJ_FILE' generated" done exit 0 diff --git a/test/app/intercept_test.cpp b/test/app/intercept_test.cpp index 876b3102..c2905d1e 100644 --- a/test/app/intercept_test.cpp +++ b/test/app/intercept_test.cpp @@ -30,14 +30,8 @@ THE SOFTWARE. #include #include -#include "ctrl/run_kernel.h" -#include "ctrl/test_aql.h" -#include "ctrl/test_hsa.h" #include "inc/rocprofiler.h" -#include "dummy_kernel/dummy_kernel.h" -#include "simple_convolution/simple_convolution.h" -#include "util/test_assert.h" -#include "util/xml.h" +#include "util/hsa_rsrc_factory.h" #define PUBLIC_API __attribute__((visibility("default"))) #define CONSTRUCTOR_API __attribute__((constructor)) @@ -228,7 +222,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, unsigned metrics_input(rocprofiler_feature_t** ret) { // Profiling feature objects - const unsigned feature_count = 9; + const unsigned feature_count = 6; rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; memset(features, 0, feature_count * sizeof(rocprofiler_feature_t)); @@ -245,12 +239,12 @@ unsigned metrics_input(rocprofiler_feature_t** ret) { features[4].name = "SQ_INSTS_VALU"; features[5].kind = ROCPROFILER_FEATURE_KIND_METRIC; features[5].name = "VALUInsts"; - features[6].kind = ROCPROFILER_FEATURE_KIND_METRIC; - features[6].name = "TCC_HIT_sum"; - features[7].kind = ROCPROFILER_FEATURE_KIND_METRIC; - features[7].name = "TCC_MISS_sum"; - features[8].kind = ROCPROFILER_FEATURE_KIND_METRIC; - features[8].name = "WRITE_SIZE"; +// features[6].kind = ROCPROFILER_FEATURE_KIND_METRIC; +// features[6].name = "TCC_HIT_sum"; +// features[7].kind = ROCPROFILER_FEATURE_KIND_METRIC; +// features[7].name = "TCC_MISS_sum"; +// features[8].kind = ROCPROFILER_FEATURE_KIND_METRIC; +// features[8].name = "WRITE_SIZE"; *ret = features; return feature_count; From fdd7d536ad1451fd51d4b699be5018e548581be0 Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary Date: Fri, 5 Jun 2020 14:53:05 -0400 Subject: [PATCH 118/168] Adding reference instead of copy of container elements --- src/core/context.h | 2 +- src/core/metrics.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/context.h b/src/core/context.h index 856c7024..f3ab1294 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -257,7 +257,7 @@ class Context { void GetMetricsData() const { const MetricArgs args(info_map_); - for (const auto v : metrics_map_) { + for (const auto &v : metrics_map_) { const std::string& name = v.first; const Metric* metric = v.second; const xml::Expr* expr = metric->GetExpr(); diff --git a/src/core/metrics.h b/src/core/metrics.h index 57ec7c31..f9ae1fbd 100644 --- a/src/core/metrics.h +++ b/src/core/metrics.h @@ -277,7 +277,7 @@ class MetricsDict { std::cout << name << "=" << expr_obj->String() << "\n" << std::endl; #endif counters_vec_t counters_vec; - for (const std::string var : expr_obj->GetVars()) { + for (const auto& var : expr_obj->GetVars()) { auto it = cache_.find(var); if (it == cache_.end()) { EXC_RAISING(HSA_STATUS_ERROR, "Bad metric '" << name << "', var '" << var << "' is not found"); From 2ef4d5d58b97c34c30dd5b919cd32825e1482677 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 12 Jun 2020 10:36:15 -0500 Subject: [PATCH 119/168] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 1e5df698..44589d6e 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,8 @@ To use the rocProfiler API you need the API header and to link your application sudo pip install CppHeaderParser argparse sqlite3 - To build and install to /opt/rocm/rocprofiler + Please use release branches/tags of 'amd-master' branch for development version. + export CMAKE_PREFIX_PATH=/opt/rocm/include/hsa:/opt/rocm cd .../rocprofiler From 1debae51dcd72244791eb26853c5b808cf6830c2 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Fri, 26 Jun 2020 23:32:22 -0500 Subject: [PATCH 120/168] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 44589d6e..73e67716 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ To use the rocProfiler API you need the API header and to link your application ## To build with the current installed ROCM: ``` - - ROCm is required. + - Python is required. The required modules: CppHeaderParser, argparse, sqlite3 From 13fa7df93562798b69d1de0efd35a3e90f29f4c9 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 26 Jun 2020 23:38:29 -0500 Subject: [PATCH 121/168] Revert "Update README.md" This reverts commit 1debae51dcd72244791eb26853c5b808cf6830c2. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 73e67716..44589d6e 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ To use the rocProfiler API you need the API header and to link your application ## To build with the current installed ROCM: ``` - + - ROCm is required. - Python is required. The required modules: CppHeaderParser, argparse, sqlite3 From 4aa416fc83baecae7ce02c60ba17353c1df669c4 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 26 Jun 2020 23:44:11 -0500 Subject: [PATCH 122/168] adding ROCm requirements --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 44589d6e..9108409f 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ To use the rocProfiler API you need the API header and to link your application ## To build with the current installed ROCM: ``` - ROCm is required. + ROCr-runtime and roctracer are needed - Python is required. The required modules: CppHeaderParser, argparse, sqlite3 From c988f7f327ffb7dff09e7faf56f38d138e28d14f Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 16 Jul 2020 02:58:45 -0500 Subject: [PATCH 123/168] 3.6 update --- CMakeLists.txt | 76 ++++-- bin/build_kernel.sh | 2 +- bin/dform.py | 23 +- bin/rpl_run.sh | 116 +++++---- bin/run_tool.sh | 38 --- bin/sqlitedb.py | 67 ++++-- bin/tblextr.py | 194 ++++++++++----- bin/txt2params.py | 47 ++-- cmake_modules/env.cmake | 32 ++- inc/rocprofiler.h | 90 ++++++- src/CMakeLists.txt | 3 +- src/core/activity.cpp | 171 +++++++++++++ src/core/context.h | 25 +- src/core/hsa_interceptor.h | 385 ++++++++++++++++++++++++++++++ src/core/hsa_proxy_queue.h | 1 + src/core/intercept_queue.cpp | 8 +- src/core/intercept_queue.h | 135 +++++++---- src/core/metrics.h | 2 +- src/core/proxy_queue.h | 2 +- src/core/rocprofiler.cpp | 111 ++++++++- src/core/tracker.h | 16 +- src/util/hsa_rsrc_factory.cpp | 65 +++-- src/util/hsa_rsrc_factory.h | 39 ++- test/CMakeLists.txt | 51 ++-- test/app/intercept_test.cpp | 2 +- test/app/stand_intercept_test.cpp | 190 +++++++++++++++ test/app/standalone_test.cpp | 56 ++++- test/run.sh | 64 ++++- test/tool/tool.cpp | 103 +++++++- test/util/hsa_rsrc_factory.cpp | 32 ++- test/util/hsa_rsrc_factory.h | 105 +++++++- 31 files changed, 1879 insertions(+), 372 deletions(-) delete mode 100755 bin/run_tool.sh create mode 100644 src/core/activity.cpp create mode 100644 src/core/hsa_interceptor.h create mode 100644 test/app/stand_intercept_test.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index edc30d1a..8aac5175 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,6 +70,31 @@ set ( ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ) set ( LIB_DIR "${ROOT_DIR}/src" ) set ( TEST_DIR "${ROOT_DIR}/test" ) +## Enable tracing API +if (NOT USE_PROF_API) + set(USE_PROF_API 1) +endif() + +# Protocol header lookup +set(PROF_API_HEADER_NAME prof_protocol.h) +if(USE_PROF_API EQUAL 1) + find_path(PROF_API_HEADER_DIR ${PROF_API_HEADER_NAME} + HINTS + ${PROF_API_HEADER_PATH} + PATHS + /opt/rocm/roctracer + PATH_SUFFIXES + include/ext + ) + if(NOT PROF_API_HEADER_DIR) + MESSAGE(FATAL_ERROR "Profiling API header not found. Tracer integration disabled. Use -DPROF_API_HEADER_PATH=") + else() + add_definitions(-DUSE_PROF_API=1) + include_directories(${PROF_API_HEADER_DIR}) + MESSAGE(STATUS "Profiling API: ${PROF_API_HEADER_DIR}/${PROF_API_HEADER_NAME}") + endif() +endif() + ## Build library include ( ${LIB_DIR}/CMakeLists.txt ) @@ -85,41 +110,58 @@ endif () ## Build tests add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test ) +## Installation and packaging +set ( DEST_NAME ${ROCPROFILER_NAME} ) +if ( DEFINED CPACK_PACKAGING_INSTALL_PREFIX ) + get_filename_component ( pkg_name ${CPACK_PACKAGING_INSTALL_PREFIX} NAME ) + get_filename_component ( pkg_dir ${CPACK_PACKAGING_INSTALL_PREFIX} DIRECTORY ) + if ( pkg_name STREQUAL ${DEST_NAME} ) + set ( CPACK_PACKAGING_INSTALL_PREFIX ${pkg_dir} ) + endif () +else () + set ( CPACK_PACKAGING_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX} ) +endif () +message ( "CMake-install-prefix: ${CMAKE_INSTALL_PREFIX}" ) +message ( "CPack-install-prefix: ${CPACK_PACKAGING_INSTALL_PREFIX}" ) +message ( "-----------Dest-name: ${DEST_NAME}" ) + ## Create symlinks for packaging and install add_custom_target ( rocprof-link ALL WORKING_DIRECTORY ${PROJECT_BINARY_DIR} - COMMAND ${CMAKE_COMMAND} -E create_symlink ../${ROCPROFILER_NAME}/bin/rpl_run.sh rocprof-link ) -add_custom_target ( inc-link ALL WORKING_DIRECTORY ${PROJECT_BINARY_DIR} - COMMAND ${CMAKE_COMMAND} -E create_symlink ../${ROCPROFILER_NAME}/include inc-link ) + COMMAND ${CMAKE_COMMAND} -E create_symlink ../${DEST_NAME}/bin/rpl_run.sh rocprof-link ) +#add_custom_target ( inc-link ALL WORKING_DIRECTORY ${PROJECT_BINARY_DIR} +# COMMAND ${CMAKE_COMMAND} -E create_symlink ../${DEST_NAME}/include inc-link ) add_custom_target ( so-link ALL WORKING_DIRECTORY ${PROJECT_BINARY_DIR} - COMMAND ${CMAKE_COMMAND} -E create_symlink ../${ROCPROFILER_NAME}/lib/${ROCPROFILER_LIBRARY}.so so-link ) + COMMAND ${CMAKE_COMMAND} -E create_symlink ../${DEST_NAME}/lib/${ROCPROFILER_LIBRARY}.so so-link ) -set ( CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}/${ROCPROFILER_NAME}" ) -message ( "---------Install-Dir: ${CMAKE_INSTALL_PREFIX}" ) -install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION lib ) -install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h DESTINATION include ) +# Install header and library +install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION ${DEST_NAME}/lib ) +install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h DESTINATION ${DEST_NAME}/include ) +install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h DESTINATION include/${DEST_NAME} ) # rpl_run.sh tblextr.py txt2xml.sh install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/bin/rpl_run.sh ${CMAKE_CURRENT_SOURCE_DIR}/bin/txt2xml.sh + ${CMAKE_CURRENT_SOURCE_DIR}/bin/txt2params.py ${CMAKE_CURRENT_SOURCE_DIR}/bin/tblextr.py ${CMAKE_CURRENT_SOURCE_DIR}/bin/dform.py ${CMAKE_CURRENT_SOURCE_DIR}/bin/sqlitedb.py - DESTINATION bin + DESTINATION ${DEST_NAME}/bin PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) -install ( FILES ${PROJECT_BINARY_DIR}/inc-link DESTINATION ../include RENAME ${ROCPROFILER_NAME} ) -install ( FILES ${PROJECT_BINARY_DIR}/so-link DESTINATION ../lib RENAME ${ROCPROFILER_LIBRARY}.so ) -install ( FILES ${PROJECT_BINARY_DIR}/rocprof-link DESTINATION ../bin - PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE - RENAME rocprof ) # gfx_metrics.xml metrics.xml install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/test/tool/metrics.xml ${CMAKE_CURRENT_SOURCE_DIR}/test/tool/gfx_metrics.xml - DESTINATION lib ) + DESTINATION ${DEST_NAME}/lib ) # libtool.so -install ( FILES ${PROJECT_BINARY_DIR}/test/libtool.so DESTINATION tool ) -install ( FILES ${PROJECT_BINARY_DIR}/test/ctrl DESTINATION tool +install ( FILES ${PROJECT_BINARY_DIR}/test/libtool.so DESTINATION ${DEST_NAME}/tool ) +install ( FILES ${PROJECT_BINARY_DIR}/test/ctrl DESTINATION ${DEST_NAME}/tool PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) +# links +install ( FILES ${PROJECT_BINARY_DIR}/so-link DESTINATION lib RENAME ${ROCPROFILER_LIBRARY}.so ) +#install ( FILES ${PROJECT_BINARY_DIR}/inc-link DESTINATION include RENAME ${DEST_NAME} ) +install ( FILES ${PROJECT_BINARY_DIR}/rocprof-link DESTINATION bin + PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE + RENAME rocprof ) ## Packaging directives set ( CPACK_GENERATOR "DEB" "RPM" "TGZ" ) diff --git a/bin/build_kernel.sh b/bin/build_kernel.sh index 9412a68e..8ed0f168 100755 --- a/bin/build_kernel.sh +++ b/bin/build_kernel.sh @@ -22,7 +22,7 @@ if [ -z "$ROCM_DIR" ] ; then fi if [ -z "$TGT_LIST" ] ; then - TGT_LIST=$(/opt/rocm/bin/rocminfo | grep "amdgcn-amd-amdhsa--" | head -n 1 | sed -n "s/^.*amdgcn-amd-amdhsa--\(\w*\).*$/\1/p") + TGT_LIST=`$ROCM_DIR/bin/rocminfo | grep "amdgcn-amd-amdhsa--" | head -n 1 | sed -n "s/^.*amdgcn-amd-amdhsa--\(\w*\).*$/\1/p"` fi if [ -z "$TGT_LIST" ] ; then diff --git a/bin/dform.py b/bin/dform.py index 93194608..82a81d08 100644 --- a/bin/dform.py +++ b/bin/dform.py @@ -1,4 +1,25 @@ -#!/usr/bin/python +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + from sqlitedb import SQLiteDB def gen_message(outfile): diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index d34888cd..0c3d83d4 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -35,13 +35,8 @@ RPL_PATH=$PKG_DIR/lib TLIB_PATH=$PKG_DIR/tool TTLIB_PATH=$TT_DIR/tool -# Default HIP path -if [ -z "$HIP_PATH" ] ; then - export HIP_PATH=/opt/rocm/hip -fi -# Default HCC path -if [ -z "$HCC_HOME" ] ; then - export HCC_HOME=/opt/rocm/hcc +if [ -z "$ROCP_PYTHON_VERSION" ] ; then + ROCP_PYTHON_VERSION=python3 fi # runtime API trace @@ -65,9 +60,9 @@ export HSA_VEN_AMD_AQLPROFILE_LOG=1 export ROCPROFILER_LOG=1 unset ROCPROFILER_SESS -# ROC Profiler environment -# Loading of ROC Profiler by HSA runtime -export HSA_TOOLS_LIB=$RPL_PATH/librocprofiler64.so +# Profiler environment +# Loading of profiler library by HSA runtime +MY_HSA_TOOLS_LIB="$RPL_PATH/librocprofiler64.so" # Loading of the test tool by ROC Profiler export ROCP_TOOL_LIB=$TLIB_PATH/libtool.so # Enabling HSA dispatches intercepting by ROC PRofiler @@ -162,14 +157,18 @@ usage() { echo " --ctx-wait - to wait for outstanding contexts on profiler exit [on]" echo " --ctx-limit - maximum number of outstanding contexts [0 - unlimited]" echo " --heartbeat - to print progress heartbeats [0 - disabled]" + echo " --obj-tracking - to turn on/off kernels code objects tracking [on]" + echo " To support V3 code object" echo "" echo " --stats - generating kernel execution stats, file .stats.csv" - echo " --roctx-trace - to enable rocTX trace" - echo " --kfd-trace - to trace KFD, generates API execution stats and JSON file chrome-tracing compatible" + echo "" + echo " --roctx-trace - to enable rocTX application code annotation trace, \"Markers and Ranges\" JSON trace section." + echo " --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible" echo " --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible" echo " --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible" - echo " --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible" - echo " Generated files: .hsa_stats.txt .json" + echo " '--hsa-trace' can be used in addition to select activity tracing from HSA (ROCr runtime) level" + echo " --kfd-trace - to trace KFD, generates KFD Thunk API execution stats and JSON file chrome-tracing compatible" + echo " Generated files: ._stats.txt .json" echo " Traced API list can be set by input .txt or .xml files." echo " Input .txt:" echo " hsa: hsa_queue_create hsa_amd_memory_pool_allocate" @@ -182,23 +181,29 @@ usage() { echo " --trace-start - to enable tracing on start [on]" echo " --trace-period - to enable trace with initial delay, with periodic sample length and rate" echo " Supported time formats: " - echo " --obj-tracking - to turn on/off kernels code objects tracking [off]" + echo " --flush-rate - to enable trace flush rate (time period)" + echo " Supported time formats: " echo "" echo "Configuration file:" echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" echo " First the configuration file is looking in the current directory, then in your home, and then in the package directory." - echo " Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat'." + echo " Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'obj-tracking'." echo " An example of 'rpl_rc.xml':" echo " " echo "" exit 1 } +# checking for availability of rocminfo utility +`which rocminfo >/dev/null 2>&1` +if [ $? != 0 ]; then fatal "'rocminfo' utility is not found: please add ROCM bin path to PATH env var."; fi + # profiling run method OUTPUT_LIST="" run() { @@ -233,13 +238,13 @@ run() { fi API_TRACE="" - LD_PRELOAD="" + MY_LD_PRELOAD="" if [ "$ROCTX_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":roctx" fi if [ "$KFD_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":kfd" - export LD_PRELOAD="$TT_DIR/lib/libkfdwrapper64.so libhsakmt.so.1 $LD_PRELOAD" + MY_LD_PRELOAD="$TT_DIR/lib/libkfdwrapper64.so libhsakmt.so.1 $MY_LD_PRELOAD" fi if [ "$HIP_TRACE" = 1 ] ; then API_TRACE=${API_TRACE}":hip" @@ -250,34 +255,44 @@ run() { if [ "$HSA_TRACE" = 1 ] ; then export ROCTRACER_DOMAIN=$API_TRACE":hsa" - export HSA_TOOLS_LIB="$HSA_TOOLS_LIB $TTLIB_PATH/libtracer_tool.so" + MY_HSA_TOOLS_LIB="$MY_HSA_TOOLS_LIB $TTLIB_PATH/libtracer_tool.so" elif [ -n "$API_TRACE" ] ; then export ROCTRACER_DOMAIN=$API_TRACE OUTPUT_LIST="$ROCP_OUTPUT_DIR/" - export HSA_TOOLS_LIB="$TTLIB_PATH/libtracer_tool.so" + MY_HSA_TOOLS_LIB="$TTLIB_PATH/libtracer_tool.so" fi - redirection_cmd="" + retval=1 if [ -n "$ROCP_OUTPUT_DIR" ] ; then - redirection_cmd="2>&1 | tee $ROCP_OUTPUT_DIR/log.txt" + log_file="$ROCP_OUTPUT_DIR/log.txt" + exit_file="$ROCP_OUTPUT_DIR/exit.txt" + { + HSA_TOOLS_LIB="$MY_HSA_TOOLS_LIB" LD_PRELOAD="$MY_LD_PRELOAD" eval "$APP_CMD" + retval=$? + echo "exit($retval)" > $exit_file + } 2>&1 | tee "$log_file" + exitval=`cat "$exit_file" | sed -n "s/^.*exit(\([0-9]*\)).*$/\1/p"` + if [ -n "$exitval" ] ; then retval=$exitval; fi + else + HSA_TOOLS_LIB="$MY_HSA_TOOLS_LIB" LD_PRELOAD="$MY_LD_PRELOAD" eval "$APP_CMD" + retval=$? fi - - CMD_LINE="$APP_CMD $redirection_cmd" - eval "$CMD_LINE" - - unset LD_PRELOAD + return $retval } merge_output() { - output_dir=$(echo "$1" | sed "s/\/[^\/]*$//") - for file_name in `ls $output_dir` ; do - output_name=$(echo $file_name | sed -n "/\.txt$/ s/^[0-9]*_//p") - if [ -n "$output_name" ] ; then - trace_file=$output_dir/$file_name - output_file=$output_dir/$output_name - touch $output_file - cat $trace_file >> $output_file - fi + while [ -n "$1" ] ; do + output_dir=$(echo "$1" | sed "s/\/[^\/]*$//") + for file_name in `ls $output_dir` ; do + output_name=$(echo $file_name | sed -n "/\.txt$/ s/^[0-9]*_//p") + if [ -n "$output_name" ] ; then + trace_file=$output_dir/$file_name + output_file=$output_dir/$output_name + touch $output_file + cat $trace_file >> $output_file + fi + done + shift done } @@ -339,11 +354,11 @@ while [ 1 ] ; do export ROCP_METRICS="$2" elif [ "$1" = "--list-basic" ] ; then export ROCP_INFO=b - eval "$PKG_DIR/tool/ctrl" + HSA_TOOLS_LIB="$MY_HSA_TOOLS_LIB" eval "$PKG_DIR/tool/ctrl" exit 1 elif [ "$1" = "--list-derived" ] ; then export ROCP_INFO=d - eval "$PKG_DIR/tool/ctrl" + HSA_TOOLS_LIB="$MY_HSA_TOOLS_LIB" eval "$PKG_DIR/tool/ctrl" exit 1 elif [ "$1" = "--basenames" ] ; then if [ "$2" = "on" ] ; then @@ -373,6 +388,7 @@ while [ 1 ] ; do GEN_STATS=1 elif [ "$1" = "--roctx-trace" ] ; then ARG_VAL=0 + GEN_STATS=1 ROCTX_TRACE=1 elif [ "$1" = "--kfd-trace" ] ; then ARG_VAL=0 @@ -414,9 +430,14 @@ while [ 1 ] ; do convert_time_val period_rate errck "Option '$ARG_IN', rate value" export ROCP_CTRL_RATE="$period_delay:$period_len:$period_rate" + elif [ "$1" = "--flush-rate" ] ; then + period_rate=$2 + convert_time_val period_rate + errck "Option '$ARG_IN', rate value" + export ROCP_FLUSH_RATE="$period_rate" elif [ "$1" = "--obj-tracking" ] ; then - if [ "$2" = "on" ] ; then - export ROCP_OBJ_TRACKING=1 + if [ "$2" = "off" ] ; then + export ROCP_OBJ_TRACKING=0 fi elif [ "$1" = "--verbose" ] ; then ARG_VAL=0 @@ -506,11 +527,13 @@ if [ -n "$csv_output" ] ; then rm -f $csv_output fi -RET=0 +RET=1 for name in $input_list; do run $name $OUTPUT_DIR $APP_CMD + RET=$? if [ -n "$ROCPROFILER_SESS" -a -e "$ROCPROFILER_SESS/error" ] ; then - echo "Error found, profiling aborted." + error_string=`cat $ROCPROFILER_SESS/error` + echo "Profiling error found: '$error_string'" csv_output="" RET=1 break @@ -518,15 +541,16 @@ for name in $input_list; do done if [ -n "$csv_output" ] ; then + merge_output $OUTPUT_LIST if [ "$GEN_STATS" = "1" ] ; then db_output=$(echo $csv_output | sed "s/\.csv/.db/") - merge_output $OUTPUT_LIST - python $BIN_DIR/tblextr.py $db_output $OUTPUT_LIST + $ROCP_PYTHON_VERSION $BIN_DIR/tblextr.py $db_output $OUTPUT_LIST else - python $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST + $ROCP_PYTHON_VERSION $BIN_DIR/tblextr.py $csv_output $OUTPUT_LIST fi if [ "$?" -ne 0 ] ; then - echo "Data extracting error: $OUTPUT_LIST'" + echo "Profiling data corrupted: '$OUTPUT_LIST'" | tee "$ROCPROFILER_SESS/error" + RET=1 fi fi diff --git a/bin/run_tool.sh b/bin/run_tool.sh deleted file mode 100755 index ed1609fa..00000000 --- a/bin/run_tool.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/sh -BIN_DIR=`dirname $0` -BIN_DIR=`realpath $BIN_DIR` -PKG_DIR=${BIN_DIR%/bin} - -# PATH to custom HSA libs -HSA_PATH=$PKG_DIR/lib/hsa - -if [ -z "$1" ] ; then - echo "Usage: $0 " - exit 1 -fi - -# profiler plugin library -test_app=$* - -# paths to ROC profiler and oher libraries -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PKG_DIR/lib:$PKG_DIR/tool:$HSA_PATH -export PATH=.:$PATH - -# ROC profiler library loaded by HSA runtime -export HSA_TOOLS_LIB=librocprofiler64.so.1 -# tool library loaded by ROC profiler -if [ -z "$ROCP_TOOL_LIB" ] ; then - export ROCP_TOOL_LIB=libintercept_test.so -fi -# enable error messages -export HSA_TOOLS_REPORT_LOAD_FAILURE=1 -export HSA_VEN_AMD_AQLPROFILE_LOG=1 -export ROCPROFILER_LOG=1 -# ROC profiler metrics config file -unset ROCP_PROXY_QUEUE -# ROC profiler metrics config file -if [ -z "$ROCP_METRICS" ] ; then - export ROCP_METRICS=$PKG_DIR/lib/metrics.xml -fi - -LD_PRELOAD=$ROCP_TOOL_LIB $test_app diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index 805c954c..eb584503 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -1,3 +1,25 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + import csv, sqlite3, re, sys from functools import reduce from txt2params import gen_params @@ -48,6 +70,13 @@ def add_data_column(self, table_name, data_label, data_type, data_expr): cursor.execute('ALTER TABLE %s ADD COLUMN "%s" %s' % (table_name, data_label, data_type)) cursor.execute('UPDATE %s SET %s = (%s);' % (table_name, data_label, data_expr)) + def change_rec_name(self, table_name, rec_id, rec_name): + self.connection.execute('UPDATE ' + table_name + ' SET Name = ? WHERE "Index" = ?', (rec_name, rec_id)) + def change_rec_tid(self, table_name, rec_id, tid): + self.connection.execute('UPDATE ' + table_name + ' SET tid = ? WHERE "Index" = ?', (tid, rec_id)) + def change_rec_fld(self, table_name, fld_expr, rec_pat): + self.connection.execute('UPDATE ' + table_name + ' SET ' + fld_expr + ' WHERE ' + rec_pat) + # populate DB table entry def insert_entry(self, table, val_list): (cursor, stm) = table @@ -77,8 +106,8 @@ def _get_raws(self, table_name): def _get_raws_indexed(self, table_name): cursor = self.connection.execute('SELECT * FROM ' + table_name + ' order by "Index" asc;') return cursor.fetchall() - def _get_raw_by_id(self, table_name, req_id): - cursor = self.connection.execute('SELECT * FROM ' + table_name + ' WHERE "Index"=?', (req_id,)) + def _get_raw_by_id(self, table_name, rec_id): + cursor = self.connection.execute('SELECT * FROM ' + table_name + ' WHERE "Index"=?', (rec_id,)) raws = cursor.fetchall() if len(raws) != 1: raise Exception('Index is not unique, table "' + table_name + '"') @@ -97,7 +126,7 @@ def dump_csv(self, table_name, file_name): fd.write(','.join(fields) + '\n') for raw in self._get_raws(table_name): fd.write(reduce(lambda a, b: str(a) + ',' + str(b), raw) + '\n') - + # dump JSON trace def open_json(self, file_name): if not re.search(r'\.json$', file_name): @@ -115,7 +144,7 @@ def label_json(self, pid, label, file_name): if not re.search(r'\.json$', file_name): raise Exception('wrong output file type: "' + file_name + '"' ) with open(file_name, mode='a') as fd: - fd.write(',{"args":{"name":"%s"},"ph":"M","pid":%s,"name":"process_name","sort_index":%d}\n' %(label, pid, self.section_index)); + fd.write(',{"args":{"name":"%s"},"ph":"M","pid":%s,"name":"process_name","sort_index":%d}\n' %(label, pid, self.section_index)) self.section_index += 1 def flow_json(self, base_id, from_pid, from_tid, from_us_list, to_pid, to_us_dict, corr_id_list, start_us, file_name): @@ -134,6 +163,21 @@ def flow_json(self, base_id, from_pid, from_tid, from_us_list, to_pid, to_us_dic fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%s,"tid":0,"name":"dep"}\n' % (to_ts, dep_id, str(to_pid))) dep_id += 1 + def metadata_json(self, jsonfile, sysinfo_file): + params = gen_params(sysinfo_file); + with open(jsonfile, mode='a') as fd: + cnt = 0 + fd.write('],\n') + fd.write('"otherData": {\n') + for nkey in sorted(params.keys()): + key = nkey[1] + cnt = cnt + 1 + if cnt == len(params): + fd.write(' "' + key + '": "' + params[nkey] + '"\n') + else: + fd.write(' "' + key + '": "' + params[nkey] + '",\n') + fd.write(' }\n') + def dump_json(self, table_name, data_name, file_name): if not re.search(r'\.json$', file_name): raise Exception('wrong output file type: "' + file_name + '"' ) @@ -231,19 +275,4 @@ def add_csv_table(self, table_name, file_name, extra = ()): table = self.add_table(table_name, descr, extra) self.insert_table(table, reader) - def metadata_json(self, jsonfile, sysinfo_file): - params = gen_params(sysinfo_file); - with open(jsonfile, mode='a') as fd: - cnt = 0 - fd.write('],\n') - fd.write('"otherData": {\n') - for key in params: - cnt = cnt + 1 - if cnt == len(params): - fd.write(' "' + key + '": "' + params[key] + '"\n') - else: - fd.write(' "' + key + '": "' + params[key] + '",\n') - fd.write(' }\n') - ############################################################################################## - diff --git a/bin/tblextr.py b/bin/tblextr.py index 3f47c65d..0fe46336 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. # @@ -22,16 +20,10 @@ # THE SOFTWARE. ################################################################################ -import os, sys, re +import os, sys, re, subprocess from sqlitedb import SQLiteDB import dform -# Parsing results in the format: -#dispatch[0], queue_index(0), kernel_name("SimpleConvolution"), time(1048928000311041,1048928006154674,1048928006168274,1048928006170503): -# GRBM_GUI_ACTIVE (74332) -# SQ_WAVES (4096) -# SQ_INSTS_VMEM_RD (36864) - EXT_PID = 0 COPY_PID = 1 HIP_PID = 2 @@ -59,17 +51,6 @@ var_table = {} ############################################################# -def metadata_gen(sysinfo_file, sysinfo_cmd): - if not re.search(r'\.txt$', sysinfo_file): - raise Exception('wrong output file type: "' + sysinfo_file + '"' ) - if re.search(r'rocminfo', sysinfo_cmd): - direct_str = " > " - else: - direct_str = " >> " - status, output = commands.getstatusoutput(sysinfo_cmd + direct_str + sysinfo_file) - if status != 0 : - raise Exception('Could not run command: ' + sysinfo_cmd) - def fatal(msg): sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); sys.exit(1) @@ -82,6 +63,22 @@ def dbglog(msg): fatal("error") ############################################################# +# Dumping sysinfo +sysinfo_begin = 1 +def metadata_gen(sysinfo_file, sysinfo_cmd): + global sysinfo_begin + if not re.search(r'\.txt$', sysinfo_file): + raise Exception('wrong output file type: "' + sysinfo_file + '"' ) + if sysinfo_begin == 1: + sysinfo_begin = 0 + with open(sysinfo_file, mode='w') as fd: fd.write('') + with open(sysinfo_file, mode='a') as fd: fd.write('CMD: ' + sysinfo_cmd + '\n') + status = subprocess.call(sysinfo_cmd + ' >> ' + sysinfo_file, + stderr=subprocess.STDOUT, + shell=True) + if status != 0: + raise Exception('Could not run command: "' + sysinfo_cmd + '"') + # parse results method def parse_res(infile): global max_gpu_id @@ -102,7 +99,7 @@ def parse_res(infile): if not dispatch_number in var_table: fatal("Error: dispatch number not found '" + str(dispatch_number) + "'") var = m.group(1) val = m.group(2) - var_table[dispatch_number][m.group(1)] = m.group(2) + var_table[dispatch_number][var] = val if not var in var_list: var_list.append(var) m = beg_pattern.match(record) @@ -256,18 +253,35 @@ def fill_ext_db(table_name, db, indir, trace_name, api_pid): rec_stack = pid_stack[tid] rec_vals = rec_stack.pop() rec_vals[1] = tms - + db.insert_entry(table_handle, rec_vals) record_id += 1 return 1 ############################################################# +def extract_field(rec_args, field): + ptrn1_field = re.compile(r'^.*'+field+'\('); + ptrn2_field = re.compile(r'\)\) .*$'); + (field_name, n_subs) = ptrn1_field.subn('', rec_args, count=1); + if n_subs != 0: + (field_name, n_subs) = ptrn2_field.subn(')', field_name, count=1) + return (field_name, n_subs) + # Fill API DB api_table_descr = [ ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'args', 'Index'], {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'Index':'INTEGER'} ] +# Filling API records DB table +# table_name - created DB table name +# db - DB handle +# indir - input directory +# api_name - traced API name +# api_pid - assigned JSON PID +# dep_pid - PID of dependet domain +# dep_list - list of dependet dospatch events +# dep_filtr - registered dependencies by record ID def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep_filtr, expl_id): global hsa_activity_found copy_raws = [] @@ -278,6 +292,10 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep file_name = indir + '/' + api_name + '_api_trace.txt' ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) ([^\(]+)(\(.*)$') ptrn_ac = re.compile(r'hsa_amd_memory_async_copy') + ptrn1_kernel = re.compile(r'^.*kernel\(') + ptrn2_kernel = re.compile(r'\)\) .*$') + ptrn_fixformat = re.compile(r'(\d+:\d+ \d+:\d+ \w+)\(\s*(.*)\)$') + ptrn_fixkernel = re.compile(r'\s+kernel=(.*)$') if not os.path.isfile(file_name): return 0 @@ -285,36 +303,45 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep dep_from_us_list = [] dep_id_list = [] - global START_US - with open(file_name, mode='r') as fd: - line = fd.readline() - record = line[:-1] - m = ptrn_val.match(record) - if m: START_US = int(m.group(1)) / 1000 - START_US = 0 - + # parsing an input trace file and creating a DB table record_id = 0 table_handle = db.add_table(table_name, api_table_descr) with open(file_name, mode='r') as fd: for line in fd.readlines(): record = line[:-1] + + kernel_arg = '' + m = ptrn_fixkernel.search(record) + if m: + kernel_arg = 'kernel(' + m.group(1) + ') ' + record = ptrn_fixkernel.sub('', record) + + mfixformat = ptrn_fixformat.match(record) + if mfixformat: #replace '=' in args with parentheses + reformated_args = kernel_arg + mfixformat.group(2).replace('=','(').replace(',',')')+')' + record = mfixformat.group(1) + '(' + reformated_args + ')' + m = ptrn_val.match(record) if m: rec_vals = [] rec_len = len(api_table_descr[0]) for ind in range(1,rec_len): rec_vals.append(m.group(ind)) + proc_id = rec_vals[2] rec_vals[2] = api_pid rec_vals.append(record_id) db.insert_entry(table_handle, rec_vals) + + # dependencies filling if ptrn_ac.search(rec_vals[4]) or record_id in dep_filtr: beg_ns = int(rec_vals[0]) end_ns = int(rec_vals[1]) from_us = (beg_ns / 1000) + ((end_ns - beg_ns) / 1000) dep_from_us_list.append(from_us) dep_tid_list.append(int(rec_vals[3])) - dep_id_list.append(record_id) + dep_id_list.append(record_id) + # memcopy data if len(copy_raws) != 0: copy_data = list(copy_raws[copy_index]) args_str = rec_vals[5] @@ -324,13 +351,30 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep copy_csv += str(copy_index) + ', ' + copy_line + '\n' copy_index += 1 + # patching activity properties: kernel name, stream-id + corr_id = record_id + if (corr_id, proc_id) in dep_filtr: + record_args = rec_vals[rec_len - 2] + select_expr = '"Index" = ' + str(corr_id) + ' AND "proc-id" = ' + proc_id + # extract kernel name + (kernel_name, n_subs) = extract_field(record_args, 'kernel') + if n_subs != 0: + db.change_rec_fld('OPS', 'Name = "' + kernel_name + '"', select_expr) + # extract stream-id + (stream_id, n_subs) = extract_field(record_args, 'stream') + if n_subs != 0: + if stream_id == 'nil' or stream_id == 'NIL': stream_id = 0 + db.change_rec_fld('OPS', 'tid = ' + stream_id, select_expr) + record_id += 1 else: fatal(api_name + " bad record: '" + record + "'") + # inserting of dispatch events correlated to the dependent dispatches for (tid, from_ns) in dep_list: db.insert_entry(table_handle, [from_ns, from_ns, api_pid, tid, 'hsa_dispatch', '', record_id]) record_id += 1 + # registering dependencies informatino if dep_pid != NONE_PID: if not dep_pid in dep_dict: dep_dict[dep_pid] = {} dep_dict[dep_pid]['pid'] = api_pid @@ -338,6 +382,7 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep dep_dict[dep_pid]['from'] = dep_from_us_list if expl_id: dep_dict[dep_pid]['id'] = dep_id_list + # generating memcopy CSV if copy_csv != '': file_name = os.environ['PWD'] + '/results_mcopy.csv' with open(file_name, mode='w') as fd: @@ -386,46 +431,72 @@ def fill_copy_db(table_name, db, indir): # fill HCC ops DB ops_table_descr = [ - ['BeginNs', 'EndNs', 'dev-id', 'queue-id', 'Name', 'pid', 'tid', 'Index'], - {'Index':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'dev-id':'INTEGER', 'queue-id':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER'} + ['BeginNs', 'EndNs', 'dev-id', 'queue-id', 'Name', 'pid', 'tid', 'Index', 'proc-id'], + {'Index':'INTEGER', 'proc-id':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'dev-id':'INTEGER', 'queue-id':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER'} ] -def fill_ops_db(table_name, db, indir): +def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): global max_gpu_id file_name = indir + '/' + 'hcc_ops_trace.txt' ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) (.*)$') - ptrn_id = re.compile(r'^[^:]+:(\d+)$') + ptrn_id = re.compile(r'^([^:]+):(\d+):(\d+)$') + ptrn_mcopy = re.compile(r'(Memcpy|Copy|Fill)') + ptrn_barrier = re.compile(r'Marker') if not os.path.isfile(file_name): return {} filtr = {} record_id = 0 - table_handle = db.add_table(table_name, ops_table_descr) + kernel_table_handle = db.add_table(kernel_table_name, ops_table_descr) + mcopy_table_handle = db.add_table(mcopy_table_name, ops_table_descr) with open(file_name, mode='r') as fd: for line in fd.readlines(): record = line[:-1] m = ptrn_val.match(record) if m: + # parsing trace record rec_vals = [] for ind in range(1,6): rec_vals.append(m.group(ind)) - gpu_id = int(rec_vals[2]); - if (gpu_id > max_gpu_id): max_gpu_id = gpu_id - gpu_pid = GPU_BASE_PID + int(gpu_id) - rec_vals.append(gpu_pid) - rec_vals.append(0) - m = ptrn_id.match(rec_vals[4]) + label = rec_vals[4] # record name + m = ptrn_id.match(label) if not m: fatal("bad hcc ops entry '" + record + "'") - corr_id = int(m.group(1)) - 1 - rec_vals.append(corr_id) + name = m.group(1) + corr_id = int(m.group(2)) - 1 + proc_id = m.group(3) + + # checking name for memcopy pattern + if ptrn_mcopy.search(name): + table_handle = mcopy_table_handle + pid = COPY_PID; + else: + table_handle = kernel_table_handle + + gpu_id = int(rec_vals[2]); + if (gpu_id > max_gpu_id): max_gpu_id = gpu_id + pid = GPU_BASE_PID + int(gpu_id) + + if ptrn_barrier.search(name): + name = '""' + + # insert DB record + rec_vals[4] = name # Name + rec_vals.append(pid) # pid + rec_vals.append(0) # tid + rec_vals.append(corr_id) # Index + rec_vals.append(proc_id) # proc-id db.insert_entry(table_handle, rec_vals) - filtr[corr_id] = 1 - if not gpu_pid in dep_dict: - dep_dict[gpu_pid] = {} - dep_dict[gpu_pid]['to'] = {} - dep_dict[gpu_pid]['to'][corr_id] = int(rec_vals[0]) / 1000 - dep_dict[gpu_pid]['bsp'] = OPS_PID - else: fatal("hcc ops bad record: '" + record + "'") + # registering a dependency filtr + filtr[(corr_id, proc_id)] = 1 + + # filling a dependency + if not pid in dep_dict: dep_dict[pid] = {} + if not 'to' in dep_dict[pid]: dep_dict[pid]['to'] = {} + dep_dict[pid]['to'][corr_id] = int(rec_vals[0]) / 1000 + dep_dict[pid]['bsp'] = OPS_PID + + else: + fatal("hcc ops bad record: '" + record + "'") return filtr ############################################################# @@ -462,8 +533,10 @@ def fill_ops_db(table_name, db, indir): hsa_statfile = re.sub(r'\.stats\.csv$', r'.hsa_stats.csv', statfile) hip_statfile = re.sub(r'\.stats\.csv$', r'.hip_stats.csv', statfile) kfd_statfile = re.sub(r'\.stats\.csv$', r'.kfd_stats.csv', statfile) - sysinfo_file = re.sub(r'\.stats\.csv$', r'.sysinfo_stats.txt', statfile) - metadata_gen(sysinfo_file, '/opt/rocm/bin/rocminfo') + ops_statfile = statfile + copy_statfile = re.sub(r'\.stats\.csv$', r'.copy_stats.csv', statfile) + sysinfo_file = re.sub(r'\.stats\.csv$', r'.sysinfo.txt', statfile) + metadata_gen(sysinfo_file, 'rocminfo') with open(dbfile, mode='w') as fd: fd.truncate() db = SQLiteDB(dbfile) @@ -475,12 +548,15 @@ def fill_ops_db(table_name, db, indir): hsa_activity_found = fill_copy_db('COPY', db, indir) hsa_trace_found = fill_api_db('HSA', db, indir, 'hsa', HSA_PID, COPY_PID, kern_dep_list, {}, 0) - ops_filtr = fill_ops_db('OPS', db, indir) + ops_filtr = fill_ops_db('OPS', 'COPY', db, indir) hip_trace_found = fill_api_db('HIP', db, indir, 'hip', HIP_PID, OPS_PID, [], ops_filtr, 1) fill_kernel_db('A', db) any_trace_found = ext_trace_found | kfd_trace_found | hsa_trace_found | hip_trace_found + copy_trace_found = 0 + if hsa_activity_found or len(ops_filtr): copy_trace_found = 1 + if any_trace_found: db.open_json(jsonfile) @@ -496,8 +572,7 @@ def fill_ops_db(table_name, db, indir): if kfd_trace_found: db.label_json(KFD_PID, "CPU KFD API", jsonfile) - if hsa_activity_found: - db.label_json(COPY_PID, "COPY", jsonfile) + db.label_json(COPY_PID, "COPY", jsonfile) if any_trace_found and max_gpu_id >= 0: for ind in range(0, int(max_gpu_id) + 1): @@ -517,8 +592,9 @@ def fill_ops_db(table_name, db, indir): dform.gen_table_bins(db, 'HSA', hsa_statfile, 'Name', 'DurationNs') dform.gen_api_json_trace(db, 'HSA', START_US, jsonfile) - if hsa_activity_found: + if copy_trace_found: dform.post_process_data(db, 'COPY') + dform.gen_table_bins(db, 'COPY', copy_statfile, 'Name', 'DurationNs') dform.gen_api_json_trace(db, 'COPY', START_US, jsonfile) if hip_trace_found: @@ -526,11 +602,11 @@ def fill_ops_db(table_name, db, indir): dform.gen_table_bins(db, 'HIP', hip_statfile, 'Name', 'DurationNs') dform.gen_api_json_trace(db, 'HIP', START_US, jsonfile) + if ops_filtr: dform.post_process_data(db, 'OPS') + dform.gen_table_bins(db, 'OPS', ops_statfile, 'Name', 'DurationNs') dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_US, jsonfile) - metadata_gen(sysinfo_file, '/opt/rocm/bin/hipcc --version') - if kfd_trace_found: dform.post_process_data(db, 'KFD') dform.gen_table_bins(db, 'KFD', kfd_statfile, 'Name', 'DurationNs') diff --git a/bin/txt2params.py b/bin/txt2params.py index 358acc3a..7944029f 100644 --- a/bin/txt2params.py +++ b/bin/txt2params.py @@ -1,5 +1,3 @@ -#!/usr/bin/python - ################################################################################ # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. # @@ -30,13 +28,14 @@ # it also processes encompasing sections to generate a full param name such as (section names separated by '_'): # "Agent2_PoolInfo_ISAInfo_ISA1_WorkgroupMaxSizeperDimension_x": "1024(0x400)", def gen_params(txtfile): - fields = {} + fields = {} + counter = 0 parent_field = '' nbr_indent = 0 nbr_indent_prev = 0 check_for_dims = False - with open(txtfile) as fp: - for line in fp: + with open(txtfile) as fp: + for line in fp: me = re.match(r'\*\*\* Done \*\*\*',line) #Marks the end of cmd if me: parent_field = '' @@ -48,7 +47,8 @@ def gen_params(txtfile): if mv: key = 'HCCclangversion' val = mv.group(1) - fields[key] = val + counter = counter + 1 + fields[(counter,key)] = val continue # Variable 'check_for_dims' is True for text like this: # Workgroup Max Size per Dimension: @@ -56,34 +56,35 @@ def gen_params(txtfile): # y 1024(0x400) # z 1024(0x400) if check_for_dims == True: - mc = re.match(r'\s*([x|y|z])\s+(.*)',line) + mc = re.match(r'\s*([x|y|z])\s+(.*)',line) if mc: key_sav = mc.group(1) if parent_field != '': - key = parent_field + '_' + mc.group(1) + key = parent_field + '.' + mc.group(1) else: key = mc.group(1) val = re.sub(r"\s+", "", mc.group(2)) - fields[key] = val + counter = counter + 1 + fields[(counter,key)] = val if key_sav == 'z': check_for_dims = False nbr_indent_prev = nbr_indent - mi = re.search(r'^(\s+)\w+', line) + mi = re.search(r'^(\s+)\w+.*', line) md = re.search(':', line) if mi: - nbr_indent = len(mi.group(1)) / 2 #indentation cnt + nbr_indent = int(len(mi.group(1)) / 2) #indentation cnt else: if not md: tmp = re.sub(r"\s+", "", line) if tmp.isalnum(): parent_field = tmp - continue - - if nbr_indent < nbr_indent_prev: - pos = parent_field.rfind('_') - if pos != -1: - parent_field = parent_field[:pos] + if nbr_indent < nbr_indent_prev: + go_back_parent = (nbr_indent_prev - nbr_indent) + for i in range(go_back_parent): #decrease as many levels up as needed + pos = parent_field.rfind('.') + if pos != -1: + parent_field = parent_field[:pos] # Process lines such as : # Segment: GLOBAL; FLAGS: KERNARG, FINE GRAINED # Size: 131897644(0x7dc992c) KB @@ -93,17 +94,17 @@ def gen_params(txtfile): if m: key, val = m.group(1), m.group(2) if parent_field != '': - key = parent_field + '_' + key + key = parent_field + '.' + key if val == '': mk = re.match(r'.*Dimension',key) if mk: # expect x,y,z on next 3 lines check_for_dims = True - parent_field = key + parent_field = key else: - fields[key] = val + counter = counter + 1 + fields[(counter,key)] = val else: if nbr_indent != nbr_indent_prev and not check_for_dims : - parent_field = parent_field + '_' + lin.replace(':','') - - return fields + parent_field = parent_field + '.' + lin.replace(':','') + return fields diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index 44fb0cd0..30e86c13 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -31,7 +31,6 @@ add_definitions ( -DUNIX_OS ) add_definitions ( -DLINUX ) add_definitions ( -D__AMD64__ ) add_definitions ( -D__x86_64__ ) -add_definitions ( -DAMD_INTERNAL_BUILD ) add_definitions ( -DLITTLEENDIAN_CPU=1 ) add_definitions ( -DHSA_LARGE_MODEL= ) add_definitions ( -DHSA_DEPRECATED= ) @@ -109,16 +108,14 @@ elseif ( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86" ) endif () ## Find hsa-runtime headers/lib -find_file ( HSA_RUNTIME_INC "hsa.h" ) -if ( "${HSA_RUNTIME_INC_PATH}" STREQUAL "" ) - find_file ( HSA_RUNTIME_INC "hsa/hsa.h" ) -endif() +find_file ( HSA_RUNTIME_INC "hsa/hsa.h" ) find_library ( HSA_RUNTIME_LIB "libhsa-runtime${NBIT}.so" ) -get_filename_component ( HSA_RUNTIME_INC_PATH ${HSA_RUNTIME_INC} DIRECTORY ) -get_filename_component ( HSA_RUNTIME_LIB_PATH ${HSA_RUNTIME_LIB} DIRECTORY ) +get_filename_component ( HSA_RUNTIME_INC_PATH "${HSA_RUNTIME_INC}" DIRECTORY ) +get_filename_component ( HSA_RUNTIME_LIB_PATH "${HSA_RUNTIME_LIB}" DIRECTORY ) find_library ( HSA_KMT_LIB "libhsakmt.so" ) -get_filename_component ( HSA_KMT_LIB_PATH ${HSA_KMT_LIB} DIRECTORY ) +get_filename_component ( HSA_KMT_LIB_PATH "${HSA_KMT_LIB}" DIRECTORY ) +get_filename_component ( ROCM_ROOT_DIR "${HSA_KMT_LIB_PATH}" DIRECTORY ) ## Basic Tool Chain Information message ( "----------------NBit: ${NBIT}" ) @@ -127,5 +124,22 @@ message ( "------------Compiler: ${CMAKE_CXX_COMPILER}" ) message ( "----Compiler-Version: ${CMAKE_CXX_COMPILER_VERSION}" ) message ( "-----HSA-Runtime-Inc: ${HSA_RUNTIME_INC_PATH}" ) message ( "-----HSA-Runtime-Lib: ${HSA_RUNTIME_LIB_PATH}" ) -message ( "-----------CXX-Flags: ${CMAKE_CXX_FLAGS}" ) +message ( "----HSA_KMT_LIB_PATH: ${HSA_KMT_LIB_PATH}" ) +message ( "-------ROCM_ROOT_DIR: ${ROCM_ROOT_DIR}" ) +message ( "-----CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}" ) message ( "---CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" ) +message ( "---------GPU_TARGETS: ${GPU_TARGETS}" ) + +## Check the ROCm pathes +if ( "${HSA_RUNTIME_INC_PATH}" STREQUAL "" ) + message ( FATAL_ERROR "HSA_RUNTIME_INC_PATH is not found." ) +endif () +if ( "${HSA_RUNTIME_LIB_PATH}" STREQUAL "" ) + message ( FATAL_ERROR "HSA_RUNTIME_LIB_PATH is not found." ) +endif () +if ( "${HSA_KMT_LIB_PATH}" STREQUAL "" ) + message ( FATAL_ERROR "HSA_KMT_LIB_PATH is not found." ) +endif () +if ( "${ROCM_ROOT_DIR}" STREQUAL "" ) + message ( FATAL_ERROR "ROCM_ROOT_DIR is not found." ) +endif () diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 31082cf4..24925cae 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -41,12 +41,13 @@ THE SOFTWARE. #ifndef INC_ROCPROFILER_H_ #define INC_ROCPROFILER_H_ -#include #include +#include +#include #include #include -#define ROCPROFILER_VERSION_MAJOR 7 +#define ROCPROFILER_VERSION_MAJOR 8 #define ROCPROFILER_VERSION_MINOR 0 #ifdef __cplusplus @@ -70,6 +71,7 @@ typedef struct { uint32_t trace_local; uint64_t timeout; uint32_t timestamp_on; + uint32_t hsa_intercepting; } rocprofiler_settings_t; //////////////////////////////////////////////////////////////////////////////// @@ -87,7 +89,9 @@ hsa_status_t rocprofiler_error_string( // Profiling feature kind typedef enum { ROCPROFILER_FEATURE_KIND_METRIC = 0, - ROCPROFILER_FEATURE_KIND_TRACE = 1 + ROCPROFILER_FEATURE_KIND_TRACE = 1, + ROCPROFILER_FEATURE_KIND_SPM_MOD = 2, + ROCPROFILER_FEATURE_KIND_PCSMP_MOD = 4 } rocprofiler_feature_kind_t; // Profiling feture parameter @@ -199,17 +203,25 @@ hsa_status_t rocprofiler_close(rocprofiler_t* context); // [in] profiling conte hsa_status_t rocprofiler_reset(rocprofiler_t* context, // [in] profiling context uint32_t group_index); // group index +// Return context agent +hsa_status_t rocprofiler_get_agent(rocprofiler_t* context, // [in] profiling context + hsa_agent_t* agent); // [out] GPU handle + // Supported time value ID typedef enum { ROCPROFILER_TIME_ID_CLOCK_REALTIME = 0, // Linux realtime clock time - ROCPROFILER_TIME_ID_CLOCK_MONOTONIC = 1, // Linux monotonic clock time + ROCPROFILER_TIME_ID_CLOCK_REALTIME_COARSE = 1, // Linux realtime-coarse clock time + ROCPROFILER_TIME_ID_CLOCK_MONOTONIC = 2, // Linux monotonic clock time + ROCPROFILER_TIME_ID_CLOCK_MONOTONIC_COARSE = 3, // Linux monotonic-coarse clock time + ROCPROFILER_TIME_ID_CLOCK_MONOTONIC_RAW = 4, // Linux monotonic-raw clock time } rocprofiler_time_id_t; // Return time value for a given time ID and profiling timestamp hsa_status_t rocprofiler_get_time( rocprofiler_time_id_t time_id, // identifier of the particular time to convert the timesatmp uint64_t timestamp, // profiling timestamp - uint64_t* value_ns); // [out] returned time 'ns' value + uint64_t* value_ns, // [out] returned time 'ns' value, ignored if NULL + uint64_t* error_ns); // [out] returned time error 'ns' value, ignored if NULL //////////////////////////////////////////////////////////////////////////////// // Queue callbacks @@ -237,7 +249,7 @@ typedef struct { const char* kernel_name; // Kernel name uint64_t kernel_object; // Kernel object address const amd_kernel_code_t* kernel_code; // Kernel code pointer - int64_t thread_id; // Thread id + uint32_t thread_id; // Thread id const rocprofiler_dispatch_record_t* record; // Dispatch record } rocprofiler_callback_data_t; @@ -262,6 +274,10 @@ hsa_status_t rocprofiler_set_queue_callbacks( // Remove queue callbacks hsa_status_t rocprofiler_remove_queue_callbacks(); +// Start/stop queue callbacks +hsa_status_t rocprofiler_start_queue_callbacks(); +hsa_status_t rocprofiler_stop_queue_callbacks(); + //////////////////////////////////////////////////////////////////////////////// // Start/stop profiling // @@ -455,6 +471,68 @@ hsa_status_t rocprofiler_pool_flush( rocprofiler_pool_t* pool); // profiling pool handle //////////////////////////////////////////////////////////////////////////////// +// HSA intercepting API + +// HSA callbacks ID enumeration +typedef enum { + ROCPROFILER_HSA_CB_ID_ALLOCATE = 0, // Memory allocate callback + ROCPROFILER_HSA_CB_ID_DEVICE = 1, // Device assign callback + ROCPROFILER_HSA_CB_ID_MEMCOPY = 2, // Memcopy callback + ROCPROFILER_HSA_CB_ID_SUBMIT = 3 // Packet submit callback +} rocprofiler_hsa_cb_id_t; + +// HSA callback data type +typedef struct { + union { + struct { + const void* ptr; // allocated area ptr + size_t size; // allocated area size, zero size means 'free' callback + hsa_amd_segment_t segment; // allocated area's memory segment type + hsa_amd_memory_pool_global_flag_t global_flag; // allocated area's memory global flag + int is_code; // equal to 1 if code is allocated + } allocate; + struct { + hsa_device_type_t type; // type of assigned device + uint32_t id; // id of assigned device + hsa_agent_t agent; // device HSA agent handle + const void* ptr; // ptr the device is assigned to + } device; + struct { + const void* dst; // memcopy dst ptr + const void* src; // memcopy src ptr + size_t size; // memcopy size bytes + } memcopy; + struct { + const void* packet; // submitted to GPU packet + const char* kernel_name; // kernel name, not NULL if dispatch + hsa_queue_t* queue; // HSA queue the kernel was submitted to + uint32_t device_type; // type of device the packed is submitted to + uint32_t device_id; // id of device the packed is submitted to + } submit; + }; +} rocprofiler_hsa_callback_data_t; + +// HSA callback function type +typedef hsa_status_t (*rocprofiler_hsa_callback_fun_t)( + rocprofiler_hsa_cb_id_t id, // callback id + const rocprofiler_hsa_callback_data_t* data, // [in] callback data + void* arg); // [in/out] user passed data + +// HSA callbacks structure +typedef struct { + rocprofiler_hsa_callback_fun_t allocate; // memory allocate callback + rocprofiler_hsa_callback_fun_t device; // agent assign callback + rocprofiler_hsa_callback_fun_t memcopy; // memory copy callback + rocprofiler_hsa_callback_fun_t submit; // packet submit callback +} rocprofiler_hsa_callbacks_t; + +// Set callbacks. If the callback is NULL then it is disabled. +// If callback returns a value that is not HSA_STATUS_SUCCESS the callback +// will be unregistered. +hsa_status_t rocprofiler_set_hsa_callbacks( + const rocprofiler_hsa_callbacks_t callbacks, // HSA callback function + void* arg); // callback user data + #ifdef __cplusplus } // extern "C" block #endif // __cplusplus diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9a398411..4c97ea6f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -30,8 +30,9 @@ set ( LIB_SRC ${LIB_DIR}/core/simple_proxy_queue.cpp ${LIB_DIR}/core/intercept_queue.cpp ${LIB_DIR}/core/metrics.cpp + ${LIB_DIR}/core/activity.cpp ${LIB_DIR}/util/hsa_rsrc_factory.cpp ) add_library ( ${TARGET_LIB} SHARED ${LIB_SRC} ) -target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ${HSA_KMT_LIB_PATH}/.. ) target_link_libraries( ${TARGET_LIB} PRIVATE ${HSA_RUNTIME_LIB} c stdc++) diff --git a/src/core/activity.cpp b/src/core/activity.cpp new file mode 100644 index 00000000..c72977e1 --- /dev/null +++ b/src/core/activity.cpp @@ -0,0 +1,171 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include + +#include +#include + +// Tracer messages protocol +#include + +#include "core/context.h" +#include "inc/rocprofiler.h" +#include "util/hsa_rsrc_factory.h" + +#define PUBLIC_API __attribute__((visibility("default"))) + +// Error handler +void fatal(const std::string msg) { + fflush(stdout); + fprintf(stderr, "%s\n\n", msg.c_str()); + fflush(stderr); + abort(); +} + +// Check returned HSA API status +void check_status(hsa_status_t status) { + if (status != HSA_STATUS_SUCCESS) { + const char* error_string = NULL; + rocprofiler_error_string(&error_string); + fprintf(stderr, "ERROR: %s\n", error_string); + abort(); + } +} + +// Activity primitives +namespace activity_prim { +// PC sampling callback data +struct pcsmp_callback_data_t { + const char* kernel_name; // sampled kernel name + void* data_buffer; // host buffer for tracing data + uint64_t id; // sample id + uint64_t cycle; // sample cycle + uint64_t pc; // sample PC +}; + +uint32_t activity_op = UINT32_MAX; +void* activity_arg = NULL; +std::atomic activity_callback{NULL}; +rocprofiler_t* context = NULL; + +hsa_status_t trace_data_cb(hsa_ven_amd_aqlprofile_info_type_t info_type, + hsa_ven_amd_aqlprofile_info_data_t* info_data, + void* data) { + const pcsmp_callback_data_t* pcsmp_data = (pcsmp_callback_data_t*) data; + + activity_record_t record{}; + record.op = activity_op; + record.pc_sample.se = pcsmp_data->id; + record.pc_sample.cycle = pcsmp_data->cycle; + record.pc_sample.pc = pcsmp_data->pc; + activity_async_callback_t fun = activity_callback.load(std::memory_order_acquire); + if (fun) { + (fun)(activity_op, &record, activity_arg); + } else { + free((void*)(pcsmp_data->kernel_name)); + } + return HSA_STATUS_SUCCESS; +} + +bool context_handler(rocprofiler_group_t group, void* arg) { + hsa_agent_t agent{}; + hsa_status_t status = rocprofiler_get_agent(group.context, &agent); + check_status(status); + const rocprofiler::util::AgentInfo* agent_info = rocprofiler::util::HsaRsrcFactory::Instance().GetAgentInfo(agent); + + pcsmp_callback_data_t pcsmp_data{}; + pcsmp_data.kernel_name = (const char*)arg; + pcsmp_data.data_buffer = rocprofiler::util::HsaRsrcFactory::Instance().AllocateSysMemory(agent_info, rocprofiler::TraceProfile::GetSize()); + status = rocprofiler_iterate_trace_data(group.context, trace_data_cb, &pcsmp_data); + check_status(status); + return false; +} + +// Kernel disoatch callback +hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data, + rocprofiler_group_t* group) { + // context features + const rocprofiler_feature_kind_t trace_kind = + (rocprofiler_feature_kind_t)(ROCPROFILER_FEATURE_KIND_TRACE | ROCPROFILER_FEATURE_KIND_PCSMP_MOD); + const uint32_t feature_count = 1; + const uint32_t parameter_count = 1; + rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; + memset(features, 0, feature_count * sizeof(rocprofiler_feature_t)); + rocprofiler_parameter_t* parameters = new rocprofiler_parameter_t[parameter_count]; + memset(features, 0, parameter_count * sizeof(rocprofiler_parameter_t)); + parameters[0].parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET; + parameters[0].value = 0; + + features[0].kind = trace_kind; + features[0].parameters = parameters; + features[0].parameter_count = parameter_count; + + // context properties + rocprofiler_properties_t properties{}; + properties.handler = context_handler; + properties.handler_arg = (void*)strdup(callback_data->kernel_name); + + // Open profiling context + hsa_status_t status = rocprofiler_open(callback_data->agent, features, feature_count, + &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); + check_status(status); + + // Get group[0] + status = rocprofiler_get_group(context, 0, group); + check_status(status); + + return status; +} +} // namespace activity_prim + +extern "C" { +PUBLIC_API const char* GetOpName(uint32_t op) { return strdup("PCSAMPLE"); } + +PUBLIC_API bool RegisterApiCallback(uint32_t op, void* callback, void* arg) { return true; } + +PUBLIC_API bool RemoveApiCallback(uint32_t op) { return true; } + +PUBLIC_API bool InitActivityCallback(void* callback, void* arg) { + activity_prim::activity_arg = arg; + activity_prim::activity_callback.store((activity_async_callback_t)callback, std::memory_order_release); + + rocprofiler_queue_callbacks_t queue_callbacks{}; + queue_callbacks.dispatch = activity_prim::dispatch_callback; + rocprofiler_set_queue_callbacks(queue_callbacks, NULL); + + return true; +} + +PUBLIC_API bool EnableActivityCallback(uint32_t op, bool enable) { + if (enable) { + activity_prim::activity_op = op; + rocprofiler_start_queue_callbacks(); + } else { + rocprofiler_stop_queue_callbacks(); + } + return true; +} +} // extern "C" diff --git a/src/core/context.h b/src/core/context.h index f3ab1294..7131d338 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -257,7 +257,7 @@ class Context { void GetMetricsData() const { const MetricArgs args(info_map_); - for (const auto &v : metrics_map_) { + for (const auto& v : metrics_map_) { const std::string& name = v.first; const Metric* metric = v.second; const xml::Expr* expr = metric->GetExpr(); @@ -276,6 +276,7 @@ class Context { profile_vector_t profile_vector; set_[0].GetTraceProfiles(profile_vector); for (auto& tuple : profile_vector) { + if (pcsmp_mode_) const_cast(tuple.profile)->event_count = UINT32_MAX; const hsa_status_t status = api_->hsa_ven_amd_aqlprofile_iterate_data(tuple.profile, callback, data); if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "context iterate data failed"); @@ -293,6 +294,7 @@ class Context { return false; } + hsa_agent_t GetAgent() const { return agent_; } Group* GetGroup(const uint32_t& index) { return &set_[index]; } rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; } @@ -306,7 +308,8 @@ class Context { api_(hsa_rsrc_->AqlProfileApi()), metrics_(NULL), handler_(handler), - handler_arg_(handler_arg) + handler_arg_(handler_arg), + pcsmp_mode_(false) {} ~Context() { Destruct(); } @@ -434,10 +437,13 @@ class Context { const uint32_t group_index = block_status.group_index; set_[group_index].Insert(profile_info_t{event, NULL, 0, info}); } - } else if (kind == ROCPROFILER_FEATURE_KIND_TRACE) { // Processing traces features - if (info->parameters != NULL) { - set_[0].Insert(profile_info_t{NULL, info->parameters, info->parameter_count, info}); - } else { + } else if (kind & ROCPROFILER_FEATURE_KIND_TRACE) { // Processing traces features + info->kind = ROCPROFILER_FEATURE_KIND_TRACE; + + const event_t* event = NULL; + if (kind & ROCPROFILER_FEATURE_KIND_PCSMP_MOD) { // PC sampling + pcsmp_mode_ = true; + } else if (kind & ROCPROFILER_FEATURE_KIND_SPM_MOD) { // SPM trace const Metric* metric = metrics_->Get(name); if (metric == NULL) EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found"); @@ -445,9 +451,9 @@ class Context { if (counters_vec.size() != 1) EXC_RAISING(HSA_STATUS_ERROR, "trace bad metric '" << name << "' is not base counter"); const counter_t* counter = counters_vec[0]; - const event_t* event = &(counter->event); - set_[0].Insert(profile_info_t{event, NULL, 0, info}); + event = &(counter->event); } + set_[0].Insert(profile_info_t{event, info->parameters, info->parameter_count, info}); } else { EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); } @@ -584,6 +590,9 @@ class Context { // Context completion handler rocprofiler_handler_t handler_; void* handler_arg_; + + // PC sampling mode + bool pcsmp_mode_; }; } // namespace rocprofiler diff --git a/src/core/hsa_interceptor.h b/src/core/hsa_interceptor.h new file mode 100644 index 00000000..f1d8a0d8 --- /dev/null +++ b/src/core/hsa_interceptor.h @@ -0,0 +1,385 @@ +/****************************************************************************** +MIT License + +Copyright (c) 2018 ROCm Core Technology + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*******************************************************************************/ + +#ifndef _SRC_CORE_HSA_INTERCEPTOR_H +#define _SRC_CORE_HSA_INTERCEPTOR_H + +#include +#include +#include + +#include +#include + +#include "inc/rocprofiler.h" +#include "util/exception.h" +#include "util/hsa_rsrc_factory.h" + +#define HSA_RT(call) \ + do { \ + const hsa_status_t status = call; \ + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, #call); \ + } while(0) + +#define IS_HSA_CALLBACK(ID) \ + const auto __id = ID; (void)__id; \ + void *__arg = arg_.load(); (void)__arg; \ + rocprofiler_hsa_callback_fun_t __callback = \ + (ID == ROCPROFILER_HSA_CB_ID_ALLOCATE) ? callbacks_.allocate: \ + (ID == ROCPROFILER_HSA_CB_ID_DEVICE) ? callbacks_.device: \ + (ID == ROCPROFILER_HSA_CB_ID_MEMCOPY) ? callbacks_.memcopy: \ + callbacks_.submit; \ + if ((__callback != NULL) && (recursion_ == false)) + +#define DO_HSA_CALLBACK \ + do { \ + recursion_ = true; \ + __callback(__id, &data, __arg); \ + recursion_ = false; \ + } while (0) + +#define ISSUE_HSA_CALLBACK(ID) \ + do { IS_HSA_CALLBACK(ID) { DO_HSA_CALLBACK; } } while(0) + +namespace rocprofiler { +extern decltype(hsa_memory_allocate)* hsa_memory_allocate_fn; +extern decltype(hsa_memory_assign_agent)* hsa_memory_assign_agent_fn; +extern decltype(hsa_memory_copy)* hsa_memory_copy_fn; +extern decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate_fn; +extern decltype(hsa_amd_memory_pool_free)* hsa_amd_memory_pool_free_fn; +extern decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access_fn; +extern decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn; +extern decltype(hsa_executable_freeze)* hsa_executable_freeze_fn; +extern decltype(hsa_executable_destroy)* hsa_executable_destroy_fn; + +class HsaInterceptor { + public: + typedef std::atomic arg_t; + typedef std::mutex mutex_t; + + static void Enable(const bool& enable) { enable_ = enable; } + + static void HsaIntercept(HsaApiTable* table) { + if (enable_) { + // Fetching AMD Loader HSA extension API + HSA_RT(hsa_system_get_major_extension_table( + HSA_EXTENSION_AMD_LOADER, + 1, + sizeof(hsa_ven_amd_loader_1_01_pfn_t), + &LoaderApiTable)); + + // Saving original API functions + hsa_memory_allocate_fn = table->core_->hsa_memory_allocate_fn; + hsa_memory_assign_agent_fn = table->core_->hsa_memory_assign_agent_fn; + hsa_memory_copy_fn = table->core_->hsa_memory_copy_fn; + hsa_amd_memory_pool_allocate_fn = table->amd_ext_->hsa_amd_memory_pool_allocate_fn; + hsa_amd_memory_pool_free_fn = table->amd_ext_->hsa_amd_memory_pool_free_fn; + hsa_amd_agents_allow_access_fn = table->amd_ext_->hsa_amd_agents_allow_access_fn; + hsa_amd_memory_async_copy_fn = table->amd_ext_->hsa_amd_memory_async_copy_fn; + hsa_executable_freeze_fn = table->core_->hsa_executable_freeze_fn; + hsa_executable_destroy_fn = table->core_->hsa_executable_destroy_fn; + + // Intercepting HSA API + table->core_->hsa_memory_allocate_fn = MemoryAllocate; + table->core_->hsa_memory_assign_agent_fn = MemoryAssignAgent; + table->core_->hsa_memory_copy_fn = MemoryCopy; + table->amd_ext_->hsa_amd_memory_pool_allocate_fn = MemoryPoolAllocate; + table->amd_ext_->hsa_amd_memory_pool_free_fn = MemoryPoolFree; + table->amd_ext_->hsa_amd_agents_allow_access_fn = AgentsAllowAccess; + table->amd_ext_->hsa_amd_memory_async_copy_fn = MemoryAsyncCopy; + table->core_->hsa_executable_freeze_fn = ExecutableFreeze; + table->core_->hsa_executable_destroy_fn = ExecutableDestroy; + } + } + + static void SetCallbacks(rocprofiler_hsa_callbacks_t callbacks, void* arg) { + std::lock_guard lck(mutex_); + callbacks_ = callbacks; + arg_.store(arg); + } + + private: + static hsa_status_t HSA_API MemoryAllocate(hsa_region_t region, + size_t size, + void** ptr) + { + hsa_status_t status = HSA_STATUS_SUCCESS; + HSA_RT(hsa_memory_allocate_fn(region, size, ptr)); + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE) { + rocprofiler_hsa_callback_data_t data{}; + data.allocate.ptr = *ptr; + data.allocate.size = size; + + HSA_RT(hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &data.allocate.segment)); + HSA_RT(hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, &data.allocate.global_flag)); + + DO_HSA_CALLBACK; + } + return status; + } + + static hsa_status_t MemoryAssignAgent( + void *ptr, + hsa_agent_t agent, + hsa_access_permission_t access) + { + hsa_status_t status = HSA_STATUS_SUCCESS; + HSA_RT(hsa_memory_assign_agent_fn(ptr, agent, access)); + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_DEVICE) { + rocprofiler_hsa_callback_data_t data{}; + data.device.ptr = ptr; + + HSA_RT(hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &data.device.type)); + + DO_HSA_CALLBACK; + } + return status; + } + + // Spawn device allow access callback + static void DeviceCallback( + uint32_t num_agents, + const hsa_agent_t* agents, + const void* ptr) + { + for (const hsa_agent_t* agent_p = agents; agent_p < (agents + num_agents); ++agent_p) { + hsa_agent_t agent = *agent_p; + rocprofiler_hsa_callback_data_t data{}; + data.device.id = util::HsaRsrcFactory::Instance().GetAgentInfo(agent)->dev_index; + data.device.agent = agent; + data.device.ptr = ptr; + + HSA_RT(hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &data.device.type)); + + ISSUE_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_DEVICE); + } + } + + // Agent allow access callback 'hsa_amd_agents_allow_access' + static hsa_status_t AgentsAllowAccess( + uint32_t num_agents, + const hsa_agent_t* agents, + const uint32_t* flags, + const void* ptr) + { + hsa_status_t status = HSA_STATUS_SUCCESS; + HSA_RT(hsa_amd_agents_allow_access_fn(num_agents, agents, flags, ptr)); + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_DEVICE) { + DeviceCallback(num_agents, agents, ptr); + } + return status; + } + + // Callback function to get available in the system agents + struct agent_callback_data_t { + hsa_amd_memory_pool_t pool; + void* ptr; + }; + static hsa_status_t AgentCallback(hsa_agent_t agent, void* data) { + agent_callback_data_t* callback_data = reinterpret_cast(data); + hsa_amd_agent_memory_pool_info_t attribute = HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS; + hsa_amd_memory_pool_access_t value; + HSA_RT(hsa_amd_agent_memory_pool_get_info(agent, callback_data->pool, attribute, &value)); + if (value == HSA_AMD_MEMORY_POOL_ACCESS_ALLOWED_BY_DEFAULT) { + DeviceCallback(1, &agent, callback_data->ptr); + } + return HSA_STATUS_SUCCESS; + } + + static hsa_status_t MemoryPoolAllocate( + hsa_amd_memory_pool_t pool, + size_t size, + uint32_t flags, + void** ptr) + { + hsa_status_t status = HSA_STATUS_SUCCESS; + HSA_RT(hsa_amd_memory_pool_allocate_fn(pool, size, flags, ptr)); + if (size != 0) { + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE) { + rocprofiler_hsa_callback_data_t data{}; + data.allocate.ptr = *ptr; + data.allocate.size = size; + + HSA_RT(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &data.allocate.segment)); + HSA_RT(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &data.allocate.global_flag)); + + DO_HSA_CALLBACK; + + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_DEVICE) { + // Scan the pool assigned devices + agent_callback_data_t callback_data{pool, *ptr}; + hsa_iterate_agents(AgentCallback, &callback_data); + } + } + } + return status; + } + static hsa_status_t MemoryPoolFree( + void* ptr) + { + hsa_status_t status = HSA_STATUS_SUCCESS; + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE) { + rocprofiler_hsa_callback_data_t data{}; + data.allocate.ptr = ptr; + data.allocate.size = 0; + DO_HSA_CALLBACK; + } + HSA_RT(hsa_amd_memory_pool_free_fn(ptr)); + return status; + } + + static hsa_status_t MemoryCopy( + void *dst, + const void *src, + size_t size) + { + hsa_status_t status = HSA_STATUS_SUCCESS; + HSA_RT(hsa_memory_copy_fn(dst, src, size)); + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_MEMCOPY) { + rocprofiler_hsa_callback_data_t data{}; + data.memcopy.dst = dst; + data.memcopy.src = src; + data.memcopy.size = size; + DO_HSA_CALLBACK; + } + return status; + } + + static hsa_status_t MemoryAsyncCopy( + void* dst, hsa_agent_t dst_agent, const void* src, + hsa_agent_t src_agent, size_t size, + uint32_t num_dep_signals, + const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal) + { + hsa_status_t status = HSA_STATUS_SUCCESS; + HSA_RT(hsa_amd_memory_async_copy_fn( + dst, dst_agent, src, src_agent, size, + num_dep_signals, dep_signals, completion_signal)); + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_MEMCOPY) { + rocprofiler_hsa_callback_data_t data{}; + data.memcopy.dst = dst; + data.memcopy.src = src; + data.memcopy.size = size; + DO_HSA_CALLBACK; + } + return status; + } + + static hsa_status_t CodeObjectCallback( + hsa_executable_t executable, + hsa_loaded_code_object_t loaded_code_object, + void* arg) + { + const int free_flag = reinterpret_cast(arg); + rocprofiler_hsa_callback_data_t data{}; + + HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE, + &data.allocate.ptr)); + + if (free_flag == 0) { + HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE, + &data.allocate.size)); + } else { + data.allocate.size = 0; + } + + // Local GPU memory + // GLOBAL; FLAGS: COARSE GRAINED + data.allocate.segment = HSA_AMD_SEGMENT_GLOBAL; + data.allocate.global_flag = HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED; + data.allocate.is_code = 1; + + ISSUE_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE); + + if (free_flag == 0) { + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_DEVICE) { + hsa_amd_pointer_info_t pointer_info{}; + uint32_t num_agents = 0; + hsa_agent_t* agents = NULL; + pointer_info.size = sizeof(hsa_amd_pointer_info_t); + HSA_RT(hsa_amd_pointer_info( + const_cast(data.allocate.ptr), + &pointer_info, + malloc, + &num_agents, + &agents)); + + DeviceCallback(num_agents, agents, data.allocate.ptr); + } + } + + return HSA_STATUS_SUCCESS; + } + + static hsa_status_t ExecutableFreeze( + hsa_executable_t executable, + const char *options) + { + hsa_status_t status = HSA_STATUS_SUCCESS; + + HSA_RT(hsa_executable_freeze_fn(executable, options)); + + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE) { + LoaderApiTable.hsa_ven_amd_loader_executable_iterate_loaded_code_objects( + executable, + CodeObjectCallback, + reinterpret_cast(0)); + } + + return status; + } + + static hsa_status_t ExecutableDestroy( + hsa_executable_t executable) + { + hsa_status_t status = HSA_STATUS_SUCCESS; + + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE) { + LoaderApiTable.hsa_ven_amd_loader_executable_iterate_loaded_code_objects( + executable, + CodeObjectCallback, + reinterpret_cast(1)); + } + + HSA_RT(hsa_executable_destroy_fn(executable)); + + return status; + } + + static bool enable_; + static thread_local bool recursion_; + static hsa_ven_amd_loader_1_01_pfn_t LoaderApiTable; + static rocprofiler_hsa_callbacks_t callbacks_; + static arg_t arg_; + static mutex_t mutex_; +}; + +} // namespace rocprofiler + +#endif // _SRC_CORE_HSA_INTERCEPTOR_H diff --git a/src/core/hsa_proxy_queue.h b/src/core/hsa_proxy_queue.h index dd4999b9..3713bfac 100644 --- a/src/core/hsa_proxy_queue.h +++ b/src/core/hsa_proxy_queue.h @@ -30,6 +30,7 @@ THE SOFTWARE. #include "core/proxy_queue.h" #include "util/exception.h" +#include "util/hsa_rsrc_factory.h" namespace rocprofiler { extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; diff --git a/src/core/intercept_queue.cpp b/src/core/intercept_queue.cpp index 91028f73..0b309d63 100644 --- a/src/core/intercept_queue.cpp +++ b/src/core/intercept_queue.cpp @@ -29,10 +29,9 @@ void InterceptQueue::HsaIntercept(HsaApiTable* table) { } InterceptQueue::mutex_t InterceptQueue::mutex_; -rocprofiler_callback_t InterceptQueue::dispatch_callback_ = NULL; -InterceptQueue::queue_callback_t InterceptQueue::create_callback_ = NULL; -InterceptQueue::queue_callback_t InterceptQueue::destroy_callback_ = NULL; +rocprofiler_queue_callbacks_t InterceptQueue::callbacks_ = {}; void* InterceptQueue::callback_data_ = NULL; +std::atomic InterceptQueue::dispatch_callback_{NULL}; InterceptQueue::obj_map_t* InterceptQueue::obj_map_ = NULL; const char* InterceptQueue::kernel_none_ = ""; Tracker* InterceptQueue::tracker_ = NULL; @@ -40,4 +39,7 @@ bool InterceptQueue::tracker_on_ = false; bool InterceptQueue::in_create_call_ = false; InterceptQueue::queue_id_t InterceptQueue::current_queue_id = 0; +rocprofiler_hsa_callback_fun_t InterceptQueue::submit_callback_fun_ = NULL; +void* InterceptQueue::submit_callback_arg_ = NULL; + } // namespace rocprofiler diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index f639b3e5..a52d8c1d 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -84,8 +84,8 @@ class InterceptQueue { obj->queue_id = current_queue_id; ++current_queue_id; - if (create_callback_ != NULL) { - status = create_callback_(*queue, callback_data_); + if (callbacks_.create != NULL) { + status = callbacks_.create(*queue, callback_data_); } in_create_call_ = false; @@ -112,8 +112,8 @@ class InterceptQueue { std::lock_guard lck(mutex_); hsa_status_t status = HSA_STATUS_SUCCESS; - if (destroy_callback_ != NULL) { - status = destroy_callback_(queue, callback_data_); + if (callbacks_.destroy != NULL) { + status = callbacks_.destroy(queue, callback_data_); } if (status == HSA_STATUS_SUCCESS) { @@ -129,13 +129,47 @@ class InterceptQueue { InterceptQueue* obj = reinterpret_cast(data); Queue* proxy = obj->proxy_; + if (submit_callback_fun_) { + mutex_.lock(); + auto* callback_fun = submit_callback_fun_; + void* callback_arg = submit_callback_arg_; + mutex_.unlock(); + + if (callback_fun) { + for (uint64_t j = 0; j < count; ++j) { + const packet_t* packet = &packets_arr[j]; + const hsa_kernel_dispatch_packet_t* dispatch_packet = + reinterpret_cast(packet); + + const char* kernel_name = NULL; + if (GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) { + uint64_t kernel_object = dispatch_packet->kernel_object; + const amd_kernel_code_t* kernel_code = GetKernelCode(kernel_object); + kernel_name = (GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) ? + QueryKernelName(kernel_object, kernel_code) : NULL; + } + + // Prepareing submit callback data + rocprofiler_hsa_callback_data_t data{}; + data.submit.packet = (void*)packet; + data.submit.kernel_name = kernel_name; + data.submit.queue = obj->queue_; + data.submit.device_type = obj->agent_info_->dev_type; + data.submit.device_id = obj->agent_info_->dev_index; + + callback_fun(ROCPROFILER_HSA_CB_ID_SUBMIT, &data, callback_arg); + } + } + } + // Travers input packets for (uint64_t j = 0; j < count; ++j) { const packet_t* packet = &packets_arr[j]; bool to_submit = true; // Checking for dispatch packet type - if ((GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) && (dispatch_callback_ != NULL)) { + if ((GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) && + (dispatch_callback_.load(std::memory_order_acquire) != NULL)) { const hsa_kernel_dispatch_packet_t* dispatch_packet = reinterpret_cast(packet); const hsa_signal_t completion_signal = dispatch_packet->completion_signal; @@ -150,17 +184,7 @@ class InterceptQueue { // Prepareing dispatch callback data uint64_t kernel_object = dispatch_packet->kernel_object; const amd_kernel_code_t* kernel_code = GetKernelCode(kernel_object); - - const uint16_t kernel_object_flag = *((uint64_t*)kernel_code + 1); - if (kernel_object_flag == 0) { - if (!util::HsaRsrcFactory::IsExecutableTracking()) { - fprintf(stderr, "Error: V3 code object detected - code objects tracking should be enabled\n"); - abort(); - } - } - const char* kernel_name = (util::HsaRsrcFactory::IsExecutableTracking()) ? - util::HsaRsrcFactory::GetKernelName(kernel_object) : - GetKernelName(kernel_code->runtime_loader_kernel_symbol); + const char* kernel_name = QueryKernelName(kernel_object, kernel_code); rocprofiler_callback_data_t data = {obj->agent_info_->dev_id, obj->agent_info_->dev_index, @@ -172,12 +196,12 @@ class InterceptQueue { kernel_name, kernel_object, kernel_code, - syscall(__NR_gettid), + (uint32_t)syscall(__NR_gettid), (tracker_entry) ? tracker_entry->record : NULL}; // Calling dispatch callback rocprofiler_group_t group = {}; - hsa_status_t status = dispatch_callback_(&data, callback_data_, &group); + hsa_status_t status = (dispatch_callback_.load())(&data, callback_data_, &group); free(const_cast(kernel_name)); // Injecting profiling start/stop packets if ((status != HSA_STATUS_SUCCESS) || (group.context == NULL)) { @@ -227,16 +251,29 @@ class InterceptQueue { } } - static void SetCallbacks(rocprofiler_callback_t dispatch_callback, - queue_callback_t create_callback, - queue_callback_t destroy_callback, - void* data) - { + static void SetCallbacks(rocprofiler_queue_callbacks_t callbacks, void* data) { std::lock_guard lck(mutex_); + if (callback_data_ != NULL) { + EXC_ABORT(HSA_STATUS_ERROR, "reassigning queue callbacks - not supported"); + } + callbacks_ = callbacks; callback_data_ = data; - dispatch_callback_ = dispatch_callback; - create_callback_ = create_callback; - destroy_callback_ = destroy_callback; + Start(); + } + + static void RemoveCallbacks() { + std::lock_guard lck(mutex_); + callbacks_ = {}; + Stop(); + } + + static inline void Start() { dispatch_callback_.store(callbacks_.dispatch, std::memory_order_release); } + static inline void Stop() { dispatch_callback_.store(NULL, std::memory_order_relaxed); } + + static void SetSubmitCallback(rocprofiler_hsa_callback_fun_t fun, void* arg) { + std::lock_guard lck(mutex_); + submit_callback_fun_ = fun; + submit_callback_arg_ = arg; } static void TrackerOn(bool on) { tracker_on_ = on; } @@ -269,20 +306,28 @@ class InterceptQueue { static const char* GetKernelName(const uint64_t kernel_symbol) { amd_runtime_loader_debug_info_t* dbg_info = reinterpret_cast(kernel_symbol); - const char* kernel_name = (dbg_info != NULL) ? dbg_info->kernel_name : NULL; - - // Kernel name is mangled name - // apply __cxa_demangle() to demangle it - const char* funcname = NULL; - if (kernel_name != NULL) { - size_t funcnamesize = 0; - int status; - const char* ret = abi::__cxa_demangle(kernel_name, NULL, &funcnamesize, &status); - funcname = (ret != 0) ? ret : strdup(kernel_name); - } - if (funcname == NULL) funcname = strdup(kernel_none_); + return (dbg_info != NULL) ? dbg_info->kernel_name : NULL; + } - return funcname; + // Demangle C++ symbol name + static const char* cpp_demangle(const char* symname) { + size_t size = 0; + int status; + const char* ret = abi::__cxa_demangle(symname, NULL, &size, &status); + return (ret != 0) ? ret : strdup(symname); + } + + static const char* QueryKernelName(uint64_t kernel_object, const amd_kernel_code_t* kernel_code) { + const uint16_t kernel_object_flag = *((uint64_t*)kernel_code + 1); + if (kernel_object_flag == 0) { + if (!util::HsaRsrcFactory::IsExecutableTracking()) { + EXC_ABORT(HSA_STATUS_ERROR, "Error: V3 code object detected - code objects tracking should be enabled\n"); + } + } + const char* kernel_symname = (util::HsaRsrcFactory::IsExecutableTracking()) ? + util::HsaRsrcFactory::GetKernelNameRef(kernel_object) : + GetKernelName(kernel_code->runtime_loader_kernel_symbol); + return cpp_demangle(kernel_symname); } // method to get an intercept queue object @@ -324,12 +369,13 @@ class InterceptQueue { ProxyQueue::Destroy(proxy_); } - static mutex_t mutex_; static const packet_word_t header_type_mask = (1ul << HSA_PACKET_HEADER_WIDTH_TYPE) - 1; - static rocprofiler_callback_t dispatch_callback_; - static queue_callback_t create_callback_; - static queue_callback_t destroy_callback_; + + static mutex_t mutex_; + static rocprofiler_queue_callbacks_t callbacks_; static void* callback_data_; + static std::atomic dispatch_callback_; + static obj_map_t* obj_map_; static const char* kernel_none_; static Tracker* tracker_; @@ -337,6 +383,9 @@ class InterceptQueue { static bool in_create_call_; static queue_id_t current_queue_id; + static rocprofiler_hsa_callback_fun_t submit_callback_fun_; + static void* submit_callback_arg_; + hsa_queue_t* const queue_; ProxyQueue* const proxy_; const util::AgentInfo* agent_info_; diff --git a/src/core/metrics.h b/src/core/metrics.h index f9ae1fbd..a221168a 100644 --- a/src/core/metrics.h +++ b/src/core/metrics.h @@ -277,7 +277,7 @@ class MetricsDict { std::cout << name << "=" << expr_obj->String() << "\n" << std::endl; #endif counters_vec_t counters_vec; - for (const auto& var : expr_obj->GetVars()) { + for (const std::string& var : expr_obj->GetVars()) { auto it = cache_.find(var); if (it == cache_.end()) { EXC_RAISING(HSA_STATUS_ERROR, "Bad metric '" << name << "', var '" << var << "' is not found"); diff --git a/src/core/proxy_queue.h b/src/core/proxy_queue.h index 42e6c63b..e719fed4 100644 --- a/src/core/proxy_queue.h +++ b/src/core/proxy_queue.h @@ -24,7 +24,7 @@ THE SOFTWARE. #define _SRC_CORE_PROXY_QUEUE_H #include -#include +#include #include #include #include diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index 3f1362a7..618edf23 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -23,13 +23,15 @@ THE SOFTWARE. #include "inc/rocprofiler.h" #include -#include #include + +#include #include #include "core/context.h" #include "core/context_pool.h" #include "core/hsa_queue.h" +#include "core/hsa_interceptor.h" #include "core/intercept_queue.h" #include "core/proxy_queue.h" #include "core/simple_proxy_queue.h" @@ -53,6 +55,15 @@ THE SOFTWARE. } \ return status; +#define ONLOAD_TRACE(str) \ + if (getenv("ROCP_ONLOAD_TRACE")) do { \ + std::cout << "PID(" << GetPid() << "): PROF_LIB::" << __FUNCTION__ << " " << str << std::endl << std::flush; \ + } while(0); +#define ONLOAD_TRACE_BEG() ONLOAD_TRACE("begin") +#define ONLOAD_TRACE_END() ONLOAD_TRACE("end") + +static inline uint32_t GetPid() { return syscall(__NR_getpid); } + /////////////////////////////////////////////////////////////////////////////////////////////////// // Internal library methods // @@ -84,8 +95,16 @@ decltype(hsa_queue_load_read_index_scacquire)* hsa_queue_load_read_index_scacqui decltype(hsa_amd_queue_intercept_create)* hsa_amd_queue_intercept_create_fn; decltype(hsa_amd_queue_intercept_register)* hsa_amd_queue_intercept_register_fn; +decltype(hsa_memory_allocate)* hsa_memory_allocate_fn; +decltype(hsa_memory_assign_agent)* hsa_memory_assign_agent_fn; +decltype(hsa_memory_copy)* hsa_memory_copy_fn; +decltype(hsa_amd_memory_pool_allocate)* hsa_amd_memory_pool_allocate_fn; +decltype(hsa_amd_memory_pool_free)* hsa_amd_memory_pool_free_fn; +decltype(hsa_amd_agents_allow_access)* hsa_amd_agents_allow_access_fn; decltype(hsa_amd_memory_async_copy)* hsa_amd_memory_async_copy_fn; decltype(hsa_amd_memory_async_copy_rect)* hsa_amd_memory_async_copy_rect_fn; +decltype(hsa_executable_freeze)* hsa_executable_freeze_fn; +decltype(hsa_executable_destroy)* hsa_executable_destroy_fn; ::HsaApiTable* kHsaApiTable; @@ -146,10 +165,14 @@ enum { DISPATCH_INTERCEPT_MODE = 0x1, CODE_OBJ_TRACKING_MODE = 0x2, MEMCOPY_INTERCEPT_MODE = 0x4, + HSA_INTERCEPT_MODE = 0x8, }; uint32_t LoadTool() { uint32_t intercept_mode = 0; const char* tool_lib = getenv("ROCP_TOOL_LIB"); + std::ostringstream oss; + if (tool_lib) oss << "load tool library(" << tool_lib << ")"; + ONLOAD_TRACE(oss.str()); if (tool_lib) { intercept_mode = DISPATCH_INTERCEPT_MODE; @@ -180,6 +203,7 @@ uint32_t LoadTool() { settings.trace_local = TraceProfile::IsLocal() ? 1: 0; settings.timeout = util::HsaRsrcFactory::GetTimeoutNs(); settings.timestamp_on = InterceptQueue::IsTrackerOn() ? 1 : 0; + settings.code_obj_tracking = 1; if (handler) handler(); else if (handler_prop) handler_prop(&settings); @@ -191,13 +215,16 @@ uint32_t LoadTool() { if (settings.intercept_mode != 0) intercept_mode = DISPATCH_INTERCEPT_MODE; if (settings.code_obj_tracking) intercept_mode |= CODE_OBJ_TRACKING_MODE; if (settings.memcopy_tracking) intercept_mode |= MEMCOPY_INTERCEPT_MODE; + if (settings.hsa_intercepting) intercept_mode |= HSA_INTERCEPT_MODE; } + ONLOAD_TRACE("end intercept_mode(" << intercept_mode << ")"); return intercept_mode; } // Unload profiling tool librray void UnloadTool() { + ONLOAD_TRACE("tool handle(" << tool_handle << ")"); if (tool_handle) { tool_handler_t handler = reinterpret_cast(dlsym(tool_handle, "OnUnloadTool")); if (handler == NULL) { @@ -208,16 +235,21 @@ void UnloadTool() { handler(); dlclose(tool_handle); } + ONLOAD_TRACE_END(); } CONSTRUCTOR_API void constructor() { + ONLOAD_TRACE_BEG(); util::Logger::Create(); + ONLOAD_TRACE_END(); } DESTRUCTOR_API void destructor() { + ONLOAD_TRACE_BEG(); rocprofiler::MetricsDict::Destroy(); util::HsaRsrcFactory::Destroy(); util::Logger::Destroy(); + ONLOAD_TRACE_END(); } const MetricsDict* GetMetrics(const hsa_agent_t& agent) { @@ -403,6 +435,7 @@ extern "C" { // HSA-runtime tool on-load method PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t failed_tool_count, const char* const* failed_tool_names) { + ONLOAD_TRACE_BEG(); rocprofiler::SaveHsaApi(table); rocprofiler::ProxyQueue::InitFactory(); bool intercept_mode = false; @@ -449,6 +482,13 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa table->amd_ext_->hsa_amd_memory_async_copy_fn = rocprofiler::hsa_amd_memory_async_copy_interceptor; table->amd_ext_->hsa_amd_memory_async_copy_rect_fn = rocprofiler::hsa_amd_memory_async_copy_rect_interceptor; } + if (intercept_mode_mask & rocprofiler::HSA_INTERCEPT_MODE) { + if (intercept_mode_mask & rocprofiler::MEMCOPY_INTERCEPT_MODE) { + EXC_ABORT(HSA_STATUS_ERROR, "HSA_INTERCEPT and MEMCOPY_INTERCEPT conflict"); + } + rocprofiler::HsaInterceptor::Enable(true); + rocprofiler::HsaInterceptor::HsaIntercept(table); + } // HSA intercepting if (intercept_mode) { @@ -458,14 +498,16 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa rocprofiler::StandaloneIntercept(); } + ONLOAD_TRACE_END(); return true; } // HSA-runtime tool on-unload method PUBLIC_API void OnUnload() { - rocprofiler::Tracker::Destroy(); + ONLOAD_TRACE_BEG(); rocprofiler::UnloadTool(); rocprofiler::RestoreHsaApi(); + ONLOAD_TRACE_END(); } // Returns library vesrion @@ -527,6 +569,14 @@ PUBLIC_API hsa_status_t rocprofiler_reset(rocprofiler_t* handle, uint32_t group_ API_METHOD_SUFFIX } +// Return context agent +PUBLIC_API hsa_status_t rocprofiler_get_agent(rocprofiler_t* handle, hsa_agent_t* agent) { + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + *agent = context->GetAgent(); + API_METHOD_SUFFIX +} + // Get profiling group count PUBLIC_API hsa_status_t rocprofiler_group_count(const rocprofiler_t* handle, uint32_t* group_count) { @@ -617,14 +667,26 @@ PUBLIC_API hsa_status_t rocprofiler_get_metrics(const rocprofiler_t* handle) { // Set/remove queue callbacks PUBLIC_API hsa_status_t rocprofiler_set_queue_callbacks(rocprofiler_queue_callbacks_t callbacks, void* data) { API_METHOD_PREFIX - rocprofiler::InterceptQueue::SetCallbacks(callbacks.dispatch, callbacks.create, callbacks.destroy, data); + rocprofiler::InterceptQueue::SetCallbacks(callbacks, data); API_METHOD_SUFFIX } // Remove queue callbacks PUBLIC_API hsa_status_t rocprofiler_remove_queue_callbacks() { API_METHOD_PREFIX - rocprofiler::InterceptQueue::SetCallbacks(NULL, NULL, NULL, NULL); + rocprofiler::InterceptQueue::RemoveCallbacks(); + API_METHOD_SUFFIX +} + +// Start/stop queue callbacks +PUBLIC_API hsa_status_t rocprofiler_start_queue_callbacks() { + API_METHOD_PREFIX + rocprofiler::InterceptQueue::Start(); + API_METHOD_SUFFIX +} +PUBLIC_API hsa_status_t rocprofiler_stop_queue_callbacks() { + API_METHOD_PREFIX + rocprofiler::InterceptQueue::Stop(); API_METHOD_SUFFIX } @@ -785,7 +847,7 @@ PUBLIC_API hsa_status_t rocprofiler_iterate_info( uint32_t block_counters; profile.events = &(counters_vec[0]->event); status = rocprofiler::util::HsaRsrcFactory::Instance().AqlProfileApi()->hsa_ven_amd_aqlprofile_get_info( - &profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS, &block_counters); + &profile, HSA_VEN_AMD_AQLPROFILE_INFO_BLOCK_COUNTERS, &block_counters); if (status != HSA_STATUS_SUCCESS) continue; info.metric.instances = query.instance_count; @@ -840,16 +902,47 @@ PUBLIC_API hsa_status_t rocprofiler_queue_create_profiled( void* data, uint32_t private_segment_size, uint32_t group_segment_size, hsa_queue_t** queue) { - return rocprofiler::InterceptQueue::QueueCreateTracked(agent, size, type, callback, data, private_segment_size, group_segment_size, queue); + API_METHOD_PREFIX + status = rocprofiler::InterceptQueue::QueueCreateTracked( + agent, size, type, callback, data, private_segment_size, group_segment_size, queue); + API_METHOD_SUFFIX } // Return time value for a given time ID and profiling timestamp -hsa_status_t rocprofiler_get_time( +PUBLIC_API hsa_status_t rocprofiler_get_time( rocprofiler_time_id_t time_id, uint64_t timestamp, - uint64_t* value_ns) + uint64_t* value_ns, + uint64_t* error_ns) { - return rocprofiler::util::HsaRsrcFactory::Instance().GetTime(time_id, timestamp, value_ns); + API_METHOD_PREFIX + if (error_ns != NULL) { + *error_ns = 0; + status = rocprofiler::util::HsaRsrcFactory::Instance().GetTimeErr(time_id, error_ns); + } + if ((status == HSA_STATUS_SUCCESS) && (value_ns != NULL)) { + *value_ns = 0; + status = rocprofiler::util::HsaRsrcFactory::Instance().GetTimeVal(time_id, timestamp, value_ns); + } + API_METHOD_SUFFIX } } // extern "C" + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// HSA API callbacks routines +// +bool rocprofiler::HsaInterceptor::enable_ = false; +thread_local bool rocprofiler::HsaInterceptor::recursion_ = false;; +rocprofiler_hsa_callbacks_t rocprofiler::HsaInterceptor::callbacks_{}; +rocprofiler::HsaInterceptor::arg_t rocprofiler::HsaInterceptor::arg_{}; +hsa_ven_amd_loader_1_01_pfn_t rocprofiler::HsaInterceptor::LoaderApiTable{}; +rocprofiler::HsaInterceptor::mutex_t rocprofiler::HsaInterceptor::mutex_; + +// Set HSA callbacks. If a callback is NULL then it is disabled +extern "C" PUBLIC_API hsa_status_t rocprofiler_set_hsa_callbacks(const rocprofiler_hsa_callbacks_t callbacks, void* arg) { + API_METHOD_PREFIX + rocprofiler::HsaInterceptor::SetCallbacks(callbacks, arg); + rocprofiler::InterceptQueue::SetSubmitCallback(callbacks.submit, arg); + API_METHOD_SUFFIX +} diff --git a/src/core/tracker.h b/src/core/tracker.h index e366c761..823dc17d 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -138,7 +138,7 @@ class Tracker { // Debug trace if (trace_on_) { auto outstanding = outstanding_.fetch_add(1); - fprintf(stdout, "Tracker::Add: entry %p, record %p, outst %lu\n", entry, entry->record, outstanding); + fprintf(stdout, "Tracker::Enable: entry %p, record %p, outst %lu\n", entry, entry->record, outstanding); fflush(stdout); } } @@ -163,11 +163,21 @@ class Tracker { {} ~Tracker() { + if (trace_on_) { + fprintf(stdout, "Tracker::DESTR: sig list %d, outst %lu\n", (int)(sig_list_.size()), outstanding_.load()); + fflush(stdout); + } + auto it = sig_list_.begin(); auto end = sig_list_.end(); while (it != end) { auto cur = it++; - hsa_rsrc_->SignalWait((*cur)->signal); +// The wait should be optiona as there possible some inter kernel dependencies and it possible to wait for +// the kernels will never be lunched as the application was finished by some reason. +#if 0 + // FIXME: currently the signal value for tracking signals are taken from original application signal + hsa_rsrc_->SignalWait((*cur)->signal, 1); +#endif Erase(cur); } } @@ -182,7 +192,7 @@ class Tracker { // Debug trace if (trace_on_) { auto outstanding = outstanding_.fetch_sub(1); - fprintf(stdout, "Tracker::Handler: entry %p, record %p, outst %lu\n", entry, entry->record, outstanding); + fprintf(stdout, "Tracker::Complete: entry %p, record %p, outst %lu\n", entry, entry->record, outstanding); fflush(stdout); } diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index 9ce362d4..78833284 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -44,9 +44,6 @@ POSSIBILITY OF SUCH DAMAGE. #include #include -#include "util/exception.h" -#include "util/logger.h" - namespace rocprofiler { namespace util { @@ -152,11 +149,15 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize // Time correlation const uint32_t corr_iters = 1000; - CorrelateTime(HsaTimer::TIME_ID_CLOCK_REALTIME, corr_iters); - CorrelateTime(HsaTimer::TIME_ID_CLOCK_MONOTONIC, corr_iters); + for (unsigned time_id = 0; time_id < HsaTimer::TIME_ID_NUMBER; time_id += 1) { + CorrelateTime((HsaTimer::time_id_t)time_id, corr_iters); + } // System timeout timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_); + + // To dump code objects + to_dump_code_obj_ = getenv("ROCP_DUMP_CODEOBJ"); } // Destructor of the class @@ -197,6 +198,7 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_executable_create_alt = table->core_->hsa_executable_create_alt_fn; hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn; hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn; + hsa_api_.hsa_executable_destroy = table->core_->hsa_executable_destroy_fn; hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn; hsa_api_.hsa_executable_symbol_get_info = table->core_->hsa_executable_symbol_get_info_fn; hsa_api_.hsa_executable_iterate_symbols = table->core_->hsa_executable_iterate_symbols_fn; @@ -237,6 +239,7 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_executable_create_alt = hsa_executable_create_alt; hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object; hsa_api_.hsa_executable_freeze = hsa_executable_freeze; + hsa_api_.hsa_executable_destroy = hsa_executable_destroy; hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol; hsa_api_.hsa_executable_symbol_get_info = hsa_executable_symbol_get_info; hsa_api_.hsa_executable_iterate_symbols = hsa_executable_iterate_symbols; @@ -523,22 +526,25 @@ uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t s } // Wait signal -void HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const { +hsa_signal_value_t HsaRsrcFactory::SignalWait(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { + const hsa_signal_value_t exp_value = signal_value - 1; + hsa_signal_value_t ret_value = signal_value; while (1) { - const hsa_signal_value_t signal_value = - hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED); - if (signal_value == 0) { - break; - } else { - if (signal_value == 1) WARN_LOGGING("signal waiting..."); - else EXC_RAISING(HSA_STATUS_ERROR, "hsa_signal_wait_scacquire (" << signal_value << ")"); + ret_value = + hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, signal_value, timeout_, HSA_WAIT_STATE_BLOCKED); + if (ret_value == exp_value) break; + if (ret_value != signal_value) { + std::cerr << "Error: HsaRsrcFactory::SignalWait: signal_value(" << signal_value + << "), ret_value(" << ret_value << ")" << std::endl << std::flush; + abort(); } } + return ret_value; } // Wait signal with signal value restore void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { - SignalWait(signal); + SignalWait(signal, signal_value); hsa_api_.hsa_signal_store_relaxed(const_cast(signal), signal_value); } @@ -551,7 +557,7 @@ bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src CHECK_STATUS("hsa_signal_create()", status); status = hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); CHECK_STATUS("hsa_amd_memory_async_copy()", status); - SignalWait(s); + SignalWait(s, 1); status = hsa_api_.hsa_signal_destroy(s); CHECK_STATUS("hsa_signal_destroy()", status); } @@ -695,20 +701,21 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s return write_idx; } -const char* HsaRsrcFactory::GetKernelName(uint64_t addr) { +const char* HsaRsrcFactory::GetKernelNameRef(uint64_t addr) { std::lock_guard lck(mutex_); const auto it = symbols_map_->find(addr); if (it == symbols_map_->end()) { fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr); abort(); } - return strdup(it->second); + return it->second; } void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) { std::lock_guard lck(mutex_); executable_tracking_on_ = true; table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; + table->core_->hsa_executable_destroy_fn = hsa_executable_destroy_interceptor; } hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data) { @@ -726,10 +733,14 @@ hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_ex status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); CHECK_STATUS("Error in getting kernel name", status); name[len] = 0; - auto ret = symbols_map_->insert({addr, name}); - if (ret.second == false) { - delete[] ret.first->second; - ret.first->second = name; + if (data == NULL) { + auto ret = symbols_map_->insert({addr, name}); + if (ret.second == false) { + delete[] ret.first->second; + ret.first->second = name; + } + } else { + symbols_map_->erase(addr); } } return HSA_STATUS_SUCCESS; @@ -740,7 +751,16 @@ hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t; hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL); CHECK_STATUS("Error in iterating executable symbols", status); - return hsa_api_.hsa_executable_freeze(executable, options);; + return hsa_api_.hsa_executable_freeze(executable, options); +} + +hsa_status_t HsaRsrcFactory::hsa_executable_destroy_interceptor(hsa_executable_t executable) { + std::lock_guard lck(mutex_); + if (symbols_map_ != NULL) { + hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, (void*)1); + CHECK_STATUS("Error in iterating executable symbols", status); + } + return hsa_api_.hsa_executable_destroy(executable); } std::atomic HsaRsrcFactory::instance_{}; @@ -749,6 +769,7 @@ HsaRsrcFactory::timestamp_t HsaRsrcFactory::timeout_ns_ = HsaTimer::TIMESTAMP_MA hsa_pfn_t HsaRsrcFactory::hsa_api_{}; bool HsaRsrcFactory::executable_tracking_on_ = false; HsaRsrcFactory::symbols_map_t* HsaRsrcFactory::symbols_map_ = NULL; +void* HsaRsrcFactory::to_dump_code_obj_ = NULL; } // namespace util } // namespace rocprofiler diff --git a/src/util/hsa_rsrc_factory.h b/src/util/hsa_rsrc_factory.h index 0362bc2c..a8e392aa 100644 --- a/src/util/hsa_rsrc_factory.h +++ b/src/util/hsa_rsrc_factory.h @@ -25,6 +25,8 @@ POSSIBILITY OF SUCH DAMAGE. #ifndef SRC_UTIL_HSA_RSRC_FACTORY_H_ #define SRC_UTIL_HSA_RSRC_FACTORY_H_ +#define AMD_INTERNAL_BUILD + #include #include #include @@ -95,6 +97,7 @@ struct hsa_pfn_t { decltype(hsa_executable_create_alt)* hsa_executable_create_alt; decltype(hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object; decltype(hsa_executable_freeze)* hsa_executable_freeze; + decltype(hsa_executable_destroy)* hsa_executable_destroy; decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol; decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info; decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols; @@ -164,10 +167,11 @@ struct AgentInfo { // Number of Shader Arrays Per Shader Engines in Gpu uint32_t shader_arrays_per_se; - // SGPR/VGPR block sizes + // SGPR/VGPR/LDS block sizes uint32_t sgpr_block_dflt; uint32_t sgpr_block_size; uint32_t vgpr_block_size; + static const uint32_t lds_block_size = 128 * 4; }; // HSA timer class @@ -180,7 +184,10 @@ class HsaTimer { enum time_id_t { TIME_ID_CLOCK_REALTIME = 0, - TIME_ID_CLOCK_MONOTONIC = 1, + TIME_ID_CLOCK_REALTIME_COARSE = 1, + TIME_ID_CLOCK_MONOTONIC = 2, + TIME_ID_CLOCK_MONOTONIC_COARSE = 3, + TIME_ID_CLOCK_MONOTONIC_RAW = 4, TIME_ID_NUMBER }; @@ -200,7 +207,7 @@ class HsaTimer { } // Method for timespec/ns conversion - timestamp_t timespec_to_ns(const timespec& time) const { + static timestamp_t timespec_to_ns(const timespec& time) { return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec; } @@ -224,13 +231,22 @@ class HsaTimer { void correlated_pair_ns(time_id_t time_id, uint32_t iters, timestamp_t* timestamp_v, timestamp_t* time_v, timestamp_t* error_v) { clockid_t clock_id = 0; - switch (clock_id) { + switch (time_id) { case TIME_ID_CLOCK_REALTIME: clock_id = CLOCK_REALTIME; break; + case TIME_ID_CLOCK_REALTIME_COARSE: + clock_id = CLOCK_REALTIME_COARSE; + break; case TIME_ID_CLOCK_MONOTONIC: clock_id = CLOCK_MONOTONIC; break; + case TIME_ID_CLOCK_MONOTONIC_COARSE: + clock_id = CLOCK_MONOTONIC_COARSE; + break; + case TIME_ID_CLOCK_MONOTONIC_RAW: + clock_id = CLOCK_MONOTONIC_RAW; + break; default: CHECK_STATUS("internal error: invalid time_id", HSA_STATUS_ERROR); } @@ -361,7 +377,7 @@ class HsaRsrcFactory { uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size); // Wait signal - void SignalWait(const hsa_signal_t& signal) const; + hsa_signal_value_t SignalWait(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const; // Wait signal with signal value restore void SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const; @@ -393,7 +409,7 @@ class HsaRsrcFactory { // Enable executables loading tracking static bool IsExecutableTracking() { return executable_tracking_on_; } static void EnableExecutableTracking(HsaApiTable* table); - static const char* GetKernelName(uint64_t addr); + static const char* GetKernelNameRef(uint64_t addr); // Initialize HSA API table void static InitHsaApiTable(HsaApiTable* table); @@ -428,9 +444,14 @@ class HsaRsrcFactory { time_error_[time_id] = error_v; } - hsa_status_t GetTime(uint32_t time_id, uint64_t value, uint64_t* time) { + hsa_status_t GetTimeVal(uint32_t time_id, uint64_t time_stamp, uint64_t* time_value) { if (time_id >= HsaTimer::TIME_ID_NUMBER) return HSA_STATUS_ERROR; - *time = value + time_shift_[time_id]; + *time_value = time_stamp + time_shift_[time_id]; + return HSA_STATUS_SUCCESS; + } + + hsa_status_t GetTimeErr(uint32_t time_id, uint64_t* err) { + *err = time_error_[time_id]; return HSA_STATUS_SUCCESS; } @@ -478,7 +499,9 @@ class HsaRsrcFactory { typedef std::map symbols_map_t; static symbols_map_t* symbols_map_; static bool executable_tracking_on_; + static void* to_dump_code_obj_; static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options); + static hsa_status_t hsa_executable_destroy_interceptor(hsa_executable_t executable); static hsa_status_t executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data); // HSA runtime API table diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 1ae8a554..4b3aec02 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -40,15 +40,22 @@ target_include_directories ( "c_test" PRIVATE ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH file( GLOB UTIL_SRC "${TEST_DIR}/util/*.cpp" ) ## Standalone test sources -set ( STEXE_NAME "standalone_test" ) -set ( STTST_SRC +set ( ST_EXE_NAME "standalone_test" ) +set ( ST_TST_SRC ${TEST_DIR}/app/standalone_test.cpp ${TEST_DIR}/ctrl/test_hsa.cpp ) +## Standalone intercept test sources +set ( STIN_EXE_NAME "stand_intercept_test" ) +set ( STIN_TST_SRC + ${TEST_DIR}/app/stand_intercept_test.cpp + ${TEST_DIR}/ctrl/test_hsa.cpp +) + ## Intercept test sources -set ( INEXE_NAME "intercept_test" ) -set ( INTST_SRC +set ( IN_EXE_NAME "intercept_test" ) +set ( IN_TST_SRC ${TEST_DIR}/app/intercept_test.cpp ${TEST_DIR}/ctrl/test_hsa.cpp ) @@ -61,26 +68,34 @@ set ( CTRL_SRC ## Dummy kernel set ( DUMMY_NAME dummy_kernel ) -execute_process ( COMMAND sh -xc "${TEST_DIR}/../bin/build_kernel.sh ${TEST_DIR}/${DUMMY_NAME}/${DUMMY_NAME} ${PROJECT_BINARY_DIR}" ) - ## Test kernel set ( TEST_NAME simple_convolution ) set ( KERN_SRC ${TEST_DIR}/${TEST_NAME}/${TEST_NAME}.cpp ) -execute_process ( COMMAND sh -xc "${TEST_DIR}/../bin/build_kernel.sh ${TEST_DIR}/${TEST_NAME}/${TEST_NAME} ${PROJECT_BINARY_DIR}" ) + +## Building test kernels +add_custom_target( mytest + COMMAND sh -xc "${TEST_DIR}/../bin/build_kernel.sh ${TEST_DIR}/${DUMMY_NAME}/${DUMMY_NAME} ${PROJECT_BINARY_DIR} '${ROCM_ROOT_DIR}' '${GPU_TARGETS}'" + COMMAND sh -xc "${TEST_DIR}/../bin/build_kernel.sh ${TEST_DIR}/${TEST_NAME}/${TEST_NAME} ${PROJECT_BINARY_DIR} '${ROCM_ROOT_DIR}' '${GPU_TARGETS}'" +) ## Building standalone test executable -add_executable ( ${STEXE_NAME} ${STTST_SRC} ${UTIL_SRC} ${KERN_SRC} ) -target_include_directories ( ${STEXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) -target_link_libraries ( ${STEXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) +add_executable ( ${ST_EXE_NAME} ${ST_TST_SRC} ${UTIL_SRC} ${KERN_SRC} ) +target_include_directories ( ${ST_EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries ( ${ST_EXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) + +## Building standalone intercept test executable +add_executable ( ${STIN_EXE_NAME} ${STIN_TST_SRC} ${UTIL_SRC} ${KERN_SRC} ) +target_include_directories ( ${STIN_EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries ( ${STIN_EXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) ## Building intercept test executable -add_library ( ${INEXE_NAME} SHARED ${INTST_SRC} ${UTIL_SRC} ${KERN_SRC} ) -target_include_directories ( ${INEXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) -target_link_libraries ( ${INEXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) +add_library ( ${IN_EXE_NAME} SHARED ${IN_TST_SRC} ${UTIL_SRC} ${KERN_SRC} ) +target_include_directories ( ${IN_EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_link_libraries ( ${IN_EXE_NAME} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) ## Building ctrl test executable add_executable ( ${EXE_NAME} ${CTRL_SRC} ${UTIL_SRC} ${KERN_SRC} ) -target_include_directories ( ${EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) +target_include_directories ( ${EXE_NAME} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ${HSA_RUNTIME_LIB_PATH}/../include ) target_link_libraries ( ${EXE_NAME} ${HSA_RUNTIME_LIB} ${HSA_KMT_LIB} c stdc++ dl pthread rt ) execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/run.sh ${PROJECT_BINARY_DIR}" ) execute_process ( COMMAND sh -xc "cp ${TEST_DIR}/tool/*.xml ${PROJECT_BINARY_DIR}" ) @@ -93,5 +108,11 @@ add_library ( ${TEST_LIB} SHARED ${TEST_LIB_SRC} ) target_include_directories ( ${TEST_LIB} PRIVATE ${TEST_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ) target_link_libraries ( ${TEST_LIB} ${ROCPROFILER_TARGET} ${HSA_RUNTIME_LIB} c stdc++ dl pthread rt ) +## Build memory test bench +add_custom_target( mbench + COMMAND sh -xc "cp -r ${TEST_DIR}/memory_validation ${PROJECT_BINARY_DIR}/test/." + COMMAND make -C "${PROJECT_BINARY_DIR}/test/memory_validation" +) + ## Copy OCL test -execute_process ( COMMAND sh -xc "cp -r ${TEST_DIR}/ocl ${PROJECT_BINARY_DIR}/test" ) +execute_process ( COMMAND sh -xc "cp -r ${TEST_DIR}/ocl ${PROJECT_BINARY_DIR}/test/." ) diff --git a/test/app/intercept_test.cpp b/test/app/intercept_test.cpp index c2905d1e..e62bf6ce 100644 --- a/test/app/intercept_test.cpp +++ b/test/app/intercept_test.cpp @@ -91,7 +91,7 @@ void dump_context_entry(context_entry_t* entry, rocprofiler_feature_t* features, const rocprofiler_dispatch_record_t* record = entry->data.record; fflush(stdout); - fprintf(stdout, "kernel symbol(0x%lx) name(\"%s\") tid(%ld) queue-id(%u) gpu-id(%u) ", + fprintf(stdout, "kernel symbol(0x%lx) name(\"%s\") tid(%u) queue-id(%u) gpu-id(%u) ", entry->data.kernel_object, kernel_name.c_str(), entry->data.thread_id, diff --git a/test/app/stand_intercept_test.cpp b/test/app/stand_intercept_test.cpp new file mode 100644 index 00000000..97642557 --- /dev/null +++ b/test/app/stand_intercept_test.cpp @@ -0,0 +1,190 @@ +/****************************************************************************** +Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include "ctrl/run_kernel.h" +#include "ctrl/test_aql.h" +#include "ctrl/test_hsa.h" +#include "inc/rocprofiler.h" +#include "dummy_kernel/dummy_kernel.h" +#include "simple_convolution/simple_convolution.h" +#include "util/test_assert.h" + +// Dispatch callbacks and context handlers synchronization +pthread_mutex_t mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP; + +// Error handler +void fatal(const std::string msg) { + fflush(stdout); + fprintf(stderr, "%s\n\n", msg.c_str()); + fflush(stderr); + abort(); +} + +// Check returned HSA API status +void check_status(hsa_status_t status) { + if (status != HSA_STATUS_SUCCESS) { + const char* error_string = NULL; + rocprofiler_error_string(&error_string); + fprintf(stderr, "ERROR: %s\n", error_string); + abort(); + } +} + +// Context stored entry type +struct context_entry_t { + bool valid; + hsa_agent_t agent; + rocprofiler_group_t group; + rocprofiler_callback_data_t data; +}; + +// Dump stored context entry +void dump_context_entry(context_entry_t* entry) { + volatile std::atomic* valid = reinterpret_cast*>(&entry->valid); + while (valid->load() == false) sched_yield(); + + const std::string kernel_name = entry->data.kernel_name; + const rocprofiler_dispatch_record_t* record = entry->data.record; + + fflush(stdout); + fprintf(stdout, "kernel-object(0x%lx) name(\"%s\")", entry->data.kernel_object, kernel_name.c_str()); + if (record) fprintf(stdout, ", gpu-id(%u), time(%lu,%lu,%lu,%lu)", + HsaRsrcFactory::Instance().GetAgentInfo(entry->agent)->dev_index, + record->dispatch, + record->begin, + record->end, + record->complete); + fprintf(stdout, "\n"); + fflush(stdout); + + rocprofiler_group_t& group = entry->group; + if (group.context == NULL) { + fprintf(stderr, "tool error: context is NULL\n"); + abort(); + } + + rocprofiler_close(group.context); +} + +// Profiling completion handler +// Dump and delete the context entry +// Return true if the context was dumped successfully +bool context_handler(rocprofiler_group_t group, void* arg) { + context_entry_t* entry = reinterpret_cast(arg); + + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + dump_context_entry(entry); + delete entry; + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + return false; +} + +// Kernel disoatch callback +hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* /*user_data*/, + rocprofiler_group_t* group) { + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; + + // Profiling context + rocprofiler_t* context = NULL; + + // Context entry + context_entry_t* entry = new context_entry_t(); + + // context properties + rocprofiler_properties_t properties{}; + properties.handler = context_handler; + properties.handler_arg = (void*)entry; + + // Open profiling context + status = rocprofiler_open(callback_data->agent, NULL, 0, + &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); + check_status(status); + + // Get group[0] + status = rocprofiler_get_group(context, 0, group); + check_status(status); + + // Fill profiling context entry + entry->agent = callback_data->agent; + entry->group = *group; + entry->data = *callback_data; + entry->data.kernel_name = strdup(callback_data->kernel_name); + reinterpret_cast*>(&entry->valid)->store(true); + + return HSA_STATUS_SUCCESS; +} + +int main() { + bool ret_val = true; + const char* kiter_s = getenv("ROCP_KITER"); + const char* diter_s = getenv("ROCP_DITER"); + const unsigned kiter = (kiter_s != NULL) ? atol(kiter_s) : 1; + const unsigned diter = (diter_s != NULL) ? atol(diter_s) : 1; + + // Adding dispatch observer + rocprofiler_queue_callbacks_t callbacks_ptrs{}; + callbacks_ptrs.dispatch = dispatch_callback; + rocprofiler_set_queue_callbacks(callbacks_ptrs, NULL); + + // Instantiate HSA resources + HsaRsrcFactory::Create(); + + // Getting GPU device info + const AgentInfo* agent_info = NULL; + if (HsaRsrcFactory::Instance().GetGpuAgentInfo(0, &agent_info) == false) abort(); + + // Creating the queue + hsa_queue_t* queue = NULL; + if (HsaRsrcFactory::Instance().CreateQueue(agent_info, 128, &queue) == false) abort(); + + // Test initialization + TestHsa::HsaInstantiate(); + + for (unsigned ind = 0; ind < kiter; ++ind) { + printf("Iteration %u:\n", ind); + if ((ind & 1) == 0) rocprofiler_start_queue_callbacks(); + else rocprofiler_stop_queue_callbacks(); + ret_val = RunKernel(0, NULL, agent_info, queue, diter); + if (ret_val) ret_val = RunKernel(0, NULL, agent_info, queue, diter); + } + + TestHsa::HsaShutdown(); + + return (ret_val) ? 0 : 1; +} diff --git a/test/app/standalone_test.cpp b/test/app/standalone_test.cpp index b173c4d3..34bc05ea 100644 --- a/test/app/standalone_test.cpp +++ b/test/app/standalone_test.cpp @@ -31,8 +31,46 @@ THE SOFTWARE. #include "inc/rocprofiler.h" #include "dummy_kernel/dummy_kernel.h" #include "simple_convolution/simple_convolution.h" +#include "util/hsa_rsrc_factory.h" #include "util/test_assert.h" +// print time +void print_sys_time(clockid_t clock_id, rocprofiler_time_id_t time_id) { + HsaTimer::timestamp_t value_ns = 0; + HsaTimer::timestamp_t error_ns = 0; + HsaTimer::timestamp_t timestamp = 0; + + timespec tm_val; + clock_gettime(clock_id, &tm_val); + HsaTimer::timestamp_t tm_val_ns = HsaTimer::timespec_to_ns(tm_val); + + timestamp = HsaRsrcFactory::Instance().TimestampNs(); + hsa_status_t status = rocprofiler_get_time(time_id, timestamp, &value_ns, &error_ns); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + + HsaTimer::timestamp_t timestamp1 = timestamp; + HsaTimer::timestamp_t value_ns1 = value_ns; + + printf("time-id(%d) ts_ns(%lu) orig_ns(%lu) time_ns(%lu) err_ns(%lu)\n", (int)time_id, timestamp, tm_val_ns, value_ns, error_ns); + + sleep(1); + + timestamp = HsaRsrcFactory::Instance().TimestampNs(); + status = rocprofiler_get_time(time_id, timestamp, &value_ns, NULL); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + status = rocprofiler_get_time(time_id, timestamp, NULL, &error_ns); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + status = rocprofiler_get_time(time_id, timestamp, NULL, NULL); + TEST_STATUS(status == HSA_STATUS_SUCCESS); + + HsaTimer::timestamp_t timestamp2 = timestamp; + HsaTimer::timestamp_t value_ns2 = value_ns; + + printf("time-id(%d) ts_ns(%lu) orig_ns(%lu) time_ns(%lu) err_ns(%lu)\n", (int)time_id, timestamp, tm_val_ns, value_ns, error_ns); + printf("ts-diff(%lu) tm-diff(%lu)\n", timestamp2 - timestamp1, value_ns2 - value_ns1); +} + +// print profiler features void print_features(rocprofiler_feature_t* feature, uint32_t feature_count) { for (rocprofiler_feature_t* p = feature; p < feature + feature_count; ++p) { std::cout << (p - feature) << ": " << p->name; @@ -82,7 +120,7 @@ int main() { rocprofiler_properties_t properties; // Profiling feature objects - const unsigned feature_count = 9; + const unsigned feature_count = 6; rocprofiler_feature_t feature[feature_count]; // PMC events memset(feature, 0, sizeof(feature)); @@ -98,12 +136,12 @@ int main() { feature[4].name = "SQ_INSTS_VALU"; feature[5].kind = ROCPROFILER_FEATURE_KIND_METRIC; feature[5].name = "VALUInsts"; - feature[6].kind = ROCPROFILER_FEATURE_KIND_METRIC; - feature[6].name = "TCC_HIT_sum"; - feature[7].kind = ROCPROFILER_FEATURE_KIND_METRIC; - feature[7].name = "TCC_MISS_sum"; - feature[8].kind = ROCPROFILER_FEATURE_KIND_METRIC; - feature[8].name = "WRITE_SIZE"; +// feature[6].kind = ROCPROFILER_FEATURE_KIND_METRIC; +// feature[6].name = "TCC_HIT_sum"; +// feature[7].kind = ROCPROFILER_FEATURE_KIND_METRIC; +// feature[7].name = "TCC_MISS_sum"; +// feature[8].kind = ROCPROFILER_FEATURE_KIND_METRIC; +// feature[8].name = "WRITE_SIZE"; // feature[8].kind = ROCPROFILER_FEATURE_KIND_METRIC; // feature[8].name = "TCC_EA_WRREQ_sum"; // feature[9].kind = ROCPROFILER_FEATURE_KIND_METRIC; @@ -176,5 +214,9 @@ int main() { status = rocprofiler_close(context); TEST_STATUS(status == HSA_STATUS_SUCCESS); + print_sys_time(CLOCK_REALTIME, ROCPROFILER_TIME_ID_CLOCK_REALTIME); + sleep(1); + print_sys_time(CLOCK_MONOTONIC, ROCPROFILER_TIME_ID_CLOCK_MONOTONIC); + return (ret_val) ? 0 : 1; } diff --git a/test/run.sh b/test/run.sh index 4612fa1c..4c985d3e 100755 --- a/test/run.sh +++ b/test/run.sh @@ -32,9 +32,12 @@ fi test_status=0 test_runnum=0 test_number=0 +failed_tests="Failed tests:" + xeval_test() { test_number=$test_number } + eval_test() { label=$1 cmdline=$2 @@ -44,6 +47,7 @@ eval_test() { eval "$cmdline" if [ $? != 0 ] ; then echo "$label: FAILED" + failed_tests="$failed_tests\n $test_number: \"$label\"" test_status=$(($test_status + 1)) else echo "$label: PASSED" @@ -52,18 +56,22 @@ eval_test() { test_number=$((test_number + 1)) } -# enable tools load failure reporting -export HSA_TOOLS_REPORT_LOAD_FAILURE=1 # paths to ROC profiler and oher libraries export LD_LIBRARY_PATH=$PWD + +# enable tools load failure reporting +export HSA_TOOLS_REPORT_LOAD_FAILURE=1 # enable error messages logging to '/tmp/rocprofiler_log.txt' export ROCPROFILER_LOG=1 -# ROC profiler metrics config file +# enable error messages logging to '/tmp/aql_profile_log.txt' +export HSA_VEN_AMD_AQLPROFILE_LOG=1 +# test trace +export ROC_TEST_TRACE=1 + +# Disabple profiler own proxy queue unset ROCP_PROXY_QUEUE # ROC profiler metrics config file export ROCP_METRICS=metrics.xml -# test trace -export ROC_TEST_TRACE=1 ## C test eval_test "C test" ./test/c_test @@ -72,17 +80,26 @@ eval_test "C test" ./test/c_test unset HSA_TOOLS_LIB unset ROCP_TOOL_LIB eval_test "Standalone sampling usage model test" ./test/standalone_test +# Standalone intercepting test +# ROC profiler library loaded by HSA runtime +export HSA_TOOLS_LIB=librocprofiler64.so.1 +# enable intercepting mode in rocprofiler +export ROCP_HSA_INTERCEPT=2 +# test macro for kernel iterations number +export ROCP_KITER=100 +# test macro for per-kernel dispatching number +export ROCP_DITER=10 +eval_test "Standalone intercepting test" ./test/stand_intercept_test +unset ROCP_HSA_INTERCEPT ## Intercepting usage model test -# ROC profiler library loaded by HSA runtime -export HSA_TOOLS_LIB=librocprofiler64.so # tool library loaded by ROC profiler export ROCP_TOOL_LIB=./test/libintercept_test.so export ROCP_KITER=50 export ROCP_DITER=50 export ROCP_AGENTS=1 export ROCP_THRS=3 -eval_test "Intercepting usage model test" "../bin/run_tool.sh ./test/ctrl" +eval_test "Intercepting usage model test" ./test/ctrl ## Libtool test # tool library loaded by ROC profiler @@ -111,6 +128,16 @@ export ROCP_THRS=10 export ROCP_INPUT=input1.xml eval_test "'rocprof' libtool test n-threads" ./test/ctrl +## SPM test +# export ROCP_KITER=3 +# export ROCP_DITER=3 +# export ROCP_AGENTS=1 +# export ROCP_THRS=1 +# export ROCP_INPUT=spm_input.xml +# export ROCP_SPM=1 +# eval_test "libtool test, SPM trace test" ./test/ctrl +# unset ROCP_SPM + ## Libtool test, counter sets # Memcopies tracking export ROCP_MCOPY_TRACKING=1 @@ -121,13 +148,30 @@ export ROCP_INPUT=input2.xml eval_test "libtool test, counter sets" ./test/ctrl ## OpenCL test -export ROCP_OBJ_TRACKING=1 +#export ROCP_OBJ_TRACKING=1 +#export ROCP_INPUT=input1.xml +#eval_test "libtool test, OpenCL sample" ./test/ocl/SimpleConvolution + +# Memcopies tracking +unset ROCP_MCOPY_TRACKING +# enable HSA intercepting +export ROCP_HSA_INTERC=1 + +export ROCP_KITER=10 +export ROCP_DITER=10 export ROCP_INPUT=input1.xml -eval_test "libtool test, OpenCL sample" ./test/ocl/SimpleConvolution +eval_test "libtool test, counter sets" ./test/ctrl + +## OpenCL test +#export ROCP_OBJ_TRACKING=1 +#eval_test "libtool test, OpenCL sample" ./test/ocl/SimpleConvolution #valgrind --leak-check=full $tbin #valgrind --tool=massif $tbin #ms_print massif.out. echo "$test_number tests total / $test_runnum tests run / $test_status tests failed" +if [ $test_status != 0 ] ; then + echo $failed_tests +fi exit $test_status diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 81626a2a..e216b7fd 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -56,6 +56,13 @@ THE SOFTWARE. #define DESTRUCTOR_API __attribute__((destructor)) #define KERNEL_NAME_LEN_MAX 128 +#define ONLOAD_TRACE(str) \ + if (getenv("ROCP_ONLOAD_TRACE")) do { \ + std::cout << "PID(" << GetPid() << "): PROF_TOOL_LIB::" << __FUNCTION__ << " " << str << std::endl << std::flush; \ + } while(0); +#define ONLOAD_TRACE_BEG() ONLOAD_TRACE("begin") +#define ONLOAD_TRACE_END() ONLOAD_TRACE("end") + // Disoatch callback data type struct callbacks_data_t { rocprofiler_feature_t* features; @@ -139,8 +146,11 @@ bool is_trace_local = true; // SPM trace enabled bool is_spm_trace = false; +static inline uint32_t GetPid() { return syscall(__NR_getpid); } static inline uint32_t GetTid() { return syscall(__NR_gettid); } +uint32_t my_pid = GetPid(); + // Error handler void fatal(const std::string msg) { fflush(stdout); @@ -475,15 +485,16 @@ bool dump_context_entry(context_entry_t* entry) { const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); const AgentInfo* agent_info = HsaRsrcFactory::Instance().GetAgentInfo(entry->agent); - fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), tid(%lu), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), kernel-name(\"%s\")", + fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), pid(%u), tid(%u), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), kernel-name(\"%s\")", index, agent_info->dev_index, entry->data.queue_id, entry->data.queue_index, + my_pid, entry->data.thread_id, entry->kernel_properties.grid_size, entry->kernel_properties.workgroup_size, - (entry->kernel_properties.lds_size * (128 * 4)), + (entry->kernel_properties.lds_size + (AgentInfo::lds_block_size - 1)) & ~(AgentInfo::lds_block_size - 1), entry->kernel_properties.scratch_size, (entry->kernel_properties.vgpr_count + 1) * agent_info->vgpr_block_size, (entry->kernel_properties.sgpr_count + agent_info->sgpr_block_dflt) * agent_info->sgpr_block_size, @@ -659,7 +670,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, uint64_t workgroup_size = packet->workgroup_size_x * packet->workgroup_size_y * packet->workgroup_size_z; if (workgroup_size > UINT32_MAX) abort(); kernel_properties_ptr->workgroup_size = (uint32_t)workgroup_size; - kernel_properties_ptr->lds_size = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc2, AMD_COMPUTE_PGM_RSRC_TWO_GRANULATED_LDS_SIZE); // packet->group_segment_size; + kernel_properties_ptr->lds_size = packet->group_segment_size; kernel_properties_ptr->scratch_size = packet->private_segment_size; kernel_properties_ptr->vgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT); kernel_properties_ptr->sgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT); @@ -826,9 +837,66 @@ static inline void check_env_var(const char* var_name, uint64_t& val) { if (str != NULL ) val = atoll(str); } +// HSA intercepting routines + +// HSA unified callback function +hsa_status_t hsa_unified_callback( + rocprofiler_hsa_cb_id_t id, + const rocprofiler_hsa_callback_data_t* data, + void* arg) +{ + printf("hsa_unified_callback(%d, %p, %p):\n", (int)id, data, arg); + if (data == NULL) abort(); + + switch (id) { + case ROCPROFILER_HSA_CB_ID_ALLOCATE: + printf(" alloc ptr = %p\n", data->allocate.ptr); + printf(" alloc size = %zu\n", data->allocate.size); + printf(" segment type = 0x%x\n", data->allocate.segment); + printf(" global flag = 0x%x\n", data->allocate.global_flag); + printf(" is_code = %x\n", data->allocate.is_code); + break; + case ROCPROFILER_HSA_CB_ID_DEVICE: + printf(" device type = 0x%x\n", data->device.type); + printf(" device id = %u\n", data->device.id); + printf(" device agent = 0x%lx\n", data->device.agent.handle); + printf(" assigned ptr = %p\n", data->device.ptr); + break; + case ROCPROFILER_HSA_CB_ID_MEMCOPY: + printf(" memcopy dst = %p\n", data->memcopy.dst); + printf(" memcopy src = %p\n", data->memcopy.src); + printf(" memcopy size = %zu\n", data->memcopy.size); + break; + case ROCPROFILER_HSA_CB_ID_SUBMIT: + printf(" packet %p\n", data->submit.packet); + if (data->submit.kernel_name != NULL) { + printf(" submit kernel \"%s\"\n", data->submit.kernel_name); + printf(" device type = %u\n", data->submit.device_type); + printf(" device id = %u\n", data->submit.device_id); + } + break; + default: + printf("Unknown callback id(%u)\n", id); + abort(); + } + + fflush(stdout); + return HSA_STATUS_SUCCESS; +} + +// HSA callbacks structure +rocprofiler_hsa_callbacks_t hsa_callbacks { + hsa_unified_callback, + hsa_unified_callback, + hsa_unified_callback, + hsa_unified_callback +}; + // Tool constructor extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) { + ONLOAD_TRACE_BEG(); + if (pthread_mutex_lock(&mutex) != 0) { perror("pthread_mutex_lock"); abort(); @@ -855,7 +923,7 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) } if (rcfile != NULL) { // Getting defaults - printf("ROCProfiler: rc-file '%s'\n", rcpath.c_str()); + printf("ROCProfiler pid(%u): rc-file '%s'\n", GetPid(), rcpath.c_str()); auto defaults_list = rcfile->GetNodes("top.defaults"); for (auto* entry : defaults_list) { const auto& opts = entry->opts; @@ -908,6 +976,9 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) check_env_var("ROCP_OBJ_TRACKING", settings->code_obj_tracking); // Set memcopies tracking check_env_var("ROCP_MCOPY_TRACKING", settings->memcopy_tracking); + // Set HSA intercepting + check_env_var("ROCP_HSA_INTERC", settings->hsa_intercepting); + if (settings->hsa_intercepting) rocprofiler_set_hsa_callbacks(hsa_callbacks, (void*)14); is_trace_local = settings->trace_local; @@ -936,7 +1007,7 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) abort(); } std::ostringstream oss; - oss << result_prefix << "/results.txt"; + oss << result_prefix << "/" << GetPid() << "_results.txt"; result_file_handle = fopen(oss.str().c_str(), "w"); if (result_file_handle == NULL) { std::ostringstream errmsg; @@ -1046,10 +1117,14 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) if (err) { errno = err; perror("pthread_attr_init"); abort(); } err = pthread_create(&thread, &attr, monitor_thr_fun, NULL); } + + ONLOAD_TRACE_END(); } // Tool destructor -extern "C" PUBLIC_API void OnUnloadTool() { +void rocprofiler_unload(bool is_destr) { + ONLOAD_TRACE("begin loaded(" << is_loaded << ") destr(" << is_destr << ")"); + if (pthread_mutex_lock(&mutex) != 0) { perror("pthread_mutex_lock"); abort(); @@ -1061,6 +1136,8 @@ extern "C" PUBLIC_API void OnUnloadTool() { abort(); } + if (is_destr) CTX_OUTSTANDING_WAIT = 0; + // Unregister dispatch callback rocprofiler_remove_queue_callbacks(); @@ -1080,6 +1157,7 @@ extern "C" PUBLIC_API void OnUnloadTool() { } fflush(stdout); +#if 0 // Cleanup if (callbacks_data != NULL) { delete[] callbacks_data->features; @@ -1096,8 +1174,19 @@ extern "C" PUBLIC_API void OnUnloadTool() { range_vec = NULL; delete context_array; context_array = NULL; +#endif + + ONLOAD_TRACE_END(); +} + +extern "C" PUBLIC_API void OnUnloadTool() { + ONLOAD_TRACE("begin loaded(" << is_loaded << ")"); + if (is_loaded == true) rocprofiler_unload(false); + ONLOAD_TRACE_END(); } extern "C" DESTRUCTOR_API void destructor() { - if (is_loaded == true) OnUnloadTool(); + ONLOAD_TRACE("begin loaded(" << is_loaded << ")"); + if (is_loaded == true) rocprofiler_unload(true); + ONLOAD_TRACE_END(); } diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp index d23a445d..10f9fbc1 100644 --- a/test/util/hsa_rsrc_factory.cpp +++ b/test/util/hsa_rsrc_factory.cpp @@ -144,6 +144,12 @@ HsaRsrcFactory::HsaRsrcFactory(bool initialize_hsa) : initialize_hsa_(initialize CHECK_STATUS("HSA timer allocation failed", (timer_ == NULL) ? HSA_STATUS_ERROR : HSA_STATUS_SUCCESS); + // Time correlation + const uint32_t corr_iters = 1000; + for (unsigned time_id = 0; time_id < HsaTimer::TIME_ID_NUMBER; time_id += 1) { + CorrelateTime((HsaTimer::time_id_t)time_id, corr_iters); + } + // System timeout timeout_ = (timeout_ns_ == HsaTimer::TIMESTAMP_MAX) ? timeout_ns_ : timer_->ns_to_sysclock(timeout_ns_); } @@ -512,21 +518,25 @@ uint8_t* HsaRsrcFactory::AllocateCmdMemory(const AgentInfo* agent_info, size_t s } // Wait signal -void HsaRsrcFactory::SignalWait(const hsa_signal_t& signal) const { +hsa_signal_value_t HsaRsrcFactory::SignalWait(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { + const hsa_signal_value_t exp_value = signal_value - 1; + hsa_signal_value_t ret_value = signal_value; while (1) { - const hsa_signal_value_t signal_value = - hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, 1, timeout_, HSA_WAIT_STATE_BLOCKED); - if (signal_value == 0) { - break; - } else { - CHECK_STATUS("hsa_signal_wait_scacquire()", HSA_STATUS_ERROR); + ret_value = + hsa_api_.hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_LT, signal_value, timeout_, HSA_WAIT_STATE_BLOCKED); + if (ret_value == exp_value) break; + if (ret_value != signal_value) { + std::cerr << "Error: HsaRsrcFactory::SignalWait: signal_value(" << signal_value + << "), ret_value(" << ret_value << ")" << std::endl << std::flush; + abort(); } } + return ret_value; } // Wait signal with signal value restore void HsaRsrcFactory::SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const { - SignalWait(signal); + SignalWait(signal, signal_value); hsa_api_.hsa_signal_store_relaxed(const_cast(signal), signal_value); } @@ -539,7 +549,7 @@ bool HsaRsrcFactory::Memcpy(const hsa_agent_t& agent, void* dst, const void* src CHECK_STATUS("hsa_signal_create()", status); status = hsa_api_.hsa_amd_memory_async_copy(dst, cpu_agents_[0], src, agent, size, 0, NULL, s); CHECK_STATUS("hsa_amd_memory_async_copy()", status); - SignalWait(s); + SignalWait(s, 1); status = hsa_api_.hsa_signal_destroy(s); CHECK_STATUS("hsa_signal_destroy()", status); } @@ -683,14 +693,14 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s return write_idx; } -const char* HsaRsrcFactory::GetKernelName(uint64_t addr) { +const char* HsaRsrcFactory::GetKernelNameRef(uint64_t addr) { std::lock_guard lck(mutex_); const auto it = symbols_map_->find(addr); if (it == symbols_map_->end()) { fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr); abort(); } - return strdup(it->second); + return it->second; } void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) { diff --git a/test/util/hsa_rsrc_factory.h b/test/util/hsa_rsrc_factory.h index 151dab8e..e857813b 100644 --- a/test/util/hsa_rsrc_factory.h +++ b/test/util/hsa_rsrc_factory.h @@ -25,6 +25,8 @@ POSSIBILITY OF SUCH DAMAGE. #ifndef TEST_UTIL_HSA_RSRC_FACTORY_H_ #define TEST_UTIL_HSA_RSRC_FACTORY_H_ +#define AMD_INTERNAL_BUILD + #include #include #include @@ -35,6 +37,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include @@ -161,10 +164,11 @@ struct AgentInfo { // Number of Shader Arrays Per Shader Engines in Gpu uint32_t shader_arrays_per_se; - // SGPR/VGPR block sizes + // SGPR/VGPR/LDS block sizes uint32_t sgpr_block_dflt; uint32_t sgpr_block_size; uint32_t vgpr_block_size; + static const uint32_t lds_block_size = 128 * 4; }; // HSA timer class @@ -175,6 +179,15 @@ class HsaTimer { static const timestamp_t TIMESTAMP_MAX = UINT64_MAX; typedef long double freq_t; + enum time_id_t { + TIME_ID_CLOCK_REALTIME = 0, + TIME_ID_CLOCK_REALTIME_COARSE = 1, + TIME_ID_CLOCK_MONOTONIC = 2, + TIME_ID_CLOCK_MONOTONIC_COARSE = 3, + TIME_ID_CLOCK_MONOTONIC_RAW = 4, + TIME_ID_NUMBER + }; + HsaTimer(const hsa_pfn_t* hsa_api) : hsa_api_(hsa_api) { timestamp_t sysclock_hz = 0; hsa_status_t status = hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &sysclock_hz); @@ -190,6 +203,11 @@ class HsaTimer { return timestamp_t((freq_t)time / sysclock_factor_); } + // Method for timespec/ns conversion + static timestamp_t timespec_to_ns(const timespec& time) { + return ((timestamp_t)time.tv_sec * 1000000000) + time.tv_nsec; + } + // Return timestamp in 'ns' timestamp_t timestamp_ns() const { timestamp_t sysclock; @@ -198,6 +216,63 @@ class HsaTimer { return sysclock_to_ns(sysclock); } + // Return time in 'ns' + timestamp_t clocktime_ns(clockid_t clock_id) const { + timespec time; + clock_gettime(clock_id, &time); + return timespec_to_ns(time); + } + + // Return pair of correlated values of profiling timestamp and time with + // correlation error for a given time ID and number of iterations + void correlated_pair_ns(time_id_t time_id, uint32_t iters, + timestamp_t* timestamp_v, timestamp_t* time_v, timestamp_t* error_v) { + clockid_t clock_id = 0; + switch (time_id) { + case TIME_ID_CLOCK_REALTIME: + clock_id = CLOCK_REALTIME; + break; + case TIME_ID_CLOCK_REALTIME_COARSE: + clock_id = CLOCK_REALTIME_COARSE; + break; + case TIME_ID_CLOCK_MONOTONIC: + clock_id = CLOCK_MONOTONIC; + break; + case TIME_ID_CLOCK_MONOTONIC_COARSE: + clock_id = CLOCK_MONOTONIC_COARSE; + break; + case TIME_ID_CLOCK_MONOTONIC_RAW: + clock_id = CLOCK_MONOTONIC_RAW; + break; + default: + CHECK_STATUS("internal error: invalid time_id", HSA_STATUS_ERROR); + } + + std::vector ts_vec(iters); + std::vector tm_vec(iters); + const uint32_t steps = iters - 1; + + for (uint32_t i = 0; i < iters; ++i) { + hsa_api_->hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &ts_vec[i]); + clock_gettime(clock_id, &tm_vec[i]); + } + + const timestamp_t ts_base = sysclock_to_ns(ts_vec.front()); + const timestamp_t tm_base = timespec_to_ns(tm_vec.front()); + const timestamp_t error = (ts_vec.back() - ts_vec.front()) / (2 * steps); + + timestamp_t ts_accum = 0; + timestamp_t tm_accum = 0; + for (uint32_t i = 0; i < iters; ++i) { + ts_accum += (ts_vec[i] - ts_base); + tm_accum += (timespec_to_ns(tm_vec[i]) - tm_base); + } + + *timestamp_v = (ts_accum / iters) + ts_base + error; + *time_v = (tm_accum / iters) + tm_base; + *error_v = error; + } + private: // Timestamp frequency factor freq_t sysclock_factor_; @@ -299,7 +374,7 @@ class HsaRsrcFactory { uint8_t* AllocateCmdMemory(const AgentInfo* agent_info, size_t size); // Wait signal - void SignalWait(const hsa_signal_t& signal) const; + hsa_signal_value_t SignalWait(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const; // Wait signal with signal value restore void SignalWaitRestore(const hsa_signal_t& signal, const hsa_signal_value_t& signal_value) const; @@ -331,7 +406,7 @@ class HsaRsrcFactory { // Enable executables loading tracking static bool IsExecutableTracking() { return executable_tracking_on_; } static void EnableExecutableTracking(HsaApiTable* table); - static const char* GetKernelName(uint64_t addr); + static const char* GetKernelNameRef(uint64_t addr); // Initialize HSA API table void static InitHsaApiTable(HsaApiTable* table); @@ -357,6 +432,26 @@ class HsaRsrcFactory { if (instance_ != NULL) Instance().timeout_ = Instance().timer_->ns_to_sysclock(time); } + void CorrelateTime(HsaTimer::time_id_t time_id, uint32_t iters) { + timestamp_t timestamp_v = 0; + timestamp_t time_v = 0; + timestamp_t error_v = 0; + timer_->correlated_pair_ns(time_id, iters, ×tamp_v, &time_v, &error_v); + time_shift_[time_id] = time_v - timestamp_v; + time_error_[time_id] = error_v; + } + + hsa_status_t GetTimeVal(uint32_t time_id, uint64_t time_stamp, uint64_t* time_value) { + if (time_id >= HsaTimer::TIME_ID_NUMBER) return HSA_STATUS_ERROR; + *time_value = time_stamp + time_shift_[time_id]; + return HSA_STATUS_SUCCESS; + } + + hsa_status_t GetTimeErr(uint32_t time_id, uint64_t* err) { + *err = time_error_[time_id]; + return HSA_STATUS_SUCCESS; + } + private: // System agents iterating callback static hsa_status_t GetHsaAgentsCallback(hsa_agent_t agent, void* data); @@ -421,6 +516,10 @@ class HsaRsrcFactory { // HSA timer HsaTimer* timer_; + // Time shift array to support time conversion + timestamp_t time_shift_[HsaTimer::TIME_ID_NUMBER]; + timestamp_t time_error_[HsaTimer::TIME_ID_NUMBER]; + // CPU/kern-arg memory pools hsa_amd_memory_pool_t *cpu_pool_; hsa_amd_memory_pool_t *kern_arg_pool_; From 08d86aaa3a2ac9a4717a050a74d0f49c2cdfc048 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 17 Aug 2020 02:04:08 -0500 Subject: [PATCH 124/168] 3.7 update --- CMakeLists.txt | 1 + bin/build_kernel.sh | 25 +- bin/mem_manager.py | 216 ++++++++++++++++ bin/rpl_run.sh | 2 +- bin/sqlitedb.py | 22 +- bin/tblextr.py | 298 +++++++++++++-------- bin/txt2xml.sh | 9 +- inc/rocprofiler.h | 14 +- src/core/activity.cpp | 99 ------- src/core/context.h | 165 +++++------- src/core/hsa_interceptor.h | 63 ++++- src/core/intercept_queue.cpp | 3 + src/core/intercept_queue.h | 243 ++++++++++++++++-- src/core/profile.h | 101 ++++---- src/core/rocprofiler.cpp | 39 +-- src/core/tracker.h | 43 ++++ src/util/hsa_rsrc_factory.cpp | 5 +- test/app/standalone_test.cpp | 13 - test/run.sh | 69 +++-- test/tool/pmc_input.xml | 4 + test/tool/pmc_input1.xml | 14 + test/tool/tool.cpp | 456 ++++++++++++++++++--------------- test/util/hsa_rsrc_factory.cpp | 79 +++--- test/util/hsa_rsrc_factory.h | 94 ++++++- 24 files changed, 1390 insertions(+), 687 deletions(-) create mode 100755 bin/mem_manager.py create mode 100644 test/tool/pmc_input.xml create mode 100644 test/tool/pmc_input1.xml diff --git a/CMakeLists.txt b/CMakeLists.txt index 8aac5175..e6765e47 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -144,6 +144,7 @@ install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/bin/txt2params.py ${CMAKE_CURRENT_SOURCE_DIR}/bin/tblextr.py ${CMAKE_CURRENT_SOURCE_DIR}/bin/dform.py + ${CMAKE_CURRENT_SOURCE_DIR}/bin/mem_manager.py ${CMAKE_CURRENT_SOURCE_DIR}/bin/sqlitedb.py DESTINATION ${DEST_NAME}/bin PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE ) diff --git a/bin/build_kernel.sh b/bin/build_kernel.sh index 8ed0f168..4b2f87dd 100755 --- a/bin/build_kernel.sh +++ b/bin/build_kernel.sh @@ -40,8 +40,17 @@ else LIB_DIR=$LLVM_DIR/lib fi -BC_DIR=$LIB_DIR/bitcode -if [ ! -d "$BC_DIR" ] ; then BC_DIR=$LIB_DIR; fi +# Determine whether using new or old device-libs layout +if [ -e $LIB_DIR/bitcode/opencl.amdgcn.bc ]; then + BC_DIR=$LIB_DIR/bitcode +elif [ -e $LIB_DIR/opencl.amdgcn.bc ]; then + BC_DIR=$LIB_DIR +elif [ -e $ROCM_DIR/amdgcn/bitcode/opencl.bc ]; then + BC_DIR=$ROCM_DIR/amdgcn/bitcode +else + echo "Error: Cannot find amdgcn bitcode directory" + exit 1 +fi CLANG_ROOT=$LLVM_DIR/lib/clang CLANG_DIR=`ls -d $CLANG_ROOT/* | head -n 1` @@ -52,10 +61,14 @@ fi BIN_DIR=$LLVM_DIR/bin INC_DIR=$CLANG_DIR/include -BITCODE_OPTS="\ - -Xclang -mlink-bitcode-file -Xclang $BC_DIR/opencl.amdgcn.bc \ - -Xclang -mlink-bitcode-file -Xclang $BC_DIR/ockl.amdgcn.bc \ - -Xclang -mlink-bitcode-file -Xclang $BC_DIR/ocml.amdgcn.bc" +if [ -e $BC_DIR/opencl.amdgcn.bc ]; then + BITCODE_OPTS="-nogpulib \ + -Xclang -mlink-bitcode-file -Xclang $BC_DIR/opencl.amdgcn.bc \ + -Xclang -mlink-bitcode-file -Xclang $BC_DIR/ockl.amdgcn.bc \ + -Xclang -mlink-bitcode-file -Xclang $BC_DIR/ocml.amdgcn.bc" +else + BITCODE_OPTS="--hip-device-lib-path=$BC_DIR" +fi for GFXIP in $TGT_LIST ; do OBJ_PREF=$GFXIP diff --git a/bin/mem_manager.py b/bin/mem_manager.py new file mode 100755 index 00000000..8b616cc6 --- /dev/null +++ b/bin/mem_manager.py @@ -0,0 +1,216 @@ +################################################################################ +# Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +################################################################################ + +import sys, os, re +from sqlitedb import SQLiteDB + +pinned = ['hipMallocHost', 'hipHostMalloc', 'hipHostAlloc'] +ondevice = ['hipMalloc', 'hipMallocPitch', 'hipMallocArray', 'hipMalloc3DArray'] + +mm_table_descr = [ + ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'Direction', 'SrcType', 'DstType', 'Size', 'BW', 'Async'], + {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'Direction':'TEXT', 'SrcType':'TEXT', 'DstType':'TEXT', 'Size':'INTEGER', 'BW':'TEXT', 'Async':'TEXT'} +] + +def fatal(msg): + sys.stderr.write(sys.argv[0] + ": " + msg + "\n"); + sys.exit(1) + +DELIM = ',' + +# Mem copy manager class +class MemManager: + + def __init__(self, db): + self.db = db + self.allocations = {} + self.memcopies = {} + self.filename = '' + self.fd = 0 + + def __del__(self): + if self.fd != 0: self.fd.close() + + # register allo and memcpy API calls + def register_api(self, rec_vals): + res = '' + malloc_ptrn = re.compile(r'hip.*Malloc') + mcopy_ptrn = re.compile(r'hipMemcpy') + record_name = rec_vals[4] + record_args = rec_vals[5] + if malloc_ptrn.match(record_name): + self.add_allocation(record_name, record_args) + elif mcopy_ptrn.match(record_name): + res = self.add_memcpy(rec_vals) + + return res + + # add allocation to map + def add_allocation(self, event, args): + choice = 0 + if event == "hipMallocPitch": + malloc_args_ptrn = re.compile(r'\(ptr\((.*)\) width\((.*)\) height\((.*)\)\)') + choice = 1 + elif event == "hipMallocArray": + malloc_args_ptrn = re.compile(r'\(array\((.*)\) width\((.*)\) height\((.*)\)\)') + choice = 1 + elif event == "hipMalloc3DArray": + malloc_args_ptrn = re.compile(r'\(array\((.*)\) width\((.*)\) height\((.*)\) depth\((.*)\)\)') + choice = 2 + else: + #(ptr(0x7f3407000000) size(800000000) flags(0)) + malloc_args_ptrn = re.compile(r'\(ptr\((.*)\) size\((.*)\) .*\)') + choice = 3 + m = malloc_args_ptrn.match(args) + if m: + ptr = int(m.group(1), 16) + if choice == 3: + size = int(m.group(2)) + elif choice == 1: + size = int(m.group(2)) * int(m.group(3)) + else: + size = int(m.group(2)) * int(m.group(3)) * int(m.group(4)) + self.allocations[ptr] = (size, event) + + #get type of ptr + def get_ptr_type(self, ptr): + addr = int(ptr, 16) + addr_type = 'unknown' + found = 0 + for base, (size, event) in self.allocations.items(): + if addr >= base and addr < base + size: + found = 1 + break + if not found: + addr_type = 'pageable' + elif event in pinned: + addr_type = 'pinned' + elif event in ondevice: + addr_type = 'device' + else: + fatal('internal error: ptr(' + ptr + ') cannot be identified') + return addr_type + + # add memcpy to map + def add_memcpy(self, recvals): + recordid = recvals[6] #same as corrid + event = recvals[4] + start_time = recvals[0] # sync time stamp + end_time = recvals[1] # sync time stamp + args = recvals[5] + procid = recvals[2] # used to query async entries + pid = recvals[2] + tid = recvals[3] + + select_expr = '"Index" = ' + str(recordid) + ' AND "proc-id" = ' + str(procid) + + # hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) + hipMemcpy_ptrn = re.compile(r'\(dst\((.*)\) src\((.*)\) sizeBytes\((\d+)\).*\)') + # hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, + # size_t height, hipMemcpyKind kind); + hipMemcpy_ptrn2 = re.compile(r'\(dst\((.*)\) .* src\((.*)\) .* width\((\d+)\) height\((\d+)\).*\)') + # hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, + # size_t count, hipMemcpyKind kind); + hipMemcpy_ptrn3 = re.compile(r'\(dst\((.*)\) .* src\((.*)\) count\((\d+)\).*\)') + # memcopy with kind argument + hipMemcpy_ptrn_kind = re.compile(r'.* kind\((\d+)\)\s*.*') + # aysnc memcopy + async_event_ptrn = re.compile(r'Async') + + m_basic = hipMemcpy_ptrn.match(args) + m_2d = hipMemcpy_ptrn2.match(args) + m_array = hipMemcpy_ptrn3.match(args) + + is_async = 1 if async_event_ptrn.search(event) else 0 + if is_async: + async_copy_recvals = self.db.table_get_record('COPY', select_expr) #List of async copy record fields + async_copy_start_time = async_copy_recvals[0] + async_copy_end_time = async_copy_recvals[1] + tid = async_copy_recvals[4] + + copy_line = '' + size = 0 + dstptr_type = 'unknown' + srcptr_type = 'unknown' + direction = 'unknown' + bandwidth = 0 + duration = 0 + + switcher = { + '0': "HtoH", + '1': "HtoD", + '2': "DtoH", + '3': "DtoD", + '4': "auto", + } + + if m_basic or m_2d or m_array: + if m_basic: + dstptr = m_basic.group(1) + dstptr_type = self.get_ptr_type(dstptr) + srcptr = m_basic.group(2) + srcptr_type = self.get_ptr_type(srcptr) + size = int(m_basic.group(3)) + if m_array: + dstptr = m_array.group(1) + dstptr_type = self.get_ptr_type(dstptr) + srcptr = m_array.group(2) + srcptr_type = self.get_ptr_type(srcptr) + size = m_array.group(3) + if m_2d: + dstptr = m_2d.group(1) + dstptr_type = self.get_ptr_type(dstptr) + srcptr = m_2d.group(2) + srcptr_type = self.get_ptr_type(srcptr) + size = m_2d.group(3)*m_2d.group(4) + + duration = (int(end_time) - int(start_time)) if not is_async else (int(async_copy_end_time) - int(async_copy_start_time)) + bandwidth = float(size) * 1000 / duration + + m = hipMemcpy_ptrn_kind.match(args) + if m: + direction = switcher.get(m.group(1), "unknown") + + copy_line = str(start_time) + DELIM + str(end_time) + DELIM + pid + DELIM + tid + DELIM + event + DELIM + 'Direction=' + direction + DELIM + 'SrcType=' + srcptr_type + DELIM + 'DstType=' + dstptr_type + DELIM + "Size=" + str(size) + DELIM + "BW=" + str(round(bandwidth, 2)) + DELIM + 'Async=' + str(is_async) + + self.memcopies[recordid] = copy_line + return copy_line; + + def dump_data(self): + # To create “MM†table in DB on the finish + table_name = "MM" + file_name = os.environ['PWD'] + '/results.memcopy_info.csv' + print("File '" + file_name + "' is generating") + table_handle = self.db.add_table(table_name, mm_table_descr) + + fld_ptrn = re.compile(r'(.*)=(.*)') + for (key, record) in self.memcopies.items(): + rec_vals_array = [] + for rec in record.split(DELIM): + fld_ptrnm = fld_ptrn.match(rec) + if fld_ptrnm: + rec_vals_array.append(fld_ptrnm.group(2)) + else: + rec_vals_array.append(rec) + self.db.insert_entry(table_handle, rec_vals_array) + # To dump the MM table as CSV + self.db.dump_csv(table_name, file_name) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index 0c3d83d4..e98561b4 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -202,7 +202,7 @@ usage() { # checking for availability of rocminfo utility `which rocminfo >/dev/null 2>&1` -if [ $? != 0 ]; then fatal "'rocminfo' utility is not found: please add ROCM bin path to PATH env var."; fi +if [ $? != 0 ]; then error "'rocminfo' utility is not found: please add ROCM bin path to PATH env var."; fi # profiling run method OUTPUT_LIST="" diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index eb584503..62553a81 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -76,6 +76,11 @@ def change_rec_tid(self, table_name, rec_id, tid): self.connection.execute('UPDATE ' + table_name + ' SET tid = ? WHERE "Index" = ?', (tid, rec_id)) def change_rec_fld(self, table_name, fld_expr, rec_pat): self.connection.execute('UPDATE ' + table_name + ' SET ' + fld_expr + ' WHERE ' + rec_pat) + def table_get_record(self, table_name, rec_pat): + cursor = self.connection.execute('SELECT * FROM ' + table_name + ' WHERE ' + rec_pat) + raws = cursor.fetchall() + if len(raws) != 1: raise Exception('Record (' + rec_pat + ') is not unique, table "' + table_name + '"') + return list(raws[0]) # populate DB table entry def insert_entry(self, table, val_list): @@ -109,8 +114,7 @@ def _get_raws_indexed(self, table_name): def _get_raw_by_id(self, table_name, rec_id): cursor = self.connection.execute('SELECT * FROM ' + table_name + ' WHERE "Index"=?', (rec_id,)) raws = cursor.fetchall() - if len(raws) != 1: - raise Exception('Index is not unique, table "' + table_name + '"') + if len(raws) != 1: raise Exception('Index is not unique, table "' + table_name + '"') return list(raws[0]) def table_get_raws(self, table_name): @@ -147,20 +151,20 @@ def label_json(self, pid, label, file_name): fd.write(',{"args":{"name":"%s"},"ph":"M","pid":%s,"name":"process_name","sort_index":%d}\n' %(label, pid, self.section_index)) self.section_index += 1 - def flow_json(self, base_id, from_pid, from_tid, from_us_list, to_pid, to_us_dict, corr_id_list, start_us, file_name): + def flow_json(self, base_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id_list, start_us, file_name): if not re.search(r'\.json$', file_name): raise Exception('wrong output file type: "' + file_name + '"' ) with open(file_name, mode='a') as fd: dep_id = base_id - for ind in range(len(from_tid)): - if (len(corr_id_list) != 0): corr_id = corr_id_list[ind] - else: corr_id = ind + for ind in range(len(from_us_list)): + corr_id = corr_id_list[ind] if (len(corr_id_list) != 0) else ind if corr_id in to_us_dict: - from_ts = from_us_list[ind] - start_us + (from_ts, from_tid, to_tid) = from_us_list[ind] + from_ts -= start_us to_ts = to_us_dict[corr_id] - start_us if from_ts > to_ts: from_ts = to_ts - fd.write(',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%s,"tid":%s,"name":"dep"}\n' % (from_ts, dep_id, str(from_pid), from_tid[ind])) - fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%s,"tid":0,"name":"dep"}\n' % (to_ts, dep_id, str(to_pid))) + fd.write(',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (from_ts, dep_id, from_pid, from_tid)) + fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (to_ts, dep_id, to_pid, to_tid)) dep_id += 1 def metadata_json(self, jsonfile, sysinfo_file): diff --git a/bin/tblextr.py b/bin/tblextr.py index 0fe46336..60d99db3 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -22,8 +22,11 @@ import os, sys, re, subprocess from sqlitedb import SQLiteDB +from mem_manager import MemManager import dform +mcopy_data_enabled = 1 if 'ROCP_MCOPY_DATA' in os.environ else 0 + EXT_PID = 0 COPY_PID = 1 HIP_PID = 2 @@ -42,6 +45,28 @@ dep_dict = {} kern_dep_list = [] +# stream ID map +stream_counter = 0 +stream_id_map = {} +def get_stream_index(stream_id): + global stream_counter + stream_ind = 0 + if stream_id.lower() != 'nil': + if not stream_id in stream_id_map: + stream_counter += 1 + stream_ind = stream_counter + stream_id_map[stream_id] = stream_ind + else: + stream_ind = stream_id_map[stream_id] + return stream_ind + +# patching activity records +def activity_record_patching(db, ops_table_name, kernel_found, kernel_name, stream_found, stream_ind, select_expr): + if kernel_found != 0: + db.change_rec_fld(ops_table_name, 'Name = "' + kernel_name + '"', select_expr) + if stream_found != 0: + db.change_rec_fld(ops_table_name, 'tid = ' + str(stream_ind), select_expr) + # global vars table_descr = [ ['Index', 'KernelName'], @@ -112,6 +137,8 @@ def parse_res(infile): } gpu_id = 0 + queue_id = 0 + disp_pid = 0 disp_tid = 0 kernel_properties = m.group(2) @@ -125,7 +152,9 @@ def parse_res(infile): if var == 'gpu-id': gpu_id = int(val) if (gpu_id > max_gpu_id): max_gpu_id = gpu_id - if var == 'tid': disp_tid = val + if var == 'queue-id': queue_id = int(val) + if var == 'pid': disp_pid = int(val) + if var == 'tid': disp_tid = int(val) else: fatal('wrong kernel property "' + prop + '" in "'+ kernel_properties + '"') m = ts_pattern.search(record) if m: @@ -134,20 +163,22 @@ def parse_res(infile): var_table[dispatch_number]['EndNs'] = m.group(3) var_table[dispatch_number]['CompleteNs'] = m.group(4) - gpu_pid = GPU_BASE_PID + int(gpu_id) - if not gpu_pid in dep_dict: dep_dict[gpu_pid] = {} - dep_str = dep_dict[gpu_pid] - if not 'tid' in dep_str: dep_str['tid'] = [] - if not 'from' in dep_str: dep_str['from'] = [] - if not 'to' in dep_str: dep_str['to'] = {} - to_id = len(dep_str['tid']) - from_us = int(m.group(1)) / 1000 + ## filling dependenciws + from_ns = m.group(1) + from_us = int(from_ns) / 1000 to_us = int(m.group(2)) / 1000 + + kern_dep_list.append((from_ns, disp_pid, disp_tid)) + + gpu_pid = GPU_BASE_PID + int(gpu_id) + if not disp_pid in dep_dict: dep_dict[disp_pid] = {} + dep_proc = dep_dict[disp_pid] + if not gpu_pid in dep_proc: dep_proc[gpu_pid] = { 'pid': HSA_PID, 'from': [], 'to': {}, 'id': [] } + dep_str = dep_proc[gpu_pid] + to_id = len(dep_str['from']) + dep_str['from'].append((from_us, disp_tid, queue_id)) dep_str['to'][to_id] = to_us - dep_str['from'].append(from_us) - dep_str['tid'].append(disp_tid) - dep_str['pid'] = HSA_PID - kern_dep_list.append((disp_tid, m.group(1))) + ## inp.close() ############################################################# @@ -259,19 +290,25 @@ def fill_ext_db(table_name, db, indir, trace_name, api_pid): return 1 ############################################################# - -def extract_field(rec_args, field): - ptrn1_field = re.compile(r'^.*'+field+'\('); - ptrn2_field = re.compile(r'\)\) .*$'); - (field_name, n_subs) = ptrn1_field.subn('', rec_args, count=1); - if n_subs != 0: - (field_name, n_subs) = ptrn2_field.subn(')', field_name, count=1) - return (field_name, n_subs) +# arguments manipulation routines +def get_field(args, field): + ptrn1_field = re.compile(r'^.* ' + field + '\('); + ptrn2_field = re.compile(r'\) .*$'); + ptrn3_field = re.compile(r'\)\)$'); + (field_name, n) = ptrn1_field.subn('', args, count=1); + if n != 0: + (field_name, n) = ptrn2_field.subn('', field_name, count=1) + if n == 0: + (field_name, n) = ptrn3_field.subn('', field_name, count=1) + return (field_name, n) + +def set_field(args, field, val): + return re.subn(field + '\(\w+\)([ \)])', field + '(' + str(val) + ')\\1', args, count=1) # Fill API DB api_table_descr = [ - ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'args', 'Index'], - {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'Index':'INTEGER'} + ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'args', 'Index', 'Data'], + {'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'Index':'INTEGER', 'Data':'TEXT'} ] # Filling API records DB table # table_name - created DB table name @@ -284,6 +321,7 @@ def extract_field(rec_args, field): # dep_filtr - registered dependencies by record ID def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep_filtr, expl_id): global hsa_activity_found + global memory_manager copy_raws = [] if (hsa_activity_found): copy_raws = db.table_get_raws('COPY') copy_csv = '' @@ -296,6 +334,7 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep ptrn2_kernel = re.compile(r'\)\) .*$') ptrn_fixformat = re.compile(r'(\d+:\d+ \d+:\d+ \w+)\(\s*(.*)\)$') ptrn_fixkernel = re.compile(r'\s+kernel=(.*)$') + ptrn_multi_kernel = re.compile(r'(.*):(\d+)$') if not os.path.isfile(file_name): return 0 @@ -304,7 +343,7 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep dep_id_list = [] # parsing an input trace file and creating a DB table - record_id = 0 + record_id_dict = {} table_handle = db.add_table(table_name, api_table_descr) with open(file_name, mode='r') as fd: for line in fd.readlines(): @@ -319,27 +358,49 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep mfixformat = ptrn_fixformat.match(record) if mfixformat: #replace '=' in args with parentheses reformated_args = kernel_arg + mfixformat.group(2).replace('=','(').replace(',',')')+')' - record = mfixformat.group(1) + '(' + reformated_args + ')' + record = mfixformat.group(1) + '( ' + reformated_args + ')' m = ptrn_val.match(record) if m: rec_vals = [] - rec_len = len(api_table_descr[0]) - for ind in range(1,rec_len): + rec_len = len(api_table_descr[0]) - 1 + for ind in range(1, rec_len): rec_vals.append(m.group(ind)) - proc_id = rec_vals[2] - rec_vals[2] = api_pid - rec_vals.append(record_id) - db.insert_entry(table_handle, rec_vals) + proc_id = int(rec_vals[2]) + thrd_id = int(rec_vals[3]) + record_name = rec_vals[4] + record_args = rec_vals[5] + + # incrementing per-process record id/correlation id + if not proc_id in record_id_dict: record_id_dict[proc_id] = 0 + corr_id = record_id_dict[proc_id] + record_id_dict[proc_id] += 1 + rec_vals.append(corr_id) + + # extracting/converting stream id + (stream_id, stream_found) = get_field(record_args, 'stream') + if stream_found != 0: + stream_id = get_stream_index(stream_id) + (rec_vals[5], found) = set_field(record_args, 'stream', stream_id) + if found == 0: fatal('set_field() failed for "stream", args: "' + record_args + '"') + else: stream_id = 0 # dependencies filling - if ptrn_ac.search(rec_vals[4]) or record_id in dep_filtr: + if ptrn_ac.search(record_name) or (corr_id, proc_id) in dep_filtr: beg_ns = int(rec_vals[0]) end_ns = int(rec_vals[1]) from_us = (beg_ns / 1000) + ((end_ns - beg_ns) / 1000) - dep_from_us_list.append(from_us) - dep_tid_list.append(int(rec_vals[3])) - dep_id_list.append(record_id) + + if not proc_id in dep_dict: dep_dict[proc_id] = {} + dep_proc = dep_dict[proc_id] + found = 1 if dep_pid in dep_proc else 0 + if found == 0 and dep_pid == OPS_PID: + dep_proc[dep_pid] = { 'pid': api_pid, 'from': [], 'id': [] } + found = 1 + if found == 1: + dep_str = dep_proc[dep_pid] + dep_str['from'].append((from_us, thrd_id, stream_id)) + if expl_id: dep_str['id'].append(corr_id) # memcopy data if len(copy_raws) != 0: @@ -347,40 +408,50 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep args_str = rec_vals[5] args_str = re.sub(r'\(', r'', args_str) args_str = re.sub(r'\).*$', r'', args_str) - copy_line = str(copy_data[0]) + ', ' + str(copy_data[1]) + ', ' + rec_vals[4] + ', ' + args_str + copy_line = str(copy_data[0]) + ', ' + str(copy_data[1]) + ', ' + record_name + ', ' + args_str copy_csv += str(copy_index) + ', ' + copy_line + '\n' copy_index += 1 # patching activity properties: kernel name, stream-id - corr_id = record_id if (corr_id, proc_id) in dep_filtr: + ops_table_name = dep_filtr[(corr_id, proc_id)] + + select_expr = '"Index" = ' + str(corr_id) + ' AND "proc-id" = ' + str(proc_id) record_args = rec_vals[rec_len - 2] - select_expr = '"Index" = ' + str(corr_id) + ' AND "proc-id" = ' + proc_id - # extract kernel name - (kernel_name, n_subs) = extract_field(record_args, 'kernel') - if n_subs != 0: - db.change_rec_fld('OPS', 'Name = "' + kernel_name + '"', select_expr) - # extract stream-id - (stream_id, n_subs) = extract_field(record_args, 'stream') - if n_subs != 0: - if stream_id == 'nil' or stream_id == 'NIL': stream_id = 0 - db.change_rec_fld('OPS', 'tid = ' + stream_id, select_expr) - record_id += 1 + # extract kernel name string + (kernel_str, kernel_found) = get_field(record_args, 'kernel') + is_kernel_list = 1 if kernel_found != 0 and kernel_str[-1] == ';' else 0 + + if is_kernel_list != 0: + for kernel_item in kernel_str[:-1].split(';'): + m = ptrn_multi_kernel.match(kernel_item) + if m: + kernel_name = m.group(1) + dev_id = m.group(2) + select_expr += ' AND "dev-id" = ' + dev_id + activity_record_patching(db, ops_table_name, 1, kernel_name, stream_found, stream_id, select_expr) + else: + fatal('Bad multi-kernel format: "' + kernel_item + '" in "' + kernel_str + '"') + else: + activity_record_patching(db, ops_table_name, kernel_found, kernel_str, stream_found, stream_id, select_expr) + + api_data = '' + if mcopy_data_enabled: + api_data = memory_manager.register_api(rec_vals) if len(dep_filtr) else '' + rec_vals.append(api_data) + + rec_vals[2] = api_pid + + db.insert_entry(table_handle, rec_vals) else: fatal(api_name + " bad record: '" + record + "'") # inserting of dispatch events correlated to the dependent dispatches - for (tid, from_ns) in dep_list: - db.insert_entry(table_handle, [from_ns, from_ns, api_pid, tid, 'hsa_dispatch', '', record_id]) - record_id += 1 - - # registering dependencies informatino - if dep_pid != NONE_PID: - if not dep_pid in dep_dict: dep_dict[dep_pid] = {} - dep_dict[dep_pid]['pid'] = api_pid - dep_dict[dep_pid]['tid'] = dep_tid_list - dep_dict[dep_pid]['from'] = dep_from_us_list - if expl_id: dep_dict[dep_pid]['id'] = dep_id_list + for (from_ns, proc_id, thrd_id) in dep_list: + if not proc_id in record_id_dict: record_id_dict[proc_id] = 0 + corr_id = record_id_dict[proc_id] + record_id_dict[proc_id] += 1 + db.insert_entry(table_handle, [from_ns, from_ns, api_pid, thrd_id, 'hsa_dispatch', '', corr_id, '']) # generating memcopy CSV if copy_csv != '': @@ -394,19 +465,17 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep # fill COPY DB copy_table_descr = [ - ['BeginNs', 'EndNs', 'Name', 'pid', 'tid', 'Index'], - {'Index':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER'} + ['BeginNs', 'EndNs', 'Name', 'pid', 'tid', 'Index', 'proc-id'], + {'Index':'INTEGER', 'proc-id':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER'} ] def fill_copy_db(table_name, db, indir): + pid = COPY_PID file_name = indir + '/' + 'async_copy_trace.txt' ptrn_val = re.compile(r'(\d+):(\d+) (.*)$') - ptrn_id = re.compile(r'^async-copy(\d+)$') + ptrn_id = re.compile(r'^async-copy:(\d+):(\d+)$') if not os.path.isfile(file_name): return 0 - if not COPY_PID in dep_dict: dep_dict[COPY_PID] = {} - dep_to_us_dict = {} - table_handle = db.add_table(table_name, copy_table_descr) with open(file_name, mode='r') as fd: for line in fd.readlines(): @@ -417,22 +486,32 @@ def fill_copy_db(table_name, db, indir): for ind in range(1,4): rec_vals.append(m.group(ind)) rec_vals.append(COPY_PID) rec_vals.append(0) + m = ptrn_id.match(rec_vals[2]) - if m: dep_to_us_dict[int(m.group(1))] = int(rec_vals[0]) / 1000 - else: fatal("bad async-copy entry") - rec_vals.append(m.group(1)) + if not m: fatal("bad async-copy entry '" + record + "'") + corr_id = int(m.group(1)) + proc_id = int(m.group(2)) + rec_vals.append(corr_id) + rec_vals.append(proc_id) + db.insert_entry(table_handle, rec_vals) - else: fatal("async-copy bad record: '" + record + "'") - dep_dict[COPY_PID]['to'] = dep_to_us_dict + # filling dependencies + if not proc_id in dep_dict: dep_dict[proc_id] = {} + dep_proc = dep_dict[proc_id] + if not pid in dep_proc: dep_proc[pid] = { 'pid': HSA_PID, 'from': [], 'to': {}, 'id': [] } + dep_str = dep_proc[pid] + dep_str['to'][corr_id] = int(rec_vals[0]) / 1000 + + else: fatal("async-copy bad record: '" + record + "'") return 1 ############################################################# # fill HCC ops DB ops_table_descr = [ - ['BeginNs', 'EndNs', 'dev-id', 'queue-id', 'Name', 'pid', 'tid', 'Index', 'proc-id'], - {'Index':'INTEGER', 'proc-id':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'dev-id':'INTEGER', 'queue-id':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER'} + ['BeginNs', 'EndNs', 'dev-id', 'queue-id', 'Name', 'pid', 'tid', 'Index', 'proc-id', 'Data'], + {'Index':'INTEGER', 'proc-id':'INTEGER', 'Name':'TEXT', 'args':'TEXT', 'BeginNs':'INTEGER', 'EndNs':'INTEGER', 'dev-id':'INTEGER', 'queue-id':'INTEGER', 'pid':'INTEGER', 'tid':'INTEGER', 'Data':'TEXT'} ] def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): global max_gpu_id @@ -446,7 +525,6 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): filtr = {} - record_id = 0 kernel_table_handle = db.add_table(kernel_table_name, ops_table_descr) mcopy_table_handle = db.add_table(mcopy_table_name, ops_table_descr) with open(file_name, mode='r') as fd: @@ -462,13 +540,15 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): if not m: fatal("bad hcc ops entry '" + record + "'") name = m.group(1) corr_id = int(m.group(2)) - 1 - proc_id = m.group(3) + proc_id = int(m.group(3)) # checking name for memcopy pattern if ptrn_mcopy.search(name): + rec_table_name = mcopy_table_name table_handle = mcopy_table_handle pid = COPY_PID; else: + rec_table_name = kernel_table_name table_handle = kernel_table_handle gpu_id = int(rec_vals[2]); @@ -484,16 +564,18 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): rec_vals.append(0) # tid rec_vals.append(corr_id) # Index rec_vals.append(proc_id) # proc-id + rec_vals.append('') # Data db.insert_entry(table_handle, rec_vals) # registering a dependency filtr - filtr[(corr_id, proc_id)] = 1 + filtr[(corr_id, proc_id)] = rec_table_name - # filling a dependency - if not pid in dep_dict: dep_dict[pid] = {} - if not 'to' in dep_dict[pid]: dep_dict[pid]['to'] = {} - dep_dict[pid]['to'][corr_id] = int(rec_vals[0]) / 1000 - dep_dict[pid]['bsp'] = OPS_PID + # filling a dependencies + if not proc_id in dep_dict: dep_dict[proc_id] = {} + dep_proc = dep_dict[proc_id] + if not pid in dep_proc: dep_proc[pid] = { 'bsp': OPS_PID, 'to': {} } + dep_str = dep_proc[pid] + dep_str['to'][corr_id] = int(rec_vals[0]) / 1000 else: fatal("hcc ops bad record: '" + record + "'") @@ -512,6 +594,16 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): dbfile = '' csvfile = '' +begin_ts_file = indir + '/begin_ts_file.txt' +if os.path.isfile(begin_ts_file): + with open(begin_ts_file, mode='r') as fd: + ind = 0 + for line in fd.readlines(): + val = int(line) / 1000 + if ind == 0 or val < START_US: START_US = val + ind += 1 + print('START timestamp found (' + str(START_US) + 'us)') + if re.search(r'\.csv$', outfile): csvfile = outfile elif re.search(r'\.db$', outfile): @@ -540,6 +632,7 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): with open(dbfile, mode='w') as fd: fd.truncate() db = SQLiteDB(dbfile) + memory_manager = MemManager(db) ext_trace_found = fill_ext_db('rocTX', db, indir, 'roctx', EXT_PID) @@ -613,32 +706,35 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): dform.gen_api_json_trace(db, 'KFD', START_US, jsonfile) if any_trace_found: - for (to_pid, dep_str) in dep_dict.items(): - if 'bsp' in dep_str: - bspid = dep_str['bsp'] - base_str = dep_dict[bspid] - for v in ('pid', 'tid', 'from', 'id'): - dep_str[v] = base_str[v] - base_str['inv'] = 1 - dep_id = 0 - for (to_pid, dep_str) in dep_dict.items(): - if 'inv' in dep_str: continue - if not 'to' in dep_str: continue - - to_us_dict = dep_str['to'] - from_us_list = dep_str['from'] - from_pid = dep_str['pid'] - tid_list = dep_str['tid'] - corr_id_list = [] - if 'id' in dep_str: corr_id_list = dep_str['id'] - - db.flow_json(dep_id, from_pid, tid_list, from_us_list, to_pid, to_us_dict, corr_id_list, START_US, jsonfile) - dep_id += len(tid_list) + for (proc_id, dep_proc) in dep_dict.items(): + for (to_pid, dep_str) in dep_proc.items(): + if 'bsp' in dep_str: + bspid = dep_str['bsp'] + base_str = dep_proc[bspid] + for v in ('pid', 'from', 'id'): + dep_str[v] = base_str[v] + base_str['inv'] = 1 + + for (to_pid, dep_str) in dep_proc.items(): + if 'inv' in dep_str: continue + if not 'to' in dep_str: continue + + from_pid = dep_str['pid'] + from_us_list = dep_str['from'] + to_us_dict = dep_str['to'] + corr_id_list = dep_str['id'] + + db.flow_json(dep_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id_list, START_US, jsonfile) + dep_id += len(from_us_list) if any_trace_found: db.metadata_json(jsonfile, sysinfo_file) db.close_json(jsonfile); + + if mcopy_data_enabled: + memory_manager.dump_data() + db.close() sys.exit(0) diff --git a/bin/txt2xml.sh b/bin/txt2xml.sh index 126337ed..e5bc3e3d 100755 --- a/bin/txt2xml.sh +++ b/bin/txt2xml.sh @@ -64,7 +64,7 @@ parse() { gpu_index=$line fi else - found=$(echo $feature | sed -n "/^\(pmc\|sqtt\|hsa\)$/ p") + found=$(echo $feature | sed -n "/^\(pmc\|hsa\)$/ p") if [ -n "$found" ] ; then output=$outdir/input${index}.xml header="# $timestamp '$output' generated with '$0 $*'" @@ -78,13 +78,6 @@ parse() { EOF fi - if [ "$feature" == "sqtt" ] ; then - cat >> $output < - -EOF - fi - if [ "$feature" == "hsa" ] ; then cat >> $output < diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 24925cae..3f295a15 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -72,6 +72,8 @@ typedef struct { uint64_t timeout; uint32_t timestamp_on; uint32_t hsa_intercepting; + uint32_t k_concurrent; + uint32_t opt_mode; } rocprofiler_settings_t; //////////////////////////////////////////////////////////////////////////////// @@ -90,8 +92,6 @@ hsa_status_t rocprofiler_error_string( typedef enum { ROCPROFILER_FEATURE_KIND_METRIC = 0, ROCPROFILER_FEATURE_KIND_TRACE = 1, - ROCPROFILER_FEATURE_KIND_SPM_MOD = 2, - ROCPROFILER_FEATURE_KIND_PCSMP_MOD = 4 } rocprofiler_feature_kind_t; // Profiling feture parameter @@ -478,7 +478,8 @@ typedef enum { ROCPROFILER_HSA_CB_ID_ALLOCATE = 0, // Memory allocate callback ROCPROFILER_HSA_CB_ID_DEVICE = 1, // Device assign callback ROCPROFILER_HSA_CB_ID_MEMCOPY = 2, // Memcopy callback - ROCPROFILER_HSA_CB_ID_SUBMIT = 3 // Packet submit callback + ROCPROFILER_HSA_CB_ID_SUBMIT = 3, // Packet submit callback + ROCPROFILER_HSA_CB_ID_KSYMBOL = 4 // Loading/unloading of kernel symbol } rocprofiler_hsa_cb_id_t; // HSA callback data type @@ -509,6 +510,12 @@ typedef struct { uint32_t device_type; // type of device the packed is submitted to uint32_t device_id; // id of device the packed is submitted to } submit; + struct { + uint64_t object; // kernel symbol object + const char* name; // kernel symbol name + uint32_t name_length; // kernel symbol name length + int destroy; // symbol executable destroy + } ksymbol; }; } rocprofiler_hsa_callback_data_t; @@ -524,6 +531,7 @@ typedef struct { rocprofiler_hsa_callback_fun_t device; // agent assign callback rocprofiler_hsa_callback_fun_t memcopy; // memory copy callback rocprofiler_hsa_callback_fun_t submit; // packet submit callback + rocprofiler_hsa_callback_fun_t ksymbol; // kernel symbol callback } rocprofiler_hsa_callbacks_t; // Set callbacks. If the callback is NULL then it is disabled. diff --git a/src/core/activity.cpp b/src/core/activity.cpp index c72977e1..19f6bea3 100644 --- a/src/core/activity.cpp +++ b/src/core/activity.cpp @@ -55,92 +55,6 @@ void check_status(hsa_status_t status) { } } -// Activity primitives -namespace activity_prim { -// PC sampling callback data -struct pcsmp_callback_data_t { - const char* kernel_name; // sampled kernel name - void* data_buffer; // host buffer for tracing data - uint64_t id; // sample id - uint64_t cycle; // sample cycle - uint64_t pc; // sample PC -}; - -uint32_t activity_op = UINT32_MAX; -void* activity_arg = NULL; -std::atomic activity_callback{NULL}; -rocprofiler_t* context = NULL; - -hsa_status_t trace_data_cb(hsa_ven_amd_aqlprofile_info_type_t info_type, - hsa_ven_amd_aqlprofile_info_data_t* info_data, - void* data) { - const pcsmp_callback_data_t* pcsmp_data = (pcsmp_callback_data_t*) data; - - activity_record_t record{}; - record.op = activity_op; - record.pc_sample.se = pcsmp_data->id; - record.pc_sample.cycle = pcsmp_data->cycle; - record.pc_sample.pc = pcsmp_data->pc; - activity_async_callback_t fun = activity_callback.load(std::memory_order_acquire); - if (fun) { - (fun)(activity_op, &record, activity_arg); - } else { - free((void*)(pcsmp_data->kernel_name)); - } - return HSA_STATUS_SUCCESS; -} - -bool context_handler(rocprofiler_group_t group, void* arg) { - hsa_agent_t agent{}; - hsa_status_t status = rocprofiler_get_agent(group.context, &agent); - check_status(status); - const rocprofiler::util::AgentInfo* agent_info = rocprofiler::util::HsaRsrcFactory::Instance().GetAgentInfo(agent); - - pcsmp_callback_data_t pcsmp_data{}; - pcsmp_data.kernel_name = (const char*)arg; - pcsmp_data.data_buffer = rocprofiler::util::HsaRsrcFactory::Instance().AllocateSysMemory(agent_info, rocprofiler::TraceProfile::GetSize()); - status = rocprofiler_iterate_trace_data(group.context, trace_data_cb, &pcsmp_data); - check_status(status); - return false; -} - -// Kernel disoatch callback -hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data, - rocprofiler_group_t* group) { - // context features - const rocprofiler_feature_kind_t trace_kind = - (rocprofiler_feature_kind_t)(ROCPROFILER_FEATURE_KIND_TRACE | ROCPROFILER_FEATURE_KIND_PCSMP_MOD); - const uint32_t feature_count = 1; - const uint32_t parameter_count = 1; - rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; - memset(features, 0, feature_count * sizeof(rocprofiler_feature_t)); - rocprofiler_parameter_t* parameters = new rocprofiler_parameter_t[parameter_count]; - memset(features, 0, parameter_count * sizeof(rocprofiler_parameter_t)); - parameters[0].parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET; - parameters[0].value = 0; - - features[0].kind = trace_kind; - features[0].parameters = parameters; - features[0].parameter_count = parameter_count; - - // context properties - rocprofiler_properties_t properties{}; - properties.handler = context_handler; - properties.handler_arg = (void*)strdup(callback_data->kernel_name); - - // Open profiling context - hsa_status_t status = rocprofiler_open(callback_data->agent, features, feature_count, - &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); - check_status(status); - - // Get group[0] - status = rocprofiler_get_group(context, 0, group); - check_status(status); - - return status; -} -} // namespace activity_prim - extern "C" { PUBLIC_API const char* GetOpName(uint32_t op) { return strdup("PCSAMPLE"); } @@ -149,23 +63,10 @@ PUBLIC_API bool RegisterApiCallback(uint32_t op, void* callback, void* arg) { re PUBLIC_API bool RemoveApiCallback(uint32_t op) { return true; } PUBLIC_API bool InitActivityCallback(void* callback, void* arg) { - activity_prim::activity_arg = arg; - activity_prim::activity_callback.store((activity_async_callback_t)callback, std::memory_order_release); - - rocprofiler_queue_callbacks_t queue_callbacks{}; - queue_callbacks.dispatch = activity_prim::dispatch_callback; - rocprofiler_set_queue_callbacks(queue_callbacks, NULL); - return true; } PUBLIC_API bool EnableActivityCallback(uint32_t op, bool enable) { - if (enable) { - activity_prim::activity_op = op; - rocprofiler_start_queue_callbacks(); - } else { - rocprofiler_stop_queue_callbacks(); - } return true; } } // extern "C" diff --git a/src/core/context.h b/src/core/context.h index 7131d338..8be3a9e8 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -83,7 +83,6 @@ class Group { Group(const util::AgentInfo* agent_info, Context* context, const uint32_t& index) : pmc_profile_(agent_info), - trace_profile_(agent_info), n_profiles_(0), refs_(1), context_(context), @@ -96,33 +95,24 @@ class Group { case ROCPROFILER_FEATURE_KIND_METRIC: pmc_profile_.Insert(info); break; - case ROCPROFILER_FEATURE_KIND_TRACE: - trace_profile_.Insert(info); - break; default: EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); } } - hsa_status_t Finalize() { - hsa_status_t status = pmc_profile_.Finalize(start_vector_, stop_vector_, read_vector_); - if (status == HSA_STATUS_SUCCESS) { - status = trace_profile_.Finalize(start_vector_, stop_vector_, read_vector_); - } + hsa_status_t Finalize(const bool is_concurrent = false) { + hsa_status_t status = pmc_profile_.Finalize(start_vector_, stop_vector_, + read_vector_, is_concurrent); if (status == HSA_STATUS_SUCCESS) { if (!pmc_profile_.Empty()) ++n_profiles_; - if (!trace_profile_.Empty()) ++n_profiles_; } return status; } void GetProfiles(profile_vector_t& vec) { pmc_profile_.GetProfiles(vec); - trace_profile_.GetProfiles(vec); } - void GetTraceProfiles(profile_vector_t& vec) { trace_profile_.GetProfiles(vec); } - info_vector_t& GetInfoVector() { return info_vector_; } const pkt_vector_t& GetStartVector() const { return start_vector_; } const pkt_vector_t& GetStopVector() const { return stop_vector_; } @@ -137,7 +127,6 @@ class Group { private: PmcProfile pmc_profile_; - TraceProfile trace_profile_; info_vector_t info_vector_; pkt_vector_t start_vector_; pkt_vector_t stop_vector_; @@ -183,7 +172,7 @@ class Context { uint32_t GetGroupCount() const { return set_.size(); } - inline rocprofiler_group_t GetGroupInfo(Group* g) { + inline rocprofiler_group_t GetGroupDescr(Group* g) { rocprofiler::info_vector_t& info_vector = g->GetInfoVector(); rocprofiler_group_t group = {}; group.index = g->GetIndex(); @@ -192,12 +181,12 @@ class Context { group.feature_count = info_vector.size(); return group; } - inline rocprofiler_group_t GetGroupInfo(const uint32_t& index) { + inline rocprofiler_group_t GetGroupDescr(const uint32_t& index) { rocprofiler_group_t group = {}; if (set_.empty()) { group.context = reinterpret_cast(this); } else { - group = GetGroupInfo(&set_[index]); + group = GetGroupDescr(&set_[index]); } return group; } @@ -272,15 +261,28 @@ class Context { } } - void IterateTraceData(rocprofiler_trace_data_callback_t callback, void* data) { - profile_vector_t profile_vector; - set_[0].GetTraceProfiles(profile_vector); + /* Handle the completion of kernel-begin 'read' packet */ + static bool HandlerRead(hsa_signal_value_t value, void* arg) { + Group* group = reinterpret_cast(arg); + Context* context = group->GetContext(); + + // Handle the completion signal of read packet at kernel begin + const profile_vector_t profile_vector = context->GetProfiles(group->GetIndex()); for (auto& tuple : profile_vector) { - if (pcsmp_mode_) const_cast(tuple.profile)->event_count = UINT32_MAX; - const hsa_status_t status = - api_->hsa_ven_amd_aqlprofile_iterate_data(tuple.profile, callback, data); - if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "context iterate data failed"); + // Wait for read packet to complete + util::HsaRsrcFactory::Instance().SignalWaitRestore(tuple.completion_signal, 1); + const profile_t* profile = tuple.profile; + // Copy the counter values, read at kernel begin, to the right half of + // the buffer, so that the next kernel-end read can reuse the left half + char* data = reinterpret_cast(profile->output_buffer.ptr); + const uint32_t num = profile->output_buffer.size / 2; + for(uint32_t i = 0; i < num; ++i) { + data[i+num] = data[i]; // left --> right + data[i] = 0; // reset left + } } + + return false; } static bool Handler(hsa_signal_value_t value, void* arg) { @@ -288,8 +290,8 @@ class Context { Context* context = group->GetContext(); auto r = group->FetchDecrRefsCount(); if (r == 1) { - const rocprofiler_group_t group_info = context->GetGroupInfo(group); - context->handler_(group_info, context->handler_arg_); + const rocprofiler_group_t group_descr = context->GetGroupDescr(group); + context->handler_(group_descr, context->handler_arg_); } return false; } @@ -298,6 +300,25 @@ class Context { Group* GetGroup(const uint32_t& index) { return &set_[index]; } rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; } + void SetDispatchSignal(const hsa_signal_t &signal) { + dispatch_signal_ = signal; + } + hsa_signal_t& GetDispatchSignal() { + return dispatch_signal_; + } + void SetOrigSignal(const hsa_signal_t &signal) { + orig_signal_ = signal; + } + const hsa_signal_t& GetOrigSignal() const { + return orig_signal_; + } + rocprofiler_dispatch_record_t* GetRecord() { + return &record_; + } + + // Concurrent profiling mode + static bool k_concurrent_; + private: Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, const uint32_t info_count, rocprofiler_handler_t handler, void* handler_arg) @@ -309,12 +330,16 @@ class Context { metrics_(NULL), handler_(handler), handler_arg_(handler_arg), - pcsmp_mode_(false) + pcsmp_mode_(false), + dispatch_signal_{}, + orig_signal_{}, + record_{} {} ~Context() { Destruct(); } void Destruct() { + hsa_signal_destroy(dispatch_signal_); for (const auto& v : info_map_) { const std::string& name = v.first; const rocprofiler_feature_t* info = v.second; @@ -349,12 +374,20 @@ class Context { set_[group_index].ResetRefsCount(); const profile_vector_t profile_vector = GetProfiles(group_index); for (auto& tuple : profile_vector) { + // Handler for read packet completion + if (k_concurrent_) { + hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, HandlerRead, + &set_[group_index]); + } // Handler for stop packet completion hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, &set_[group_index]); } } } + + hsa_status_t status = hsa_signal_create(1, 0, NULL, &dispatch_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "MetricsDict create failed"); } // Initialize rocprofiler context @@ -437,23 +470,6 @@ class Context { const uint32_t group_index = block_status.group_index; set_[group_index].Insert(profile_info_t{event, NULL, 0, info}); } - } else if (kind & ROCPROFILER_FEATURE_KIND_TRACE) { // Processing traces features - info->kind = ROCPROFILER_FEATURE_KIND_TRACE; - - const event_t* event = NULL; - if (kind & ROCPROFILER_FEATURE_KIND_PCSMP_MOD) { // PC sampling - pcsmp_mode_ = true; - } else if (kind & ROCPROFILER_FEATURE_KIND_SPM_MOD) { // SPM trace - const Metric* metric = metrics_->Get(name); - if (metric == NULL) - EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found"); - counters_vec_t counters_vec = metric->GetCounters(); - if (counters_vec.size() != 1) - EXC_RAISING(HSA_STATUS_ERROR, "trace bad metric '" << name << "' is not base counter"); - const counter_t* counter = counters_vec[0]; - event = &(counter->event); - } - set_[0].Insert(profile_info_t{event, info->parameters, info->parameter_count, info}); } else { EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); } @@ -464,7 +480,7 @@ class Context { void Finalize() { for (unsigned index = 0; index < set_.size(); ++index) { - const hsa_status_t status = set_[index].Finalize(); + const hsa_status_t status = set_[index].Finalize(k_concurrent_); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "context finalize failed"); } } @@ -502,56 +518,6 @@ class Context { if (ainfo_data->sample_id == 0) rinfo->data.result_int64 = 0; rinfo->data.result_int64 += ainfo_data->pmc_data.result; rinfo->data.kind = ROCPROFILER_DATA_KIND_INT64; - } else if (ainfo_type == HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA) { - if (rinfo->data.result_bytes.copy) { - const bool trace_local = TraceProfile::IsLocal(); - util::HsaRsrcFactory* hsa_rsrc = &util::HsaRsrcFactory::Instance(); - if (sample_id == 0) { - const uint32_t output_buffer_size = profile->output_buffer.size; - const uint32_t output_buffer_size64 = profile->output_buffer.size / sizeof(uint64_t); - const util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(profile->agent); - void* ptr = (trace_local) ? hsa_rsrc->AllocateSysMemory(agent_info, output_buffer_size) : - calloc(output_buffer_size64, sizeof(uint64_t)); - rinfo->data.result_bytes.size = output_buffer_size; - rinfo->data.result_bytes.ptr = ptr; - callback_data->ptr = reinterpret_cast(ptr); - } - char* result_bytes_ptr = reinterpret_cast(rinfo->data.result_bytes.ptr); - const char* end = result_bytes_ptr + rinfo->data.result_bytes.size; - const char* src = reinterpret_cast(ainfo_data->trace_data.ptr); - uint32_t size = ainfo_data->trace_data.size; - char* ptr = callback_data->ptr; - uint32_t* header = reinterpret_cast(ptr); - char* dest = ptr + sizeof(*header); - - if ((dest + size) >= end) { - if (dest < end) size = end - dest; - else EXC_RAISING(HSA_STATUS_ERROR, "Trace data out of output buffer"); - } - - bool suc = true; - if (trace_local) { - suc = hsa_rsrc->Memcpy(profile->agent, dest, src, size); - } else { - memcpy(dest, src, size); - } - if (suc) { - *header = size; - callback_data->ptr = dest + align_size(size, sizeof(uint32_t)); - rinfo->data.result_bytes.instance_count = sample_id + 1; - rinfo->data.kind = ROCPROFILER_DATA_KIND_BYTES; - } else - EXC_RAISING(HSA_STATUS_ERROR, "Agent Memcpy failed, dst(" << (void*)dest << ") src(" << (void*)src << ") size(" << size << ")"); - } else { - if (sample_id == 0) { - rinfo->data.result_bytes.ptr = profile->output_buffer.ptr; - rinfo->data.result_bytes.size = profile->output_buffer.size; - rinfo->data.result_bytes.instance_count = UINT32_MAX; - } - - rinfo->data.result_bytes.instance_count += 1; - rinfo->data.kind = ROCPROFILER_DATA_KIND_BYTES; - } } else { EXC_RAISING(HSA_STATUS_ERROR, "unknown data type = " << ainfo_type); } @@ -593,8 +559,17 @@ class Context { // PC sampling mode bool pcsmp_mode_; + + // kernel packet dispatch copmletion signal + hsa_signal_t dispatch_signal_; + hsa_signal_t orig_signal_; + rocprofiler_dispatch_record_t record_; + }; +#define CONTEXT_INSTANTIATE() \ + bool rocprofiler::Context::k_concurrent_ = false; + } // namespace rocprofiler #endif // SRC_CORE_CONTEXT_H_ diff --git a/src/core/hsa_interceptor.h b/src/core/hsa_interceptor.h index f1d8a0d8..9207730b 100644 --- a/src/core/hsa_interceptor.h +++ b/src/core/hsa_interceptor.h @@ -25,6 +25,7 @@ SOFTWARE. #ifndef _SRC_CORE_HSA_INTERCEPTOR_H #define _SRC_CORE_HSA_INTERCEPTOR_H +#include #include #include #include @@ -49,7 +50,8 @@ SOFTWARE. (ID == ROCPROFILER_HSA_CB_ID_ALLOCATE) ? callbacks_.allocate: \ (ID == ROCPROFILER_HSA_CB_ID_DEVICE) ? callbacks_.device: \ (ID == ROCPROFILER_HSA_CB_ID_MEMCOPY) ? callbacks_.memcopy: \ - callbacks_.submit; \ + (ID == ROCPROFILER_HSA_CB_ID_SUBMIT) ? callbacks_.submit: \ + callbacks_.ksymbol; \ if ((__callback != NULL) && (recursion_ == false)) #define DO_HSA_CALLBACK \ @@ -62,6 +64,14 @@ SOFTWARE. #define ISSUE_HSA_CALLBACK(ID) \ do { IS_HSA_CALLBACK(ID) { DO_HSA_CALLBACK; } } while(0) +// Demangle C++ symbol name +static const char* cpp_demangle(const char* symname) { + size_t size = 0; + int status; + const char* ret = abi::__cxa_demangle(symname, NULL, &size, &status); + return (ret != 0) ? ret : strdup(symname); +} + namespace rocprofiler { extern decltype(hsa_memory_allocate)* hsa_memory_allocate_fn; extern decltype(hsa_memory_assign_agent)* hsa_memory_assign_agent_fn; @@ -337,6 +347,39 @@ class HsaInterceptor { return HSA_STATUS_SUCCESS; } + static hsa_status_t KernelSymbolCallback( + hsa_executable_t executable, + hsa_executable_symbol_t symbol, + void *arg) + { + const int free_flag = reinterpret_cast(arg); + hsa_symbol_kind_t kind = (hsa_symbol_kind_t)0; + HSA_RT(hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &kind)); + + if (kind == HSA_SYMBOL_KIND_KERNEL) { + const char* name = NULL; + uint32_t len = 0; + uint64_t obj = 0; + HSA_RT(hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &obj)); + if (free_flag == 0) { + HSA_RT(hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len)); + char sym_name[len + 1]; + HSA_RT(hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, sym_name)); + name = cpp_demangle(sym_name); + } + + rocprofiler_hsa_callback_data_t data{}; + data.ksymbol.object = obj; + data.ksymbol.name = name; + data.ksymbol.name_length = len; + data.ksymbol.destroy = free_flag; + + ISSUE_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_KSYMBOL); + } + + return HSA_STATUS_SUCCESS; + } + static hsa_status_t ExecutableFreeze( hsa_executable_t executable, const char *options) @@ -352,6 +395,15 @@ class HsaInterceptor { reinterpret_cast(0)); } + { + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_KSYMBOL) { + HSA_RT(hsa_executable_iterate_symbols( + executable, + KernelSymbolCallback, + reinterpret_cast(0))); + } + } + return status; } @@ -367,6 +419,15 @@ class HsaInterceptor { reinterpret_cast(1)); } + { + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_KSYMBOL) { + HSA_RT(hsa_executable_iterate_symbols( + executable, + KernelSymbolCallback, + reinterpret_cast(1))); + } + } + HSA_RT(hsa_executable_destroy_fn(executable)); return status; diff --git a/src/core/intercept_queue.cpp b/src/core/intercept_queue.cpp index 0b309d63..705fff29 100644 --- a/src/core/intercept_queue.cpp +++ b/src/core/intercept_queue.cpp @@ -42,4 +42,7 @@ InterceptQueue::queue_id_t InterceptQueue::current_queue_id = 0; rocprofiler_hsa_callback_fun_t InterceptQueue::submit_callback_fun_ = NULL; void* InterceptQueue::submit_callback_arg_ = NULL; +bool InterceptQueue::opt_mode_ = false; +uint32_t InterceptQueue::k_concurrent_ = K_CONC_OFF; +std::once_flag InterceptQueue::once_flag_; } // namespace rocprofiler diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index a52d8c1d..5cd09b10 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -24,7 +24,6 @@ THE SOFTWARE. #define _SRC_CORE_INTERCEPT_QUEUE_H #include -#include #include #include @@ -41,9 +40,28 @@ THE SOFTWARE. #include "util/hsa_rsrc_factory.h" namespace rocprofiler { +enum { + K_CONC_OFF = 0, + K_CONC_PMC = 1, + K_CONC_TRACE = 2 +}; + extern decltype(hsa_queue_create)* hsa_queue_create_fn; extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; +void PmcStarter(Context* context); + +static std::mutex ctx_a_mutex; +typedef std::map ctx_a_map_t; +static ctx_a_map_t* ctx_a_map = NULL; +static bool ck_ctx_inactive(Context* context) { + std::lock_guard lock(ctx_a_mutex); + if (ctx_a_map == NULL) ctx_a_map = new ctx_a_map_t; + auto ret = ctx_a_map->insert({context, true}); + if (ret.second == false) ctx_a_map->erase(context); + return ret.second; +} + class InterceptQueue { public: typedef std::recursive_mutex mutex_t; @@ -79,7 +97,13 @@ class InterceptQueue { if (!obj_map_) obj_map_ = new obj_map_t; InterceptQueue* obj = new InterceptQueue(agent, *queue, proxy); (*obj_map_)[(uint64_t)(*queue)] = obj; - status = proxy->SetInterceptCB(OnSubmitCB, obj); + if (k_concurrent_ == K_CONC_TRACE) { + status = proxy->SetInterceptCB(OnSubmitCB_ctrace, obj); + } else if (opt_mode_) { + status = proxy->SetInterceptCB(OnSubmitCB_opt, obj); + } else { + status = proxy->SetInterceptCB(OnSubmitCB, obj); + } obj->queue_event_callback_ = callback; obj->queue_id = current_queue_id; ++current_queue_id; @@ -123,6 +147,77 @@ class InterceptQueue { return status; } + static void OnSubmitCB_opt(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data, + hsa_amd_queue_intercept_packet_writer writer) { + const packet_t* packets_arr = reinterpret_cast(in_packets); + InterceptQueue* obj = reinterpret_cast(data); + Queue* proxy = obj->proxy_; + + // Travers input packets + for (uint64_t j = 0; j < count; ++j) { + const packet_t* packet = &packets_arr[j]; + bool to_submit = true; + + // Checking for dispatch packet type + if ((GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) && + (dispatch_callback_.load(std::memory_order_acquire) != NULL)) { + const hsa_kernel_dispatch_packet_t* dispatch_packet = + reinterpret_cast(packet); + const hsa_signal_t completion_signal = dispatch_packet->completion_signal; + + rocprofiler_callback_data_t data = {obj->agent_info_->dev_id, + obj->agent_info_->dev_index, + obj->queue_, + user_que_idx, + obj->queue_id, + completion_signal, + dispatch_packet, + NULL, // kernel_name + 0, // kernel_object + NULL, // kernel_code + 0, // (uint32_t)syscall(__NR_gettid), + NULL}; // record + + // Calling dispatch callback + rocprofiler_group_t group = {}; + hsa_status_t status = (dispatch_callback_.load())(&data, callback_data_, &group); + Context* context = reinterpret_cast(group.context); + // Injecting profiling start/stop packets + if ((status == HSA_STATUS_SUCCESS) && (context != NULL)) { + if (group.feature_count != 0) { + if (tracker_ != NULL) { + const_cast(dispatch_packet)->completion_signal = context->GetDispatchSignal(); + Group* context_group = context->GetGroup(group.index); + Tracker::Enable_opt(context_group, completion_signal); + context_group->IncrRefsCount(); + } + + const pkt_vector_t& start_vector = context->StartPackets(group.index); + const pkt_vector_t& stop_vector = context->StopPackets(group.index); + pkt_vector_t packets = start_vector; + packets.insert(packets.end(), *packet); + packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); + if (writer != NULL) { + writer(&packets[0], packets.size()); + } else { + proxy->Submit(&packets[0], packets.size()); + } + to_submit = false; + } + } + } + + // Submitting the original packets if profiling was not enabled + if (to_submit) { + if (writer != NULL) { + writer(packet, 1); + } else { + proxy->Submit(packet, 1); + } + } + } + } + static void OnSubmitCB(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data, hsa_amd_queue_intercept_packet_writer writer) { const packet_t* packets_arr = reinterpret_cast(in_packets); @@ -202,7 +297,6 @@ class InterceptQueue { // Calling dispatch callback rocprofiler_group_t group = {}; hsa_status_t status = (dispatch_callback_.load())(&data, callback_data_, &group); - free(const_cast(kernel_name)); // Injecting profiling start/stop packets if ((status != HSA_STATUS_SUCCESS) || (group.context == NULL)) { if (tracker_entry != NULL) { @@ -221,9 +315,27 @@ class InterceptQueue { const pkt_vector_t& start_vector = context->StartPackets(group.index); const pkt_vector_t& stop_vector = context->StopPackets(group.index); - pkt_vector_t packets = start_vector; - packets.insert(packets.end(), *packet); - packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); + const pkt_vector_t& read_vector = context->ReadPackets(group.index); + pkt_vector_t packets; + + if (k_concurrent_ == K_CONC_OFF) { // serial + packets = start_vector; + packets.insert(packets.end(), *packet); + packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); + } else { // concurrent + // Atrt PMC once + std::call_once(once_flag_, PmcStarter, context); + // Reads at both kernel start and end + assert(read_vector.size() == 2 * start_vector.size()); + auto mid = read_vector.begin() + read_vector.size()/2; + // Read at kernel start + packets.insert(packets.end(), read_vector.begin(), mid); + // Kernel dispatch packet + packets.insert(packets.end(), *packet); + // Read at kernel end + packets.insert(packets.end(), mid, read_vector.end()); + } + if (writer != NULL) { writer(&packets[0], packets.size()); } else { @@ -251,6 +363,110 @@ class InterceptQueue { } } + static void OnSubmitCB_ctrace(const void* in_packets, uint64_t count, uint64_t user_que_idx, void* data, + hsa_amd_queue_intercept_packet_writer writer) { + const packet_t* packets_arr = reinterpret_cast(in_packets); + InterceptQueue* obj = reinterpret_cast(data); + Queue* proxy = obj->proxy_; + + if (submit_callback_fun_) { + mutex_.lock(); + auto* callback_fun = submit_callback_fun_; + void* callback_arg = submit_callback_arg_; + mutex_.unlock(); + + if (callback_fun) { + for (uint64_t j = 0; j < count; ++j) { + const packet_t* packet = &packets_arr[j]; + const hsa_kernel_dispatch_packet_t* dispatch_packet = + reinterpret_cast(packet); + + const char* kernel_name = NULL; + if (GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) { + uint64_t kernel_object = dispatch_packet->kernel_object; + const amd_kernel_code_t* kernel_code = GetKernelCode(kernel_object); + kernel_name = (GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) ? + QueryKernelName(kernel_object, kernel_code) : NULL; + } + + // Prepareing submit callback data + rocprofiler_hsa_callback_data_t data{}; + data.submit.packet = (void*)packet; + data.submit.kernel_name = kernel_name; + data.submit.queue = obj->queue_; + data.submit.device_type = obj->agent_info_->dev_type; + data.submit.device_id = obj->agent_info_->dev_index; + + callback_fun(ROCPROFILER_HSA_CB_ID_SUBMIT, &data, callback_arg); + } + } + } + + // Travers input packets + for (uint64_t j = 0; j < count; ++j) { + const packet_t* packet = &packets_arr[j]; + bool to_submit = true; + + // Checking for dispatch packet type + if ((GetHeaderType(packet) == HSA_PACKET_TYPE_KERNEL_DISPATCH) && + (dispatch_callback_.load(std::memory_order_acquire) != NULL)) { + const hsa_kernel_dispatch_packet_t* dispatch_packet = + reinterpret_cast(packet); + const hsa_signal_t completion_signal = dispatch_packet->completion_signal; + + // Prepareing dispatch callback data + uint64_t kernel_object = dispatch_packet->kernel_object; + const amd_kernel_code_t* kernel_code = GetKernelCode(kernel_object); + const char* kernel_name = QueryKernelName(kernel_object, kernel_code); + + rocprofiler_callback_data_t data = {obj->agent_info_->dev_id, + obj->agent_info_->dev_index, + obj->queue_, + user_que_idx, + obj->queue_id, + completion_signal, + dispatch_packet, + kernel_name, + kernel_object, + kernel_code, + (uint32_t)syscall(__NR_gettid), + NULL}; + + // Calling dispatch callback + rocprofiler_group_t group = {}; + hsa_status_t status = (dispatch_callback_.load())(&data, callback_data_, &group); + + // Injecting profiling start/stop packets + if ((status == HSA_STATUS_SUCCESS) && (group.context != NULL)) { + Context* context = reinterpret_cast(group.context); + const bool ctx_inactive = ck_ctx_inactive(context); + + const pkt_vector_t& start_vector = context->StartPackets(group.index); + const pkt_vector_t& stop_vector = context->StopPackets(group.index); + pkt_vector_t packets; + if (ctx_inactive) packets = start_vector; + packets.insert(packets.end(), *packet); + if (!ctx_inactive) packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); + if (writer != NULL) { + writer(&packets[0], packets.size()); + } else { + proxy->Submit(&packets[0], packets.size()); + } + to_submit = false; + } + } + + // Submitting the original packets if profiling was not enabled + if (to_submit) { + if (writer != NULL) { + writer(packet, 1); + } else { + proxy->Submit(packet, 1); + } + } + } + } + static void SetCallbacks(rocprofiler_queue_callbacks_t callbacks, void* data) { std::lock_guard lck(mutex_); if (callback_data_ != NULL) { @@ -279,6 +495,9 @@ class InterceptQueue { static void TrackerOn(bool on) { tracker_on_ = on; } static bool IsTrackerOn() { return tracker_on_; } + static bool opt_mode_; + static uint32_t k_concurrent_; + private: static void queue_event_callback(hsa_status_t status, hsa_queue_t *queue, void *arg) { if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "queue error handling is not supported"); @@ -309,14 +528,6 @@ class InterceptQueue { return (dbg_info != NULL) ? dbg_info->kernel_name : NULL; } - // Demangle C++ symbol name - static const char* cpp_demangle(const char* symname) { - size_t size = 0; - int status; - const char* ret = abi::__cxa_demangle(symname, NULL, &size, &status); - return (ret != 0) ? ret : strdup(symname); - } - static const char* QueryKernelName(uint64_t kernel_object, const amd_kernel_code_t* kernel_code) { const uint16_t kernel_object_flag = *((uint64_t*)kernel_code + 1); if (kernel_object_flag == 0) { @@ -327,7 +538,7 @@ class InterceptQueue { const char* kernel_symname = (util::HsaRsrcFactory::IsExecutableTracking()) ? util::HsaRsrcFactory::GetKernelNameRef(kernel_object) : GetKernelName(kernel_code->runtime_loader_kernel_symbol); - return cpp_demangle(kernel_symname); + return kernel_symname; } // method to get an intercept queue object @@ -391,6 +602,8 @@ class InterceptQueue { const util::AgentInfo* agent_info_; queue_event_callback_t queue_event_callback_; queue_id_t queue_id; + + static std::once_flag once_flag_; }; } // namespace rocprofiler diff --git a/src/core/profile.h b/src/core/profile.h index 9ed03375..f6165d07 100644 --- a/src/core/profile.h +++ b/src/core/profile.h @@ -119,7 +119,34 @@ class Profile { virtual void Insert(const profile_info_t& info) { info_vector_.push_back(info.rinfo); } - hsa_status_t Finalize(pkt_vector_t& start_vector, pkt_vector_t& stop_vector, pkt_vector_t& read_vector) { + void SetConcurrent(profile_t* profile) { + // Check whether conconcurrent has been set + for (const parameter_t* p = profile->parameters; + p < (profile->parameters + profile->parameter_count); ++p) { + // If yes, stop here + if (p->parameter_name == HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT) { + return; + } + } + + // Otherwise, try to set + parameter_t* parameters = new parameter_t[profile->parameter_count+1]; + for (unsigned i = 0; i < profile->parameter_count; ++i) { + parameters[i].parameter_name = profile->parameters[i].parameter_name; + parameters[i].value = profile->parameters[i].value; + } + if (profile->parameters) free(const_cast(profile->parameters)); + parameters[profile->parameter_count].parameter_name = + HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT; + parameters[profile->parameter_count].value = 1; + profile->parameters = parameters; + profile->parameter_count += 1; + } + + hsa_status_t Finalize(pkt_vector_t& start_vector, pkt_vector_t& stop_vector, + pkt_vector_t& read_vector, bool is_concurrent = false) { + if (is_concurrent) SetConcurrent(&profile_); + hsa_status_t status = HSA_STATUS_SUCCESS; if (!info_vector_.empty()) { @@ -127,11 +154,14 @@ class Profile { const pfn_t* api = rsrc->AqlProfileApi(); packet_t start{}; packet_t stop{}; - packet_t read{}; + packet_t read{}; // read at kernel start + packet_t read2{}; // read at kernel end // Check the profile buffer sizes status = api->hsa_ven_amd_aqlprofile_start(&profile_, NULL); if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_start(NULL)"); + // Double output buffer size if concurrent + if (is_concurrent) profile_.output_buffer.size *= 2; status = Allocate(rsrc); if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "Allocate()"); @@ -144,21 +174,28 @@ class Profile { #ifdef AQLPROF_NEW_API if (profile_.type == HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_PMC) { rd_status = api->hsa_ven_amd_aqlprofile_read(&profile_, &read); + if (is_concurrent){ // concurrent: one more read + if (rd_status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_read"); + rd_status = api->hsa_ven_amd_aqlprofile_read(&profile_, &read2); + } } #if 0 // Read API returns error if disabled if (rd_status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "aqlprofile_read"); #endif #endif - // Set completion signal + // Set completion signal of start hsa_signal_t dummy_signal{}; dummy_signal.handle = 0; start.completion_signal = dummy_signal; + + // Set completion signal of read/stop hsa_signal_t post_signal; status = hsa_signal_create(1, 0, NULL, &post_signal); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); stop.completion_signal = post_signal; read.completion_signal = post_signal; + read2.completion_signal = post_signal; completion_signal_ = post_signal; // Fill packet vectors @@ -180,18 +217,24 @@ class Profile { AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); if (rd_status == HSA_STATUS_SUCCESS) { - const uint32_t read_index = read_vector.size(); - read_vector.insert(read_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{}); - status = api->hsa_ven_amd_aqlprofile_legacy_get_pm4( - &read, reinterpret_cast(&read_vector[read_index])); - if (status != HSA_STATUS_SUCCESS) - AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + pkt_vector_t reads = {read}; + if (is_concurrent) reads.push_back(read2); + for (auto rd : reads) { + const uint32_t read_index = read_vector.size(); + read_vector.insert(read_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{}); + status = api->hsa_ven_amd_aqlprofile_legacy_get_pm4( + &rd, reinterpret_cast(&read_vector[read_index])); + if (status != HSA_STATUS_SUCCESS) + AQL_EXC_RAISING(status, "hsa_ven_amd_aqlprofile_legacy_get_pm4"); + } } } else { start_vector.push_back(start); stop_vector.push_back(stop); if (rd_status == HSA_STATUS_SUCCESS) { read_vector.push_back(read); + if (is_concurrent) + read_vector.push_back(read2); } } } @@ -237,46 +280,6 @@ class PmcProfile : public Profile { } }; -class TraceProfile : public Profile { - public: - static inline void SetSize(const uint32_t& size) { output_buffer_size_ = size; } - static inline uint32_t GetSize() { return output_buffer_size_; } - static inline void SetLocal(const bool& b) { output_buffer_local_ = b; } - static inline bool IsLocal() { return output_buffer_local_; } - - TraceProfile(const util::AgentInfo* agent_info) : Profile(agent_info) { - profile_.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE; - } - - void Insert(const profile_info_t& info) { - if (info.parameters != NULL) { - Profile::Insert(info); - for (unsigned j = 0; j < info.parameter_count; ++j) { - Config(&profile_).Insert(info.parameters[j]); - } - } else if (info.event != NULL) { - Config(&profile_).Insert(*(info.event)); - } else { - EXC_ABORT(HSA_STATUS_ERROR, "invalid trace info inserted"); - } - } - - hsa_status_t Allocate(util::HsaRsrcFactory* rsrc) { - profile_.command_buffer.ptr = - rsrc->AllocateSysMemory(agent_info_, profile_.command_buffer.size); - profile_.output_buffer.size = output_buffer_size_; - profile_.output_buffer.ptr = (output_buffer_local_) ? - rsrc->AllocateLocalMemory(agent_info_, profile_.output_buffer.size) : - rsrc->AllocateSysMemory(agent_info_, profile_.output_buffer.size); - return (profile_.command_buffer.ptr && profile_.output_buffer.ptr) ? HSA_STATUS_SUCCESS - : HSA_STATUS_ERROR; - } - - private: - static uint32_t output_buffer_size_; - static bool output_buffer_local_; -}; - } // namespace rocprofiler #endif // SRC_CORE_PROFILE_H_ diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index 618edf23..e53d7257 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -150,6 +150,20 @@ void RestoreHsaApi() { table->amd_ext_->hsa_amd_queue_intercept_register_fn = hsa_amd_queue_intercept_register_fn; } +void PmcStarter(Context* context) { + hsa_agent_t agent = context->GetAgent(); + // Create queue + hsa_queue_t* queue; + hsa_status_t status = rocprofiler::CreateQueuePro(agent, 1, + HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "CreateQueuePro"); + HsaQueue hsa_queue(NULL, queue); + context->Start(0, &hsa_queue); + context->Read(0, &hsa_queue); + context->GetData(0); + hsa_queue_destroy(queue); +} + void StandaloneIntercept() { ::HsaApiTable* table = kHsaApiTable; table->core_->hsa_queue_create_fn = rocprofiler::CreateQueuePro; @@ -199,8 +213,6 @@ uint32_t LoadTool() { rocprofiler_settings_t settings{}; settings.intercept_mode = (intercept_mode != 0) ? 1 : 0; - settings.trace_size = TraceProfile::GetSize(); - settings.trace_local = TraceProfile::IsLocal() ? 1: 0; settings.timeout = util::HsaRsrcFactory::GetTimeoutNs(); settings.timestamp_on = InterceptQueue::IsTrackerOn() ? 1 : 0; settings.code_obj_tracking = 1; @@ -208,14 +220,17 @@ uint32_t LoadTool() { if (handler) handler(); else if (handler_prop) handler_prop(&settings); - TraceProfile::SetSize(settings.trace_size); - TraceProfile::SetLocal(settings.trace_local != 0); util::HsaRsrcFactory::SetTimeoutNs(settings.timeout); InterceptQueue::TrackerOn(settings.timestamp_on != 0); if (settings.intercept_mode != 0) intercept_mode = DISPATCH_INTERCEPT_MODE; if (settings.code_obj_tracking) intercept_mode |= CODE_OBJ_TRACKING_MODE; if (settings.memcopy_tracking) intercept_mode |= MEMCOPY_INTERCEPT_MODE; if (settings.hsa_intercepting) intercept_mode |= HSA_INTERCEPT_MODE; + if (settings.k_concurrent) { + Context::k_concurrent_ = settings.k_concurrent; + InterceptQueue::k_concurrent_ = settings.k_concurrent; + } + if (settings.opt_mode) InterceptQueue::opt_mode_ = true; } ONLOAD_TRACE("end intercept_mode(" << intercept_mode << ")"); @@ -418,8 +433,6 @@ hsa_status_t hsa_amd_memory_async_copy_rect_interceptor( } rocprofiler_properties_t rocprofiler_properties; -uint32_t TraceProfile::output_buffer_size_ = 0x2000000; // 32M -bool TraceProfile::output_buffer_local_ = true; std::atomic Tracker::instance_{}; Tracker::mutex_t Tracker::glob_mutex_; Tracker::counter_t Tracker::counter_ = 0; @@ -427,6 +440,8 @@ util::Logger::mutex_t util::Logger::mutex_; std::atomic util::Logger::instance_{}; } +CONTEXT_INSTANTIATE(); + /////////////////////////////////////////////////////////////////////////////////////////////////// // Public library methods // @@ -536,8 +551,7 @@ PUBLIC_API hsa_status_t rocprofiler_open(hsa_agent_t agent, rocprofiler_feature_ if (mode != 0) { if (mode & ROCPROFILER_MODE_STANDALONE) { if (mode & ROCPROFILER_MODE_CREATEQUEUE) { - if (hsa_rsrc->CreateQueue(agent_info, properties->queue_depth, &(properties->queue)) == - false) { + if (hsa_rsrc->CreateQueue(agent_info, properties->queue_depth, &(properties->queue)) == false) { EXC_RAISING(HSA_STATUS_ERROR, "CreateQueue() failed"); } } @@ -591,7 +605,7 @@ PUBLIC_API hsa_status_t rocprofiler_get_group(rocprofiler_t* handle, uint32_t gr rocprofiler_group_t* group) { API_METHOD_PREFIX rocprofiler::Context* context = reinterpret_cast(handle); - *group = context->GetGroupInfo(group_index); + *group = context->GetGroupDescr(group_index); API_METHOD_SUFFIX } @@ -692,12 +706,7 @@ PUBLIC_API hsa_status_t rocprofiler_stop_queue_callbacks() { // Method for iterating the events output data PUBLIC_API hsa_status_t rocprofiler_iterate_trace_data( - rocprofiler_t* handle, hsa_ven_amd_aqlprofile_data_callback_t callback, void* data) { - API_METHOD_PREFIX - rocprofiler::Context* context = reinterpret_cast(handle); - context->IterateTraceData(callback, data); - API_METHOD_SUFFIX -} + rocprofiler_t* handle, hsa_ven_amd_aqlprofile_data_callback_t callback, void* data) {} //////////////////////////////////////////////////////////////////////////////// // Open profiling pool diff --git a/src/core/tracker.h b/src/core/tracker.h index 823dc17d..d538aff7 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -155,6 +155,49 @@ class Tracker { Enable(entry, reinterpret_cast(handler), arg); } + // Enable tracking + static void Enable_opt(Group* group, const hsa_signal_t& orig_signal) { + Context* context = group->GetContext(); + context->SetOrigSignal(orig_signal); + context->GetRecord()->dispatch = util::HsaRsrcFactory::Instance().TimestampNs(); + + // Creating a proxy signal + const hsa_signal_value_t signal_value = (orig_signal.handle) ? + util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_load_relaxed(orig_signal) : 1; + hsa_signal_t& dispatch_signal = context->GetDispatchSignal(); + util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_store_screlease(dispatch_signal, signal_value); + hsa_status_t status = + util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_signal_async_handler(dispatch_signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler_opt, group); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + } + + // Tracker handler + static bool Handler_opt(hsa_signal_value_t signal_value, void* arg) { + Group* group = reinterpret_cast(arg); + Context* context = group->GetContext(); + hsa_signal_t dispatch_signal = context->GetDispatchSignal(); + record_t* record = context->GetRecord(); + hsa_amd_profiling_dispatch_time_t dispatch_time{}; + hsa_status_t status = + util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_profiling_get_dispatch_time(context->GetAgent(), dispatch_signal, &dispatch_time); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_profiling_get_dispatch_time"); + record->begin = util::HsaRsrcFactory::Instance().SysclockToNs(dispatch_time.start); + record->end = util::HsaRsrcFactory::Instance().SysclockToNs(dispatch_time.end); + record->complete = util::HsaRsrcFactory::Instance().TimestampNs(); + + // Original intercepted signal completion + const hsa_signal_t& orig_signal = context->GetOrigSignal(); + if (orig_signal.handle) { + amd_signal_t* orig_signal_ptr = reinterpret_cast(orig_signal.handle); + amd_signal_t* prof_signal_ptr = reinterpret_cast(dispatch_signal.handle); + orig_signal_ptr->start_ts = prof_signal_ptr->start_ts; + orig_signal_ptr->end_ts = prof_signal_ptr->end_ts; + util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_store_screlease(orig_signal, signal_value); + } + + return Context::Handler(signal_value, arg); + } + private: Tracker() : outstanding_(0), diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index 78833284..e2f97ce9 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -36,6 +36,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include @@ -626,6 +627,8 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br &kernelSymbol); CHECK_STATUS("Error in looking up kernel symbol", status); + close(file_handle); + // Update output parameter *code_desc = kernelSymbol; return true; @@ -705,7 +708,7 @@ const char* HsaRsrcFactory::GetKernelNameRef(uint64_t addr) { std::lock_guard lck(mutex_); const auto it = symbols_map_->find(addr); if (it == symbols_map_->end()) { - fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr); + fprintf(stderr, "HsaRsrcFactory::GetKernelNameRef: kernel addr (0x%lx) is not found\n", addr); abort(); } return it->second; diff --git a/test/app/standalone_test.cpp b/test/app/standalone_test.cpp index 34bc05ea..7758daf2 100644 --- a/test/app/standalone_test.cpp +++ b/test/app/standalone_test.cpp @@ -146,19 +146,6 @@ int main() { // feature[8].name = "TCC_EA_WRREQ_sum"; // feature[9].kind = ROCPROFILER_FEATURE_KIND_METRIC; // feature[9].name = "TCC_EA_WRREQ_64B_sum"; -#if 0 - // Tracing parameters - const unsigned parameter_count = 2; - rocprofiler_parameter_t parameters[parameter_count]; - feature[2].name = "THREAD_TRACE"; - feature[2].kind = ROCPROFILER_FEATURE_KIND_TRACE; - feature[2].parameters = parameters; - feature[2].parameter_count = parameter_count; - parameters[0].parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK; - parameters[0].value = 0; - parameters[1].parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK; - parameters[1].value = 0; -#endif // Instantiate HSA resources HsaRsrcFactory::Create(); diff --git a/test/run.sh b/test/run.sh index 4c985d3e..8611d7ef 100755 --- a/test/run.sh +++ b/test/run.sh @@ -86,7 +86,7 @@ export HSA_TOOLS_LIB=librocprofiler64.so.1 # enable intercepting mode in rocprofiler export ROCP_HSA_INTERCEPT=2 # test macro for kernel iterations number -export ROCP_KITER=100 +export ROCP_KITER=20 # test macro for per-kernel dispatching number export ROCP_DITER=10 eval_test "Standalone intercepting test" ./test/stand_intercept_test @@ -95,8 +95,8 @@ unset ROCP_HSA_INTERCEPT ## Intercepting usage model test # tool library loaded by ROC profiler export ROCP_TOOL_LIB=./test/libintercept_test.so -export ROCP_KITER=50 -export ROCP_DITER=50 +export ROCP_KITER=20 +export ROCP_DITER=20 export ROCP_AGENTS=1 export ROCP_THRS=3 eval_test "Intercepting usage model test" ./test/ctrl @@ -107,59 +107,56 @@ export ROCP_TOOL_LIB=libtool.so # ROC profiler kernels timing export ROCP_TIMESTAMP_ON=1 # output directory for the tool library, for metrics results file 'results.txt' -# and SQTT trace files 'thread_trace.se.out' export ROCP_OUTPUT_DIR=./RESULTS if [ ! -e $ROCP_TOOL_LIB ] ; then export ROCP_TOOL_LIB=test/libtool.so fi -export ROCP_KITER=50 -export ROCP_DITER=50 +export ROCP_KITER=20 +export ROCP_DITER=20 export ROCP_AGENTS=1 export ROCP_THRS=1 -export ROCP_INPUT=input.xml -eval_test "'rocprof' libtool test" ./test/ctrl +export ROCP_INPUT=pmc_input.xml +eval_test "'rocprof' libtool PMC test" ./test/ctrl -export ROCP_KITER=10 -export ROCP_DITER=10 +export ROCP_KITER=20 +export ROCP_DITER=20 export ROCP_AGENTS=1 export ROCP_THRS=10 -export ROCP_INPUT=input1.xml -eval_test "'rocprof' libtool test n-threads" ./test/ctrl - -## SPM test -# export ROCP_KITER=3 -# export ROCP_DITER=3 -# export ROCP_AGENTS=1 -# export ROCP_THRS=1 -# export ROCP_INPUT=spm_input.xml -# export ROCP_SPM=1 -# eval_test "libtool test, SPM trace test" ./test/ctrl -# unset ROCP_SPM +export ROCP_INPUT=pmc_input.xml +eval_test "'rocprof' libtool PMC n-thread test" ./test/ctrl -## Libtool test, counter sets -# Memcopies tracking -export ROCP_MCOPY_TRACKING=1 +export ROCP_OPT_MODE=1 +export ROCP_KITER=20 +export ROCP_DITER=20 +export ROCP_AGENTS=1 +export ROCP_THRS=10 +export ROCP_INPUT=pmc_input.xml +eval_test "'rocprof' libtool PMC n-thread opt test" ./test/ctrl +unset ROCP_OPT_MODE -export ROCP_KITER=1 -export ROCP_DITER=4 -export ROCP_INPUT=input2.xml -eval_test "libtool test, counter sets" ./test/ctrl +export ROCP_KITER=20 +export ROCP_DITER=20 +export ROCP_AGENTS=1 +export ROCP_THRS=1 +export ROCP_INPUT=pmc_input1.xml +eval_test "'rocprof' libtool PMC test1" ./test/ctrl -## OpenCL test -#export ROCP_OBJ_TRACKING=1 -#export ROCP_INPUT=input1.xml -#eval_test "libtool test, OpenCL sample" ./test/ocl/SimpleConvolution +export ROCP_KITER=20 +export ROCP_DITER=20 +export ROCP_AGENTS=1 +export ROCP_THRS=10 +export ROCP_INPUT=pmc_input1.xml +eval_test "'rocprof' libtool PMC n-thread test1" ./test/ctrl -# Memcopies tracking -unset ROCP_MCOPY_TRACKING +## Libtool test, counter sets # enable HSA intercepting export ROCP_HSA_INTERC=1 export ROCP_KITER=10 export ROCP_DITER=10 -export ROCP_INPUT=input1.xml +#export ROCP_INPUT=input1.xml eval_test "libtool test, counter sets" ./test/ctrl ## OpenCL test diff --git a/test/tool/pmc_input.xml b/test/tool/pmc_input.xml new file mode 100644 index 00000000..6b9e3d6a --- /dev/null +++ b/test/tool/pmc_input.xml @@ -0,0 +1,4 @@ +# List of metrics + diff --git a/test/tool/pmc_input1.xml b/test/tool/pmc_input1.xml new file mode 100644 index 00000000..6863fa29 --- /dev/null +++ b/test/tool/pmc_input1.xml @@ -0,0 +1,14 @@ +# Filter by dispatches range, GPU index and kernel names + + +# List of metrics + diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index e216b7fd..6b2adf8a 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -27,6 +27,7 @@ THE SOFTWARE. /////////////////////////////////////////////////////////////////////////////// #include +#include #include #include #include @@ -99,6 +100,7 @@ struct context_entry_t { unsigned feature_count; rocprofiler_callback_data_t data; kernel_properties_t kernel_properties; + HsaRsrcFactory::symbols_map_it_t kernel_name_it; FILE* file_handle; }; @@ -143,8 +145,6 @@ static uint32_t CTX_OUTSTANDING_MON = 0; uint32_t to_truncate_names = 0; // local trace buffer bool is_trace_local = true; -// SPM trace enabled -bool is_spm_trace = false; static inline uint32_t GetPid() { return syscall(__NR_getpid); } static inline uint32_t GetTid() { return syscall(__NR_gettid); } @@ -169,6 +169,21 @@ void check_status(hsa_status_t status) { } } +////////////////////////////////////////////////////////////////////////////////////// +// Dispatch opt code ///////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////// +// Context callback arg +struct callbacks_arg_t { + rocprofiler_pool_t** pools; +}; + +// Handler callback arg +struct handler_arg_t { + rocprofiler_feature_t* features; + unsigned feature_count; +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////// // Print profiling results output break if terminal output is enabled void results_output_break() { const bool is_terminal_output = (result_file_opened == false); @@ -289,54 +304,18 @@ void dealloc_context_entry(context_entry_t* entry) { } } -// Dump trace data to file -void dump_sqtt_trace(const char* label, const uint32_t chunk, const void* data, const uint32_t& size) { - if (result_prefix != NULL) { - // Open file - std::ostringstream oss; - oss << result_prefix << "/thread_trace_" << label << "_se" << chunk << ".out"; - FILE* file = fopen(oss.str().c_str(), "w"); - if (file == NULL) { - std::ostringstream errmsg; - errmsg << "fopen error, file '" << oss.str().c_str() << "'"; - perror(errmsg.str().c_str()); - abort(); - } - - // Write the buffer in terms of shorts (16 bits) - const unsigned short* ptr = reinterpret_cast(data); - for (uint32_t i = 0; i < (size / sizeof(short)); ++i) { - fprintf(file, "%04x\n", ptr[i]); - } - - // Close file - fclose(file); - } -} - -// Dump trace data to file -void dump_spm_trace(const char* label, const void* data, const uint32_t& size) { - if (result_prefix != NULL) { - // Open trace file - std::ostringstream oss; - oss << result_prefix << "/spm_trace_" << label << ".out"; - const int fd = open(oss.str().c_str(), O_CREAT|O_WRONLY|O_TRUNC, 0666); - if (fd == -1) { - std::ostringstream errmsg; - errmsg << "open error, file '" << oss.str().c_str() << "'"; - perror(errmsg.str().c_str()); - abort(); - } - // write trace binary data - if (write(fd, data, size) == -1) { - std::ostringstream errmsg; - errmsg << "write error, file '" << oss.str().c_str() << "'"; - perror(errmsg.str().c_str()); - abort(); - } - // Close file - close(fd); - } +// Global context map +static std::mutex ctx_a_mutex; +typedef std::map ctx_a_map_t; +ctx_a_map_t* ctx_a_map = NULL; +context_entry_t* ck_ctx_entry(hsa_agent_t agent, bool& found) { + std::lock_guard lock(ctx_a_mutex); + if (ctx_a_map == NULL) ctx_a_map = new ctx_a_map_t; + auto ret = ctx_a_map->insert({agent.handle, NULL}); + found = !ret.second; + if (found) ctx_a_map->erase(agent.handle); + else ret.first->second = new context_entry_t{}; + return ret.first->second; } struct trace_data_arg_t { @@ -345,54 +324,6 @@ struct trace_data_arg_t { hsa_agent_t agent; }; -// Trace data callback for getting trace data from GPU local memory -hsa_status_t trace_data_cb(hsa_ven_amd_aqlprofile_info_type_t info_type, - hsa_ven_amd_aqlprofile_info_data_t* info_data, void* data) { - hsa_status_t status = HSA_STATUS_SUCCESS; - trace_data_arg_t* arg = reinterpret_cast(data); - if (info_type == HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA) { - if (is_spm_trace) { - if (info_data->sample_id != 0) { - fatal("Only one SPM sample expected"); - } - const void* data_ptr = info_data->trace_data.ptr; - const uint32_t data_size = info_data->trace_data.size; - fprintf(arg->file, " size(%u)\n", data_size); - - if (is_trace_local == false) fatal("SPM trace supports only local trace allocation"); - HsaRsrcFactory* hsa_rsrc = &HsaRsrcFactory::Instance(); - const AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(arg->agent); - const uint32_t mem_size = data_size; - void* buffer = hsa_rsrc->AllocateSysMemory(agent_info, mem_size); - if(!hsa_rsrc->Memcpy(agent_info, buffer, data_ptr, mem_size)) { - fatal("Trace data memcopy to host failed"); - } - dump_spm_trace(arg->label, buffer, data_size); - HsaRsrcFactory::FreeMemory(buffer); - } else { - const void* data_ptr = info_data->trace_data.ptr; - const uint32_t data_size = info_data->trace_data.size; - fprintf(arg->file, " SE(%u) size(%u)\n", info_data->sample_id, data_size); - - if (is_trace_local) { - HsaRsrcFactory* hsa_rsrc = &HsaRsrcFactory::Instance(); - const AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(arg->agent); - const uint32_t mem_size = data_size; - void* buffer = hsa_rsrc->AllocateSysMemory(agent_info, mem_size); - if(!hsa_rsrc->Memcpy(agent_info, buffer, data_ptr, mem_size)) { - fatal("Trace data memcopy to host failed"); - } - dump_sqtt_trace(arg->label, info_data->sample_id, buffer, data_size); - HsaRsrcFactory::FreeMemory(buffer); - } else { - dump_sqtt_trace(arg->label, info_data->sample_id, data_ptr, data_size); - } - } - } else - status = HSA_STATUS_ERROR; - return status; -} - // Align to specified alignment unsigned align_size(unsigned size, unsigned alignment) { return ((size + alignment - 1) & ~(alignment - 1)); @@ -413,38 +344,7 @@ void output_results(const context_entry_t* entry, const char* label) { case ROCPROFILER_DATA_KIND_INT64: fprintf(file, "(%lu)\n", p->data.result_int64); break; - // Output trace results - case ROCPROFILER_DATA_KIND_BYTES: { - if (p->data.result_bytes.copy) { - uint64_t size = 0; - - const char* ptr = reinterpret_cast(p->data.result_bytes.ptr); - const char* end = reinterpret_cast(ptr + p->data.result_bytes.size); - for (unsigned i = 0; i < p->data.result_bytes.instance_count; ++i) { - const uint32_t chunk_size = *reinterpret_cast(ptr); - const char* chunk_data = ptr + sizeof(uint32_t); - if (chunk_data >= end) fatal("Trace data is out of the result buffer size"); - - dump_sqtt_trace(label, i, chunk_data, chunk_size); - const uint32_t off = align_size(chunk_size, sizeof(uint32_t)); - ptr = chunk_data + off; - if (chunk_data >= end) fatal("Trace data ptr is out of the result buffer size"); - size += chunk_size; - } - fprintf(file, "size(%lu)\n", size); - HsaRsrcFactory::FreeMemory(p->data.result_bytes.ptr); - const_cast(p)->data.result_bytes.size = 0; - } else { - fprintf(file, "(\n"); - trace_data_arg_t trace_data_arg{file, label, entry->agent}; - hsa_status_t status = rocprofiler_iterate_trace_data(context, trace_data_cb, reinterpret_cast(&trace_data_arg)); - check_status(status); - fprintf(file, " )\n"); - } - break; - } default: - if (is_spm_trace) continue; fprintf(stderr, "RPL-tool: undefined data kind(%u)\n", p->data.kind); abort(); } @@ -465,7 +365,7 @@ void output_group(const context_entry_t* entry, const char* label) { } // Dump stored context entry -bool dump_context_entry(context_entry_t* entry) { +bool dump_context_entry(context_entry_t* entry, bool to_clean = true) { hsa_status_t status = HSA_STATUS_ERROR; volatile std::atomic* valid = reinterpret_cast*>(&entry->valid); @@ -481,35 +381,36 @@ bool dump_context_entry(context_entry_t* entry) { ++context_collected; const uint32_t index = entry->index; - FILE* file_handle = entry->file_handle; - const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); - const AgentInfo* agent_info = HsaRsrcFactory::Instance().GetAgentInfo(entry->agent); - - fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), pid(%u), tid(%u), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), kernel-name(\"%s\")", - index, - agent_info->dev_index, - entry->data.queue_id, - entry->data.queue_index, - my_pid, - entry->data.thread_id, - entry->kernel_properties.grid_size, - entry->kernel_properties.workgroup_size, - (entry->kernel_properties.lds_size + (AgentInfo::lds_block_size - 1)) & ~(AgentInfo::lds_block_size - 1), - entry->kernel_properties.scratch_size, - (entry->kernel_properties.vgpr_count + 1) * agent_info->vgpr_block_size, - (entry->kernel_properties.sgpr_count + agent_info->sgpr_block_dflt) * agent_info->sgpr_block_size, - entry->kernel_properties.fbarrier_count, - entry->kernel_properties.signal.handle, - nik_name.c_str()); - if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", - record->dispatch, - record->begin, - record->end, - record->complete); - fprintf(file_handle, "\n"); - fflush(file_handle); - - if (record) { + if (index != UINT32_MAX) { + FILE* file_handle = entry->file_handle; + const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); + const AgentInfo* agent_info = HsaRsrcFactory::Instance().GetAgentInfo(entry->agent); + + fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), pid(%u), tid(%u), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), kernel-name(\"%s\")", + index, + agent_info->dev_index, + entry->data.queue_id, + entry->data.queue_index, + my_pid, + entry->data.thread_id, + entry->kernel_properties.grid_size, + entry->kernel_properties.workgroup_size, + (entry->kernel_properties.lds_size + (AgentInfo::lds_block_size - 1)) & ~(AgentInfo::lds_block_size - 1), + entry->kernel_properties.scratch_size, + (entry->kernel_properties.vgpr_count + 1) * agent_info->vgpr_block_size, + (entry->kernel_properties.sgpr_count + agent_info->sgpr_block_dflt) * agent_info->sgpr_block_size, + entry->kernel_properties.fbarrier_count, + entry->kernel_properties.signal.handle, + nik_name.c_str()); + if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", + record->dispatch, + record->begin, + record->end, + record->complete); + fprintf(file_handle, "\n"); + fflush(file_handle); + } + if (record && to_clean) { delete record; entry->data.record = NULL; } @@ -527,11 +428,11 @@ bool dump_context_entry(context_entry_t* entry) { std::ostringstream oss; oss << index << "__" << filtr_kernel_name(entry->data.kernel_name); output_results(entry, oss.str().substr(0, KERNEL_NAME_LEN_MAX).c_str()); - free(const_cast(entry->data.kernel_name)); + if (to_clean) free(const_cast(entry->data.kernel_name)); // Finishing cleanup // Deleting profiling context will delete all allocated resources - rocprofiler_close(group.context); + if (to_clean) rocprofiler_close(group.context); } return true; @@ -574,7 +475,6 @@ void dump_context_array(hsa_queue_t* queue) { // Profiling completion handler // Dump and delete the context entry -// Return true if the context was dumped successfully bool context_handler(rocprofiler_group_t group, void* arg) { context_entry_t* entry = reinterpret_cast(arg); @@ -606,6 +506,34 @@ bool context_handler(rocprofiler_group_t group, void* arg) { return false; } +// Profiling completion handler +// Dump context entry +bool context_pool_handler(const rocprofiler_pool_entry_t* entry, void* arg) { + // Context entry + context_entry_t* ctx_entry = reinterpret_cast(entry->payload); + handler_arg_t* handler_arg = reinterpret_cast(arg); + ctx_entry->features = handler_arg->features; + ctx_entry->feature_count = handler_arg->feature_count; + ctx_entry->data.kernel_name = ctx_entry->kernel_name_it->second.name; + ctx_entry->file_handle = result_file_handle; + + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + dump_context_entry(ctx_entry, false); + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + HsaRsrcFactory::ReleaseKernelNameRef(ctx_entry->kernel_name_it); + + return false; +} + bool check_filter(const rocprofiler_callback_data_t* callback_data, const callbacks_data_t* tool_data) { bool found = true; @@ -641,12 +569,54 @@ bool check_filter(const rocprofiler_callback_data_t* callback_data, const callba return found; } +static const amd_kernel_code_t* GetKernelCode(uint64_t kernel_object) { + const amd_kernel_code_t* kernel_code = NULL; + hsa_status_t status = + HsaRsrcFactory::Instance().LoaderApi()->hsa_ven_amd_loader_query_host_address( + reinterpret_cast(kernel_object), + reinterpret_cast(&kernel_code)); + if (HSA_STATUS_SUCCESS != status) { + kernel_code = reinterpret_cast(kernel_object); + } + return kernel_code; +} + +// Setting kernel properties +void set_kernel_properties(const rocprofiler_callback_data_t* callback_data, + context_entry_t* entry) +{ + const hsa_kernel_dispatch_packet_t* packet = callback_data->packet; + kernel_properties_t* kernel_properties_ptr = &(entry->kernel_properties); + const amd_kernel_code_t* kernel_code = callback_data->kernel_code; + + entry->data = *callback_data; + + if (kernel_code == NULL) { + const uint64_t kernel_object = callback_data->packet->kernel_object; + kernel_code = GetKernelCode(kernel_object); + entry->kernel_name_it = HsaRsrcFactory::AcquireKernelNameRef(kernel_object); + } else { + entry->data.kernel_name = strdup(callback_data->kernel_name); + } + + uint64_t grid_size = packet->grid_size_x * packet->grid_size_y * packet->grid_size_z; + if (grid_size > UINT32_MAX) abort(); + kernel_properties_ptr->grid_size = (uint32_t)grid_size; + uint64_t workgroup_size = packet->workgroup_size_x * packet->workgroup_size_y * packet->workgroup_size_z; + if (workgroup_size > UINT32_MAX) abort(); + kernel_properties_ptr->workgroup_size = (uint32_t)workgroup_size; + kernel_properties_ptr->lds_size = packet->group_segment_size; + kernel_properties_ptr->scratch_size = packet->private_segment_size; + kernel_properties_ptr->vgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT); + kernel_properties_ptr->sgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT); + kernel_properties_ptr->fbarrier_count = kernel_code->workgroup_fbarrier_count; + kernel_properties_ptr->signal = callback_data->completion_signal; +} + // Kernel disoatch callback hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data, rocprofiler_group_t* group) { // Passed tool data - const hsa_kernel_dispatch_packet_t* packet = callback_data->packet; - const amd_kernel_code_t* kernel_code = callback_data->kernel_code; callbacks_data_t* tool_data = reinterpret_cast(user_data); // HSA status hsa_status_t status = HSA_STATUS_ERROR; @@ -659,23 +629,10 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, } } // Profiling context - rocprofiler_t* context = NULL; // Context entry context_entry_t* entry = alloc_context_entry(); - // kernel properties - kernel_properties_t* kernel_properties_ptr = &(entry->kernel_properties); - uint64_t grid_size = packet->grid_size_x * packet->grid_size_y * packet->grid_size_z; - if (grid_size > UINT32_MAX) abort(); - kernel_properties_ptr->grid_size = (uint32_t)grid_size; - uint64_t workgroup_size = packet->workgroup_size_x * packet->workgroup_size_y * packet->workgroup_size_z; - if (workgroup_size > UINT32_MAX) abort(); - kernel_properties_ptr->workgroup_size = (uint32_t)workgroup_size; - kernel_properties_ptr->lds_size = packet->group_segment_size; - kernel_properties_ptr->scratch_size = packet->private_segment_size; - kernel_properties_ptr->vgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WORKITEM_VGPR_COUNT); - kernel_properties_ptr->sgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT); - kernel_properties_ptr->fbarrier_count = kernel_code->workgroup_fbarrier_count; - kernel_properties_ptr->signal = callback_data->completion_signal; + // Setting kernel properties + set_kernel_properties(callback_data, entry); // context properties rocprofiler_properties_t properties{}; @@ -701,6 +658,7 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, } // Open profiling context + rocprofiler_t* context = NULL; status = rocprofiler_open(callback_data->agent, features, feature_count, &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); check_status(status); @@ -720,8 +678,6 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, entry->group = *group; entry->features = features; entry->feature_count = feature_count; - entry->data = *callback_data; - entry->data.kernel_name = strdup(callback_data->kernel_name); entry->file_handle = tool_data->file_handle; entry->active = true; reinterpret_cast*>(&entry->valid)->store(true); @@ -734,6 +690,35 @@ hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, return status; } +// Kernel disoatch callback +hsa_status_t dispatch_callback_opt(const rocprofiler_callback_data_t* callback_data, void* user_data, + rocprofiler_group_t* group) { + hsa_status_t status = HSA_STATUS_ERROR; + hsa_agent_t agent = callback_data->agent; + const unsigned gpu_id = HsaRsrcFactory::Instance().GetAgentInfo(agent)->dev_index; + callbacks_arg_t* callbacks_arg = reinterpret_cast(user_data); + rocprofiler_pool_t* pool = callbacks_arg->pools[gpu_id]; + rocprofiler_pool_entry_t pool_entry{}; + status = rocprofiler_pool_fetch(pool, &pool_entry); + check_status(status); + // Profiling context entry + rocprofiler_t* context = pool_entry.context; + context_entry_t* entry = reinterpret_cast(pool_entry.payload); + // Setting kernel properties + set_kernel_properties(callback_data, entry); + // Get group[0] + status = rocprofiler_get_group(context, 0, group); + check_status(status); + + // Fill profiling context entry + entry->index = UINT32_MAX; + entry->agent = agent; + entry->group = *group; + + reinterpret_cast*>(&entry->valid)->store(true); + return status; +} + hsa_status_t destroy_callback(hsa_queue_t* queue, void*) { results_output_break(); dump_context_array(queue); @@ -889,9 +874,19 @@ rocprofiler_hsa_callbacks_t hsa_callbacks { hsa_unified_callback, hsa_unified_callback, hsa_unified_callback, - hsa_unified_callback + hsa_unified_callback, + NULL }; +// HSA kernel symbol callback +hsa_status_t hsa_ksymbol_cb(rocprofiler_hsa_cb_id_t id, + const rocprofiler_hsa_callback_data_t* data, + void* arg) +{ + HsaRsrcFactory::SetKernelNameRef(data->ksymbol.object, data->ksymbol.name, data->ksymbol.destroy); + return HSA_STATUS_SUCCESS; +} + // Tool constructor extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) { @@ -979,6 +974,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) // Set HSA intercepting check_env_var("ROCP_HSA_INTERC", settings->hsa_intercepting); if (settings->hsa_intercepting) rocprofiler_set_hsa_callbacks(hsa_callbacks, (void*)14); + // Enable optmized mode + check_env_var("ROCP_OPT_MODE", settings->opt_mode); is_trace_local = settings->trace_local; @@ -1064,6 +1061,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) range_vec->push_back(*(range_vec->begin()) + 1); } + const bool filter_disabled = (gpu_index_vec->empty() && kernel_string_vec->empty() && range_vec->empty()); + // Getting traces const auto traces_list = xml->GetNodes("top.trace"); if (traces_list.size() > 1) fatal("ROCProfiler: only one trace supported at a time"); @@ -1087,26 +1086,79 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) // Context array aloocation context_array = new context_array_t; - // Adding dispatch observer - rocprofiler_queue_callbacks_t callbacks_ptrs{0}; - callbacks_ptrs.dispatch = dispatch_callback; - callbacks_ptrs.destroy = destroy_callback; - - callbacks_data = new callbacks_data_t{}; - callbacks_data->features = features; - callbacks_data->feature_count = features_found; - callbacks_data->set = (metrics_set->empty()) ? NULL : metrics_set; - callbacks_data->group_index = 0; - callbacks_data->file_handle = result_file_handle; - callbacks_data->gpu_index = (gpu_index_vec->empty()) ? NULL : gpu_index_vec; - callbacks_data->kernel_string = (kernel_string_vec->empty()) ? NULL : kernel_string_vec; - callbacks_data->range = (range_vec->empty()) ? NULL : range_vec;; - callbacks_data->filter_on = (callbacks_data->gpu_index != NULL) || - (callbacks_data->kernel_string != NULL) || - (callbacks_data->range != NULL) - ? 1 : 0; - - rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_data); + bool opt_mode_cond = ((features_found != 0) && + (metrics_set->empty()) && + (traces_found == 0) && + (filter_disabled == true)); + if (settings->opt_mode == 0) opt_mode_cond = false; + if (!opt_mode_cond) settings->opt_mode = 0; + if (opt_mode_cond) { + // Handler arg + handler_arg_t* handler_arg = new handler_arg_t{}; + handler_arg->features = features; + handler_arg->feature_count = feature_count; + + // Context properties + rocprofiler_pool_properties_t properties{}; + properties.num_entries = (CTX_OUTSTANDING_MAX != 0) ? CTX_OUTSTANDING_MAX : 1000; + properties.payload_bytes = sizeof(context_entry_t); + properties.handler = context_pool_handler; + properties.handler_arg = handler_arg; + + // Available GPU agents + const unsigned gpu_count = HsaRsrcFactory::Instance().GetCountOfGpuAgents(); + callbacks_arg_t* callbacks_arg = new callbacks_arg_t{}; + callbacks_arg->pools = new rocprofiler_pool_t* [gpu_count]; + for (unsigned gpu_id = 0; gpu_id < gpu_count; gpu_id++) { + // Getting GPU device info + const AgentInfo* agent_info = NULL; + if (HsaRsrcFactory::Instance().GetGpuAgentInfo(gpu_id, &agent_info) == false) { + fprintf(stderr, "GetGpuAgentInfo failed\n"); + abort(); + } + + // Open profiling pool + rocprofiler_pool_t* pool = NULL; + hsa_status_t status = rocprofiler_pool_open(agent_info->dev_id, features, features_found, + &pool, 0, &properties); + check_status(status); + callbacks_arg->pools[gpu_id] = pool; + } + + // Adding dispatch observer + rocprofiler_queue_callbacks_t callbacks_ptrs{0}; + callbacks_ptrs.dispatch = dispatch_callback_opt; + callbacks_ptrs.destroy = destroy_callback; + + rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_arg); + + rocprofiler_hsa_callbacks_t cs{}; + cs.ksymbol = hsa_ksymbol_cb; + rocprofiler_set_hsa_callbacks(cs, NULL); + settings->code_obj_tracking = 0; + settings->hsa_intercepting = 1; + } else { + // Adding dispatch observer + rocprofiler_queue_callbacks_t callbacks_ptrs{0}; + callbacks_ptrs.dispatch = dispatch_callback; + callbacks_ptrs.destroy = destroy_callback; + + callbacks_data = new callbacks_data_t{}; + callbacks_data->features = features; + callbacks_data->feature_count = features_found; + callbacks_data->set = (metrics_set->empty()) ? NULL : metrics_set; + callbacks_data->group_index = 0; + callbacks_data->file_handle = result_file_handle; + callbacks_data->gpu_index = (gpu_index_vec->empty()) ? NULL : gpu_index_vec; + callbacks_data->kernel_string = (kernel_string_vec->empty()) ? NULL : kernel_string_vec; + callbacks_data->range = (range_vec->empty()) ? NULL : range_vec;; + callbacks_data->filter_on = (callbacks_data->gpu_index != NULL) || + (callbacks_data->kernel_string != NULL) || + (callbacks_data->range != NULL) + ? 1 : 0; + + rocprofiler_set_queue_callbacks(callbacks_ptrs, callbacks_data); + } xml::Xml::Destroy(xml); diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp index 10f9fbc1..7d3301a3 100644 --- a/test/util/hsa_rsrc_factory.cpp +++ b/test/util/hsa_rsrc_factory.cpp @@ -24,6 +24,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "util/hsa_rsrc_factory.h" +#include #include #include #include @@ -36,6 +37,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include @@ -44,6 +46,14 @@ POSSIBILITY OF SUCH DAMAGE. #include #include +// Demangle C++ symbol name +static const char* cpp_demangle(const char* symname) { + size_t size = 0; + int status; + const char* ret = abi::__cxa_demangle(symname, NULL, &size, &status); + return (ret != 0) ? ret : strdup(symname); +} + // Callback function to get available in the system agents hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) { hsa_status_t status = HSA_STATUS_ERROR; @@ -192,6 +202,7 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_executable_create_alt = table->core_->hsa_executable_create_alt_fn; hsa_api_.hsa_executable_load_agent_code_object = table->core_->hsa_executable_load_agent_code_object_fn; hsa_api_.hsa_executable_freeze = table->core_->hsa_executable_freeze_fn; + hsa_api_.hsa_executable_destroy = table->core_->hsa_executable_destroy_fn; hsa_api_.hsa_executable_get_symbol = table->core_->hsa_executable_get_symbol_fn; hsa_api_.hsa_executable_symbol_get_info = table->core_->hsa_executable_symbol_get_info_fn; hsa_api_.hsa_executable_iterate_symbols = table->core_->hsa_executable_iterate_symbols_fn; @@ -232,6 +243,7 @@ void HsaRsrcFactory::InitHsaApiTable(HsaApiTable* table) { hsa_api_.hsa_executable_create_alt = hsa_executable_create_alt; hsa_api_.hsa_executable_load_agent_code_object = hsa_executable_load_agent_code_object; hsa_api_.hsa_executable_freeze = hsa_executable_freeze; + hsa_api_.hsa_executable_destroy = hsa_executable_destroy; hsa_api_.hsa_executable_get_symbol = hsa_executable_get_symbol; hsa_api_.hsa_executable_symbol_get_info = hsa_executable_symbol_get_info; hsa_api_.hsa_executable_iterate_symbols = hsa_executable_iterate_symbols; @@ -618,6 +630,8 @@ bool HsaRsrcFactory::LoadAndFinalize(const AgentInfo* agent_info, const char* br &kernelSymbol); CHECK_STATUS("Error in looking up kernel symbol", status); + close(file_handle); + // Update output parameter *code_desc = kernelSymbol; return true; @@ -693,52 +707,57 @@ uint64_t HsaRsrcFactory::Submit(hsa_queue_t* queue, const void* packet, size_t s return write_idx; } -const char* HsaRsrcFactory::GetKernelNameRef(uint64_t addr) { - std::lock_guard lck(mutex_); - const auto it = symbols_map_->find(addr); - if (it == symbols_map_->end()) { - fprintf(stderr, "HsaRsrcFactory::kernel addr (0x%lx) is not found\n", addr); - abort(); - } - return it->second; -} - -void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) { - std::lock_guard lck(mutex_); - executable_tracking_on_ = true; - table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; -} - -hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data) { +hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *arg) { hsa_symbol_kind_t value = (hsa_symbol_kind_t)0; hsa_status_t status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &value); CHECK_STATUS("Error in getting symbol info", status); + if (value == HSA_SYMBOL_KIND_KERNEL) { uint64_t addr = 0; - uint32_t len = 0; status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &addr); CHECK_STATUS("Error in getting kernel object", status); - status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len); - CHECK_STATUS("Error in getting name len", status); - char *name = new char[len + 1]; - status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); - CHECK_STATUS("Error in getting kernel name", status); - name[len] = 0; - auto ret = symbols_map_->insert({addr, name}); - if (ret.second == false) { - delete[] ret.first->second; - ret.first->second = name; + + const int to_free = reinterpret_cast(arg); + const char* name = NULL; + if (to_free == 0) { + uint32_t len = 0; + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len); + CHECK_STATUS("Error in getting name len", status); + char sym_name[len + 1]; + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, sym_name); + CHECK_STATUS("Error in getting kernel name", status); + sym_name[len] = 0; + name = cpp_demangle(sym_name); } + + SetKernelNameRef(addr, name, to_free); } + return HSA_STATUS_SUCCESS; } hsa_status_t HsaRsrcFactory::hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options) { std::lock_guard lck(mutex_); if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t; - hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, NULL); + hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, (void*)0); CHECK_STATUS("Error in iterating executable symbols", status); - return hsa_api_.hsa_executable_freeze(executable, options);; + return hsa_api_.hsa_executable_freeze(executable, options); +} + +hsa_status_t HsaRsrcFactory::hsa_executable_destroy_interceptor(hsa_executable_t executable) { + std::lock_guard lck(mutex_); + if (symbols_map_ != NULL) { + hsa_status_t status = hsa_api_.hsa_executable_iterate_symbols(executable, executable_symbols_cb, (void*)1); + CHECK_STATUS("Error in iterating executable symbols", status); + } + return hsa_api_.hsa_executable_destroy(executable); +} + +void HsaRsrcFactory::EnableExecutableTracking(HsaApiTable* table) { + std::lock_guard lck(mutex_); + executable_tracking_on_ = true; + table->core_->hsa_executable_freeze_fn = hsa_executable_freeze_interceptor; + table->core_->hsa_executable_destroy_fn = hsa_executable_destroy_interceptor; } std::atomic HsaRsrcFactory::instance_{}; diff --git a/test/util/hsa_rsrc_factory.h b/test/util/hsa_rsrc_factory.h index e857813b..ca5a6e7a 100644 --- a/test/util/hsa_rsrc_factory.h +++ b/test/util/hsa_rsrc_factory.h @@ -95,6 +95,7 @@ struct hsa_pfn_t { decltype(hsa_executable_create_alt)* hsa_executable_create_alt; decltype(hsa_executable_load_agent_code_object)* hsa_executable_load_agent_code_object; decltype(hsa_executable_freeze)* hsa_executable_freeze; + decltype(hsa_executable_destroy)* hsa_executable_destroy; decltype(hsa_executable_get_symbol)* hsa_executable_get_symbol; decltype(hsa_executable_symbol_get_info)* hsa_executable_symbol_get_info; decltype(hsa_executable_iterate_symbols)* hsa_executable_iterate_symbols; @@ -286,6 +287,13 @@ class HsaRsrcFactory { typedef std::recursive_mutex mutex_t; typedef HsaTimer::timestamp_t timestamp_t; + // Executables loading tracking + struct symbols_map_data_t { + const char* name; + uint64_t refs_count; + }; + typedef std::map symbols_map_t; + static HsaRsrcFactory* Create(bool initialize_hsa = true) { std::lock_guard lck(mutex_); HsaRsrcFactory* obj = instance_.load(std::memory_order_relaxed); @@ -406,7 +414,88 @@ class HsaRsrcFactory { // Enable executables loading tracking static bool IsExecutableTracking() { return executable_tracking_on_; } static void EnableExecutableTracking(HsaApiTable* table); - static const char* GetKernelNameRef(uint64_t addr); + + typedef symbols_map_t::iterator symbols_map_it_t; + + static inline const char* GetKernelNameRef(const uint64_t& addr) { + if (symbols_map_ == NULL) { + fprintf(stderr, "HsaRsrcFactory::GetKernelNameRef: kernel addr (0x%lx), error\n", addr); + abort(); + } + + std::lock_guard lck(mutex_); + + const auto it = symbols_map_->find(addr); + if (it == symbols_map_->end()) { + fprintf(stderr, "HsaRsrcFactory::GetKernelNameRef: kernel addr (0x%lx) is not found\n", addr); + abort(); + } + + return it->second.name; + } + + static inline symbols_map_it_t AcquireKernelNameRef(const uint64_t& addr) { + if (symbols_map_ == NULL) { + fprintf(stderr, "HsaRsrcFactory::GetKernelNameRef: kernel addr (0x%lx), error\n", addr); + abort(); + } + + std::lock_guard lck(mutex_); + + const auto it = symbols_map_->find(addr); + if (it == symbols_map_->end()) { + fprintf(stderr, "HsaRsrcFactory::GetKernelNameRef: kernel addr (0x%lx) is not found\n", addr); + abort(); + } + + std::atomic* atomic_ptr = + reinterpret_cast*>(&(it->second.refs_count)); + atomic_ptr->fetch_add(1, std::memory_order_relaxed); + + return it; + } + + static inline void ReleaseKernelNameRef(const symbols_map_it_t& it) { + std::atomic* atomic_ptr = + reinterpret_cast*>(&(it->second.refs_count)); + atomic_ptr->fetch_sub(1, std::memory_order_relaxed); + } + + static inline void SetKernelNameRef(const uint64_t& addr, const char* name, const int& free) { + if (symbols_map_ == NULL) { + std::lock_guard lck(mutex_); + if (symbols_map_ == NULL) symbols_map_ = new symbols_map_t; + } + + auto it = symbols_map_->find(addr); + if (it != symbols_map_->end()) { + while (1) { + while(it->second.refs_count != 0) sched_yield(); + mutex_.lock(); + if (it->second.refs_count == 0) break; + mutex_.unlock(); + } + } + + if (it != symbols_map_->end()) { + delete[] it->second.name; + if (free == 1) { + symbols_map_->erase(it); + } else { + fprintf(stderr, "HsaRsrcFactory::SetKernelNameRef: to set kernel addr (0x%lx) conflict\n", addr); + abort(); + } + } else { + if (free == 0) { + symbols_map_->insert({addr, symbols_map_data_t{name, 0}}); + } else { + fprintf(stderr, "HsaRsrcFactory::SetKernelNameRef: to free kernel addr (0x%lx) not found\n", addr); + abort(); + } + } + + mutex_.unlock(); + } // Initialize HSA API table void static InitHsaApiTable(HsaApiTable* table); @@ -492,11 +581,10 @@ class HsaRsrcFactory { // System agents map std::map agent_map_; - // Executables loading tracking - typedef std::map symbols_map_t; static symbols_map_t* symbols_map_; static bool executable_tracking_on_; static hsa_status_t hsa_executable_freeze_interceptor(hsa_executable_t executable, const char *options); + static hsa_status_t hsa_executable_destroy_interceptor(hsa_executable_t executable); static hsa_status_t executable_symbols_cb(hsa_executable_t exec, hsa_executable_symbol_t symbol, void *data); // HSA runtime API table From b9e5f11509988e7edf13485beea6b9b1dd785700 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 21 Aug 2020 10:51:18 -0500 Subject: [PATCH 125/168] merge fix --- src/core/context.h | 1 - src/core/rocprofiler.cpp | 4 +++- test/tool/tool.cpp | 2 -- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/core/context.h b/src/core/context.h index 8be3a9e8..3116e036 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -499,7 +499,6 @@ class Context { hsa_ven_amd_aqlprofile_info_data_t* ainfo_data, void* data) { hsa_status_t status = HSA_STATUS_SUCCESS; callback_data_t* callback_data = reinterpret_cast(data); - const profile_t* profile = callback_data->profile; info_vector_t& info_vector = *(callback_data->info_vector); uint32_t index = callback_data->index; const uint32_t sample_id = ainfo_data->sample_id; diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index e53d7257..d5af91c2 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -706,7 +706,9 @@ PUBLIC_API hsa_status_t rocprofiler_stop_queue_callbacks() { // Method for iterating the events output data PUBLIC_API hsa_status_t rocprofiler_iterate_trace_data( - rocprofiler_t* handle, hsa_ven_amd_aqlprofile_data_callback_t callback, void* data) {} + rocprofiler_t* handle, hsa_ven_amd_aqlprofile_data_callback_t callback, void* data) { + return HSA_STATUS_ERROR; +} //////////////////////////////////////////////////////////////////////////////// // Open profiling pool diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 6b2adf8a..30e35504 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -334,7 +334,6 @@ void output_results(const context_entry_t* entry, const char* label) { FILE* file = entry->file_handle; const rocprofiler_feature_t* features = entry->features; const unsigned feature_count = entry->feature_count; - rocprofiler_t* context = entry->group.context; for (unsigned i = 0; i < feature_count; ++i) { const rocprofiler_feature_t* p = &features[i]; @@ -1088,7 +1087,6 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) bool opt_mode_cond = ((features_found != 0) && (metrics_set->empty()) && - (traces_found == 0) && (filter_disabled == true)); if (settings->opt_mode == 0) opt_mode_cond = false; if (!opt_mode_cond) settings->opt_mode = 0; From fea1fd598c092ec88163f054aa8924df950be351 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Fri, 21 Aug 2020 13:30:16 -0500 Subject: [PATCH 126/168] clang warinig fix --- src/core/context.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/core/context.h b/src/core/context.h index 3116e036..d23c93ba 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -330,7 +330,6 @@ class Context { metrics_(NULL), handler_(handler), handler_arg_(handler_arg), - pcsmp_mode_(false), dispatch_signal_{}, orig_signal_{}, record_{} @@ -556,9 +555,6 @@ class Context { rocprofiler_handler_t handler_; void* handler_arg_; - // PC sampling mode - bool pcsmp_mode_; - // kernel packet dispatch copmletion signal hsa_signal_t dispatch_signal_; hsa_signal_t orig_signal_; From a2e7bfad92755f55d7a53d5b1f31718e784ad924 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Sun, 20 Sep 2020 22:30:02 -0500 Subject: [PATCH 127/168] 3.8 update --- bin/rpl_run.sh | 4 + bin/tblextr.py | 119 ++++++++++++++------- inc/rocprofiler.h | 2 + src/core/context.h | 196 ++++++++++++++++++++++++---------- src/core/intercept_queue.h | 48 +++++---- src/core/profile.h | 115 ++++++++++++++++++-- src/core/rocprofiler.cpp | 56 +++++++--- src/core/tracker.h | 39 ++++--- src/util/hsa_rsrc_factory.cpp | 15 ++- test/tool/tool.cpp | 13 ++- 10 files changed, 454 insertions(+), 153 deletions(-) diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index e98561b4..f45b8312 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -183,6 +183,7 @@ usage() { echo " Supported time formats: " echo " --flush-rate - to enable trace flush rate (time period)" echo " Supported time formats: " + echo " --parallel-kernels - to enable cnocurrent kernels" echo "" echo "Configuration file:" echo " You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:${HOME}:" @@ -439,6 +440,9 @@ while [ 1 ] ; do if [ "$2" = "off" ] ; then export ROCP_OBJ_TRACKING=0 fi + elif [ "$1" = "--parallel-kernels" ] ; then + ARG_VAL=0 + export ROCP_K_CONCURRENT=1 elif [ "$1" = "--verbose" ] ; then ARG_VAL=0 export ROCP_VERBOSE_MODE=1 diff --git a/bin/tblextr.py b/bin/tblextr.py index 60d99db3..1b39a415 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -240,11 +240,13 @@ def fill_kernel_db(table_name, db): ] def fill_ext_db(table_name, db, indir, trace_name, api_pid): file_name = indir + '/' + trace_name + '_trace.txt' - ptrn_val = re.compile(r'(\d+) (\d+):(\d+) (\d+):(.*)$') + # tms pid:tid cid:rid:'.....' + ptrn_val = re.compile(r'(\d+) (\d+):(\d+) (\d+):(\d+):(.*)$') if not os.path.isfile(file_name): return 0 range_stack = {} + range_map = {} record_id = 0 table_handle = db.add_table(table_name, ext_table_descr) @@ -257,7 +259,8 @@ def fill_ext_db(table_name, db, indir, trace_name, api_pid): pid = m.group(2) tid = m.group(3) cid = int(m.group(4)) - msg = m.group(5) + rid = int(m.group(5)) + msg = m.group(6) rec_vals = [] @@ -285,6 +288,21 @@ def fill_ext_db(table_name, db, indir, trace_name, api_pid): rec_vals = rec_stack.pop() rec_vals[1] = tms + # range start + if cid == 3: + range_map[rid] = (tms, msg) + continue + + # range stop + if cid == 4: + if rid in range_map: + (tms, msg) = range_map[rid] # querying start timestamp if rid exists + del range_map[rid] + else: fatal("range id(" + str(rid) + ") is not found") + rec_vals[0] = tms # begin timestamp + rec_vals[3] = 0 # 0 lane for ranges + rec_vals[4] = msg # range message + db.insert_entry(table_handle, rec_vals) record_id += 1 @@ -305,6 +323,8 @@ def get_field(args, field): def set_field(args, field, val): return re.subn(field + '\(\w+\)([ \)])', field + '(' + str(val) + ')\\1', args, count=1) +ops_patch_data = {} + # Fill API DB api_table_descr = [ ['BeginNs', 'EndNs', 'pid', 'tid', 'Name', 'args', 'Index', 'Data'], @@ -329,6 +349,7 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep file_name = indir + '/' + api_name + '_api_trace.txt' ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) ([^\(]+)(\(.*)$') + hip_mcopy_ptrn = re.compile(r'hipMemcpy') ptrn_ac = re.compile(r'hsa_amd_memory_async_copy') ptrn1_kernel = re.compile(r'^.*kernel\(') ptrn2_kernel = re.compile(r'\)\) .*$') @@ -346,7 +367,16 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep record_id_dict = {} table_handle = db.add_table(table_name, api_table_descr) with open(file_name, mode='r') as fd: - for line in fd.readlines(): + file_lines = fd.readlines() + total_lines = len(file_lines) + line_index = 0 + for line in file_lines: + if (line_index == total_lines - 1) or (line_index % 100 == 0): + sys.stdout.write( \ + "\rscan " + api_name + " API data " + str(line_index) + ":" + str(total_lines) + " "*100 \ + ) + line_index += 1 + record = line[:-1] kernel_arg = '' @@ -361,7 +391,8 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep record = mfixformat.group(1) + '( ' + reformated_args + ')' m = ptrn_val.match(record) - if m: + if not m: fatal(api_name + " bad record: '" + record + "'") + else: rec_vals = [] rec_len = len(api_table_descr[0]) - 1 for ind in range(1, rec_len): @@ -385,11 +416,17 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep if found == 0: fatal('set_field() failed for "stream", args: "' + record_args + '"') else: stream_id = 0 + # extract kernel name string + (kernel_str, kernel_found) = get_field(record_args, 'kernel') + + if stream_found != 0 or kernel_found != 0: + ops_patch_data[(corr_id, proc_id)] = (stream_id if stream_found else 0, kernel_str if kernel_found else '') + # dependencies filling - if ptrn_ac.search(record_name) or (corr_id, proc_id) in dep_filtr: + if ptrn_ac.match(record_name) or hip_mcopy_ptrn.match(record_name): beg_ns = int(rec_vals[0]) end_ns = int(rec_vals[1]) - from_us = (beg_ns / 1000) + ((end_ns - beg_ns) / 1000) + from_us = end_ns / 1000 if not proc_id in dep_dict: dep_dict[proc_id] = {} dep_proc = dep_dict[proc_id] @@ -412,39 +449,31 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep copy_csv += str(copy_index) + ', ' + copy_line + '\n' copy_index += 1 - # patching activity properties: kernel name, stream-id - if (corr_id, proc_id) in dep_filtr: - ops_table_name = dep_filtr[(corr_id, proc_id)] - - select_expr = '"Index" = ' + str(corr_id) + ' AND "proc-id" = ' + str(proc_id) - record_args = rec_vals[rec_len - 2] - - # extract kernel name string - (kernel_str, kernel_found) = get_field(record_args, 'kernel') - is_kernel_list = 1 if kernel_found != 0 and kernel_str[-1] == ';' else 0 - - if is_kernel_list != 0: - for kernel_item in kernel_str[:-1].split(';'): - m = ptrn_multi_kernel.match(kernel_item) - if m: - kernel_name = m.group(1) - dev_id = m.group(2) - select_expr += ' AND "dev-id" = ' + dev_id - activity_record_patching(db, ops_table_name, 1, kernel_name, stream_found, stream_id, select_expr) - else: - fatal('Bad multi-kernel format: "' + kernel_item + '" in "' + kernel_str + '"') - else: - activity_record_patching(db, ops_table_name, kernel_found, kernel_str, stream_found, stream_id, select_expr) - - api_data = '' - if mcopy_data_enabled: - api_data = memory_manager.register_api(rec_vals) if len(dep_filtr) else '' + if False: + # patching activity properties: kernel name, stream-id + if (corr_id, proc_id) in dep_filtr: + ops_table_name = dep_filtr[(corr_id, proc_id)] + select_expr = '"Index" = ' + str(corr_id) + ' AND "proc-id" = ' + str(proc_id) + is_kernel_list = 1 if kernel_found != 0 and kernel_str[-1] == ';' else 0 + if is_kernel_list != 0: + for kernel_item in kernel_str[:-1].split(';'): + m = ptrn_multi_kernel.match(kernel_item) + if m: + kernel_name = m.group(1) + dev_id = m.group(2) + select_expr += ' AND "dev-id" = ' + dev_id + activity_record_patching(db, ops_table_name, 1, kernel_name, stream_found, stream_id, select_expr) + else: + fatal('Bad multi-kernel format: "' + kernel_item + '" in "' + kernel_str + '"') + else: + activity_record_patching(db, ops_table_name, kernel_found, kernel_str, stream_found, stream_id, select_expr) + + api_data = memory_manager.register_api(rec_vals) if mcopy_data_enabled and api_name == 'hip' else '' rec_vals.append(api_data) rec_vals[2] = api_pid db.insert_entry(table_handle, rec_vals) - else: fatal(api_name + " bad record: '" + record + "'") # inserting of dispatch events correlated to the dependent dispatches for (from_ns, proc_id, thrd_id) in dep_list: @@ -528,7 +557,16 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): kernel_table_handle = db.add_table(kernel_table_name, ops_table_descr) mcopy_table_handle = db.add_table(mcopy_table_name, ops_table_descr) with open(file_name, mode='r') as fd: - for line in fd.readlines(): + file_lines = fd.readlines() + total_lines = len(file_lines) + line_index = 0 + for line in file_lines: + if (line_index == total_lines - 1) or (line_index % 100 == 0): + sys.stdout.write( \ + "\rscan ops data " + str(line_index) + ":" + str(total_lines) + " "*100 \ + ) + line_index += 1 + record = line[:-1] m = ptrn_val.match(record) if m: @@ -558,10 +596,17 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): if ptrn_barrier.search(name): name = '""' + tid = 0 + if (corr_id, proc_id) in ops_patch_data: + vals = ops_patch_data[(corr_id, proc_id)] + tid = vals[0] + name_patch = vals[1] + if name_patch != '': name = name_patch + # insert DB record rec_vals[4] = name # Name rec_vals.append(pid) # pid - rec_vals.append(0) # tid + rec_vals.append(tid) # tid rec_vals.append(corr_id) # Index rec_vals.append(proc_id) # proc-id rec_vals.append('') # Data @@ -641,8 +686,8 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): hsa_activity_found = fill_copy_db('COPY', db, indir) hsa_trace_found = fill_api_db('HSA', db, indir, 'hsa', HSA_PID, COPY_PID, kern_dep_list, {}, 0) + hip_trace_found = fill_api_db('HIP', db, indir, 'hip', HIP_PID, OPS_PID, [], {}, 1) ops_filtr = fill_ops_db('OPS', 'COPY', db, indir) - hip_trace_found = fill_api_db('HIP', db, indir, 'hip', HIP_PID, OPS_PID, [], ops_filtr, 1) fill_kernel_db('A', db) diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 3f295a15..b176cadf 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -92,6 +92,8 @@ hsa_status_t rocprofiler_error_string( typedef enum { ROCPROFILER_FEATURE_KIND_METRIC = 0, ROCPROFILER_FEATURE_KIND_TRACE = 1, + ROCPROFILER_FEATURE_KIND_SPM_MOD = 2, + ROCPROFILER_FEATURE_KIND_PCSMP_MOD = 4 } rocprofiler_feature_kind_t; // Profiling feture parameter diff --git a/src/core/context.h b/src/core/context.h index d23c93ba..c368d42c 100644 --- a/src/core/context.h +++ b/src/core/context.h @@ -83,10 +83,16 @@ class Group { Group(const util::AgentInfo* agent_info, Context* context, const uint32_t& index) : pmc_profile_(agent_info), + trace_profile_(agent_info), n_profiles_(0), refs_(1), context_(context), - index_(index) {} + index_(index), + barrier_signal_{}, + dispatch_signal_{}, + orig_signal_{}, + record_{} + {} void Insert(const profile_info_t& info) { const rocprofiler_feature_kind_t kind = info.rinfo->kind; @@ -95,6 +101,9 @@ class Group { case ROCPROFILER_FEATURE_KIND_METRIC: pmc_profile_.Insert(info); break; + case ROCPROFILER_FEATURE_KIND_TRACE: + trace_profile_.Insert(info); + break; default: EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); } @@ -103,16 +112,24 @@ class Group { hsa_status_t Finalize(const bool is_concurrent = false) { hsa_status_t status = pmc_profile_.Finalize(start_vector_, stop_vector_, read_vector_, is_concurrent); + if (status == HSA_STATUS_SUCCESS) { + status = trace_profile_.Finalize(start_vector_, stop_vector_, + read_vector_, is_concurrent); + } if (status == HSA_STATUS_SUCCESS) { if (!pmc_profile_.Empty()) ++n_profiles_; + if (!trace_profile_.Empty()) ++n_profiles_; } return status; } void GetProfiles(profile_vector_t& vec) { pmc_profile_.GetProfiles(vec); + trace_profile_.GetProfiles(vec); } + void GetTraceProfiles(profile_vector_t& vec) { trace_profile_.GetProfiles(vec); } + info_vector_t& GetInfoVector() { return info_vector_; } const pkt_vector_t& GetStartVector() const { return start_vector_; } const pkt_vector_t& GetStopVector() const { return stop_vector_; } @@ -120,6 +137,28 @@ class Group { Context* GetContext() { return context_; } uint32_t GetIndex() const { return index_; } + void SetBarrierSignal(const hsa_signal_t &signal) { + barrier_signal_ = signal; + } + hsa_signal_t& GetBarrierSignal() { + return barrier_signal_; + } + void SetDispatchSignal(const hsa_signal_t &signal) { + dispatch_signal_ = signal; + } + hsa_signal_t& GetDispatchSignal() { + return dispatch_signal_; + } + void SetOrigSignal(const hsa_signal_t &signal) { + orig_signal_ = signal; + } + const hsa_signal_t& GetOrigSignal() const { + return orig_signal_; + } + rocprofiler_dispatch_record_t* GetRecord() { + return &record_; + } + atomic_refs_t* AtomicRefsCount() { return reinterpret_cast(&refs_); } void ResetRefsCount() { AtomicRefsCount()->store(n_profiles_, std::memory_order_release); } void IncrRefsCount() { AtomicRefsCount()->fetch_add(1, std::memory_order_acq_rel); } @@ -127,6 +166,7 @@ class Group { private: PmcProfile pmc_profile_; + TraceProfile trace_profile_; info_vector_t info_vector_; pkt_vector_t start_vector_; pkt_vector_t stop_vector_; @@ -135,6 +175,12 @@ class Group { refs_t refs_; Context* const context_; const uint32_t index_; + // completion signal of after-dispatch barrier + hsa_signal_t barrier_signal_; + // completion signal kernel packet dispatch + hsa_signal_t dispatch_signal_; + hsa_signal_t orig_signal_; + rocprofiler_dispatch_record_t record_; }; // Profiling context @@ -231,11 +277,21 @@ class Context { char* ptr; }; + void RestoreSignals(const profile_tuple_t& tuple) { + hsa_rsrc_->HsaApi()->hsa_signal_store_screlease(tuple.dispatch_signal, 1); + if (k_concurrent_) { + hsa_rsrc_->HsaApi()->hsa_signal_store_screlease(tuple.read_signal, 1); + hsa_rsrc_->HsaApi()->hsa_signal_store_screlease(tuple.barrier_signal, 1); + } + } + void GetData(const uint32_t& group_index) { const profile_vector_t profile_vector = GetProfiles(group_index); for (auto& tuple : profile_vector) { // Wait for stop packet to complete hsa_rsrc_->SignalWaitRestore(tuple.completion_signal, 1); + // Restore other signals + RestoreSignals(tuple); for (rocprofiler_feature_t* rinfo : *(tuple.info_vector)) rinfo->data.kind = ROCPROFILER_DATA_KIND_UNINIT; callback_data_t callback_data{tuple.profile, tuple.info_vector, tuple.info_vector->size(), NULL}; const hsa_status_t status = @@ -261,28 +317,15 @@ class Context { } } - /* Handle the completion of kernel-begin 'read' packet */ - static bool HandlerRead(hsa_signal_value_t value, void* arg) { - Group* group = reinterpret_cast(arg); - Context* context = group->GetContext(); - - // Handle the completion signal of read packet at kernel begin - const profile_vector_t profile_vector = context->GetProfiles(group->GetIndex()); + void IterateTraceData(rocprofiler_trace_data_callback_t callback, void* data) { + profile_vector_t profile_vector; + set_[0].GetTraceProfiles(profile_vector); for (auto& tuple : profile_vector) { - // Wait for read packet to complete - util::HsaRsrcFactory::Instance().SignalWaitRestore(tuple.completion_signal, 1); - const profile_t* profile = tuple.profile; - // Copy the counter values, read at kernel begin, to the right half of - // the buffer, so that the next kernel-end read can reuse the left half - char* data = reinterpret_cast(profile->output_buffer.ptr); - const uint32_t num = profile->output_buffer.size / 2; - for(uint32_t i = 0; i < num; ++i) { - data[i+num] = data[i]; // left --> right - data[i] = 0; // reset left - } + if (pcsmp_mode_) const_cast(tuple.profile)->event_count = UINT32_MAX; + const hsa_status_t status = + api_->hsa_ven_amd_aqlprofile_iterate_data(tuple.profile, callback, data); + if (status != HSA_STATUS_SUCCESS) AQL_EXC_RAISING(status, "context iterate data failed"); } - - return false; } static bool Handler(hsa_signal_value_t value, void* arg) { @@ -300,24 +343,10 @@ class Context { Group* GetGroup(const uint32_t& index) { return &set_[index]; } rocprofiler_handler_t GetHandler(void** arg) const { *arg = handler_arg_; return handler_; } - void SetDispatchSignal(const hsa_signal_t &signal) { - dispatch_signal_ = signal; - } - hsa_signal_t& GetDispatchSignal() { - return dispatch_signal_; - } - void SetOrigSignal(const hsa_signal_t &signal) { - orig_signal_ = signal; - } - const hsa_signal_t& GetOrigSignal() const { - return orig_signal_; - } - rocprofiler_dispatch_record_t* GetRecord() { - return &record_; - } - // Concurrent profiling mode static bool k_concurrent_; + // Packets to stop the profiling + static pkt_vector_t stop_packets_; private: Context(const util::AgentInfo* agent_info, Queue* queue, rocprofiler_feature_t* info, @@ -330,15 +359,12 @@ class Context { metrics_(NULL), handler_(handler), handler_arg_(handler_arg), - dispatch_signal_{}, - orig_signal_{}, - record_{} + pcsmp_mode_(false) {} ~Context() { Destruct(); } void Destruct() { - hsa_signal_destroy(dispatch_signal_); for (const auto& v : info_map_) { const std::string& name = v.first; const rocprofiler_feature_t* info = v.second; @@ -373,20 +399,14 @@ class Context { set_[group_index].ResetRefsCount(); const profile_vector_t profile_vector = GetProfiles(group_index); for (auto& tuple : profile_vector) { - // Handler for read packet completion - if (k_concurrent_) { - hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, HandlerRead, - &set_[group_index]); - } + set_[group_index].SetDispatchSignal(tuple.dispatch_signal); + set_[group_index].SetBarrierSignal(tuple.barrier_signal); // Handler for stop packet completion hsa_amd_signal_async_handler(tuple.completion_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, &set_[group_index]); } } } - - hsa_status_t status = hsa_signal_create(1, 0, NULL, &dispatch_signal_); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "MetricsDict create failed"); } // Initialize rocprofiler context @@ -469,6 +489,23 @@ class Context { const uint32_t group_index = block_status.group_index; set_[group_index].Insert(profile_info_t{event, NULL, 0, info}); } + } else if (kind & ROCPROFILER_FEATURE_KIND_TRACE) { // Processing traces features + info->kind = ROCPROFILER_FEATURE_KIND_TRACE; + + const event_t* event = NULL; + if (kind & ROCPROFILER_FEATURE_KIND_PCSMP_MOD) { // PC sampling + pcsmp_mode_ = true; + } else if (kind & ROCPROFILER_FEATURE_KIND_SPM_MOD) { // SPM trace + const Metric* metric = metrics_->Get(name); + if (metric == NULL) + EXC_RAISING(HSA_STATUS_ERROR, "input metric '" << name << "' is not found"); + counters_vec_t counters_vec = metric->GetCounters(); + if (counters_vec.size() != 1) + EXC_RAISING(HSA_STATUS_ERROR, "trace bad metric '" << name << "' is not base counter"); + const counter_t* counter = counters_vec[0]; + event = &(counter->event); + } + set_[0].Insert(profile_info_t{event, info->parameters, info->parameter_count, info}); } else { EXC_RAISING(HSA_STATUS_ERROR, "bad rocprofiler feature kind (" << kind << ")"); } @@ -498,6 +535,7 @@ class Context { hsa_ven_amd_aqlprofile_info_data_t* ainfo_data, void* data) { hsa_status_t status = HSA_STATUS_SUCCESS; callback_data_t* callback_data = reinterpret_cast(data); + const profile_t* profile = callback_data->profile; info_vector_t& info_vector = *(callback_data->info_vector); uint32_t index = callback_data->index; const uint32_t sample_id = ainfo_data->sample_id; @@ -516,6 +554,56 @@ class Context { if (ainfo_data->sample_id == 0) rinfo->data.result_int64 = 0; rinfo->data.result_int64 += ainfo_data->pmc_data.result; rinfo->data.kind = ROCPROFILER_DATA_KIND_INT64; + } else if (ainfo_type == HSA_VEN_AMD_AQLPROFILE_INFO_TRACE_DATA) { + if (rinfo->data.result_bytes.copy) { + const bool trace_local = TraceProfile::IsLocal(); + util::HsaRsrcFactory* hsa_rsrc = &util::HsaRsrcFactory::Instance(); + if (sample_id == 0) { + const uint32_t output_buffer_size = profile->output_buffer.size; + const uint32_t output_buffer_size64 = profile->output_buffer.size / sizeof(uint64_t); + const util::AgentInfo* agent_info = hsa_rsrc->GetAgentInfo(profile->agent); + void* ptr = (trace_local) ? hsa_rsrc->AllocateSysMemory(agent_info, output_buffer_size) : + calloc(output_buffer_size64, sizeof(uint64_t)); + rinfo->data.result_bytes.size = output_buffer_size; + rinfo->data.result_bytes.ptr = ptr; + callback_data->ptr = reinterpret_cast(ptr); + } + char* result_bytes_ptr = reinterpret_cast(rinfo->data.result_bytes.ptr); + const char* end = result_bytes_ptr + rinfo->data.result_bytes.size; + const char* src = reinterpret_cast(ainfo_data->trace_data.ptr); + uint32_t size = ainfo_data->trace_data.size; + char* ptr = callback_data->ptr; + uint32_t* header = reinterpret_cast(ptr); + char* dest = ptr + sizeof(*header); + + if ((dest + size) >= end) { + if (dest < end) size = end - dest; + else EXC_RAISING(HSA_STATUS_ERROR, "Trace data out of output buffer"); + } + + bool suc = true; + if (trace_local) { + suc = hsa_rsrc->Memcpy(profile->agent, dest, src, size); + } else { + memcpy(dest, src, size); + } + if (suc) { + *header = size; + callback_data->ptr = dest + align_size(size, sizeof(uint32_t)); + rinfo->data.result_bytes.instance_count = sample_id + 1; + rinfo->data.kind = ROCPROFILER_DATA_KIND_BYTES; + } else + EXC_RAISING(HSA_STATUS_ERROR, "Agent Memcpy failed, dst(" << (void*)dest << ") src(" << (void*)src << ") size(" << size << ")"); + } else { + if (sample_id == 0) { + rinfo->data.result_bytes.ptr = profile->output_buffer.ptr; + rinfo->data.result_bytes.size = profile->output_buffer.size; + rinfo->data.result_bytes.instance_count = UINT32_MAX; + } + + rinfo->data.result_bytes.instance_count += 1; + rinfo->data.kind = ROCPROFILER_DATA_KIND_BYTES; + } } else { EXC_RAISING(HSA_STATUS_ERROR, "unknown data type = " << ainfo_type); } @@ -555,15 +643,13 @@ class Context { rocprofiler_handler_t handler_; void* handler_arg_; - // kernel packet dispatch copmletion signal - hsa_signal_t dispatch_signal_; - hsa_signal_t orig_signal_; - rocprofiler_dispatch_record_t record_; - + // PC sampling mode + bool pcsmp_mode_; }; #define CONTEXT_INSTANTIATE() \ - bool rocprofiler::Context::k_concurrent_ = false; + bool rocprofiler::Context::k_concurrent_ = false; \ + std::vector rocprofiler::Context::stop_packets_{}; } // namespace rocprofiler diff --git a/src/core/intercept_queue.h b/src/core/intercept_queue.h index 5cd09b10..946ba424 100644 --- a/src/core/intercept_queue.h +++ b/src/core/intercept_queue.h @@ -49,8 +49,6 @@ enum { extern decltype(hsa_queue_create)* hsa_queue_create_fn; extern decltype(hsa_queue_destroy)* hsa_queue_destroy_fn; -void PmcStarter(Context* context); - static std::mutex ctx_a_mutex; typedef std::map ctx_a_map_t; static ctx_a_map_t* ctx_a_map = NULL; @@ -186,8 +184,8 @@ class InterceptQueue { if ((status == HSA_STATUS_SUCCESS) && (context != NULL)) { if (group.feature_count != 0) { if (tracker_ != NULL) { - const_cast(dispatch_packet)->completion_signal = context->GetDispatchSignal(); Group* context_group = context->GetGroup(group.index); + const_cast(dispatch_packet)->completion_signal = context_group->GetDispatchSignal(); Tracker::Enable_opt(context_group, completion_signal); context_group->IncrRefsCount(); } @@ -271,9 +269,11 @@ class InterceptQueue { // Adding kernel timing tracker Tracker::entry_t* tracker_entry = NULL; + + const bool is_serial = (k_concurrent_ == K_CONC_OFF); if (tracker_ != NULL) { - tracker_entry = tracker_->Alloc(obj->agent_info_->dev_id, dispatch_packet->completion_signal); - const_cast(dispatch_packet)->completion_signal = tracker_entry->signal; + tracker_entry = tracker_->Alloc(obj->agent_info_->dev_id, dispatch_packet->completion_signal, is_serial); + if (is_serial) const_cast(dispatch_packet)->completion_signal = tracker_entry->signal; } // Prepareing dispatch callback data @@ -297,43 +297,55 @@ class InterceptQueue { // Calling dispatch callback rocprofiler_group_t group = {}; hsa_status_t status = (dispatch_callback_.load())(&data, callback_data_, &group); - // Injecting profiling start/stop packets + // Injecting profiling start/stop/read packets if ((status != HSA_STATUS_SUCCESS) || (group.context == NULL)) { if (tracker_entry != NULL) { - const_cast(dispatch_packet)->completion_signal = tracker_entry->orig; + if (is_serial) const_cast(dispatch_packet)->completion_signal = tracker_entry->orig; tracker_->Delete(tracker_entry); } } else { Context* context = reinterpret_cast(group.context); if (group.feature_count != 0) { - if (tracker_entry != NULL) { - Group* context_group = context->GetGroup(group.index); - context_group->IncrRefsCount(); - tracker_->EnableContext(tracker_entry, Context::Handler, reinterpret_cast(context_group)); - } - const pkt_vector_t& start_vector = context->StartPackets(group.index); const pkt_vector_t& stop_vector = context->StopPackets(group.index); const pkt_vector_t& read_vector = context->ReadPackets(group.index); pkt_vector_t packets; - if (k_concurrent_ == K_CONC_OFF) { // serial + if (is_serial) { // serial packets = start_vector; packets.insert(packets.end(), *packet); packets.insert(packets.end(), stop_vector.begin(), stop_vector.end()); } else { // concurrent - // Atrt PMC once - std::call_once(once_flag_, PmcStarter, context); - // Reads at both kernel start and end - assert(read_vector.size() == 2 * start_vector.size()); + // Insert start packets once + auto inject_start = [&packets](const pkt_vector_t& starts) mutable { + packets = starts; + }; + std::call_once(once_flag_, inject_start, start_vector); + // Reads at both kernel start and end (also with barriers) + assert(read_vector.size() >= 2 * start_vector.size()); auto mid = read_vector.begin() + read_vector.size()/2; // Read at kernel start packets.insert(packets.end(), read_vector.begin(), mid); // Kernel dispatch packet + assert(tracker_entry != NULL); + // Bind dispatch and barrier signals with tracker entry + tracker_->SetHandler(tracker_entry, context->GetGroup(group.index)); + const_cast(dispatch_packet)->completion_signal = context->GetGroup(group.index)->GetDispatchSignal(); packets.insert(packets.end(), *packet); // Read at kernel end packets.insert(packets.end(), mid, read_vector.end()); + + // Save the stop packets for eventual PmcStopper + if (Context::stop_packets_.empty()) { + Context::stop_packets_.insert(Context::stop_packets_.end(), stop_vector.begin(), stop_vector.end()); + } + } + + if (tracker_entry != NULL) { + Group* context_group = context->GetGroup(group.index); + context_group->IncrRefsCount(); + tracker_->EnableContext(tracker_entry, Context::Handler, reinterpret_cast(context_group)); } if (writer != NULL) { diff --git a/src/core/profile.h b/src/core/profile.h index f6165d07..09ad2644 100644 --- a/src/core/profile.h +++ b/src/core/profile.h @@ -45,6 +45,9 @@ struct profile_tuple_t { const profile_t* profile; info_vector_t* info_vector; hsa_signal_t completion_signal; + hsa_signal_t dispatch_signal; + hsa_signal_t barrier_signal; + hsa_signal_t read_signal; }; typedef std::vector profile_vector_t; @@ -102,6 +105,9 @@ class Profile { profile_ = {}; profile_.agent = agent_info->dev_id; completion_signal_ = {}; + dispatch_signal_ = {}; + barrier_signal_ = {}; + read_signal_ = {}; is_legacy_ = (strncmp(agent_info->name, "gfx8", 4) == 0); } @@ -115,6 +121,18 @@ class Profile { hsa_status_t status = hsa_signal_destroy(completion_signal_); if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status); } + if (dispatch_signal_.handle) { + hsa_status_t status = hsa_signal_destroy(dispatch_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status); + } + if (barrier_signal_.handle) { + hsa_status_t status = hsa_signal_destroy(barrier_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status); + } + if (read_signal_.handle) { + hsa_status_t status = hsa_signal_destroy(read_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_ABORT(status, "signal_destroy " << std::hex << status); + } } virtual void Insert(const profile_info_t& info) { info_vector_.push_back(info.rinfo); } @@ -143,6 +161,14 @@ class Profile { profile->parameter_count += 1; } + void BarrierPacket(packet_t* packet, const hsa_signal_t& prior_signal) { + hsa_barrier_and_packet_t* barrier = + reinterpret_cast(packet); + barrier->header = HSA_PACKET_TYPE_BARRIER_AND; + if (prior_signal.handle) barrier->dep_signal[0] = prior_signal; // set packet dependency + else barrier->header |= 1 << HSA_PACKET_HEADER_BARRIER; // set barrier bit + } + hsa_status_t Finalize(pkt_vector_t& start_vector, pkt_vector_t& stop_vector, pkt_vector_t& read_vector, bool is_concurrent = false) { if (is_concurrent) SetConcurrent(&profile_); @@ -190,13 +216,32 @@ class Profile { start.completion_signal = dummy_signal; // Set completion signal of read/stop - hsa_signal_t post_signal; - status = hsa_signal_create(1, 0, NULL, &post_signal); + status = hsa_signal_create(1, 0, NULL, &completion_signal_); if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); - stop.completion_signal = post_signal; - read.completion_signal = post_signal; - read2.completion_signal = post_signal; - completion_signal_ = post_signal; + if (is_concurrent) { + status = hsa_signal_create(1, 0, NULL, &read_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); + read.completion_signal = read_signal_; + read2.completion_signal = completion_signal_; + } else { + read.completion_signal = completion_signal_; + } + stop.completion_signal = completion_signal_; + + status = hsa_signal_create(1, 0, NULL, &dispatch_signal_); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); + + // Create barrier packets: enforce start to be done first, and further make + // read and read2 finish before and after kernel dispatch, respectively + packet_t barrier_st, barrier_rd{}, barrier_rd2{}; + if (is_concurrent) { + BarrierPacket(&barrier_st, start.completion_signal); + BarrierPacket(&barrier_rd, read.completion_signal); + BarrierPacket(&barrier_rd2, dispatch_signal_); + status = hsa_signal_create(1, 0, NULL, &(barrier_signal_)); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "signal_create " << std::hex << status); + barrier_rd2.completion_signal = barrier_signal_; + } // Fill packet vectors if (is_legacy_) { @@ -218,7 +263,11 @@ class Profile { if (rd_status == HSA_STATUS_SUCCESS) { pkt_vector_t reads = {read}; - if (is_concurrent) reads.push_back(read2); + if (is_concurrent) { + reads.push_back(barrier_rd); + reads.push_back(barrier_rd2); + reads.push_back(read2); + } for (auto rd : reads) { const uint32_t read_index = read_vector.size(); read_vector.insert(read_vector.end(), LEGACY_SLOT_SIZE_PKT, packet_t{}); @@ -230,11 +279,15 @@ class Profile { } } else { start_vector.push_back(start); + if (is_concurrent) start_vector.push_back(barrier_st); stop_vector.push_back(stop); if (rd_status == HSA_STATUS_SUCCESS) { read_vector.push_back(read); - if (is_concurrent) + if (is_concurrent) { + read_vector.push_back(barrier_rd); + read_vector.push_back(barrier_rd2); read_vector.push_back(read2); + } } } } @@ -244,7 +297,8 @@ class Profile { void GetProfiles(profile_vector_t& vec) { if (!info_vector_.empty()) { - vec.push_back(profile_tuple_t{&profile_, &info_vector_, completion_signal_}); + vec.push_back(profile_tuple_t{&profile_, &info_vector_, completion_signal_, + dispatch_signal_, barrier_signal_, read_signal_}); } } @@ -258,6 +312,9 @@ class Profile { profile_t profile_; info_vector_t info_vector_; hsa_signal_t completion_signal_; + hsa_signal_t dispatch_signal_; + hsa_signal_t barrier_signal_; + hsa_signal_t read_signal_; }; class PmcProfile : public Profile { @@ -280,6 +337,46 @@ class PmcProfile : public Profile { } }; +class TraceProfile : public Profile { + public: + static inline void SetSize(const uint32_t& size) { output_buffer_size_ = size; } + static inline uint32_t GetSize() { return output_buffer_size_; } + static inline void SetLocal(const bool& b) { output_buffer_local_ = b; } + static inline bool IsLocal() { return output_buffer_local_; } + + TraceProfile(const util::AgentInfo* agent_info) : Profile(agent_info) { + profile_.type = HSA_VEN_AMD_AQLPROFILE_EVENT_TYPE_TRACE; + } + + void Insert(const profile_info_t& info) { + if (info.parameters != NULL) { + Profile::Insert(info); + for (unsigned j = 0; j < info.parameter_count; ++j) { + Config(&profile_).Insert(info.parameters[j]); + } + } else if (info.event != NULL) { + Config(&profile_).Insert(*(info.event)); + } else { + EXC_ABORT(HSA_STATUS_ERROR, "invalid trace info inserted"); + } + } + + hsa_status_t Allocate(util::HsaRsrcFactory* rsrc) { + profile_.command_buffer.ptr = + rsrc->AllocateSysMemory(agent_info_, profile_.command_buffer.size); + profile_.output_buffer.size = output_buffer_size_; + profile_.output_buffer.ptr = (output_buffer_local_) ? + rsrc->AllocateLocalMemory(agent_info_, profile_.output_buffer.size) : + rsrc->AllocateSysMemory(agent_info_, profile_.output_buffer.size); + return (profile_.command_buffer.ptr && profile_.output_buffer.ptr) ? HSA_STATUS_SUCCESS + : HSA_STATUS_ERROR; + } + + private: + static uint32_t output_buffer_size_; + static bool output_buffer_local_; +}; + } // namespace rocprofiler #endif // SRC_CORE_PROFILE_H_ diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index d5af91c2..5d1cd9c7 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -150,20 +150,6 @@ void RestoreHsaApi() { table->amd_ext_->hsa_amd_queue_intercept_register_fn = hsa_amd_queue_intercept_register_fn; } -void PmcStarter(Context* context) { - hsa_agent_t agent = context->GetAgent(); - // Create queue - hsa_queue_t* queue; - hsa_status_t status = rocprofiler::CreateQueuePro(agent, 1, - HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "CreateQueuePro"); - HsaQueue hsa_queue(NULL, queue); - context->Start(0, &hsa_queue); - context->Read(0, &hsa_queue); - context->GetData(0); - hsa_queue_destroy(queue); -} - void StandaloneIntercept() { ::HsaApiTable* table = kHsaApiTable; table->core_->hsa_queue_create_fn = rocprofiler::CreateQueuePro; @@ -213,6 +199,8 @@ uint32_t LoadTool() { rocprofiler_settings_t settings{}; settings.intercept_mode = (intercept_mode != 0) ? 1 : 0; + settings.trace_size = TraceProfile::GetSize(); + settings.trace_local = TraceProfile::IsLocal() ? 1: 0; settings.timeout = util::HsaRsrcFactory::GetTimeoutNs(); settings.timestamp_on = InterceptQueue::IsTrackerOn() ? 1 : 0; settings.code_obj_tracking = 1; @@ -220,6 +208,8 @@ uint32_t LoadTool() { if (handler) handler(); else if (handler_prop) handler_prop(&settings); + TraceProfile::SetSize(settings.trace_size); + TraceProfile::SetLocal(settings.trace_local != 0); util::HsaRsrcFactory::SetTimeoutNs(settings.timeout); InterceptQueue::TrackerOn(settings.timestamp_on != 0); if (settings.intercept_mode != 0) intercept_mode = DISPATCH_INTERCEPT_MODE; @@ -237,9 +227,40 @@ uint32_t LoadTool() { return intercept_mode; } +void PmcStopper() { + rocprofiler::util::HsaRsrcFactory* rsrc = &rocprofiler::util::HsaRsrcFactory::Instance(); + + const uint32_t gpu_count = rsrc->GetCountOfGpuAgents(); + for (uint32_t gpu_id = 0; gpu_id < gpu_count; gpu_id++) { + // Get agent info + const rocprofiler::util::AgentInfo* agent_info; + if (rsrc->GetGpuAgentInfo(gpu_id, &agent_info) == false) { + fprintf(stderr, "Error: GetGpuAgentInfo(%u) \n", gpu_id); + abort(); + } + + // Create queue + hsa_queue_t* queue; + hsa_status_t status = rocprofiler::CreateQueuePro(agent_info->dev_id, 1, + HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT32_MAX, UINT32_MAX, &queue); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "CreateQueuePro (" + << gpu_id << ") " << std::hex << status); + + // Submit packets + for (auto& pkt: Context::stop_packets_) { + rsrc->Submit(queue, &pkt); + // Wait for stop packet to complete + rsrc->SignalWaitRestore(pkt.completion_signal, 1); + } + + hsa_queue_destroy(queue); + } +} + // Unload profiling tool librray void UnloadTool() { ONLOAD_TRACE("tool handle(" << tool_handle << ")"); + //if (Context::k_concurrent_) PmcStopper(); if (tool_handle) { tool_handler_t handler = reinterpret_cast(dlsym(tool_handle, "OnUnloadTool")); if (handler == NULL) { @@ -433,6 +454,8 @@ hsa_status_t hsa_amd_memory_async_copy_rect_interceptor( } rocprofiler_properties_t rocprofiler_properties; +uint32_t TraceProfile::output_buffer_size_ = 0x2000000; // 32M +bool TraceProfile::output_buffer_local_ = true; std::atomic Tracker::instance_{}; Tracker::mutex_t Tracker::glob_mutex_; Tracker::counter_t Tracker::counter_ = 0; @@ -707,7 +730,10 @@ PUBLIC_API hsa_status_t rocprofiler_stop_queue_callbacks() { // Method for iterating the events output data PUBLIC_API hsa_status_t rocprofiler_iterate_trace_data( rocprofiler_t* handle, hsa_ven_amd_aqlprofile_data_callback_t callback, void* data) { - return HSA_STATUS_ERROR; + API_METHOD_PREFIX + rocprofiler::Context* context = reinterpret_cast(handle); + context->IterateTraceData(callback, data); + API_METHOD_SUFFIX } //////////////////////////////////////////////////////////////////////////////// diff --git a/src/core/tracker.h b/src/core/tracker.h index d538aff7..f98c355e 100644 --- a/src/core/tracker.h +++ b/src/core/tracker.h @@ -62,6 +62,7 @@ class Tracker { void* arg; bool is_context; bool is_memcopy; + bool is_proxy; }; static Tracker* Create() { @@ -88,7 +89,7 @@ class Tracker { } // Add tracker entry - entry_t* Alloc(const hsa_agent_t& agent, const hsa_signal_t& orig) { + entry_t* Alloc(const hsa_agent_t& agent, const hsa_signal_t& orig, bool proxy=true) { hsa_status_t status = HSA_STATUS_ERROR; // Creating a new tracker entry @@ -105,11 +106,14 @@ class Tracker { entry->record = record; // Creating a proxy signal - const hsa_signal_value_t signal_value = (orig.handle) ? hsa_api_.hsa_signal_load_relaxed(orig) : 1; - status = hsa_api_.hsa_signal_create(signal_value, 0, NULL, &(entry->signal)); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); - status = hsa_api_.hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler, entry); - if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + if (proxy) { + entry->is_proxy = true; + const hsa_signal_value_t signal_value = (orig.handle) ? hsa_api_.hsa_signal_load_relaxed(orig) : 1; + status = hsa_api_.hsa_signal_create(signal_value, 0, NULL, &(entry->signal)); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_signal_create"); + status = hsa_api_.hsa_amd_signal_async_handler(entry->signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler, entry); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + } // Adding antry to the list mutex_.lock(); @@ -120,9 +124,17 @@ class Tracker { return entry; } + void SetHandler(entry_t* entry, Group* group) { + hsa_signal_t& dispatch_signal = group->GetDispatchSignal(); + hsa_signal_t& handler_signal = group->GetBarrierSignal(); + entry->signal = dispatch_signal; + hsa_status_t status = hsa_api_.hsa_amd_signal_async_handler(handler_signal, HSA_SIGNAL_CONDITION_LT, 1, Handler, entry); + if (status != HSA_STATUS_SUCCESS) EXC_RAISING(status, "hsa_amd_signal_async_handler"); + } + // Delete tracker entry void Delete(entry_t* entry) { - hsa_api_.hsa_signal_destroy(entry->signal); + if (entry->is_proxy && entry->signal.handle) hsa_api_.hsa_signal_destroy(entry->signal); mutex_.lock(); sig_list_.erase(entry->it); mutex_.unlock(); @@ -157,14 +169,13 @@ class Tracker { // Enable tracking static void Enable_opt(Group* group, const hsa_signal_t& orig_signal) { - Context* context = group->GetContext(); - context->SetOrigSignal(orig_signal); - context->GetRecord()->dispatch = util::HsaRsrcFactory::Instance().TimestampNs(); + group->SetOrigSignal(orig_signal); + group->GetRecord()->dispatch = util::HsaRsrcFactory::Instance().TimestampNs(); // Creating a proxy signal const hsa_signal_value_t signal_value = (orig_signal.handle) ? util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_load_relaxed(orig_signal) : 1; - hsa_signal_t& dispatch_signal = context->GetDispatchSignal(); + hsa_signal_t& dispatch_signal = group->GetDispatchSignal(); util::HsaRsrcFactory::Instance().HsaApi()->hsa_signal_store_screlease(dispatch_signal, signal_value); hsa_status_t status = util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_signal_async_handler(dispatch_signal, HSA_SIGNAL_CONDITION_LT, signal_value, Handler_opt, group); @@ -175,8 +186,8 @@ class Tracker { static bool Handler_opt(hsa_signal_value_t signal_value, void* arg) { Group* group = reinterpret_cast(arg); Context* context = group->GetContext(); - hsa_signal_t dispatch_signal = context->GetDispatchSignal(); - record_t* record = context->GetRecord(); + hsa_signal_t dispatch_signal = group->GetDispatchSignal(); + record_t* record = group->GetRecord(); hsa_amd_profiling_dispatch_time_t dispatch_time{}; hsa_status_t status = util::HsaRsrcFactory::Instance().HsaApi()->hsa_amd_profiling_get_dispatch_time(context->GetAgent(), dispatch_signal, &dispatch_time); @@ -186,7 +197,7 @@ class Tracker { record->complete = util::HsaRsrcFactory::Instance().TimestampNs(); // Original intercepted signal completion - const hsa_signal_t& orig_signal = context->GetOrigSignal(); + const hsa_signal_t& orig_signal = group->GetOrigSignal(); if (orig_signal.handle) { amd_signal_t* orig_signal_ptr = reinterpret_cast(orig_signal.handle); amd_signal_t* prof_signal_ptr = reinterpret_cast(dispatch_signal.handle); diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index e2f97ce9..7cbaecc5 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -24,6 +24,7 @@ POSSIBILITY OF SUCH DAMAGE. #include "util/hsa_rsrc_factory.h" +#include #include #include #include @@ -47,6 +48,13 @@ POSSIBILITY OF SUCH DAMAGE. namespace rocprofiler { namespace util { +// Demangle C++ symbol name +static const char* cpp_demangle(const char* symname) { + size_t size = 0; + int status; + const char* ret = abi::__cxa_demangle(symname, NULL, &size, &status); + return (ret != 0) ? ret : strdup(symname); +} // Callback function to get available in the system agents hsa_status_t HsaRsrcFactory::GetHsaAgentsCallback(hsa_agent_t agent, void* data) { @@ -732,11 +740,12 @@ hsa_status_t HsaRsrcFactory::executable_symbols_cb(hsa_executable_t exec, hsa_ex CHECK_STATUS("Error in getting kernel object", status); status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len); CHECK_STATUS("Error in getting name len", status); - char *name = new char[len + 1]; - status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); + char symname[len + 1]; + status = hsa_api_.hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, symname); CHECK_STATUS("Error in getting kernel name", status); - name[len] = 0; + symname[len] = 0; if (data == NULL) { + const char* name = cpp_demangle(symname); auto ret = symbols_map_->insert({addr, name}); if (ret.second == false) { delete[] ret.first->second; diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 30e35504..4724b87b 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -210,6 +210,14 @@ std::string filtr_kernel_name(const std::string name) { open_token = '>'; close_token = '<'; break; + case ']': + counter = 1; + open_token = ']'; + close_token = '['; + break; + case ' ': + ++rit; + continue; } if (counter == 0) break; } else { @@ -218,9 +226,8 @@ std::string filtr_kernel_name(const std::string name) { } ++rit; } - while (rit != rend) if ((*rit == ' ') || (*rit == ' ')) rit++; else break; auto rbeg = rit; - while (rit != rend) if ((*rit != ' ') && (*rit != ':')) rit++; else break; + while ((rit != rend) && (*rit != ' ') && (*rit != ':')) rit++; const uint32_t pos = rend - rit; const uint32_t length = rit - rbeg; return name.substr(pos, length); @@ -973,6 +980,8 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) // Set HSA intercepting check_env_var("ROCP_HSA_INTERC", settings->hsa_intercepting); if (settings->hsa_intercepting) rocprofiler_set_hsa_callbacks(hsa_callbacks, (void*)14); + // Enable concurrent SQTT + check_env_var("ROCP_K_CONCURRENT", settings->k_concurrent); // Enable optmized mode check_env_var("ROCP_OPT_MODE", settings->opt_mode); From 2ae6abd151ebf0a74596715dfc852856f4c1aeb7 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 15 Oct 2020 12:46:08 -0500 Subject: [PATCH 128/168] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9108409f..4b093e4b 100644 --- a/README.md +++ b/README.md @@ -157,7 +157,7 @@ Options: --ctx-limit - maximum number of outstanding contexts [0 - unlimited] --heartbeat - to print progress heartbeats [0 - disabled] - --obj-tracking - to turn on/off kernels code objects tracking [off] + --obj-tracking - to turn on/off kernels code objects tracking [on] To support V3 code-object. --stats - generating kernel execution stats, file .stats.csv From 0fb8713913b95bd5c51879b6239709748eb583fc Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Thu, 15 Oct 2020 12:48:35 -0500 Subject: [PATCH 129/168] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4b093e4b..91361b56 100644 --- a/README.md +++ b/README.md @@ -191,6 +191,6 @@ Configuration file: timestamp=off ctx-limit=0 heartbeat=0 - obj-tracking=off + obj-tracking=on > ``` From 73a8c80c3196e01f4471c403c515973793e1e922 Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Fri, 23 Oct 2020 12:16:52 -0500 Subject: [PATCH 130/168] Add rocminfo in the binary package dependency. --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e6765e47..4584e914 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -176,11 +176,11 @@ set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "ROCPROFILER library for AMD HSA runtime set ( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" ) ## Debian package specific variables -set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev" ) +set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo" ) set ( CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst;${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm" ) ## RPM package specific variables -set ( CPACK_RPM_PACKAGE_DEPENDS "hsa-rocr-dev" ) +set ( CPACK_RPM_PACKAGE_DEPENDS "hsa-rocr-dev, rocminfo" ) set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" ) set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" ) From 7df95623af3b732b27eddf11696b235b72ce5fee Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 28 Oct 2020 11:22:09 -0500 Subject: [PATCH 131/168] Update rocprofiler_spec.md --- doc/rocprofiler_spec.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index 25e61df7..75ab58b7 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -635,7 +635,7 @@ example but in SINGLEGROUP mode when only one group is allowed the context handl saved and then direct context method rocprofiler_get_data with default group index equal to 0 can be used. -hsa_status_t_dispatch_callback( +hsa_status_t dispatch_callback( const rocprofiler_callback_data_t* callback_data, void* user_data, rocprofiler_group_t* group) From 759f081cf325bc3e1597b97eacb7ed6d7303fe60 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Wed, 28 Oct 2020 17:59:40 -0500 Subject: [PATCH 132/168] 3.9 update --- bin/mem_manager.py | 11 +++++------ bin/sqlitedb.py | 1 + bin/tblextr.py | 3 ++- test/tool/tool.cpp | 2 +- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/bin/mem_manager.py b/bin/mem_manager.py index 8b616cc6..e87c4bca 100755 --- a/bin/mem_manager.py +++ b/bin/mem_manager.py @@ -188,17 +188,15 @@ def add_memcpy(self, recvals): m = hipMemcpy_ptrn_kind.match(args) if m: - direction = switcher.get(m.group(1), "unknown") + direction = switcher.get(m.group(1), "unknown") copy_line = str(start_time) + DELIM + str(end_time) + DELIM + pid + DELIM + tid + DELIM + event + DELIM + 'Direction=' + direction + DELIM + 'SrcType=' + srcptr_type + DELIM + 'DstType=' + dstptr_type + DELIM + "Size=" + str(size) + DELIM + "BW=" + str(round(bandwidth, 2)) + DELIM + 'Async=' + str(is_async) self.memcopies[recordid] = copy_line return copy_line; - def dump_data(self): - # To create “MM†table in DB on the finish - table_name = "MM" - file_name = os.environ['PWD'] + '/results.memcopy_info.csv' + def dump_data(self, table_name, file_name): + # To create memcopy info table in DB print("File '" + file_name + "' is generating") table_handle = self.db.add_table(table_name, mm_table_descr) @@ -212,5 +210,6 @@ def dump_data(self): else: rec_vals_array.append(rec) self.db.insert_entry(table_handle, rec_vals_array) - # To dump the MM table as CSV + + # To dump the memcopy info table as CSV self.db.dump_csv(table_name, file_name) diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index 62553a81..7aae7c17 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -233,6 +233,7 @@ def commit(self): # close DB def close(self): + self.connection.commit() self.connection.close() # access DB diff --git a/bin/tblextr.py b/bin/tblextr.py index 1b39a415..4ccc92e9 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -672,6 +672,7 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): kfd_statfile = re.sub(r'\.stats\.csv$', r'.kfd_stats.csv', statfile) ops_statfile = statfile copy_statfile = re.sub(r'\.stats\.csv$', r'.copy_stats.csv', statfile) + memcopy_info_file = re.sub(r'\.stats\.csv$', r'.memcopy_info.csv', statfile) sysinfo_file = re.sub(r'\.stats\.csv$', r'.sysinfo.txt', statfile) metadata_gen(sysinfo_file, 'rocminfo') @@ -778,7 +779,7 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): db.close_json(jsonfile); if mcopy_data_enabled: - memory_manager.dump_data() + memory_manager.dump_data('MM', memcopy_info_file) db.close() diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 4724b87b..34650a34 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -980,7 +980,7 @@ extern "C" PUBLIC_API void OnLoadToolProp(rocprofiler_settings_t* settings) // Set HSA intercepting check_env_var("ROCP_HSA_INTERC", settings->hsa_intercepting); if (settings->hsa_intercepting) rocprofiler_set_hsa_callbacks(hsa_callbacks, (void*)14); - // Enable concurrent SQTT + // Enable concurrent mode check_env_var("ROCP_K_CONCURRENT", settings->k_concurrent); // Enable optmized mode check_env_var("ROCP_OPT_MODE", settings->opt_mode); From 777925295d996313cdeb704551dbd5cad31f5a61 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Mon, 30 Nov 2020 00:48:16 -0600 Subject: [PATCH 133/168] 3.10 update --- CMakeLists.txt | 10 ++- bin/dform.py | 16 ++-- bin/mem_manager.py | 67 ++++++++------- bin/rpl_run.sh | 1 + bin/sqlitedb.py | 5 +- bin/tblextr.py | 100 ++++++++++++--------- inc/rocprofiler.h | 19 +++- src/CMakeLists.txt | 2 +- src/core/activity.cpp | 172 +++++++++++++++++++++++++++++++++++++ src/core/activity.h | 26 ++++++ src/core/hsa_interceptor.h | 140 +++++++++++++++++++++++------- src/core/rocprofiler.cpp | 4 +- test/tool/tool.cpp | 43 ++++++++-- 13 files changed, 478 insertions(+), 127 deletions(-) create mode 100644 src/core/activity.h diff --git a/CMakeLists.txt b/CMakeLists.txt index e6765e47..c5de434d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -135,8 +135,14 @@ add_custom_target ( so-link ALL WORKING_DIRECTORY ${PROJECT_BINARY_DIR} # Install header and library install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION ${DEST_NAME}/lib ) -install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h DESTINATION ${DEST_NAME}/include ) -install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h DESTINATION include/${DEST_NAME} ) +install ( FILES + ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h + ${CMAKE_CURRENT_SOURCE_DIR}/src/core/activity.h + DESTINATION ${DEST_NAME}/include ) +install ( FILES + ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h + ${CMAKE_CURRENT_SOURCE_DIR}/src/core/activity.h + DESTINATION include/${DEST_NAME} ) # rpl_run.sh tblextr.py txt2xml.sh install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/bin/rpl_run.sh diff --git a/bin/dform.py b/bin/dform.py index 82a81d08..1ed78d8f 100644 --- a/bin/dform.py +++ b/bin/dform.py @@ -46,26 +46,26 @@ def gen_table_bins(db, table, outfile, name_var, dur_ns_var): db.execute('DROP VIEW B') gen_message(outfile) -def gen_api_json_trace(db, table, start_us, outfile): - db.execute('create view B as select "Index", Name as name, pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s;' % (start_us, table)); +def gen_api_json_trace(db, table, start_ns, outfile): + db.execute('create view B as select "Index", Name as name, pid, tid, ((BeginNs - %d)/1000) as ts, (DurationNs/1000) as dur from %s;' % (start_ns, table)); db.dump_json('B', table, outfile) db.execute('DROP VIEW B') gen_message(outfile) -def gen_ext_json_trace(db, table, start_us, outfile): - db.execute('create view B as select Name as name, pid, tid, (BeginNs/1000 - %d) as ts, ((EndNs - BeginNs)/1000) as dur from %s;' % (start_us, table)); +def gen_ext_json_trace(db, table, start_ns, outfile): + db.execute('create view B as select Name as name, pid, tid, ((BeginNs - %d)/1000) as ts, ((EndNs - BeginNs)/1000) as dur from %s;' % (start_ns, table)); db.dump_json('B', table, outfile) db.execute('DROP VIEW B') gen_message(outfile) -def gen_ops_json_trace(db, table, base_pid, start_us, outfile): - db.execute('create view B as select "Index", Name as name, ("dev-id" + %d) as pid, tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s;' % (base_pid, start_us, table)); +def gen_ops_json_trace(db, table, base_pid, start_ns, outfile): + db.execute('create view B as select "Index", Name as name, ("dev-id" + %d) as pid, tid, ((BeginNs - %d)/1000) as ts, (DurationNs/1000) as dur from %s;' % (base_pid, start_ns, table)); db.dump_json('B', table, outfile) db.execute('DROP VIEW B') gen_message(outfile) -def gen_kernel_json_trace(db, table, base_pid, start_us, outfile): - db.execute('create view B as select "Index", KernelName as name, ("gpu-id" + %d) as pid, (0) as tid, (BeginNs/1000 - %d) as ts, (DurationNs/1000) as dur from %s;' % (base_pid, start_us, table)); +def gen_kernel_json_trace(db, table, base_pid, start_ns, outfile): + db.execute('create view B as select "Index", KernelName as name, ("gpu-id" + %d) as pid, (0) as tid, ((BeginNs - %d)/1000) as ts, (DurationNs/1000) as dur from %s;' % (base_pid, start_ns, table)); db.dump_json('B', table, outfile) db.execute('DROP VIEW B') gen_message(outfile) diff --git a/bin/mem_manager.py b/bin/mem_manager.py index e87c4bca..8480063c 100755 --- a/bin/mem_manager.py +++ b/bin/mem_manager.py @@ -124,13 +124,13 @@ def add_memcpy(self, recvals): select_expr = '"Index" = ' + str(recordid) + ' AND "proc-id" = ' + str(procid) # hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) - hipMemcpy_ptrn = re.compile(r'\(dst\((.*)\) src\((.*)\) sizeBytes\((\d+)\).*\)') + hipMemcpy_ptrn = re.compile(r'\(\s*dst\((.*)\) src\((.*)\) sizeBytes\((\d+)\).*\)') # hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, # size_t height, hipMemcpyKind kind); - hipMemcpy_ptrn2 = re.compile(r'\(dst\((.*)\) .* src\((.*)\) .* width\((\d+)\) height\((\d+)\).*\)') + hipMemcpy_ptrn2 = re.compile(r'\(\s*dst\((.*)\) .* src\((.*)\) .* width\((\d+)\) height\((\d+)\).*\)') # hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, # size_t count, hipMemcpyKind kind); - hipMemcpy_ptrn3 = re.compile(r'\(dst\((.*)\) .* src\((.*)\) count\((\d+)\).*\)') + hipMemcpy_ptrn3 = re.compile(r'\(\s*dst\((.*)\) .* src\((.*)\) count\((\d+)\).*\)') # memcopy with kind argument hipMemcpy_ptrn_kind = re.compile(r'.* kind\((\d+)\)\s*.*') # aysnc memcopy @@ -163,34 +163,39 @@ def add_memcpy(self, recvals): '4': "auto", } - if m_basic or m_2d or m_array: - if m_basic: - dstptr = m_basic.group(1) - dstptr_type = self.get_ptr_type(dstptr) - srcptr = m_basic.group(2) - srcptr_type = self.get_ptr_type(srcptr) - size = int(m_basic.group(3)) - if m_array: - dstptr = m_array.group(1) - dstptr_type = self.get_ptr_type(dstptr) - srcptr = m_array.group(2) - srcptr_type = self.get_ptr_type(srcptr) - size = m_array.group(3) - if m_2d: - dstptr = m_2d.group(1) - dstptr_type = self.get_ptr_type(dstptr) - srcptr = m_2d.group(2) - srcptr_type = self.get_ptr_type(srcptr) - size = m_2d.group(3)*m_2d.group(4) - - duration = (int(end_time) - int(start_time)) if not is_async else (int(async_copy_end_time) - int(async_copy_start_time)) - bandwidth = float(size) * 1000 / duration - - m = hipMemcpy_ptrn_kind.match(args) - if m: - direction = switcher.get(m.group(1), "unknown") - - copy_line = str(start_time) + DELIM + str(end_time) + DELIM + pid + DELIM + tid + DELIM + event + DELIM + 'Direction=' + direction + DELIM + 'SrcType=' + srcptr_type + DELIM + 'DstType=' + dstptr_type + DELIM + "Size=" + str(size) + DELIM + "BW=" + str(round(bandwidth, 2)) + DELIM + 'Async=' + str(is_async) + condition_matched = False + if m_basic: + dstptr = m_basic.group(1) + dstptr_type = self.get_ptr_type(dstptr) + srcptr = m_basic.group(2) + srcptr_type = self.get_ptr_type(srcptr) + size = int(m_basic.group(3)) + condition_matched = True + if m_array: + dstptr = m_array.group(1) + dstptr_type = self.get_ptr_type(dstptr) + srcptr = m_array.group(2) + srcptr_type = self.get_ptr_type(srcptr) + size = m_array.group(3) + condition_matched = True + if m_2d: + dstptr = m_2d.group(1) + dstptr_type = self.get_ptr_type(dstptr) + srcptr = m_2d.group(2) + srcptr_type = self.get_ptr_type(srcptr) + size = m_2d.group(3)*m_2d.group(4) + condition_matched = True + + if not condition_matched: fatal('Memcpy args \"' + args + '\" cannot be identified') + + duration = (int(end_time) - int(start_time)) if not is_async else (int(async_copy_end_time) - int(async_copy_start_time)) + bandwidth = float(size) * 1000 / duration + + m = hipMemcpy_ptrn_kind.match(args) + if m: + direction = switcher.get(m.group(1), "unknown") + + copy_line = str(start_time) + DELIM + str(end_time) + DELIM + pid + DELIM + tid + DELIM + event + DELIM + 'Direction=' + direction + DELIM + 'SrcType=' + srcptr_type + DELIM + 'DstType=' + dstptr_type + DELIM + "Size=" + str(size) + DELIM + "BW=" + str(round(bandwidth, 2)) + DELIM + 'Async=' + str(is_async) self.memcopies[recordid] = copy_line return copy_line; diff --git a/bin/rpl_run.sh b/bin/rpl_run.sh index f45b8312..6d66405d 100755 --- a/bin/rpl_run.sh +++ b/bin/rpl_run.sh @@ -443,6 +443,7 @@ while [ 1 ] ; do elif [ "$1" = "--parallel-kernels" ] ; then ARG_VAL=0 export ROCP_K_CONCURRENT=1 + export AQLPROFILE_READ_API=1 elif [ "$1" = "--verbose" ] ; then ARG_VAL=0 export ROCP_VERBOSE_MODE=1 diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index 7aae7c17..50adb698 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -151,7 +151,7 @@ def label_json(self, pid, label, file_name): fd.write(',{"args":{"name":"%s"},"ph":"M","pid":%s,"name":"process_name","sort_index":%d}\n' %(label, pid, self.section_index)) self.section_index += 1 - def flow_json(self, base_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id_list, start_us, file_name): + def flow_json(self, base_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id_list, file_name): if not re.search(r'\.json$', file_name): raise Exception('wrong output file type: "' + file_name + '"' ) with open(file_name, mode='a') as fd: @@ -160,8 +160,7 @@ def flow_json(self, base_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id corr_id = corr_id_list[ind] if (len(corr_id_list) != 0) else ind if corr_id in to_us_dict: (from_ts, from_tid, to_tid) = from_us_list[ind] - from_ts -= start_us - to_ts = to_us_dict[corr_id] - start_us + to_ts = to_us_dict[corr_id] if from_ts > to_ts: from_ts = to_ts fd.write(',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (from_ts, dep_id, from_pid, from_tid)) fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (to_ts, dep_id, to_pid, to_tid)) diff --git a/bin/tblextr.py b/bin/tblextr.py index 4ccc92e9..ce1ff348 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -37,7 +37,7 @@ NONE_PID = -1 max_gpu_id = -1 -START_US = 0 +START_NS = 0 hsa_activity_found = 0 @@ -164,9 +164,10 @@ def parse_res(infile): var_table[dispatch_number]['CompleteNs'] = m.group(4) ## filling dependenciws - from_ns = m.group(1) - from_us = int(from_ns) / 1000 - to_us = int(m.group(2)) / 1000 + from_ns = int(m.group(1)) + to_ns = int(m.group(2)) + from_us = int((from_ns - START_NS) / 1000) + to_us = int((to_ns - START_NS) / 1000) kern_dep_list.append((from_ns, disp_pid, disp_tid)) @@ -346,23 +347,19 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep if (hsa_activity_found): copy_raws = db.table_get_raws('COPY') copy_csv = '' copy_index = 0 + op_found = 0 file_name = indir + '/' + api_name + '_api_trace.txt' ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) ([^\(]+)(\(.*)$') hip_mcopy_ptrn = re.compile(r'hipMemcpy') ptrn_ac = re.compile(r'hsa_amd_memory_async_copy') - ptrn1_kernel = re.compile(r'^.*kernel\(') - ptrn2_kernel = re.compile(r'\)\) .*$') ptrn_fixformat = re.compile(r'(\d+:\d+ \d+:\d+ \w+)\(\s*(.*)\)$') ptrn_fixkernel = re.compile(r'\s+kernel=(.*)$') ptrn_multi_kernel = re.compile(r'(.*):(\d+)$') + ptrn_corr_id = re.compile(r'\ :(\d*)$') if not os.path.isfile(file_name): return 0 - dep_tid_list = [] - dep_from_us_list = [] - dep_id_list = [] - # parsing an input trace file and creating a DB table record_id_dict = {} table_handle = db.add_table(table_name, api_table_descr) @@ -379,6 +376,12 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep record = line[:-1] + corr_id = 0 + m = ptrn_corr_id.search(record) + if m: + corr_id = int(m.group(1)) + record = ptrn_corr_id.sub('', record) + kernel_arg = '' m = ptrn_fixkernel.search(record) if m: @@ -404,29 +407,50 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep # incrementing per-process record id/correlation id if not proc_id in record_id_dict: record_id_dict[proc_id] = 0 - corr_id = record_id_dict[proc_id] record_id_dict[proc_id] += 1 + record_id = record_id_dict[proc_id] + + # setting correlationid to record id if correlation id is not defined + if corr_id == 0: corr_id = record_id + rec_vals.append(corr_id) # extracting/converting stream id (stream_id, stream_found) = get_field(record_args, 'stream') - if stream_found != 0: + if stream_found == 0: + stream_id = 0 + else: stream_id = get_stream_index(stream_id) (rec_vals[5], found) = set_field(record_args, 'stream', stream_id) if found == 0: fatal('set_field() failed for "stream", args: "' + record_args + '"') - else: stream_id = 0 # extract kernel name string (kernel_str, kernel_found) = get_field(record_args, 'kernel') + if kernel_found == 0: kernel_str = '' + else: op_found = 1 if stream_found != 0 or kernel_found != 0: - ops_patch_data[(corr_id, proc_id)] = (stream_id if stream_found else 0, kernel_str if kernel_found else '') + ops_patch_data[(corr_id, proc_id)] = (stream_id, kernel_str) # dependencies filling if ptrn_ac.match(record_name) or hip_mcopy_ptrn.match(record_name): + op_found = 1 + + # memcopy data + if len(copy_raws) != 0: + copy_data = list(copy_raws[copy_index]) + args_str = rec_vals[5] + args_str = re.sub(r'\(', r'', args_str) + args_str = re.sub(r'\).*$', r'', args_str) + copy_line = str(copy_data[0]) + ', ' + str(copy_data[1]) + ', ' + record_name + ', ' + args_str + copy_csv += str(copy_index) + ', ' + copy_line + '\n' + copy_index += 1 + + if op_found: beg_ns = int(rec_vals[0]) end_ns = int(rec_vals[1]) - from_us = end_ns / 1000 + dur_us = int((end_ns - beg_ns) / 1000) + from_us = int((beg_ns - START_NS) / 1000) + dur_us if not proc_id in dep_dict: dep_dict[proc_id] = {} dep_proc = dep_dict[proc_id] @@ -439,16 +463,6 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep dep_str['from'].append((from_us, thrd_id, stream_id)) if expl_id: dep_str['id'].append(corr_id) - # memcopy data - if len(copy_raws) != 0: - copy_data = list(copy_raws[copy_index]) - args_str = rec_vals[5] - args_str = re.sub(r'\(', r'', args_str) - args_str = re.sub(r'\).*$', r'', args_str) - copy_line = str(copy_data[0]) + ', ' + str(copy_data[1]) + ', ' + record_name + ', ' + args_str - copy_csv += str(copy_index) + ', ' + copy_line + '\n' - copy_index += 1 - if False: # patching activity properties: kernel name, stream-id if (corr_id, proc_id) in dep_filtr: @@ -478,8 +492,8 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep # inserting of dispatch events correlated to the dependent dispatches for (from_ns, proc_id, thrd_id) in dep_list: if not proc_id in record_id_dict: record_id_dict[proc_id] = 0 - corr_id = record_id_dict[proc_id] record_id_dict[proc_id] += 1 + corr_id = record_id_dict[proc_id] db.insert_entry(table_handle, [from_ns, from_ns, api_pid, thrd_id, 'hsa_dispatch', '', corr_id, '']) # generating memcopy CSV @@ -526,11 +540,14 @@ def fill_copy_db(table_name, db, indir): db.insert_entry(table_handle, rec_vals) # filling dependencies + to_ns = int(rec_vals[0]) + to_us = int((to_ns - START_NS) / 1000) + if not proc_id in dep_dict: dep_dict[proc_id] = {} dep_proc = dep_dict[proc_id] if not pid in dep_proc: dep_proc[pid] = { 'pid': HSA_PID, 'from': [], 'to': {}, 'id': [] } dep_str = dep_proc[pid] - dep_str['to'][corr_id] = int(rec_vals[0]) / 1000 + dep_str['to'][corr_id] = to_us else: fatal("async-copy bad record: '" + record + "'") @@ -577,7 +594,7 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): m = ptrn_id.match(label) if not m: fatal("bad hcc ops entry '" + record + "'") name = m.group(1) - corr_id = int(m.group(2)) - 1 + corr_id = int(m.group(2)) proc_id = int(m.group(3)) # checking name for memcopy pattern @@ -616,11 +633,14 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): filtr[(corr_id, proc_id)] = rec_table_name # filling a dependencies + to_ns = int(rec_vals[0]) + to_us = int((to_ns - START_NS) / 1000) + if not proc_id in dep_dict: dep_dict[proc_id] = {} dep_proc = dep_dict[proc_id] if not pid in dep_proc: dep_proc[pid] = { 'bsp': OPS_PID, 'to': {} } dep_str = dep_proc[pid] - dep_str['to'][corr_id] = int(rec_vals[0]) / 1000 + dep_str['to'][corr_id] = to_us else: fatal("hcc ops bad record: '" + record + "'") @@ -644,10 +664,10 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): with open(begin_ts_file, mode='r') as fd: ind = 0 for line in fd.readlines(): - val = int(line) / 1000 - if ind == 0 or val < START_US: START_US = val + val = int(line) + if ind == 0 or val < START_NS: START_NS = val ind += 1 - print('START timestamp found (' + str(START_US) + 'us)') + print('START timestamp found (' + str(START_NS) + 'ns)') if re.search(r'\.csv$', outfile): csvfile = outfile @@ -718,38 +738,38 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): db.label_json(int(ind) + int(GPU_BASE_PID), "GPU" + str(ind), jsonfile) if ext_trace_found: - dform.gen_ext_json_trace(db, 'rocTX', START_US, jsonfile) + dform.gen_ext_json_trace(db, 'rocTX', START_NS, jsonfile) if len(var_table) != 0: dform.post_process_data(db, 'A', csvfile) dform.gen_table_bins(db, 'A', statfile, 'KernelName', 'DurationNs') if hsa_trace_found and 'BeginNs' in var_list: - dform.gen_kernel_json_trace(db, 'A', GPU_BASE_PID, START_US, jsonfile) + dform.gen_kernel_json_trace(db, 'A', GPU_BASE_PID, START_NS, jsonfile) if hsa_trace_found: dform.post_process_data(db, 'HSA') dform.gen_table_bins(db, 'HSA', hsa_statfile, 'Name', 'DurationNs') - dform.gen_api_json_trace(db, 'HSA', START_US, jsonfile) + dform.gen_api_json_trace(db, 'HSA', START_NS, jsonfile) if copy_trace_found: dform.post_process_data(db, 'COPY') dform.gen_table_bins(db, 'COPY', copy_statfile, 'Name', 'DurationNs') - dform.gen_api_json_trace(db, 'COPY', START_US, jsonfile) + dform.gen_api_json_trace(db, 'COPY', START_NS, jsonfile) if hip_trace_found: dform.post_process_data(db, 'HIP') dform.gen_table_bins(db, 'HIP', hip_statfile, 'Name', 'DurationNs') - dform.gen_api_json_trace(db, 'HIP', START_US, jsonfile) + dform.gen_api_json_trace(db, 'HIP', START_NS, jsonfile) if ops_filtr: dform.post_process_data(db, 'OPS') dform.gen_table_bins(db, 'OPS', ops_statfile, 'Name', 'DurationNs') - dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_US, jsonfile) + dform.gen_ops_json_trace(db, 'OPS', GPU_BASE_PID, START_NS, jsonfile) if kfd_trace_found: dform.post_process_data(db, 'KFD') dform.gen_table_bins(db, 'KFD', kfd_statfile, 'Name', 'DurationNs') - dform.gen_api_json_trace(db, 'KFD', START_US, jsonfile) + dform.gen_api_json_trace(db, 'KFD', START_NS, jsonfile) if any_trace_found: dep_id = 0 @@ -771,7 +791,7 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): to_us_dict = dep_str['to'] corr_id_list = dep_str['id'] - db.flow_json(dep_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id_list, START_US, jsonfile) + db.flow_json(dep_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id_list, jsonfile) dep_id += len(from_us_list) if any_trace_found: diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index b176cadf..4a966190 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -74,6 +74,7 @@ typedef struct { uint32_t hsa_intercepting; uint32_t k_concurrent; uint32_t opt_mode; + uint32_t obj_dumping; } rocprofiler_settings_t; //////////////////////////////////////////////////////////////////////////////// @@ -481,7 +482,8 @@ typedef enum { ROCPROFILER_HSA_CB_ID_DEVICE = 1, // Device assign callback ROCPROFILER_HSA_CB_ID_MEMCOPY = 2, // Memcopy callback ROCPROFILER_HSA_CB_ID_SUBMIT = 3, // Packet submit callback - ROCPROFILER_HSA_CB_ID_KSYMBOL = 4 // Loading/unloading of kernel symbol + ROCPROFILER_HSA_CB_ID_KSYMBOL = 4, // Loading/unloading of kernel symbol + ROCPROFILER_HSA_CB_ID_CODEOBJ = 5 // Loading/unloading of kernel symbol } rocprofiler_hsa_cb_id_t; // HSA callback data type @@ -516,8 +518,20 @@ typedef struct { uint64_t object; // kernel symbol object const char* name; // kernel symbol name uint32_t name_length; // kernel symbol name length - int destroy; // symbol executable destroy + int unload; // symbol executable destroy } ksymbol; + struct { + uint32_t storage_type; // code object storage type + int storage_file; // origin file descriptor + uint64_t memory_base; // origin memory base + uint64_t memory_size; // origin memory size + uint64_t load_base; // codeobj load base + uint64_t load_size; // codeobj load size + uint64_t load_delta; // codeobj load size + uint32_t uri_length; // URI string length + char* uri; // URI string + int unload; // unload flag + } codeobj; }; } rocprofiler_hsa_callback_data_t; @@ -534,6 +548,7 @@ typedef struct { rocprofiler_hsa_callback_fun_t memcopy; // memory copy callback rocprofiler_hsa_callback_fun_t submit; // packet submit callback rocprofiler_hsa_callback_fun_t ksymbol; // kernel symbol callback + rocprofiler_hsa_callback_fun_t codeobj; // codeobject load/unload callback } rocprofiler_hsa_callbacks_t; // Set callbacks. If the callback is NULL then it is disabled. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4c97ea6f..dbe00cd9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -35,4 +35,4 @@ set ( LIB_SRC ) add_library ( ${TARGET_LIB} SHARED ${LIB_SRC} ) target_include_directories ( ${TARGET_LIB} PRIVATE ${LIB_DIR} ${ROOT_DIR} ${HSA_RUNTIME_INC_PATH} ${HSA_KMT_LIB_PATH}/.. ) -target_link_libraries( ${TARGET_LIB} PRIVATE ${HSA_RUNTIME_LIB} c stdc++) +target_link_libraries( ${TARGET_LIB} PRIVATE ${HSA_RUNTIME_LIB} c stdc++ ) diff --git a/src/core/activity.cpp b/src/core/activity.cpp index 19f6bea3..2071b5cf 100644 --- a/src/core/activity.cpp +++ b/src/core/activity.cpp @@ -20,6 +20,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *******************************************************************************/ +#define ROCP_INTERNAL_BUILD +#include "activity.h" + #include #include #include @@ -55,6 +58,92 @@ void check_status(hsa_status_t status) { } } +// Activity primitives +namespace activity_prim { +// PC sampling callback data +struct pcsmp_callback_data_t { + const char* kernel_name; // sampled kernel name + void* data_buffer; // host buffer for tracing data + uint64_t id; // sample id + uint64_t cycle; // sample cycle + uint64_t pc; // sample PC +}; + +uint32_t activity_op = UINT32_MAX; +void* activity_arg = NULL; +std::atomic activity_callback{NULL}; +rocprofiler_t* context = NULL; + +hsa_status_t trace_data_cb(hsa_ven_amd_aqlprofile_info_type_t info_type, + hsa_ven_amd_aqlprofile_info_data_t* info_data, + void* data) { + const pcsmp_callback_data_t* pcsmp_data = (pcsmp_callback_data_t*) data; + + activity_record_t record{}; + record.op = activity_op; + record.pc_sample.se = pcsmp_data->id; + record.pc_sample.cycle = pcsmp_data->cycle; + record.pc_sample.pc = pcsmp_data->pc; + activity_async_callback_t fun = activity_callback.load(std::memory_order_acquire); + if (fun) { + (fun)(activity_op, &record, activity_arg); + } else { + free((void*)(pcsmp_data->kernel_name)); + } + return HSA_STATUS_SUCCESS; +} + +bool context_handler(rocprofiler_group_t group, void* arg) { + hsa_agent_t agent{}; + hsa_status_t status = rocprofiler_get_agent(group.context, &agent); + check_status(status); + const rocprofiler::util::AgentInfo* agent_info = rocprofiler::util::HsaRsrcFactory::Instance().GetAgentInfo(agent); + + pcsmp_callback_data_t pcsmp_data{}; + pcsmp_data.kernel_name = (const char*)arg; + pcsmp_data.data_buffer = rocprofiler::util::HsaRsrcFactory::Instance().AllocateSysMemory(agent_info, rocprofiler::TraceProfile::GetSize()); + status = rocprofiler_iterate_trace_data(group.context, trace_data_cb, &pcsmp_data); + check_status(status); + return false; +} + +// Kernel disoatch callback +hsa_status_t dispatch_callback(const rocprofiler_callback_data_t* callback_data, void* user_data, + rocprofiler_group_t* group) { + // context features + const rocprofiler_feature_kind_t trace_kind = + (rocprofiler_feature_kind_t)(ROCPROFILER_FEATURE_KIND_TRACE | ROCPROFILER_FEATURE_KIND_PCSMP_MOD); + const uint32_t feature_count = 1; + const uint32_t parameter_count = 1; + rocprofiler_feature_t* features = new rocprofiler_feature_t[feature_count]; + memset(features, 0, feature_count * sizeof(rocprofiler_feature_t)); + rocprofiler_parameter_t* parameters = new rocprofiler_parameter_t[parameter_count]; + memset(features, 0, parameter_count * sizeof(rocprofiler_parameter_t)); + parameters[0].parameter_name = HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET; + parameters[0].value = 0; + + features[0].kind = trace_kind; + features[0].parameters = parameters; + features[0].parameter_count = parameter_count; + + // context properties + rocprofiler_properties_t properties{}; + properties.handler = context_handler; + properties.handler_arg = (void*)strdup(callback_data->kernel_name); + + // Open profiling context + hsa_status_t status = rocprofiler_open(callback_data->agent, features, feature_count, + &context, 0 /*ROCPROFILER_MODE_SINGLEGROUP*/, &properties); + check_status(status); + + // Get group[0] + status = rocprofiler_get_group(context, 0, group); + check_status(status); + + return status; +} +} // namespace activity_prim + extern "C" { PUBLIC_API const char* GetOpName(uint32_t op) { return strdup("PCSAMPLE"); } @@ -63,10 +152,93 @@ PUBLIC_API bool RegisterApiCallback(uint32_t op, void* callback, void* arg) { re PUBLIC_API bool RemoveApiCallback(uint32_t op) { return true; } PUBLIC_API bool InitActivityCallback(void* callback, void* arg) { + activity_prim::activity_arg = arg; + activity_prim::activity_callback.store((activity_async_callback_t)callback, std::memory_order_release); + + rocprofiler_queue_callbacks_t queue_callbacks{}; + queue_callbacks.dispatch = activity_prim::dispatch_callback; + rocprofiler_set_queue_callbacks(queue_callbacks, NULL); + return true; } PUBLIC_API bool EnableActivityCallback(uint32_t op, bool enable) { + if (enable) { + activity_prim::activity_op = op; + rocprofiler_start_queue_callbacks(); + } else { + rocprofiler_stop_queue_callbacks(); + } + return true; +} + +struct evt_cb_entry_t { + typedef std::pair data_t; + data_t data; + std::mutex mutex; + + void set(const data_t& in) { + mutex.lock(); + data = in; + mutex.unlock(); + } + data_t get() { + mutex.lock(); + const data_t out = data; + mutex.unlock(); + return out; + } + evt_cb_entry_t() : data{} {} +}; +evt_cb_entry_t evt_cb_table[HSA_EVT_ID_NUMBER]; + +hsa_status_t codeobj_evt_callback( + rocprofiler_hsa_cb_id_t id, + const rocprofiler_hsa_callback_data_t* cb_data, + void* arg) +{ + const auto evt = evt_cb_table[id].get(); + activity_rtapi_callback_t evt_callback = (activity_rtapi_callback_t)evt.first; + if (evt_callback != NULL) evt_callback(ACTIVITY_DOMAIN_HSA_EVT, id, cb_data, evt.second); + return HSA_STATUS_SUCCESS; +} + +PUBLIC_API const char* GetEvtName(uint32_t op) { return strdup("CODEOBJ"); } + +PUBLIC_API bool RegisterEvtCallback(uint32_t op, void* callback, void* arg) { + evt_cb_table[op].set({callback, arg}); + + rocprofiler_hsa_callbacks_t ocb{}; + switch (op) { + case HSA_EVT_ID_ALLOCATE: + ocb.allocate = codeobj_evt_callback; + break; + case HSA_EVT_ID_DEVICE: + ocb.device = codeobj_evt_callback; + break; + case HSA_EVT_ID_MEMCOPY: + ocb.memcopy = codeobj_evt_callback; + break; + case HSA_EVT_ID_SUBMIT: + ocb.submit = codeobj_evt_callback; + break; + case HSA_EVT_ID_KSYMBOL: + ocb.ksymbol = codeobj_evt_callback; + break; + case HSA_EVT_ID_CODEOBJ: + ocb.codeobj = codeobj_evt_callback; + break; + default: + fatal("invalid activity opcode"); + } + rocprofiler_set_hsa_callbacks(ocb, NULL); + + return true; +} + +PUBLIC_API bool RemoveEvtCallback(uint32_t op) { + rocprofiler_hsa_callbacks_t ocb{}; + rocprofiler_set_hsa_callbacks(ocb, NULL); return true; } } // extern "C" diff --git a/src/core/activity.h b/src/core/activity.h new file mode 100644 index 00000000..ad64c0fa --- /dev/null +++ b/src/core/activity.h @@ -0,0 +1,26 @@ +#ifndef _SRC_CORE_ACTIVITY_H +#define _SRC_CORE_ACTIVITY_H + +#ifdef ROCP_INTERNAL_BUILD +#include "inc/rocprofiler.h" +#else +#include +#endif + +#include + +// HSA EVT ID enumeration +enum hsa_evt_id_t { + HSA_EVT_ID_ALLOCATE = ROCPROFILER_HSA_CB_ID_ALLOCATE, + HSA_EVT_ID_DEVICE = ROCPROFILER_HSA_CB_ID_DEVICE, + HSA_EVT_ID_MEMCOPY = ROCPROFILER_HSA_CB_ID_MEMCOPY, + HSA_EVT_ID_SUBMIT = ROCPROFILER_HSA_CB_ID_SUBMIT, + HSA_EVT_ID_KSYMBOL = ROCPROFILER_HSA_CB_ID_KSYMBOL, + HSA_EVT_ID_CODEOBJ = ROCPROFILER_HSA_CB_ID_CODEOBJ, + HSA_EVT_ID_NUMBER +}; + +// HSA EVT callback data type +typedef rocprofiler_hsa_callback_data_t hsa_evt_data_t; + +#endif // _SRC_CORE_ACTIVITY_H diff --git a/src/core/hsa_interceptor.h b/src/core/hsa_interceptor.h index 9207730b..ed20da96 100644 --- a/src/core/hsa_interceptor.h +++ b/src/core/hsa_interceptor.h @@ -51,7 +51,8 @@ SOFTWARE. (ID == ROCPROFILER_HSA_CB_ID_DEVICE) ? callbacks_.device: \ (ID == ROCPROFILER_HSA_CB_ID_MEMCOPY) ? callbacks_.memcopy: \ (ID == ROCPROFILER_HSA_CB_ID_SUBMIT) ? callbacks_.submit: \ - callbacks_.ksymbol; \ + (ID == ROCPROFILER_HSA_CB_ID_KSYMBOL) ? callbacks_.ksymbol: \ + callbacks_.codeobj; \ if ((__callback != NULL) && (recursion_ == false)) #define DO_HSA_CALLBACK \ @@ -230,12 +231,12 @@ class HsaInterceptor { rocprofiler_hsa_callback_data_t data{}; data.allocate.ptr = *ptr; data.allocate.size = size; - + HSA_RT(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &data.allocate.segment)); HSA_RT(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &data.allocate.global_flag)); - + DO_HSA_CALLBACK; - + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_DEVICE) { // Scan the pool assigned devices agent_callback_data_t callback_data{pool, *ptr}; @@ -303,44 +304,116 @@ class HsaInterceptor { void* arg) { const int free_flag = reinterpret_cast(arg); - rocprofiler_hsa_callback_data_t data{}; + hsa_ven_amd_loader_code_object_storage_type_t storage_type = + HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE; + int storage_fd = -1; + uint64_t memory_base = 0; + uint64_t memory_size = 0; + uint64_t load_base = 0; + uint64_t load_size = 0; + uint64_t load_delta = 0; + uint32_t uri_len = 0; + char* uri_str = NULL; HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( loaded_code_object, - HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE, - &data.allocate.ptr)); + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_TYPE, + &storage_type)); - if (free_flag == 0) { + if (storage_type == HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_FILE) { HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( loaded_code_object, - HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE, - &data.allocate.size)); - } else { - data.allocate.size = 0; + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_FILE, + &storage_fd)); + if (storage_fd == -1) { + printf("CodeObjectCallback: fd == -1\n"); fflush(stdout); + abort(); + } + } else if (storage_type == HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_MEMORY) { + HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_BASE, + &memory_base)); + HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_CODE_OBJECT_STORAGE_MEMORY_SIZE, + &memory_size)); + } + + HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE, + &load_base)); + HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE, + &load_size)); + HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA, + &load_delta)); + + // Getting URI + HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI_LENGTH, + &uri_len)); + + uri_str = (char*)calloc(uri_len + 1, sizeof(char)); + if (!uri_str) EXC_ABORT(HSA_STATUS_ERROR, "URI allocation"); + + HSA_RT(LoaderApiTable.hsa_ven_amd_loader_loaded_code_object_get_info( + loaded_code_object, + HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_URI, + uri_str)); + + if (storage_type != HSA_VEN_AMD_LOADER_CODE_OBJECT_STORAGE_TYPE_NONE) { + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_CODEOBJ) { + rocprofiler_hsa_callback_data_t data{}; + data.codeobj.storage_type = storage_type; + data.codeobj.storage_file = storage_fd; + data.codeobj.memory_base = memory_base; + data.codeobj.memory_size = memory_size; + data.codeobj.load_base = load_base; + data.codeobj.load_size = load_size; + data.codeobj.load_delta = load_delta; + data.codeobj.uri_length = uri_len; + data.codeobj.uri = uri_str; + data.codeobj.unload = free_flag; + + DO_HSA_CALLBACK; + } } - // Local GPU memory - // GLOBAL; FLAGS: COARSE GRAINED - data.allocate.segment = HSA_AMD_SEGMENT_GLOBAL; - data.allocate.global_flag = HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED; - data.allocate.is_code = 1; + { + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE) { + // Local GPU memory + // GLOBAL; FLAGS: COARSE GRAINED + rocprofiler_hsa_callback_data_t data{}; + data.allocate.ptr = reinterpret_cast(load_base); + data.allocate.size = (free_flag == 0) ? load_size : 0; + data.allocate.segment = HSA_AMD_SEGMENT_GLOBAL; + data.allocate.global_flag = HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED; + data.allocate.is_code = 1; - ISSUE_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE); + DO_HSA_CALLBACK; + } + } - if (free_flag == 0) { + if (free_flag != 0) { IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_DEVICE) { hsa_amd_pointer_info_t pointer_info{}; uint32_t num_agents = 0; hsa_agent_t* agents = NULL; pointer_info.size = sizeof(hsa_amd_pointer_info_t); HSA_RT(hsa_amd_pointer_info( - const_cast(data.allocate.ptr), + reinterpret_cast(load_base), &pointer_info, malloc, &num_agents, &agents)); - - DeviceCallback(num_agents, agents, data.allocate.ptr); + + DeviceCallback(num_agents, agents, reinterpret_cast(load_base)); } } @@ -372,7 +445,7 @@ class HsaInterceptor { data.ksymbol.object = obj; data.ksymbol.name = name; data.ksymbol.name_length = len; - data.ksymbol.destroy = free_flag; + data.ksymbol.unload = free_flag; ISSUE_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_KSYMBOL); } @@ -388,22 +461,23 @@ class HsaInterceptor { HSA_RT(hsa_executable_freeze_fn(executable, options)); - IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE) { + IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_KSYMBOL) { + HSA_RT(hsa_executable_iterate_symbols( + executable, + KernelSymbolCallback, + reinterpret_cast(0))); + } + + unsigned is_codeobj_cb = 0; + { IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_CODEOBJ) is_codeobj_cb |= 1; } + { IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_ALLOCATE) is_codeobj_cb |= 1; } + if (is_codeobj_cb) { LoaderApiTable.hsa_ven_amd_loader_executable_iterate_loaded_code_objects( executable, CodeObjectCallback, reinterpret_cast(0)); } - { - IS_HSA_CALLBACK(ROCPROFILER_HSA_CB_ID_KSYMBOL) { - HSA_RT(hsa_executable_iterate_symbols( - executable, - KernelSymbolCallback, - reinterpret_cast(0))); - } - } - return status; } diff --git a/src/core/rocprofiler.cpp b/src/core/rocprofiler.cpp index 5d1cd9c7..a544ca58 100644 --- a/src/core/rocprofiler.cpp +++ b/src/core/rocprofiler.cpp @@ -219,6 +219,7 @@ uint32_t LoadTool() { if (settings.k_concurrent) { Context::k_concurrent_ = settings.k_concurrent; InterceptQueue::k_concurrent_ = settings.k_concurrent; + InterceptQueue::TrackerOn(true); } if (settings.opt_mode) InterceptQueue::opt_mode_ = true; } @@ -536,7 +537,8 @@ PUBLIC_API bool OnLoad(HsaApiTable* table, uint64_t runtime_version, uint64_t fa rocprofiler::StandaloneIntercept(); } - ONLOAD_TRACE_END(); + ONLOAD_TRACE("end intercept_mode(" << std::hex << intercept_mode << ")" << + " intercept_mode_mask(" << std::hex << intercept_mode_mask << ")" << std::dec); return true; } diff --git a/test/tool/tool.cpp b/test/tool/tool.cpp index 34650a34..4bdce5dd 100644 --- a/test/tool/tool.cpp +++ b/test/tool/tool.cpp @@ -87,6 +87,7 @@ struct kernel_properties_t { uint32_t sgpr_count; uint32_t fbarrier_count; hsa_signal_t signal; + uint64_t object; }; // Context stored entry type @@ -392,7 +393,7 @@ bool dump_context_entry(context_entry_t* entry, bool to_clean = true) { const std::string nik_name = (to_truncate_names == 0) ? entry->data.kernel_name : filtr_kernel_name(entry->data.kernel_name); const AgentInfo* agent_info = HsaRsrcFactory::Instance().GetAgentInfo(entry->agent); - fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), pid(%u), tid(%u), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), kernel-name(\"%s\")", + fprintf(file_handle, "dispatch[%u], gpu-id(%u), queue-id(%u), queue-index(%lu), pid(%u), tid(%u), grd(%u), wgr(%u), lds(%u), scr(%u), vgpr(%u), sgpr(%u), fbar(%u), sig(0x%lx), obj(0x%lx), kernel-name(\"%s\")", index, agent_info->dev_index, entry->data.queue_id, @@ -407,6 +408,7 @@ bool dump_context_entry(context_entry_t* entry, bool to_clean = true) { (entry->kernel_properties.sgpr_count + agent_info->sgpr_block_dflt) * agent_info->sgpr_block_size, entry->kernel_properties.fbarrier_count, entry->kernel_properties.signal.handle, + entry->kernel_properties.object, nik_name.c_str()); if (record) fprintf(file_handle, ", time(%lu,%lu,%lu,%lu)", record->dispatch, @@ -540,6 +542,37 @@ bool context_pool_handler(const rocprofiler_pool_entry_t* entry, void* arg) { return false; } +// Profiling completion handler for concurrent implementation +// Dump the context entry +// Return true if the context was dumped successfully +bool context_handler_con(rocprofiler_group_t group, void* arg) { + context_entry_t* entry = reinterpret_cast(arg); + + if (pthread_mutex_lock(&mutex) != 0) { + perror("pthread_mutex_lock"); + abort(); + } + + bool ret = true; + ret = dump_context_entry(entry); + if (ret == false) { + fprintf(stderr, "tool error: context is not complete\n"); + abort(); + } + + if (trace_on) { + fprintf(stdout, "tool::handler_con: context_map %d tid %u\n", (int)(ctx_a_map->size()), GetTid()); + fflush(stdout); + } + + if (pthread_mutex_unlock(&mutex) != 0) { + perror("pthread_mutex_unlock"); + abort(); + } + + return false; +} + bool check_filter(const rocprofiler_callback_data_t* callback_data, const callbacks_data_t* tool_data) { bool found = true; @@ -617,6 +650,7 @@ void set_kernel_properties(const rocprofiler_callback_data_t* callback_data, kernel_properties_ptr->sgpr_count = AMD_HSA_BITS_GET(kernel_code->compute_pgm_rsrc1, AMD_COMPUTE_PGM_RSRC_ONE_GRANULATED_WAVEFRONT_SGPR_COUNT); kernel_properties_ptr->fbarrier_count = kernel_code->workgroup_fbarrier_count; kernel_properties_ptr->signal = callback_data->completion_signal; + kernel_properties_ptr->object = callback_data->packet->kernel_object; } // Kernel disoatch callback @@ -881,6 +915,7 @@ rocprofiler_hsa_callbacks_t hsa_callbacks { hsa_unified_callback, hsa_unified_callback, hsa_unified_callback, + NULL, NULL }; @@ -889,7 +924,7 @@ hsa_status_t hsa_ksymbol_cb(rocprofiler_hsa_cb_id_t id, const rocprofiler_hsa_callback_data_t* data, void* arg) { - HsaRsrcFactory::SetKernelNameRef(data->ksymbol.object, data->ksymbol.name, data->ksymbol.destroy); + HsaRsrcFactory::SetKernelNameRef(data->ksymbol.object, data->ksymbol.name, data->ksymbol.unload); return HSA_STATUS_SUCCESS; } @@ -1195,8 +1230,6 @@ void rocprofiler_unload(bool is_destr) { abort(); } - if (is_destr) CTX_OUTSTANDING_WAIT = 0; - // Unregister dispatch callback rocprofiler_remove_queue_callbacks(); @@ -1216,7 +1249,6 @@ void rocprofiler_unload(bool is_destr) { } fflush(stdout); -#if 0 // Cleanup if (callbacks_data != NULL) { delete[] callbacks_data->features; @@ -1233,7 +1265,6 @@ void rocprofiler_unload(bool is_destr) { range_vec = NULL; delete context_array; context_array = NULL; -#endif ONLOAD_TRACE_END(); } From 939b0e2124f8bb10126cee48496d5692079a4709 Mon Sep 17 00:00:00 2001 From: Bert Wesarg Date: Sat, 12 Dec 2020 09:03:23 +0100 Subject: [PATCH 134/168] Avoid empty-argument lists in C function declarations `()` as a function argument list in C is equivalent to `(...)`, i.e, in C you get at most a warning, if it is called with too many arguments. Clarify this situation by explicitly stating `(void)` as argument list. --- inc/rocprofiler.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 4a966190..0adbb585 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -57,8 +57,8 @@ extern "C" { //////////////////////////////////////////////////////////////////////////////// // Returning library version -uint32_t rocprofiler_version_major(); -uint32_t rocprofiler_version_minor(); +uint32_t rocprofiler_version_major(void); +uint32_t rocprofiler_version_minor(void); //////////////////////////////////////////////////////////////////////////////// // Global properties structure @@ -275,11 +275,11 @@ hsa_status_t rocprofiler_set_queue_callbacks( void* data); // [in/out] passed callbacks data // Remove queue callbacks -hsa_status_t rocprofiler_remove_queue_callbacks(); +hsa_status_t rocprofiler_remove_queue_callbacks(void); // Start/stop queue callbacks -hsa_status_t rocprofiler_start_queue_callbacks(); -hsa_status_t rocprofiler_stop_queue_callbacks(); +hsa_status_t rocprofiler_start_queue_callbacks(void); +hsa_status_t rocprofiler_stop_queue_callbacks(void); //////////////////////////////////////////////////////////////////////////////// // Start/stop profiling From 58ae9b67ae070868520118b8e6f4c64d950b6111 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Tue, 19 Jan 2021 12:53:59 -0600 Subject: [PATCH 135/168] Update rocprofiler_spec.md --- doc/rocprofiler_spec.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md index 75ab58b7..37952a56 100644 --- a/doc/rocprofiler_spec.md +++ b/doc/rocprofiler_spec.md @@ -746,7 +746,9 @@ Fetching a context entry: ``` ### 5.5. Standalone Sampling Usage Code Example ``` -The profiling metrics are being read from separate standalone queue other than the application kernels are submitted to. To enable the sampling mode, the profiling mode in all user queues should be enabled. It can be done by loading ROC-profiler library to HSA runtime using the environment variable HSA_TOOLS_LIB for all shell sessions. +The profiling metrics are being read from separate standalone queue other than the application kernels are submitted to. +To enable the sampling mode, the profiling mode in all user queues should be enabled. It can be done by loading ROC-profiler +library to HSA runtime using the environment variable HSA_TOOLS_LIB for all shell sessions. // Sampling rate uint32_t sampling_rate = ; // Sampling count From 4f948e5e17d80c789409b06f524923be98129ab5 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Wed, 20 Jan 2021 19:50:21 -0600 Subject: [PATCH 136/168] Update rocprof.md --- doc/rocprof.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/rocprof.md b/doc/rocprof.md index 717653eb..3b4c9f99 100644 --- a/doc/rocprof.md +++ b/doc/rocprof.md @@ -193,7 +193,9 @@ Tracing can be filtered by events names using profiler input file and by enablin #### 3.2.1. Filtering traced APIs A list of traced API names can be specified in profiler input file. An example of input file line for ROCr runtime trace (HAS API): +``` hsa: hsa_queue_create hsa_amd_memory_pool_allocate +``` #### 3.2.2. Tracing time period Trace can be dumped periodically with initial delay, dumping period length and rate: ``` From 8cd086e6fd568367056c79dd7d14e07d67c76861 Mon Sep 17 00:00:00 2001 From: eshcherb <33529668+eshcherb@users.noreply.github.com> Date: Mon, 8 Feb 2021 10:01:48 -0600 Subject: [PATCH 137/168] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 91361b56..85276d07 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,7 @@ To use the rocProfiler API you need the API header and to link your application ## Internal 'simple_convolution' test run script: ``` cd .../rocprofiler/build + make mytest run.sh ``` From 93778bdc4fa5403fedede3afcc1155338c62a6f3 Mon Sep 17 00:00:00 2001 From: Evgeny Date: Thu, 8 Apr 2021 10:13:00 -0500 Subject: [PATCH 138/168] cleanup --- test/tool/input.xml | 14 -------------- test/tool/input1.xml | 14 -------------- test/tool/input2.xml | 5 ----- 3 files changed, 33 deletions(-) delete mode 100644 test/tool/input.xml delete mode 100644 test/tool/input1.xml delete mode 100644 test/tool/input2.xml diff --git a/test/tool/input.xml b/test/tool/input.xml deleted file mode 100644 index f4ecd178..00000000 --- a/test/tool/input.xml +++ /dev/null @@ -1,14 +0,0 @@ -# Filter by dispatches range, GPU index and kernel names - - -# List of metrics - diff --git a/test/tool/input1.xml b/test/tool/input1.xml deleted file mode 100644 index f4ecd178..00000000 --- a/test/tool/input1.xml +++ /dev/null @@ -1,14 +0,0 @@ -# Filter by dispatches range, GPU index and kernel names - - -# List of metrics - diff --git a/test/tool/input2.xml b/test/tool/input2.xml deleted file mode 100644 index 254c83dc..00000000 --- a/test/tool/input2.xml +++ /dev/null @@ -1,5 +0,0 @@ -# List of metrics - From 48cc7855d52e0024071447aa061c65c6fa204fcd Mon Sep 17 00:00:00 2001 From: Christophe Paquot Date: Tue, 22 Jun 2021 21:12:08 -0700 Subject: [PATCH 139/168] SWDEV-282961: dependency arrows missing When building the json data flow, from_us_list has (timestamp, stream_id, thread_id). stream_id used to be interpreted as from_tid and tid as to_tid. But that's not correct. stream_id is always a destination and tid is the initiator (source). Change-Id: I2f5bb86a387b4003b17271c90bdf9de4b59a79bf --- bin/sqlitedb.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index dc5358ff..00a7dba2 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -159,11 +159,11 @@ def flow_json(self, base_id, from_pid, from_us_list, to_pid, to_us_dict, corr_id for ind in range(len(from_us_list)): corr_id = corr_id_list[ind] if (len(corr_id_list) != 0) else ind if corr_id in to_us_dict: - (from_ts, from_tid, to_tid) = from_us_list[ind] + (from_ts, stream_id, tid) = from_us_list[ind] to_ts = to_us_dict[corr_id] if from_ts > to_ts: from_ts = to_ts - fd.write(',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (from_ts, dep_id, from_pid, from_tid)) - fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (to_ts, dep_id, to_pid, to_tid)) + fd.write(',{"ts":%d,"ph":"s","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (from_ts, dep_id, from_pid, tid)) + fd.write(',{"ts":%d,"ph":"t","cat":"DataFlow","id":%d,"pid":%d,"tid":%d,"name":"dep"}\n' % (to_ts, dep_id, to_pid, stream_id)) dep_id += 1 def metadata_json(self, jsonfile, sysinfo_file): From 9f0ca101ec2286827c9b1cd8ccfa2ee3438f6a15 Mon Sep 17 00:00:00 2001 From: AMD Date: Wed, 16 Jun 2021 18:33:58 -0500 Subject: [PATCH 140/168] Add support for gfx90a Merge gfx90a support from the 'amd-npi' branch. Change-Id: I9b51711ed4a1d2f1ed42ba9b83cb12136be228b8 (cherry picked from commit 4df3e0bd9ae6e5982b43fd2fc3867cf5f0b87a53) --- src/core/metrics.h | 6 +++--- test/tool/gfx_metrics.xml | 17 +++++++++++++++++ test/tool/metrics.xml | 8 ++++++-- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/core/metrics.h b/src/core/metrics.h index a221168a..6eeebee3 100644 --- a/src/core/metrics.h +++ b/src/core/metrics.h @@ -196,9 +196,9 @@ class MetricsDict { xml_->AddConst("top.const.metric", "SE_NUM", agent_info->se_num); ImportMetrics(agent_info, "const"); agent_name_ = agent_info->name; - if (std::string("gfx906") == agent_info->name) { - ImportMetrics(agent_info, agent_info->name); - } else if (std::string("gfx908") == agent_info->name) { + if (std::string("gfx906") == agent_info->name || + std::string("gfx908") == agent_info->name || + std::string("gfx90a") == agent_info->name) { ImportMetrics(agent_info, agent_info->name); } else { agent_name_ = agent_info->gfxip; diff --git a/test/tool/gfx_metrics.xml b/test/tool/gfx_metrics.xml index 698826c6..c2a79af2 100644 --- a/test/tool/gfx_metrics.xml +++ b/test/tool/gfx_metrics.xml @@ -101,3 +101,20 @@ + + + + + + + + + + + + + + + + + diff --git a/test/tool/metrics.xml b/test/tool/metrics.xml index c340a439..a920ff04 100644 --- a/test/tool/metrics.xml +++ b/test/tool/metrics.xml @@ -65,12 +65,16 @@ + + -# VG20 +# Vega20 -# MI100 +# Arcturus +# Aldebaran + # GPUBusy The percentage of time GPU was busy. From a369af3049bc3a6d2faca537f4e47d5a046f01cf Mon Sep 17 00:00:00 2001 From: rachida Date: Tue, 18 May 2021 20:11:48 -0400 Subject: [PATCH 141/168] SWDEV-282961 Skip barrier events. Process hipMemSet events Marker events inside hcc_ops_trace.txt are from barriers so they are not meant to be stored in ops_patch_data map. Added support for hipMemset events which are a kind of memory copy. Change-Id: I213fe959bcd35ff0371613ba5bffd95bc53e06b5 (cherry picked from commit caa5f323007734fd0b14b3fa49618a5d7cc7acdd) --- bin/tblextr.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/bin/tblextr.py b/bin/tblextr.py index 61644e2a..deafb199 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -354,7 +354,7 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep copy_index = 0 ptrn_val = re.compile(r'(\d+):(\d+) (\d+):(\d+) ([^\(]+)(\(.*)$') - hip_mcopy_ptrn = re.compile(r'hipMemcpy') + hip_mcopy_ptrn = re.compile(r'hipMemcpy|hipMemset') hip_wait_event_ptrn = re.compile(r'WaitEvent') hip_sync_event_ptrn = re.compile(r'hipStreamSynchronize') hip_sync_dev_event_ptrn = re.compile(r'hipDeviceSynchronize') @@ -430,7 +430,6 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep if corr_id == 0: corr_id = record_id rec_vals.append(corr_id) - # extracting/converting stream id (stream_id, stream_found) = get_field(record_args, 'stream') if stream_found == 0: @@ -489,9 +488,6 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep mcopy_found = 1 op_found = 1 - if op_found: - ops_patch_data[(corr_id, proc_id)] = (thread_id, stream_id, kernel_str) - # HIP WaitEvent API if wait_event_ptrn.search(record_name): op_found = 1 @@ -505,6 +501,9 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep hsa_patch_data[(copy_index, proc_id)] = thread_id copy_index += 1 + if op_found: + ops_patch_data[(corr_id, proc_id)] = (thread_id, stream_id, kernel_str) + if op_found: op_found = 0 beg_ns = int(rec_vals[0]) @@ -650,6 +649,7 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): proc_id = int(m.group(3)) # checking name for memcopy pattern + is_barrier = 0 if ptrn_mcopy.search(name): rec_table_name = mcopy_table_name table_handle = mcopy_table_handle @@ -664,6 +664,7 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): if ptrn_barrier.search(name): name = '""' + is_barrier = 1 thread_id = 0 stream_id = 0 @@ -671,7 +672,8 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): (thread_id, stream_id, name_patch) = ops_patch_data[(corr_id, proc_id)] if name_patch != '': name = name_patch else: - fatal("hcc ops data not found: '" + record + "', " + str(corr_id) + ", " + str(proc_id)) + if is_barrier: continue + else: fatal("hcc ops data not found: '" + record + "', " + str(corr_id) + ", " + str(proc_id)) # activity record rec_vals[4] = name # Name @@ -878,3 +880,4 @@ def fill_ops_db(kernel_table_name, mcopy_table_name, db, indir): sys.exit(0) ############################################################# + From 2f189791a5cf6a6beb0aea9572d8760f792f0b32 Mon Sep 17 00:00:00 2001 From: Icarus Sparry Date: Wed, 7 Jul 2021 15:49:02 +0000 Subject: [PATCH 142/168] Add dependency on rocm-core Signed-off-by: Icarus Sparry Change-Id: Icb935e9230888fd005d9ca3617e28f6173173cc8 --- CMakeLists.txt | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c34f7cc9..e47f06df 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -204,7 +204,7 @@ else() endif() message ( "Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}" ) set ( CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT" ) -set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev" ) +set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsa-rocr-dev, rocm-core" ) ## Process the Debian install/remove scripts to update the CPACK variables configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst.in DEBIAN/postinst @ONLY ) configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm.in DEBIAN/prerm @ONLY ) @@ -229,12 +229,17 @@ if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" ) string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" ) endif() set ( CPACK_RPM_FILE_NAME "RPM-DEFAULT" ) -set ( CPACK_RPM_PACKAGE_DEPENDS "hsa-rocr-dev" ) +set ( CPACK_RPM_PACKAGE_REQUIRES "hsa-rocr-dev, rocm-core" ) ## Process the Rpm install/remove scripts to update the CPACK variables configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/post.in" RPM/post @ONLY ) configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/postun.in" RPM/postun @ONLY ) set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/post" ) set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/postun" ) +# Remove dependency on rocm-core if -DROCM_DEP_ROCMCORE=ON not given to cmake +if(NOT ROCM_DEP_ROCMCORE) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_PACKAGE_REQUIRES ${CPACK_RPM_PACKAGE_REQUIRES}) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_PACKAGE_DEPENDS ${CPACK_DEBIAN_PACKAGE_DEPENDS}) +endif() include ( CPack ) From ae6c093864009c8974c50db3b52d21839663738a Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Wed, 27 Oct 2021 19:06:30 -0600 Subject: [PATCH 143/168] Adding Known Issues --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 85276d07..48033a1f 100644 --- a/README.md +++ b/README.md @@ -195,3 +195,10 @@ Configuration file: obj-tracking=on > ``` + + +## Known Issues: +- For workloads where the hip application might make more than 10 million HIP API calls, the application might crash with the error - "Profiling data corrupted" + - Suggested Workaround - Instead of profiling for the complete run, it is suggested to run profiling in parts by using the --trace-period option. +- When the same kernel is launched back to back multiple times on a GPU, the cache hit rate from rocprofiler is reported as 0% or very low. This also causes FETCH_SIZE to be not usable for repeatable kernel. +- OpenMP applications are not fully supported by the rocprofiler. From 83592409edf7e69695abe0d6fd1ae0e8c8dc6e69 Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Wed, 27 Oct 2021 19:07:52 -0600 Subject: [PATCH 144/168] Updating Known Issues --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 48033a1f..95a3476f 100644 --- a/README.md +++ b/README.md @@ -201,4 +201,3 @@ Configuration file: - For workloads where the hip application might make more than 10 million HIP API calls, the application might crash with the error - "Profiling data corrupted" - Suggested Workaround - Instead of profiling for the complete run, it is suggested to run profiling in parts by using the --trace-period option. - When the same kernel is launched back to back multiple times on a GPU, the cache hit rate from rocprofiler is reported as 0% or very low. This also causes FETCH_SIZE to be not usable for repeatable kernel. -- OpenMP applications are not fully supported by the rocprofiler. From b71b5414d1e1d52d2dfe30b7216fc809613a0f44 Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Fri, 21 Jan 2022 12:05:10 -0600 Subject: [PATCH 145/168] SWDEV-318551: Adding License file for profiler Making the new License file, Adding support in the CMakeLists.txt Change-Id: I785035a780fbfc59951fc27d45f9c1869ffb4fb3 --- CMakeLists.txt | 5 +++++ LICENSE | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 68f9b86a..dfb069be 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -191,6 +191,9 @@ set ( CPACK_PACKAGE_CONTACT "ROCm Profiler Support Date: Wed, 9 Feb 2022 14:18:35 -0600 Subject: [PATCH 146/168] Update Readme --- Readme.txt | 211 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 178 insertions(+), 33 deletions(-) diff --git a/Readme.txt b/Readme.txt index 9008165a..e83b410c 100644 --- a/Readme.txt +++ b/Readme.txt @@ -1,54 +1,199 @@ -ROC Profiler library. -Profiling with metrics and traces based on perfcounters (PMC) and traces (SPM). -Implementation is based on AqlProfile HSA extension. -Library supports GFX8/GFX9. +# ROC-profiler +ROC profiler library. Profiling with perf-counters and derived metrics. Library supports GFX8/GFX9. -The library source tree: - - doc - Documentation +HW specific low-level performance analysis interface for profiling of GPU compute applications. The +profiling includes HW performance counters with complex performance metrics. + +To use the rocProfiler API you need the API header and to link your application with roctracer .so librray: + - the API header: /opt/rocm/rocprofiler/include/rocprofiler.h + - the .so library: /opt/rocm/lib/librocprofiler64.so + +## Documentation +- ['rocprof' cmdline tool specification](doc/rocprof.md) +- ['rocprofiler' profiling C API specification](doc/rocprofiler_spec.md) + +## Metrics +[The link to profiler default metrics XML specification](test/tool/metrics.xml) + + +## Source tree +``` + - bin + - rocprof - Profiling tool run script + - doc - Documentation - inc/rocprofiler.h - Library public API - src - Library sources - core - Library API sources - util - Library utils sources - xml - XML parser - test - Library test suite + - tool - Profiling tool + - tool.cpp - tool sources + - metrics.xml - metrics config file - ctrl - Test controll - util - Test utils - simple_convolution - Simple convolution test kernel +``` + +## Build environment: +``` + export CMAKE_PREFIX_PATH=: + export CMAKE_BUILD_TYPE= # release by default + export CMAKE_DEBUG_TRACE=1 # to enable debug tracing +``` + +## To build with the current installed ROCM: +``` + - ROCm is required. + ROCr-runtime and roctracer are needed + + - Python is required. + The required modules: CppHeaderParser, argparse, sqlite3 + To install: + sudo pip install CppHeaderParser argparse sqlite3 + + - To build and install to /opt/rocm/rocprofiler + Please use release branches/tags of 'amd-master' branch for development version. + + export CMAKE_PREFIX_PATH=/opt/rocm/include/hsa:/opt/rocm + + cd .../rocprofiler + ./build.sh +``` + +## Internal 'simple_convolution' test run script: +``` + cd .../rocprofiler/build + make mytest + run.sh +``` + +## To enable error messages logging to '/tmp/rocprofiler_log.txt': +``` + export ROCPROFILER_LOG=1 +``` + +## To enable verbose tracing: +``` + export ROCPROFILER_TRACE=1 +``` + +## Profiling utility usage: +``` +rocprof [-h] [--list-basic] [--list-derived] [-i ] [-o ] + +Options: + -h - this help + --verbose - verbose mode, dumping all base counters used in the input metrics + --list-basic - to print the list of basic HW counters + --list-derived - to print the list of derived metrics with formulas + --cmd-qts - quoting profiled cmd-line [on] + + -i <.txt|.xml file> - input file + Input file .txt format, automatically rerun application for every pmc line: + + # Perf counters group 1 + pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts FetchSize + # Perf counters group 2 + pmc : VALUUtilization,WriteSize L2CacheHit + # Filter by dispatches range, GPU index and kernel names + # supported range formats: "3:9", "3:", "3" + range: 1 : 4 + gpu: 0 1 2 3 + kernel: simple Pass1 simpleConvolutionPass2 -Build environment: + Input file .xml format, for single profiling run: -$ export CMAKE_PREFIX_PATH=: -$ export CMAKE_BUILD_TYPE= # release by default -$ export CMAKE_DEBUG_TRACE=1 # 1 to enable debug tracing + # Metrics list definition, also the form ":" can be used + # All defined metrics can be found in the 'metrics.xml' + # There are basic metrics for raw HW counters and high-level metrics for derived counters + -To build with the current installed ROCM: + # Filter by dispatches range, GPU index and kernel names + -$ cd .../rocprofiler -$ export CMAKE_PREFIX_PATH=/opt/rocm/include/hsa:/opt/rocm -$ mkdir build -$ cd build -$ cmake .. -$ make + -o - output CSV file [.csv] + The output CSV file columns meaning in the columns order: + Index - kernels dispatch order index + KernelName - the dispatched kernel name + gpu-id - GPU id the kernel was submitted to + queue-id - the ROCm queue unique id the kernel was submitted to + queue-index - The ROCm queue write index for the submitted AQL packet + tid - system application thread id which submitted the kernel + grd - the kernel's grid size + wgr - the kernel's work group size + lds - the kernel's LDS memory size + scr - the kernel's scratch memory size + vgpr - the kernel's VGPR size + sgpr - the kernel's SGPR size + fbar - the kernel's barriers limitation + sig - the kernel's completion signal + ... - The columns with the counters values per kernel dispatch + DispatchNs/BeginNs/EndNs/CompleteNs - timestamp columns if time-stamping was enabled + + -d - directory where profiler store profiling data including thread treaces [/tmp] + The data directory is renoving autonatically if the directory is matching the temporary one, which is the default. + -t - to change the temporary directory [/tmp] + By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory. -To run the test: + --basenames - to turn on/off truncating of the kernel full function names till the base ones [off] + --timestamp - to turn on/off the kernel dispatches timestamps, dispatch/begin/end/complete [off] + Four kernel timestamps in nanoseconds are reported: + DispatchNs - the time when the kernel AQL dispatch packet was written to the queue + BeginNs - the kernel execution begin time + EndNs - the kernel execution end time + CompleteNs - the time when the completion signal of the AQL dispatch packet was received -$ cd .../rocprofiler/build -$ export LD_LIBRARY_PATH=.: # paths to ROC profiler and oher libraries -$ export HSA_TOOLS_LIB=librocprofiler64.so # ROC profiler library loaded by HSA runtime -$ export ROCP_TOOL_LIB=test/libtool.so # tool library loaded by ROC profiler -$ export ROCP_METRICS=metrics.xml # ROC profiler metrics config file -$ export ROCP_INPUT=input.xml # input file for the tool library -$ export ROCP_OUTPUT_DIR=./ # output directory for the tool library, for metrics results file 'results.txt' and trace files -$ + --ctx-limit - maximum number of outstanding contexts [0 - unlimited] + --heartbeat - to print progress heartbeats [0 - disabled] + --obj-tracking - to turn on/off kernels code objects tracking [on] + To support V3 code-object. -Internal 'simple_convolution' test run script: -$ cd .../rocprofiler/build -$ run.sh + --stats - generating kernel execution stats, file .stats.csv + + --roctx-trace - to enable rocTX application code annotation trace, "Markers and Ranges" JSON trace section. + --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible + --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible + --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible + --kfd-trace - to trace KFD, generates API execution stats and JSON file chrome-tracing compatible + Generated files: ._stats.txt .json + Traced API list can be set by input .txt or .xml files. + Input .txt: + hsa: hsa_queue_create hsa_amd_memory_pool_allocate + Input .xml: + + + + -To enabled error messages logging to '/tmp/rocprofiler_log.txt': + --trace-start - to enable tracing on start [on] + --trace-period - to enable trace with initial delay, with periodic sample length and rate + Supported time formats: -$ export ROCPROFILER_LOG=1 +Configuration file: + You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:$HOME: + First the configuration file is looking in the current directory, then in your home, and then in the package directory. + Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'obj-tracking'. + An example of 'rpl_rc.xml': + +``` -To enable verbose tracing: -$ export ROCPROFILER_TRACE=1 +## Known Issues: +- For workloads where the hip application might make more than 10 million HIP API calls, the application might crash with the error - "Profiling data corrupted" + - Suggested Workaround - Instead of profiling for the complete run, it is suggested to run profiling in parts by using the --trace-period option. +- When the same kernel is launched back to back multiple times on a GPU, the cache hit rate from rocprofiler is reported as 0% or very low. This also causes FETCH_SIZE to be not usable for repeatable kernel. From 967d6c27259f0e2b1d0bf254b14e8ec7cd70009b Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Wed, 9 Feb 2022 14:19:02 -0600 Subject: [PATCH 147/168] Rename Readme.txt to README.md --- Readme.txt => README.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Readme.txt => README.md (100%) diff --git a/Readme.txt b/README.md similarity index 100% rename from Readme.txt rename to README.md From b318ef99a9a23bdf827234de59888b4dfddd35e7 Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Thu, 24 Feb 2022 10:34:52 -0600 Subject: [PATCH 148/168] Create rocprof.md --- doc/rocprof.md | 393 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 393 insertions(+) create mode 100644 doc/rocprof.md diff --git a/doc/rocprof.md b/doc/rocprof.md new file mode 100644 index 00000000..3b4c9f99 --- /dev/null +++ b/doc/rocprof.md @@ -0,0 +1,393 @@ +# rocprof +## 1. Overview +The rocProf is a command line tool implemented on the top of rocProfiler and rocTracer APIs. Source code for rocProf may be found here: +GitHub: https://github.com/ROCm-Developer-Tools/rocprofiler/blob/amd-master/bin/rocprof +This command line tool is implemented as a script which is setting up the environment for attaching the profiler and then run the provided application command line. The tool uses two profiling plugins loaded by ROC runtime and based on rocProfiler and rocTracer for collecting metrics/counters, HW traces and runtime API/activity traces. The tool consumes an input XML or text file with counters list or trace parameters and provides output profiling data and statistics in various formats as text, CSV and JSON traces. Google Chrome tracing can be used to visualize the JSON traces with runtime API/activity timelines and per kernel counters data. +## 2. Profiling Modes +‘rocprof’ can be used for GPU profiling using HW counters and application tracing +### 2.1. GPU profiling +GPU profiling is controlled with input file which defines a list of metrics/counters and a profiling scope. An input file is provided using option ‘-i ’. Output CSV file with a line per submitted kernel is generated. Each line has kernel name, kernel parameters and counter values. By option ‘—stats’ the kernel execution stats can be generated in CSV format. Currently profiling has limitation of serializing submitted kernels. +An example of input file: +``` + # Perf counters group 1 + pmc : Wavefronts VALUInsts SALUInsts SFetchInsts + # Perf counters group 2 + pmc : TCC_HIT[0], TCC_MISS[0] + # Filter by dispatches range, GPU index and kernel names + # supported range formats: "3:9", "3:", "3" + range: 1 : 4 + gpu: 0 1 2 3 + kernel: simple Pass1 simpleConvolutionPass2 +``` +An example of profiling command line for ‘MatrixTranspose’ application +``` +$ rocprof -i input.txt MatrixTranspose +RPL: on '191018_011134' from '/…./rocprofiler_pkg' in '/…./MatrixTranspose' +RPL: profiling '"./MatrixTranspose"' +RPL: input file 'input.txt' +RPL: output dir '/tmp/rpl_data_191018_011134_9695' +RPL: result dir '/tmp/rpl_data_191018_011134_9695/input0_results_191018_011134' +ROCProfiler: rc-file '/…./rpl_rc.xml' +ROCProfiler: input from "/tmp/rpl_data_191018_011134_9695/input0.xml" + gpu_index = + kernel = + range = + 4 metrics + L2CacheHit, VFetchInsts, VWriteInsts, MemUnitStalled + 0 traces +Device name Ellesmere [Radeon RX 470/480/570/570X/580/580X] +PASSED! + +ROCPRofiler: 1 contexts collected, output directory /tmp/rpl_data_191018_011134_9695/input0_results_191018_011134 +RPL: '/…./MatrixTranspose/input.csv' is generated +``` +#### 2.1.1. Counters and metrics +There are two profiling features, metrics and traces. Hardware performance counters are treated as the basic metrics and the formulas can be defined for derived metrics. +Counters and metrics can be dynamically configured using XML configuration files with counters and metrics tables: + - Counters table entry, basic metric: counter name, block name, event id + - Derived metrics table entry: metric name, an expression for calculation the metric from the counters + +Metrics XML File Example: +``` + + + + . . . + + + + . . . + + + + + +``` +##### 2.1.1.1. Metrics query +Available counters and metrics can be queried by options ‘—list-basic’ for counters and ‘—list-derived’ for derived metrics. The output for counters indicates number of block instances and number of block counter registers. The output for derived metrics prints the metrics expressions. +Examples: +``` +$ rocprof --list-basic +RPL: on '191018_014450' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' +ROCProfiler: rc-file '/…./rpl_rc.xml' +Basic HW counters: + gpu-agent0 : GRBM_COUNT : Tie High - Count Number of Clocks + block GRBM has 2 counters + gpu-agent0 : GRBM_GUI_ACTIVE : The GUI is Active + block GRBM has 2 counters + . . . + gpu-agent0 : TCC_HIT[0-15] : Number of cache hits. + block TCC has 4 counters + gpu-agent0 : TCC_MISS[0-15] : Number of cache misses. UC reads count as misses. + block TCC has 4 counters + . . . + +$ rocprof --list-derived +RPL: on '191018_015911' from '/opt/rocm/rocprofiler' in '/home/evgeny/work/BUILD/0_MatrixTranspose' +ROCProfiler: rc-file '/home/evgeny/rpl_rc.xml' +Derived metrics: + gpu-agent0 : TCC_HIT_sum : Number of cache hits. Sum over TCC instances. + TCC_HIT_sum = sum(TCC_HIT,16) + gpu-agent0 : TCC_MISS_sum : Number of cache misses. Sum over TCC instances. + TCC_MISS_sum = sum(TCC_MISS,16) + gpu-agent0 : TCC_MC_RDREQ_sum : Number of 32-byte reads. Sum over TCC instaces. + TCC_MC_RDREQ_sum = sum(TCC_MC_RDREQ,16) + . . . +``` +##### 2.1.1.2. Metrics collecting +Counters and metrics accumulated per kernel can be collected using input file with a list of metrics, see an example in 2.1. +Currently profiling has limitation of serializing submitted kernels. +The number of counters which can be dumped by one run is limited by GPU HW by number of counter registers per block. The number of counters can be different for different blocks and can be queried, see 2.1.1.1. +###### 2.1.1.2.1. Blocks instancing +GPU blocks are implemented as several identical instances. To dump counters of specific instance square brackets can be used, see an example in 2.1. +The number of block instances can be queried, see 2.1.1.1. +###### 2.1.1.2.2. HW limitations +The number of counters which can be dumped by one run is limited by GPU HW by number of counter registers per block. The number of counters can be different for different blocks and can be queried, see 2.1.1.1. + - Metrics groups + +To dump a list of metrics exceeding HW limitations the metrics list can be split on groups. +The tool supports automatic splitting on optimal metric groups: +``` +$ rocprof -i input.txt ./MatrixTranspose +RPL: on '191018_032645' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' +RPL: profiling './MatrixTranspose' +RPL: input file 'input.txt' +RPL: output dir '/tmp/rpl_data_191018_032645_12106' +RPL: result dir '/tmp/rpl_data_191018_032645_12106/input0_results_191018_032645' +ROCProfiler: rc-file '/…./rpl_rc.xml' +ROCProfiler: input from "/tmp/rpl_data_191018_032645_12106/input0.xml" + gpu_index = + kernel = + range = + 20 metrics + Wavefronts, VALUInsts, SALUInsts, SFetchInsts, FlatVMemInsts, LDSInsts, FlatLDSInsts, GDSInsts, VALUUtilization, FetchSize, WriteSize, L2CacheHit, VWriteInsts, GPUBusy, VALUBusy, SALUBusy, MemUnitStalled, WriteUnitStalled, LDSBankConflict, MemUnitBusy + 0 traces +Device name Ellesmere [Radeon RX 470/480/570/570X/580/580X] + +Input metrics out of HW limit. Proposed metrics group set: + group1: L2CacheHit VWriteInsts MemUnitStalled WriteUnitStalled MemUnitBusy FetchSize FlatVMemInsts LDSInsts VALUInsts SALUInsts SFetchInsts FlatLDSInsts GPUBusy Wavefronts + group2: WriteSize GDSInsts VALUUtilization VALUBusy SALUBusy LDSBankConflict + +ERROR: rocprofiler_open(), Construct(), Metrics list exceeds HW limits + +Aborted (core dumped) +Error found, profiling aborted. +``` + - Collecting with multiple runs + +To collect several metric groups a full application replay is used by defining several ‘pmc:’ lines in the input file, see 2.1. + +### 2.2. Application tracing +Supported application tracing includes runtime API and GPU activity tracing’ +Supported runtimes are: ROCr (HSA API) and HIP +Supported GPU activity: kernel execution, async memory copy, barrier packets. +The trace is generated in JSON format compatible with Chrome tracing. +The trace consists of several sections with timelines for API trace per thread and GPU activity. The timelines events show event name and parameters. +Supported options: ‘—hsa-trace’, ‘—hip-trace’, ‘—sys-trace’, where ‘sys trace’ is for HIP and HSA combined trace. +#### 2.2.1. HIP runtime trace +The trace is generated by option ‘—hip-trace’ and includes HIP API timelines and GPU activity at the runtime level. +#### 2.2.2. ROCr runtime trace +The trace is generated by option ‘—hsa-trace’ and includes ROCr API timelines and GPU activity at AQL queue level. Also, can provide counters per kernel. +#### 2.2.3. KFD driver trace +The trace is generated by option ‘—kfd-trace’ and includes KFD Thunk API timeline. +It is planned to add memory allocations/migration tracing. +#### 2.2.4. Code annotation +Support for application code annotation. +Start/stop API is supported to programmatically control the profiling. +A ‘roctx’ library provides annotation API. Annotation is visualized in JSON trace as a separate "Markers and Ranges" timeline section. +##### 2.2.4.1. Start/stop API +``` +// Tracing start API +void roctracer_start(); + +// Tracing stop API +void roctracer_stop(); +``` +##### 2.2.4.2. rocTX basic markers API +``` +// A marker created by given ASCII massage +void roctxMark(const char* message); + +// Returns the 0 based level of a nested range being started by given message associated to this range. +// A negative value is returned on the error. +int roctxRangePush(const char* message); + +// Marks the end of a nested range. +// Returns the 0 based level the range. +// A negative value is returned on the error. +int roctxRangePop(); +``` +### 2.3. Multiple GPUs profiling +The profiler supports multiple GPU’s profiling and provide GPI id for counters and kernels data in CSV output file. Also, GPU id is indicating for respective GPU activity timeline in JSON trace. +## 3. Profiling control +Profiling can be controlled by specifying a profiling scope, by filtering trace events and specifying interesting time intervals. +### 3.1. Profiling scope +Counters profiling scope can be specified by GPU id list, kernel name substrings list and dispatch range. +Supported range formats examples: "3:9", "3:", "3". You can see an example of input file in 2.1. +#### 3.2. Tracing control +Tracing can be filtered by events names using profiler input file and by enabling interesting time intervals by command line option. +#### 3.2.1. Filtering traced APIs +A list of traced API names can be specified in profiler input file. +An example of input file line for ROCr runtime trace (HAS API): +``` +hsa: hsa_queue_create hsa_amd_memory_pool_allocate +``` +#### 3.2.2. Tracing time period +Trace can be dumped periodically with initial delay, dumping period length and rate: +``` +--trace-period +``` +### 3.3. Concurrent kernels +Currently concurrent kernels profiling is not supported which is a planned feature. Kernels are serialized. +### 3.4. Multi-processes profiling +Multi-processes profiling is not currently supported. +### 3.5. Errors logging +Profiler errors are logged to global logs: +``` +/tmp/aql_profile_log.txt +/tmp/rocprofiler_log.txt +/tmp/roctracer_log.txt +``` +## 4. 3rd party visualization tools +‘rocprof’ is producing JSON trace compatible with Chrome Tracing, which is an internal trace visualization tool in Google Chrome. +### 4.1. Chrome tracing +Good review can be found by the link: https://aras-p.info/blog/2017/01/23/Chrome-Tracing-as-Profiler-Frontend/ +## 5. Command line options +The command line options can be printed with option ‘-h’: +``` +$ rocprof -h +RPL: on '191018_023018' from '/opt/rocm/rocprofiler' in '/…./MatrixTranspose' +ROCm Profiling Library (RPL) run script, a part of ROCprofiler library package. +Full path: /opt/rocm/rocprofiler/bin/rocprof +Metrics definition: /opt/rocm/rocprofiler/lib/metrics.xml + +Usage: + rocprof [-h] [--list-basic] [--list-derived] [-i ] [-o ] + +Options: + -h - this help + --verbose - verbose mode, dumping all base counters used in the input metrics + --list-basic - to print the list of basic HW counters + --list-derived - to print the list of derived metrics with formulas + --cmd-qts - quoting profiled cmd-line [on] + + -i <.txt|.xml file> - input file + Input file .txt format, automatically rerun application for every pmc line: + + # Perf counters group 1 + pmc : Wavefronts VALUInsts SALUInsts SFetchInsts FlatVMemInsts LDSInsts FlatLDSInsts GDSInsts VALUUtilization FetchSize + # Perf counters group 2 + pmc : WriteSize L2CacheHit + # Filter by dispatches range, GPU index and kernel names + # supported range formats: "3:9", "3:", "3" + range: 1 : 4 + gpu: 0 1 2 3 + kernel: simple Pass1 simpleConvolutionPass2 + + Input file .xml format, for single profiling run: + + # Metrics list definition, also the form ":" can be used + # All defined metrics can be found in the 'metrics.xml' + # There are basic metrics for raw HW counters and high-level metrics for derived counters + + + # Filter by dispatches range, GPU index and kernel names + + + -o - output CSV file [.csv] + -d - directory where profiler store profiling data including traces [/tmp] + The data directory is renoving autonatically if the directory is matching the temporary one, which is the default. + -t - to change the temporary directory [/tmp] + By changing the temporary directory you can prevent removing the profiling data from /tmp or enable removing from not '/tmp' directory. + + --basenames - to turn on/off truncating of the kernel full function names till the base ones [off] + --timestamp - to turn on/off the kernel disoatches timestamps, dispatch/begin/end/complete [off] + --ctx-wait - to wait for outstanding contexts on profiler exit [on] + --ctx-limit - maximum number of outstanding contexts [0 - unlimited] + --heartbeat - to print progress heartbeats [0 - disabled] + --obj-tracking - to turn on/off kernels code objects tracking [off] + + --stats - generating kernel execution stats, file .stats.csv + + --roctx-trace - to enable rocTX application code annotation trace, "Markers and Ranges" JSON trace section. + --sys-trace - to trace HIP/HSA APIs and GPU activity, generates stats and JSON trace chrome-tracing compatible + --hip-trace - to trace HIP, generates API execution stats and JSON file chrome-tracing compatible + --hsa-trace - to trace HSA, generates API execution stats and JSON file chrome-tracing compatible + --kfd-trace - to trace KFD, generates API execution stats and JSON file chrome-tracing compatible + Generated files: ._stats.txt .json + Traced API list can be set by input .txt or .xml files. + Input .txt: + hsa: hsa_queue_create hsa_amd_memory_pool_allocate + Input .xml: + + + + + + --trace-start - to enable tracing on start [on] + --trace-period - to enable trace with initial delay, with periodic sample length and rate + Supported time formats: + +Configuration file: + You can set your parameters defaults preferences in the configuration file 'rpl_rc.xml'. The search path sequence: .:/home/evgeny: + First the configuration file is looking in the current directory, then in your home, and then in the package directory. + Configurable options: 'basenames', 'timestamp', 'ctx-limit', 'heartbeat', 'obj-tracking'. + An example of 'rpl_rc.xml': + +``` +## 6. Publicly available counters and metrics +The following counters are publicly available for commercially available VEGA10/20 GPUs. + +Counters: +``` +• GRBM_COUNT : Tie High - Count Number of Clocks +• GRBM_GUI_ACTIVE : The GUI is Active +• SQ_WAVES : Count number of waves sent to SQs. (per-simd, emulated, global) +• SQ_INSTS_VALU : Number of VALU instructions issued. (per-simd, emulated) +• SQ_INSTS_VMEM_WR : Number of VMEM write instructions issued (including FLAT). (per-simd, emulated) +• SQ_INSTS_VMEM_RD : Number of VMEM read instructions issued (including FLAT). (per-simd, emulated) +• SQ_INSTS_SALU : Number of SALU instructions issued. (per-simd, emulated) +• SQ_INSTS_SMEM : Number of SMEM instructions issued. (per-simd, emulated) +• SQ_INSTS_FLAT : Number of FLAT instructions issued. (per-simd, emulated) +• SQ_INSTS_FLAT_LDS_ONLY : Number of FLAT instructions issued that read/wrote only from/to LDS (only works if EARLY_TA_DONE is enabled). (per-simd, emulated) +• SQ_INSTS_LDS : Number of LDS instructions issued (including FLAT). (per-simd, emulated) +• SQ_INSTS_GDS : Number of GDS instructions issued. (per-simd, emulated) +• SQ_WAIT_INST_LDS : Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic) +• SQ_ACTIVE_INST_VALU : regspec 71? Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, nondeterministic) +• SQ_INST_CYCLES_SALU : Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated) +• SQ_THREAD_CYCLES_VALU : Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd) +• SQ_LDS_BANK_CONFLICT : Number of cycles LDS is stalled by bank conflicts. (emulated) +• TA_TA_BUSY[0-15] : TA block is busy. Perf_Windowing not supported for this counter. +• TA_FLAT_READ_WAVEFRONTS[0-15] : Number of flat opcode reads processed by the TA. +• TA_FLAT_WRITE_WAVEFRONTS[0-15] : Number of flat opcode writes processed by the TA. +• TCC_HIT[0-15] : Number of cache hits. +• TCC_MISS[0-15] : Number of cache misses. UC reads count as misses. +• TCC_EA_WRREQ[0-15] : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands. +• TCC_EA_WRREQ_64B[0-15] : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. +• TCC_EA_WRREQ_STALL[0-15] : Number of cycles a write request was stalled. +• TCC_EA_RDREQ[0-15] : Number of TCC/EA read requests (either 32-byte or 64-byte) +• TCC_EA_RDREQ_32B[0-15] : Number of 32-byte TCC/EA read requests +• TCP_TCP_TA_DATA_STALL_CYCLES[0-15] : TCP stalls TA data interface. Now Windowed. +``` + +The following derived metrics have been defined and the profiler metrics XML specification can be found at: https://github.com/ROCm-Developer-Tools/rocprofiler/blob/amd-master/test/tool/metrics.xml. + +Metrics: +``` +• TA_BUSY_avr : TA block is busy. Average over TA instances. +• TA_BUSY_max : TA block is busy. Max over TA instances. +• TA_BUSY_min : TA block is busy. Min over TA instances. +• TA_FLAT_READ_WAVEFRONTS_sum : Number of flat opcode reads processed by the TA. Sum over TA instances. +• TA_FLAT_WRITE_WAVEFRONTS_sum : Number of flat opcode writes processed by the TA. Sum over TA instances. +• TCC_HIT_sum : Number of cache hits. Sum over TCC instances. +• TCC_MISS_sum : Number of cache misses. Sum over TCC instances. +• TCC_EA_RDREQ_32B_sum : Number of 32-byte TCC/EA read requests. Sum over TCC instances. +• TCC_EA_RDREQ_sum : Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances. +• TCC_EA_WRREQ_sum : Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances. +• TCC_EA_WRREQ_64B_sum : Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances. +• TCC_WRREQ_STALL_max : Number of cycles a write request was stalled. Max over TCC instances. +• TCC_MC_WRREQ_sum : Number of 32-byte effective writes. Sum over TCC instaces. +• FETCH_SIZE : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• WRITE_SIZE : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• GPUBusy : The percentage of time GPU was busy. +• Wavefronts : Total wavefronts. +• VALUInsts : The average number of vector ALU instructions executed per work-item (affected by flow control). +• SALUInsts : The average number of scalar ALU instructions executed per work-item (affected by flow control). +• VFetchInsts : The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory. +• SFetchInsts : The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control). +• VWriteInsts : The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory. +• FlatVMemInsts : The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch. +• LDSInsts : The average number of LDS read or LDS write instructions executed per work item (affected by flow control). Excludes FLAT instructions that read from or write to LDS. +• FlatLDSInsts : The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control). +• GDSInsts : The average number of GDS read or GDS write instructions executed per work item (affected by flow control). +• VALUUtilization : The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence). +• VALUBusy : The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). +• SALUBusy : The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal). +• Mem32Bwrites : +• FetchSize : The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• WriteSize : The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account. +• L2CacheHit : The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal). +• MemUnitBusy : The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound). +• MemUnitStalled : The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad). +• WriteUnitStalled : The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad). +• ALUStalledByLDS : The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad). +• LDSBankConflict : The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad). +``` From 06b07a5e1dd00d1dbbfefa585d1769d8c30f3e28 Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Thu, 24 Feb 2022 10:35:21 -0600 Subject: [PATCH 149/168] Create rocprofiler_spec.md --- doc/rocprofiler_spec.md | 837 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 837 insertions(+) create mode 100644 doc/rocprofiler_spec.md diff --git a/doc/rocprofiler_spec.md b/doc/rocprofiler_spec.md new file mode 100644 index 00000000..975d58ca --- /dev/null +++ b/doc/rocprofiler_spec.md @@ -0,0 +1,837 @@ +# ROC Profiler Library Specification +ROC Profiler API version 7 + +## 1. High level overview +``` +The goal of the implementation is to provide a HW specific low-level performance analysis +interface for profiling of GPU compute applications. The profiling includes HW performance +counters with complex performance metrics and HW traces. The implementation distinguishes +two profiling features, metrics and traces. HW performance counters are treated as the basic +metrics and the formulas can be defined for derived complex metrics. +The library can be loaded by HSA runtime as a tool plugin and it can be loaded by higher +level HW independent performance analysis API like PAPI. +The library has C API and is based on AQLprofile AMD specific HSA extension. + + 1. The library provides methods to query the list of supported HW features. + 2. The library provides profiling APIs to start, stop, read metrics results and tracing + data. + 3. The library provides a intercepting API for collecting per-kernel profiling data for + the kernels + dispatched to HSA AQL queues. + 4. The library provides mechanism to load profiling tool library plugin by env variable + ROCP_TOOL_LIB. + 5. The library is responsible for allocation of the buffers for profiling and notifying + about output data buffer overflow for traces. + 6. The library is implemented based on AMD specific AQLprofile HSA extension. + 7. The library implementation is abstracted from the specific GFXIP. + 8. The library implementation is extensible: + - Easy adding of counters and metrics + - Counters enumeration + - Counters and metrics can be dynamically configured using XML configuration files with + counters and metrics tables: + o Counters table entry, basic metric: counter name, block name, event id + o Complex metrics table entry: metric name, an expression for calculation the metric + from the counters + +Metrics XML file example: + + + + . . . + + + + . . . + + + + + +``` +## 2. Environment +``` +* HSA_TOOLS_LIB - required to be set to the name of rocprofiler library to be loaded by +HSA runtime +* ROCP_METRICS - path to the metrics XML file +* ROCP_TOOL_LIB - path to profiling tool library loaded by ROC Profiler +* ROCP_HSA_INTERCEPT - if set then HSA dispatches intercepting is enabled +``` +## 3. General API +### 3.1. Description +``` +The library supports method for getting the error number and error string of the last +failed library API call. +To check the conformance of used library APi header and the library binary the version +macros and API methods can be used. + +Returning the error and error string methods: +- rocprofiler_error_string - method for returning the error string + +Library version: +- ROCPROFILER_VERSION_MAJOR - API major version macro +- ROCPROFILER_VERSION_MINOR - API minor version macro +- rocprofiler_version_major - library major version +- rocprofiler_version_minor - library minor version +``` +### 3.2. Returning the error and error string methods +``` +const char* rocprofiler_error_string(); +``` +### 3.3. Library version +``` +The library provides back compatibility if the library major version is less or equal +then the API major version macro. + +API version macros defined in the library API header 'rocprofiler.h': + +ROCPROFILER_VERSION_MAJOR +ROCPROFILER_VERSION_MINOR + +Methods to check library major and minor venison: + +uint32_t rocprofiler_major_version(); +uint32_t rocprofiler_minor_version(); +``` +## 4. Backend API +### 4.1. Description +``` +The library provides the methods to open/close profiling context, to start, stop and read +HW performance counters and traces, to intercept kernel dispatches to collect per-kernel +profiling data. Also the library provides methods to calculate complex performance metrics +and to query the list of available metrics. The library distinguishes two profiling features, +metrics and traces, where HW performance counters are treated as the basic metrics. To check +if there was an error the library methods return HSA standard status code. +For a given context the profiling can be started/stopped and counters sampled in standalone +mode or profiling can be initiated by intercepting the kernel dispatches with registering +a dispatch callback. +For counters sampling, which is the usage model of higher level APIs like PAPI, +the start/stop/read APIs should be used. +For collecting per-kernel data for the submitted to HSA queues kernels the dispatch callback +API should be used. +The library provides back compatibility if the library major version is less or equal. + +Returned API status: +- hsa_status_t - HSA status codes are used from hsa.h header + +Loading and Configuring, loadable plugin on-load/unload methods: +- rocprofiler_settings_t – global properties +- OnLoadTool +- OnLoadToolProp +- OnUnloadTool + +Info API: +- rocprofiler_info_kind_t - profiling info kind +- rocprofiler_info_query_t - profiling info query +- rocprofiler_info_data_t - profiling info data +- rocprofiler_get_info - return the info for a given info kind +- rocprofiler_iterote_inf_ - iterate over the info for a given info kind +- rocprofiler_query_info - iterate over the info for a given info query + +Context API: +- rocprofiler_t - profiling context handle +- rocprofiler_feature_kind_t - profiling feature kind +- rocprofiler_feature_parameter_t - profiling feature parameter +- rocprofiler_data_kind_t - profiling data kind +- rocprofiler_data_t - profiling data +- rocprofiler_feature_t - profiling feature +- rocprofiler_mode_t - profiling modes +- rocprofiler_properties_t - profiler properties +- rocprofiler_open - open new profiling context +- rocprofiler_close - close profiling context and release all allocated resources +- rocprofiler_group_count - return profiling groups count +- rocprofiler_get_group - return profiling group for a given index +- rocprofiler_get_metrics - method for calculating the metrics data +- rocprofiler_iterate_trace_data - method for iterating output trace data instances +- rocprofiler_time_id_t - supported time value ID enumeration +- rocprofiler_get_time – return time for a given time ID and profiling timestamp value + +Sampling API: +- rocprofiler_start - start profiling +- rocprofiler_stop - stop profiling +- rocprofiler_read - read profiling data to the profiling features objects +- rocprofiler_get_data - wait for profiling data + Group versions of start/stop/read/get_data methods: + o rocprofiler_group_start + o rocprofiler_group_stop + o rocprofiler_group_read + o rocprofiler_group_get_data + +Intercepting API: +- rocprofiler_callback_t - profiling callback type +- rocprofiler_callback_data_t - profiling callback data type +- rocprofiler_dispatch_record_t – dispatch record +- rocprofiler_queue_callbacks_t – queue callbacks, dispatch/destroy +- rocprofiler_set_queue_callbacks - set queue kernel dispatch and queue destroy callbacks +- rocprofiler_remove_queue_callbacks - remove queue callbacks + +Context pool API: +- rocprofiler_pool_t – context pool handle +- rocprofiler_pool_entry_t – context pool entry +- rocprofiler_pool_properties_t – context pool properties +- rocprofiler_pool_handler_t – context pool completion handler +- rocprofiler_pool_open - context pool open +- rocprofiler_pool_close - context pool close +- rocprofiler_pool_fetch – fetch and empty context entry to pool +- rocprofiler_pool_release – release a context entry +- rocprofiler_pool_iterate – iterated fetched context entries +- rocprofiler_pool_flush – flush completed context entries +``` +### 4.2. Loading and Configuring +``` +Loading and Configuring +The profiling properties can be set by profiler plugin on loading by ROC runtime. +The profiler library plugin can be set by ROCP_TOOL_LIB env var. + +Global properties: + +typedef struct { + uint32_t intercept_mode; + uint64_t timeout; + uint32_t timestamp_on; +} rocprofiler_settings_t; + +On load/unload methods defined in profiling tool library loaded by ROCP_TOOL_LIB env var: +extern "C" void OnLoadTool(); +extern "C" void OnLoadToolProp(rocprofiler_settings_t* settings); +extern "C" void OnUnloadTool(); + +``` +### 4.3. Info API +``` +The profiling metrics are defined by name and the traces are defined by name and parameters. +All supported features can be iterated using 'iterate_info/query_info' methods. The counter +names are defined in counters table configuration file, each counter has a unique name and +defined by block name and event id. The traces and trace parameters names are same as in +the hardware documentation and the parameters codes are rocprofiler_feature_parameter_t values, +see below in the "Context API" section. + +Profiling info kind: + +typedef enum { + ROCPROFILER_INFO_KIND_METRIC = 0, // metric info + ROCPROFILER_INFO_KIND_METRIC_COUNT = 1, // metrics count + ROCPROFILER_INFO_KIND_TRACE = 2, // trace info + ROCPROFILER_INFO_KIND_TRACE_COUNT = 3, // traces count +} rocprofiler_info_kind_t; + +Profiling info data: + +typedef struct { + rocprofiler_info_kind_t kind; // info data kind + union { + struct { + const char* name; // metric name + uint32_t instances; // instances number + const char* expr; // metric expression, NULL for basic counters + const char* description; // metric description + const char* block_name; // block name + uint32_t block_counters; // number of block counters + } metric; + struct { + const char* name; // trace name + const char* description; // trace description + uint32_t parameter_count; // supported by the trace number + // parameters + } trace; + }; +} rocprofiler_info_data_t; + +Return info for a given info kind: + +has_status_t rocprofiler_get_info( + const hsa_agent_t* agent, // [in] GPU handle, NULL for all + // GPU agents + rocprofiler info_kind_t kind, // kind of iterated info + void *data); // data passed to callback + +Iterate over the info for a given info kind, and invoke an application-defined callback on +every iteration: + +has_status_t rocprofiler_iterate_info( + const hsa_agent_t* agent, // [in] GPU handle, NULL for all + // GPU agents + rocprofiler info_kind_t kind, // kind of iterated info + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback + void *data); + +Iterate over the info for a given info query, and invoke an application-defined callback on +every iteration. The query +fields set to NULL define the query wildcard: + +has_status_t rocprofiler_query_info( + const hsa_agent_t* agent, // [in] GPU handle, NULL for all + // GPU agents + rocprofiler info_kind_t kind, // kind of iterated info + rocprofiler_info_data_t query, // info query + hsa_status_t (*callback)(const rocprofiler_info_data_t info, void *data), // callback + void *data); // data passed to callback +``` +### 4.4. Context API +``` +Profiling context is accumulating all profiling information including profiling features +which carry profiling data, required buffers for profiling command packets and output data. +The context can be created and deleted by the library open/close methods. By deleting +the context all accumulated by the library resources associated with this context will be +released. If it is required more than one run to collect all requested counters data then +data for all profiling groups should be collected and then the metrics can be calculated by +loading the saved groups' data to the profiling context. Saving and loading of the groups +data is responsibility of the tool. The groups are automatically identified on the profiling +context open and there is API to access them, see the "Profiling groups" section below. + +Profiling context handle: + +typename rocprofiler_t; + +Profiling feature kind: + +typedef enum { + ROCPROFILER_FEATURE_KIND_METRIC = 0, // metric + ROCPROFILER_FEATURE_KIND_TRACE = 1 // trace +} rocprofiler_feature_kind_t; + +Profiling feature parameter: + +typedef hsa_ven_amd_aqlprofile_parameter_t rocprofiler_feature_parameter_t; + +Profiling data kind: + +typedef enum { + ROCPROFILER_DATA_KIND_UNINIT = 0, // data uninitialized + ROCPROFILER_DATA_KIND_INT32 = 1, // 32bit integer + ROCPROFILER_DATA_KIND_INT64 = 2, // 64bit integer + ROCPROFILER_DATA_KIND_FLOAT = 3, // float single-precision result + ROCPROFILER_DATA_KIND_DOUBLE = 4, // float double-precision result + ROCPROFILER_DATA_KIND_BYTES = 5 // trace output as a bytes array +} rocprofiler_data_kind_t; + + +Profiling data: + +typedef struct { + rocprofiler_data_kind_t kind; // result kind + union { + uint32_t result_int32; // 32bit integer result + uint64_t result_int64; // 64bit integer result + float result_float; // float single-precision result + double result_double; // float double-precision result + typedef struct { + void* ptr; // pointer + uint32_t size; // byte size + uint32_t instances; // number of trace instances + } result_bytes; // data by ptr and byte size + }; +} rocprofiler_data_t; + +Profiling feature: + +typedef struct { + rocprofiler_feature_kind_t type; // feature type + const char* name; // feature name + const rocprofiler_feature_parameter_t* parameters; // feature parameters + uint32_t parameter_count; // feature parameter count + rocprofiler_data_t* data; // profiling data +} rocprofiler_feature_t; + +Profiling mode masks: +There are several modes which can be specified for the profiling context. +STANDALONE mode can be used for the counters sampling in another then application context +to support statistical system wide profiling. In this mode the profiling context supports +its own queue which can be created on the context open if the CREATEQUEUE mode also specified. +See also "Profiler properties" section below for the standalone mode queue properties. +The profiler supports several profiling groups for collecting profiling data in several +runs and 'SINGLEGROUP' mode allows only one group and the context open will fail if more +groups are needed. + +typedef enum { + ROCPROFILER_MODE_STANDALONE = 1, // standalone mode when ROC profiler + // supports own AQL queue + ROCPROFILER_MODE_CREATEQUEUE = 2, // profiler creates queue in STANDALONE mode + ROCPROFILER_MODE_SINGLEGROUP = 4 // profiler allows one group only and fails + // if more groups are needed +} rocprofiler_mode_t; + +Context data readiness callback: + +typedef void (*rocprofiler_context_callback_t)( + rocprofiler_group_t* group, // profiling group + void* arg); // callback arg + +Profiler properties: +There are several properties which can be specified for the context. A callback can be +registered which will be called when the context data is ready. In standalone profiling mode +'ROCPROFILER_MODE_STANDALONE' the context supports its own queue and the queue can be set by +the property 'queue' or a queue will be created with the specified depth 'queue_depth' if mode +'ROCPROFILER_MODE_CREATEQUEUE' also specified. + +typedef struct { + rocprofiler_context_callback_t callback; // callback on the context data readiness + void* callback_arg; // callback arg + has_queue_t* queue; // HSA queue for standalone mode + uint32_t queue_depth; // created queue depth,for create-queue mode +} rocprofiler_properties_t; + +Open/close profiling context: + +hsa_status_t rocprofiler_open( + hsa_agent_t agent, // GPU handle + rocprofiler_feature_t* features, // [in/out] profiling feature array + uint32_t feature_count, // profiling feature count + rocprofiler_t** context, // [out] profiling context handle + uint32_t mode, // profiling mode mask + rocprofiler_properties_t* properties); // profiler properties + +hsa_status_t rocprofiler_close( + rocprofiler_t* context); // [in] profiling context + +Profiling groups: +The profiler on the context open automatically identifies a required number of the application +runs to collect all data needed for all specified metrics and creates a metric group per each +run. Data for all profiling groups should be collected and then the metrics can be calculated +by loading the saved groups' data to the profiling context. Saving and loading of he groups +data is responsibility of the tool. + +typedef struct { + uint32_t index; // profiling group index + rocprofiler_feature_t** features; // profiling features array + uint32_t feature_count; // profiling feature count + rocprofiler_t* context; // profiling context handle +} rocprofiler_group_t; + +Return profiling groups count: + +hsa_status_t rocprofiler_group_count( + rocprofiler_t* context); // [in/out] profiling context + uint32* count); // [out] profiling groups count + +Return the profiling group for a given index: + +hsa_status_t rocprofiler_get_group( + rocprofiler_t* context, // [in/out] profiling context, + // will be returned as + // a part of the group structure + uint32_t index, // [in] group index + rocprofiler_group_t* group); // [out] profiling group + +Calculate metrics data. The data will be stored to the registered profiling features data fields: +After all profiling context data is ready the registered metrics can be calculated. The context +data readiness can be checked by 'get_data' API or using the context callback. + +hsa_status_t rocprofiler_get_metrics( + rocprofiler_t* context); // [in/out] profiling context + +Method for iterating trace data instances: +Trace data can have several instance, for example, one instance per Shader Engine. + +hsa_status_t rocprofiler_iterate_trace_data( + const rocprofiler_t* contex, // [in] context object + hsa_ven_amd_aqlprofile_data_callback_t callback, // [in] callback to iterate + // the output data + void* callback_data); // [in/out] passed to callback data + +Converting of profiling timestamp to time value for suported time ID. +Supported time value ID enumeration: +typedef enum { + ROCPROFILER_TIME_ID_CLOCK_REALTIME = 0, // Linux realtime clock time + ROCPROFILER_TIME_ID_CLOCK_MONOTONIC = 1, // Linux monotonic clock time +} rocprofiler_time_id_t; + +Method for converting of profiling timestamp to time value for a given time ID: +hsa_status_t rocprofiler_get_time( + rocprofiler_time_id_t time_id, // identifier of the particular + // time to convert the timestamp + uint64_t timestamp, // profiling timestamp + uint64_t* value_ns); // [out] returned time ‘ns’ value +``` +### 4.5. Sampling API +``` +The API supports the counters sampling usage model with start/read/stop methods and also lets +to wait for the profiling data in the intercepting usage model with get_data method. + +Start/stop/read methods: + +hsa_status_t rocprofiler_start( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +hsa_status_t rocprofiler_stop( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +hsa_status_t rocprofiler_read( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +Wait for profiling data: + +hsa_status_t rocprofiler_get_data( + rocprofiler_t* context, // [in/out] profiling context + uint32_t group_index = 0); // group index + +Group versions of the above start/stop/read/get_data methods: + +hsa_status_t rocprofiler_group_start( + rocprofiler_group_t* group); // [in/out] profiling group + +hsa_status_t rocprofiler_group_stop( + rocprofiler_group_t* group); // [in/out] profiling group + + +hsa_status_t rocprofiler_group_read( + rocprofiler_group_t* group); // [in/out] profiling group + + +hsa_status_t rocprofiler_group_get_data( + rocprofiler_group_t* group); // [in/out] profiling group +``` +### 4.6. Intercepting API +``` +The library provides a callback API for enabling profiling for the kernels dispatched to +HSA AQL queues. The API enables per-kernel profiling data collection. +Currently implemented the option with serializing the kernels execution. + +ROC profiler callback type: + +hsa_status_t (*rocprofiler_callback_t)( + const rocprofiler_callback_data_t* callback_data, // callback data passed by HSA runtime + void* user_data, // [in/out] user data passed + // to the callback + rocprofiler_group** group); // [out] returned profiling group + +Profiling callback data: + +typedef struct { + uint64_t dispatch; // dispatch timestamp + uint64_t begin; // begin timestamp + uint64_t end; // end timestamp + uint64_t complete; // completion signal timestamp +} rocprofiler_dispatch_record_t; + +typedef struct { + hsa_agent_t agent; // GPU agent handle + uint32_t agent_index; // GPU index + const hsa_queue_t* queue; // HSA queue + uint64_t queue_index; // Index in the queue + const hsa_kernel_dispatch_packet_t* packet; // HSA dispatch packet + const char* kernel_name; // Kernel name + const rocprofiler_dispatch_record_t* record; // Dispatch record +} rocprofiler_callback_data_t; + +Queue callbacks: + +typedef struct { + rocprofiler_callback_t dispatch; // kernel dispatch callback + hsa_status_t (*destroy)(hsa_queue_t* queue, void* data); // queue destroy callback +} rocprofiler_queue_callbacks_t; + +Adding/removing kernel dispatch and queue destroy callbacks + +hsa_status_t rocprofiler_set_intercepting( + rocprofiler_intercepting_t callbacks, // intercepting callbacks + void* data); // [in/out] passed callbacks data + +hsa_status_t rocprofiler_remove_intercepting(); +``` +### 4.7. Profiling Context Pools +``` +The API provide capability to create a context pool for a given agent and a set of features, to fetch/release a context entry, to register a callback for pool’s contexts completion. +Profiling pool handle: +typename rocprofiler_pool_t; +Profiling pool entry: +typedef struct { + rocprofiler_t* context; // context object + void* payload; // payload data object +} rocprofiler_pool_entry_t; + +Profiling handler, calling on profiling completion: +typedef bool (*rocprofiler_pool_handler_t)(const rocprofiler_pool_entry_t* entry, void* arg); + +Profiling properties: +typedef struct { + uint32_t num_entries; // pool size entries + uint32_t payload_bytes; // payload size bytes + rocprofiler_pool_handler_t handler; // handler on context completion + void* handler_arg; // the handler arg +} rocprofiler_pool_properties_t; + +Open profiling pool: +hsa_status_t rocprofiler_pool_open( + hsa_agent_t agent, // GPU handle + rocprofiler_feature_t* features, // [in] profiling features array + uint32_t feature_count, // profiling info count + rocprofiler_pool_t** pool, // [out] context object + uint32_t mode, // profiling mode mask + rocprofiler_pool_properties_t*); // pool properties + +Close profiling pool: +hsa_status_t rocprofiler_pool_close( + rocprofiler_pool_t* pool); // profiling pool handle + +Fetch profiling pool entry: +hsa_status_t rocprofiler_pool_fetch( + rocprofiler_pool_t* pool, // profiling pool handle + rocprofiler_pool_entry_t* entry); // [out] empty profiling pool entry + +Release profiling pool entry: +hsa_status_t rocprofiler_pool_release( + rocprofiler_pool_entry_t* entry); // released profiling pool entry + +Iterate fetched profiling pool entries: +hsa_status_t rocprofiler_pool_iterate( + rocprofiler_pool_t* pool, // profiling pool handle + hsa_status_t (*callback)(rocprofiler_pool_entry_t* entry, void* data), + // callback + void *data); // [in/out] data passed to callback + +Flush completed entries in profiling pool: +hsa_status_t rocprofiler_pool_flush( + rocprofiler_pool_t* pool); // profiling pool handle +``` +## 5. Application code examples +### 5.1. Querying available metrics +``` +Info data callback: + + hsa_status_t info_data_callback(const rocprofiler_info_data_t info, void *data) { + switch (info.kind) { + case ROCPROFILER_INFO_KIND_METRIC: { + if (info.metric.expr != NULL) { + fprintf(stdout, "Derived counter: gpu-agent%d : %s : %s\n", + info.agent_index, info.metric.name, info.metric.description); + fprintf(stdout, " %s = %s\n", info.metric.name, info.metric.expr); + } else { + fprintf(stdout, "Basic counter: gpu-agent%d : %s", + info.agent_index, info.metric.name); + if (info.metric.instances > 1) { + fprintf(stdout, "[0-%u]", info.metric.instances - 1); + } + fprintf(stdout, " : %s\n", info.metric.description); + fprintf(stdout, " block %s has %u counters\n", + info.metric.block_name, info.metric.block_counters); + } + fflush(stdout); + break; + } + default: + printf("wrong info kind %u\n", kind); + return HSA_STATUS_ERROR; + } + return HSA_STATUS_SUCCESS; + } + +Printing all available metrics: + + hsa_status_t status = rocprofiler_iterate_info( + agent, + ROCPROFILER_INFO_KIND_METRIC, + info_data_callback, + NULL); + +``` +### 5.2. Profiling code example +``` +Profiling of L1 miss ratio, average memory bandwidth. +In the example below rocprofiler_group_get_data group APIs are used for the purpose of a usage +example but in SINGLEGROUP mode when only one group is allowed the context handle itself can be +saved and then direct context method rocprofiler_get_data with default group index equal to 0 +can be used. + +hsa_status_t dispatch_callback( + const rocprofiler_callback_data_t* callback_data, + void* user_data, + rocprofiler_group_t* group) +{ + hsa_status_t status = HSA_STATUS_SUCCESS; + // Profiling context + rocprofiler_t* context; + // Profiling info objects + rocprofiler_feature_t features* = new rocprofiler_feature_t[2]; + // Tracing parameters + rocprofiler_feature_parameter_t* parameters = new rocprofiler_feature_parameter_t[2]; + + // Setting profiling features + features[0].type = ROCPROFILER_METRIC; + features[0].name = "L1_MISS_RATIO"; + features[1].type = ROCPROFILER_METRIC; + features[1].name = "DRAM_BANDWIDTH"; + + // Creating profiling context + status = rocprofiler_open(callback_data->dispatch.agent, features, 2, &context, + ROCPROFILER_MODE_SINGLEGROUP, NULL); + + + // Get the profiling group + // For general case with many groups there is rocprofiler_group_count() API + const uint32_t group_index = 0 + status = rocprofiler_get_group(context, group_index, group); + + + // In SINGLEGROUP mode the context handle itself can be saved, because there is just one group + + + return status; +} + +Profiling tool constructor is adding the dispatch callback: + +void profiling_libary_constructor() { + // Defining callback data, no data in this simple example + void* callback_data = NULL; + + // Adding observers + hsa_sttaus_t status = rocprofiler_add_dispatch_callback(dispatch_callback, callback_data); + + + // Dispatching profiled kernel + +} + +void profiling_libary_destructor() { + > { + // In SINGLEGROUP mode the rocprofiler_get_group() method with default zero group + // index can be used, if context handle would be saved + status = rocprofiler_group_get_data(entry->group); + + status = rocprofiler_get_metrics(entry->group->context); + + status = rocprofiler_close(entry->group->context); + + + dispatch_data, entry->features, entry->features_count)>; + } +} +``` +### 5.3. Option to use completion callback +``` +Creating profiling context with completion callback: + . . . + rocprofiler_properties_t properties = {}; + properties.callback = completion_callback; + properties.callback_arg = NULL; // no args defined + status = rocprofiler_open(agent, features, 3, &context, + ROCPROFILER_MODE_SINGLEGROUP, properties); + + . . . + +Definition of completion callback: + +void completion_callback(profiler_group_t group, void* arg) { + + hsa_status_t status = rocprofiler_close(group.context); + +} +``` +### 5.4. Option to Use Context Pool +``` +Code example of context pool usage. +Creating profiling contexts pool: + . . . + rocprofiler_pool_properties_t properties{}; + properties.num_entries = 100; + properties.payload_bytes = sizeof(context_entry_t); + properties.handler = context_handler; + properties.handler_arg = handler_arg; + status = rocprofiler_pool_open(agent, features, 3, &context, + ROCPROFILER_MODE_SINGLEGROUP, properties); + + . . . + +Fetching a context entry: + rocprofiler_pool_entry_t pool_entry{}; + status = rocprofiler_pool_fetch(pool, &pool_entry); + + // Profiling context entry + rocprofiler_t* context = pool_entry.context; + context_entry_t* entry = reinterpret_cast + (pool_entry.payload); +``` +### 5.5. Standalone Sampling Usage Code Example +``` +The profiling metrics are being read from separate standalone queue other than the application kernels are submitted to. +To enable the sampling mode, the profiling mode in all user queues should be enabled. It can be done by loading ROC-profiler +library to HSA runtime using the environment variable HSA_TOOLS_LIB for all shell sessions. + // Sampling rate + uint32_t sampling_rate = ; + // Sampling count + uint32_t sampling_count = ; + // HSA status + hsa_status_t status = HSA_STATUS_ERROR; + // HSA agent + hsa_agent_t agent; + // Profiling context + rocprofiler_t* context = NULL; + // Profiling properties + rocprofiler_properties_t properties; + + // Getting HSA agent + + + // Profiling feature objects + const unsigned feature_count = 2; + rocprofiler_feature_t feature[feature_count]; + + // Counters and metrics + feature[0].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[0].name = "GPUBusy"; + feature[1].kind = ROCPROFILER_FEATURE_KIND_METRIC; + feature[1].name = "SQ_WAVES"; + + // Creating profiling context with standalone queue + properties = {}; + properties.queue_depth = 128; + status = rocprofiler_open(agent, feature, feature_count, &context, + ROCPROFILER_MODE_STANDALONE| ROCPROFILER_MODE_CREATEQUEUE| + ROCPROFILER_MODE_SINGLEGROUP, &properties); + + + // Start counters and sample them in the loop with the sampling rate + status = rocprofiler_start(context, 0); + + + for (unsigned ind = 0; ind < sampling_count; ++ind) { + sleep(sampling_rate); + status = rocprofiler_read(context, 0); + + status = rocprofiler_get_data(context, 0); + + status = rocprofiler_get_metrics(context); + + print_results(feature, feature_count); + } + + // Stop counters + status = rocprofiler_stop(context, group_n); + + + // Finishing cleanup + // Deleting profiling context will delete all allocated resources + status = rocprofiler_close(context); + +``` +### 5.6. Printing Out Profiling Results +``` +Below is a code example for printing out the profiling results from profiling features array: +void print_results(rocprofiler_feature_t* feature, uint32_t feature_count) { + for (rocprofiler_feature_t* p = feature; p < feature + feature_count; ++p) + { + std::cout << (p - feature) << ": " << p->name; + switch (p->data.kind) { + case ROCPROFILER_DATA_KIND_INT64: + std::cout << " result_int64 (" << p->data.result_int64 << ")" + << std::endl; + break; + + case ROCPROFILER_DATA_KIND_BYTES: { + std::cout << " result_bytes ptr(" << p->data.result_bytes.ptr << + ") " << " size(" << p->data.result_bytes.size << ")" + << " instance_count(" << p->data.result_bytes.instance_count + << ")"; + break; + } + default: + std::cout << "bad result kind (" << p->data.kind << ")" + << std::endl; + + } + } +} +``` From 4650aa69b8bcc22ff4ea0206968e89b5c9f3ddb9 Mon Sep 17 00:00:00 2001 From: Chun Yang Date: Thu, 17 Mar 2022 00:12:12 -0700 Subject: [PATCH 150/168] SWDEV-324379 : Expose FP64 and FP32 performance counters on on AMD profilers for MI200 Change-Id: I2c38ccc297872dfc1896314ceadbed98dc761766 (cherry picked from commit 26c479c72a585e16b9cb34f8d4dd8a9cc2bad8a9) --- test/tool/gfx_metrics.xml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/tool/gfx_metrics.xml b/test/tool/gfx_metrics.xml index c2a79af2..8da94414 100644 --- a/test/tool/gfx_metrics.xml +++ b/test/tool/gfx_metrics.xml @@ -103,6 +103,26 @@ + + + + + + + + + + + + + + + + + + + + From e6b8a3baf2db2ff9f0bdef21cff45056fcf23641 Mon Sep 17 00:00:00 2001 From: Saurabh Verma Date: Mon, 16 May 2022 15:40:44 -0500 Subject: [PATCH 151/168] SWDEV-298750:Approval to make internal profile counters public Added approved HW counters for MI200. Also added derived metrics for the same Change-Id: I1c6abfdfde4e4fd4ba8bd5eec0557ad08fd71c77 --- test/tool/gfx_metrics.xml | 235 +++++++++++++++++++++++++++++++++++++- test/tool/metrics.xml | 118 ++++++++++++++++++- 2 files changed, 348 insertions(+), 5 deletions(-) diff --git a/test/tool/gfx_metrics.xml b/test/tool/gfx_metrics.xml index 8da94414..9380eb4f 100644 --- a/test/tool/gfx_metrics.xml +++ b/test/tool/gfx_metrics.xml @@ -102,7 +102,70 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -118,23 +181,187 @@ + + + + + + + - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/tool/metrics.xml b/test/tool/metrics.xml index a920ff04..2f8e10dd 100644 --- a/test/tool/metrics.xml +++ b/test/tool/metrics.xml @@ -65,7 +65,123 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 4fccecb090f1205440b3a12d71acbfdc9a70c7fb Mon Sep 17 00:00:00 2001 From: Kiumars Sabeti Date: Fri, 15 Apr 2022 04:38:04 +0000 Subject: [PATCH 152/168] SWDEV-320429: wrapping the comma-containing names in the .csv in double quotes at the time the .csv is generated Change-Id: I62f94a1cf8895eb324080f8aacac3f13c02d7050 (cherry picked from commit 4d99f8d8e56fc5d615ba4eea439e32f0dd8466dc) --- bin/sqlitedb.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bin/sqlitedb.py b/bin/sqlitedb.py index 7aadd257..d1584e54 100644 --- a/bin/sqlitedb.py +++ b/bin/sqlitedb.py @@ -129,6 +129,11 @@ def dump_csv(self, table_name, file_name): with open(file_name, mode='w') as fd: fd.write(','.join(fields) + '\n') for raw in self._get_raws(table_name): + tmp = list(raw) + for idx in range(len(tmp)): + if type(tmp[idx]) == str: + if(not(tmp[idx][0] == tmp[idx][-1] == '"')): tmp[idx] = '"'+tmp[idx]+'"' + raw = tuple(tmp) fd.write(reduce(lambda a, b: str(a) + ',' + str(b), raw) + '\n') # dump JSON trace From 8d448fe7f53bf95f4ce972575dac5079d10b818d Mon Sep 17 00:00:00 2001 From: Laurent Morichetti Date: Fri, 14 Oct 2022 17:00:28 -0700 Subject: [PATCH 153/168] SWDEV-362165 - Escape argument strings in the JSON file "a_string\x000" -> \"a_string\\x000\" Change-Id: I8bf054702b34577f84080080d9538b16abc996fe (cherry picked from commit 1f2723d85638aacf4c355e955d096909b249db9c) --- bin/tblextr.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/tblextr.py b/bin/tblextr.py index ab81add6..9f4abb46 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -431,7 +431,8 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep mfixformat = ptrn_fixformat.match(record) if mfixformat: #replace '=' in args with parentheses - reformated_args = kernel_arg + mfixformat.group(2).replace('=','(').replace(',',')')+')' + reformated_args = kernel_arg + mfixformat.group(2).replace('=','(').replace(',',')') \ + .replace('\\', "\\\\").replace('\"', "\\\"")+')' record = mfixformat.group(1) + '( ' + reformated_args + ')' m = ptrn_val.match(record) From be70c6eab46ee4e05179fa4f53ef193f86c28ed3 Mon Sep 17 00:00:00 2001 From: Laurent Morichetti Date: Fri, 14 Oct 2022 17:00:28 -0700 Subject: [PATCH 154/168] SWDEV-362165 - Escape argument strings in the JSON file "a_string\x000" -> \"a_string\\x000\" Change-Id: I8bf054702b34577f84080080d9538b16abc996fe (cherry picked from commit 1f2723d85638aacf4c355e955d096909b249db9c) --- bin/tblextr.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/tblextr.py b/bin/tblextr.py index ab81add6..9f4abb46 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -431,7 +431,8 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep mfixformat = ptrn_fixformat.match(record) if mfixformat: #replace '=' in args with parentheses - reformated_args = kernel_arg + mfixformat.group(2).replace('=','(').replace(',',')')+')' + reformated_args = kernel_arg + mfixformat.group(2).replace('=','(').replace(',',')') \ + .replace('\\', "\\\\").replace('\"', "\\\"")+')' record = mfixformat.group(1) + '( ' + reformated_args + ')' m = ptrn_val.match(record) From b7d002d61b4726026fcc897c7f2776e62cb40021 Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Tue, 1 Nov 2022 23:21:05 -0500 Subject: [PATCH 155/168] Suppressing the hipGraphLaunch issue hipGraphs acts as a separate activity group as they have their own memory copies and their kernel dispatches and more activities, currently hip api that roctracer depend on doesn't have a way to represent this type, so we are temporarily suppressing the issue up till we have a complete support for the hipGraphs Change-Id: I5d889be05c9414530672c781e5a712d572ea4104 (cherry picked from commit bbbd177dd7aa07b138062a799eda8e21bc48bddb) --- bin/tblextr.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bin/tblextr.py b/bin/tblextr.py index 9f4abb46..8ebd9ae4 100755 --- a/bin/tblextr.py +++ b/bin/tblextr.py @@ -382,6 +382,7 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep hip_wait_event_ptrn = re.compile(r'WaitEvent') hip_sync_event_ptrn = re.compile(r'hipStreamSynchronize') hip_sync_dev_event_ptrn = re.compile(r'hipDeviceSynchronize') + hip_graph_ptrn = re.compile(r'hipGraphLaunch') wait_event_ptrn = re.compile(r'WaitEvent|hipStreamSynchronize|hipDeviceSynchronize') hip_stream_wait_write_ptrn = re.compile(r'hipStreamWaitValue64|hipStreamWriteValue64|hipStreamWaitValue32|hipStreamWriteValue32') prop_pattern = re.compile("([\w-]+)\((\w+)\)"); @@ -514,6 +515,10 @@ def fill_api_db(table_name, db, indir, api_name, api_pid, dep_pid, dep_list, dep mcopy_found = 1 op_found = 1 + # HIP Graph API + if hip_graph_ptrn.search(record_name): + op_found = 1 + # HIP WaitEvent API if wait_event_ptrn.search(record_name): op_found = 1 From 74ecd34d80a743a7ee7eda7e12b2cc187e04d0a3 Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Fri, 18 Nov 2022 16:10:18 -0600 Subject: [PATCH 156/168] GPU Index to use HSA AMD Agent Driver Node ID Change-Id: Ia814f64419615f1d77fc09fc88f11bbaf75afd45 (cherry picked from commit 553a4c7ee76074a60ec82a922716bf869715f5b4) --- src/util/hsa_rsrc_factory.cpp | 8 +++++++- test/util/hsa_rsrc_factory.cpp | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/util/hsa_rsrc_factory.cpp b/src/util/hsa_rsrc_factory.cpp index 6d60882b..50d4ec1e 100644 --- a/src/util/hsa_rsrc_factory.cpp +++ b/src/util/hsa_rsrc_factory.cpp @@ -366,7 +366,13 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { agent_info->vgpr_block_size = 4; // Set GPU index - agent_info->dev_index = gpu_list_.size(); + uint32_t driver_node_id; + status = hsa_api_.hsa_agent_get_info( + agent, + static_cast(HSA_AMD_AGENT_INFO_DRIVER_NODE_ID), + &driver_node_id); + CHECK_STATUS("hsa_agent_get_info(gpu hsa_driver_node_id)", status); + agent_info->dev_index = driver_node_id; gpu_list_.push_back(agent_info); gpu_agents_.push_back(agent); } diff --git a/test/util/hsa_rsrc_factory.cpp b/test/util/hsa_rsrc_factory.cpp index f20e66f0..21f340e0 100644 --- a/test/util/hsa_rsrc_factory.cpp +++ b/test/util/hsa_rsrc_factory.cpp @@ -355,7 +355,13 @@ const AgentInfo* HsaRsrcFactory::AddAgentInfo(const hsa_agent_t agent) { CHECK_ITER_STATUS("hsa_amd_agent_iterate_memory_pools(gpu pool)", status); // Set GPU index - agent_info->dev_index = gpu_list_.size(); + uint32_t driver_node_id; + status = hsa_api_.hsa_agent_get_info( + agent, + static_cast(HSA_AMD_AGENT_INFO_DRIVER_NODE_ID), + &driver_node_id); + CHECK_STATUS("hsa_agent_get_info(gpu hsa_driver_node_id)", status); + agent_info->dev_index = driver_node_id; gpu_list_.push_back(agent_info); gpu_agents_.push_back(agent); } From e80f7cb91b7524dfde16cca8d5302557b5a633cb Mon Sep 17 00:00:00 2001 From: Ammar ELWazir Date: Mon, 20 Feb 2023 09:17:41 -0600 Subject: [PATCH 157/168] Squashed commit for rocprofilerv2 support of the following: commit 34a9a1ced9f7c15002e56dc0dd562b443ebe85c6 Author: Ammar ELWazir Date: Fri Feb 17 11:12:27 2023 -0600 Fixing rocprofilerv2 naming and CMake issues Change-Id: Ib6d336349a056731e5c0f35151296d6fea671360 commit e5439b38c25f9517ec977a943059ba2d17f1baf9 Author: Ammar ELWazir Date: Fri Feb 17 09:28:06 2023 -0600 Fixing Installation Path Change-Id: I98e098f0d06e9862a5df81ddee18c3a081c6157a commit 288f98290ba727f9cccbce9c67a2b6ab5577dfb0 Author: Ammar ELWazir Date: Thu Feb 16 19:48:35 2023 +0000 Fixing Packages installation path Change-Id: Ide3536b3c6050effcc9337e612375a7e5ddb6522 commit 1bfc40c8e7ea4863a448d8167bbb355f5e496dbe Author: gobhardw Date: Thu Feb 16 22:19:17 2023 +0530 Fixing Jenkins unknkown build id issue with samples Change-Id: Ibae653c44115e1360ca1145ed4eaa4627a54cb4b commit 710491d8350f14a4e04d58ea70abf74b95b73dd8 Author: Sriraksha Nagaraj Date: Wed Feb 15 20:09:43 2023 +0000 SWDEV-382125: Adding a description when counter is not supported on a hardware Change-Id: Ic835f59029aa1a95d32b80bf79375978e8ea9635 commit ae56bc36b17f11f4a1a4d1cd4491f6d4efe393f1 Author: Ammar ELWazir Date: Tue Feb 14 22:59:52 2023 -0600 Disabling samples compilation Change-Id: Ibd4ec66cfbc4751034ad98143b9ca129947760f1 commit 108032e9cb6db7423ed7aab6ec11bd004cbb992e Author: Ammar ELWazir Date: Tue Feb 14 22:32:02 2023 -0600 Disabling Samples Change-Id: Ic51ca32b14e5926620b3a21f6ec762a800f5cee1 commit 92955c33acb1984a0fe04235637780e45c89ad22 Author: Ammar ELWazir Date: Tue Feb 14 21:45:03 2023 -0600 Removing the compilation of PC Sampling sample Change-Id: Ice539983e7f1d9510a2fa227a2620f0877fcbb5a commit 65984340d189d23d472f1043fb12c85590160d80 Author: Ammar ELWazir Date: Tue Feb 14 18:56:43 2023 -0600 Excluding PC Sampling Sample from samples package Change-Id: I27865f8040ea519909db45041b0aaad6b786ce8e commit e96c1f7d2513a281b4928fdcd417ddb3f05d221a Author: Ranjith Ramakrishnan Date: Wed Feb 1 10:47:35 2023 -0800 Get ROCm path relative to hsakmt header file path SWDEV-351540 - ROCm path was derived from hsakmt library path. For ASAN build, the library will be installed in /lib/asan rather than /lib Since the header file installation path remains the same for ASAN and normal build, using the same for getting the ROCm path Change-Id: I735dbd3c0f67a69e833059a05851da3971098ec5 commit bf53002b108191d4d22ec52278c20217abb35680 Author: gobhardw Date: Sat Feb 11 02:33:38 2023 +0530 Fixed Outpt Path and recv_0 for ATT Change-Id: I94248e217d5af14152be82cbe6095de90a489387 commit 88d319f9a6377d81af90ec916c8955bd1c0d81df Author: Ammar ELWazir Date: Mon Nov 21 13:31:27 2022 -0600 Modifying Public header file with gpu index definition Change-Id: I9c07ac419b9894c6dc65daa763bbc4cd4a7511b0 commit 736a7aaf288f99840551fc4c91a95e0a4bccad66 Author: Saurabh Verma Date: Fri Feb 10 16:19:03 2023 +0000 Adding Make for bulding code samples Change-Id: I7b62a4c65c5560239e69ea121c6fdaef188f709d commit 8e0280cbe8ff53c42e0877afdd222e0efd23ed3d Author: gobhardw Date: Tue Feb 7 13:06:02 2023 +0530 Making ATT work with Profilerv2 Change-Id: Ic9334aa80e40faaaf5c1a79ba37dbe52e8d31253 commit d61123001e9c9953bddf0fe67c26dd9f9e66503c Author: Ammar ELWazir Date: Fri Dec 2 11:16:45 2022 -0600 GPU ID issue Before, the GPU IDs were counted starting from zero, now CPU IDs are counted from zero and then GPU IDs from the last CPU_ID+1 Change-Id: I3f815195ad97933e02f249841e53b64b674370d9 commit ccf7595d44c52d73a441984ad5eacccbc985d802 Author: Ammar ELWazir Date: Fri Feb 3 12:31:39 2023 -0600 Adding rocprofilerv2 Change-Id: Ic0cc280ba207d2b8f6ccae1cd4ac3184152fc1ad Change-Id: Ib34d9e875e7191061523532b80bd922cd17881e2 --- _clang-format => .clang-format | 2 +- .gitignore | 4 + CMakeLists.txt | 651 +- bin/rocprofv2 | 296 + build.sh | 67 +- cmake_modules/FindLibDw.cmake | 28 + cmake_modules/FindLibElf.cmake | 30 + cmake_modules/env.cmake | 65 +- doc/Doxyfile.in | 2447 + doc/Rocprofiler_Documentation.pdf | Bin 0 -> 256419 bytes inc/rocprofiler.h | 2324 +- inc/rocprofiler_plugin.h | 135 + plugin/CMakeLists.txt | 26 + plugin/att/CMakeLists.txt | 66 + plugin/att/att.cpp | 195 + plugin/att/att.py | 518 + plugin/att/trace_view.py | 465 + plugin/att/ui/index.html | 1047 + plugin/att/ui/logo.svg | 132 + plugin/att/ui/styles.css | 106 + plugin/ctf/.gitignore | 1 + plugin/ctf/CMakeLists.txt | 161 + plugin/ctf/README.adoc | 260 + plugin/ctf/barectf_event_record.h | 67 + plugin/ctf/barectf_platform.h | 192 + plugin/ctf/barectf_tracer.h | 124 + plugin/ctf/barectf_writer.h | 178 + plugin/ctf/config.yaml | 165 + plugin/ctf/ctf.cpp | 107 + plugin/ctf/dst_base.yaml | 28 + plugin/ctf/gen_api_files.py | 645 + plugin/ctf/gen_env_yaml.py | 33 + plugin/ctf/plugin.cpp | 869 + plugin/ctf/plugin.h | 146 + plugin/exportmap | 7 + plugin/file/CMakeLists.txt | 44 + plugin/file/file.cpp | 475 + plugin/perfetto/CMakeLists.txt | 27 + plugin/perfetto/perfetto.cpp | 804 + plugin/perfetto/perfetto_sdk/LICENSE | 189 + plugin/perfetto/perfetto_sdk/OWNERS | 35 + .../perfetto/perfetto_sdk/docs/tracing-sdk.md | 394 + plugin/perfetto/perfetto_sdk/sdk/perfetto.cc | 74282 +++++++ plugin/perfetto/perfetto_sdk/sdk/perfetto.h | 151604 +++++++++++++++ plugin/utils.h | 63 + samples/.clangd | 10 + samples/CMakeLists.txt | 146 + samples/Makefile | 50 + samples/README.md | 92 + samples/att.txt | 1 + samples/common/common.h | 351 + samples/common/helper.cpp | 180 + samples/common/helper.h | 72 + samples/input.txt | 1 + .../pcsampler/code_printing_sample/.clangd | 10 + .../pcsampler/code_printing_sample/Makefile | 68 + .../pcsampler/code_printing_sample/README.md | 149 + .../code_printing_sample/code_printing.cpp | 1281 + .../code_printing_sample/code_printing.hpp | 126 + .../code_printing_sample/disassembly.cpp | 215 + .../code_printing_sample/disassembly.hpp | 32 + .../pcsampler/code_printing_sample/main.cpp | 447 + .../code_printing_sample/program.hpp | 54 + .../code_printing_sample/program_options.hpp | 49 + .../profiler/application_replay_sample.cpp | 72 + samples/profiler/device_profiling_sample.cpp | 64 + .../kernel_profiling_no_replay_sample.cpp | 68 + samples/profiler/kernel_replay_sample.cpp | 68 + samples/profiler/user_replay_sample.cpp | 78 + samples/run_samples.sh | 28 + samples/tracer/sample.cpp | 82 + script/gen_ostream_ops.py | 242 + script/hsaap.py | 581 + src/CMakeLists.txt | 19 +- src/api/CMakeLists.txt | 260 + src/api/exportmap | 88 + src/api/rocmtool.cpp | 242 + src/api/rocmtool.h | 122 + src/api/rocmtools.cpp | 755 + src/core/activity.cpp | 2 +- src/core/activity.h | 2 + src/core/context.h | 3 +- src/core/counters/basic/basic_counter.h | 86 + src/core/counters/basic/gfx_metrics.xml | 696 + src/core/counters/basic/xml_parser_basic.py | 233 + src/core/counters/counter.cpp | 46 + src/core/counters/counter.h | 52 + .../counters/derived/derived_counter.cpp.in | 61 + src/core/counters/derived/derived_counter.h | 64 + src/core/counters/derived/metrics.xml | 427 + .../counters/derived/xml_parser_derived.py | 88 + src/core/counters/metrics/basic_counters.xml | 683 + .../counters/metrics/derived_counters.xml | 538 + src/core/counters/metrics/eval_metrics.cpp | 204 + src/core/counters/metrics/eval_metrics.h | 74 + src/core/counters/metrics/exception.h | 75 + src/core/counters/metrics/expr.h | 449 + src/core/counters/metrics/metrics.cpp | 28 + src/core/counters/metrics/metrics.h | 367 + src/core/counters/metrics/types.h | 51 + src/core/counters/metrics/xml.h | 512 + src/core/hardware/gfx10/gfx10.xml | 405 + src/core/hardware/gfx8/gfx8.xml | 26 + src/core/hardware/gfx9/gfx9.xml | 374 + src/core/hardware/hsa_info.cpp | 123 + src/core/hardware/hsa_info.h | 110 + src/core/hsa/hsa_common.cpp | 112 + src/core/hsa/hsa_common.h | 61 + src/core/hsa/hsa_support.cpp | 895 + src/core/hsa/hsa_support.h | 136 + src/core/hsa/packets/packets_generator.cpp | 631 + src/core/hsa/packets/packets_generator.h | 79 + src/core/hsa/queues/queue.cpp | 1024 + src/core/hsa/queues/queue.h | 91 + src/core/memory/generic_buffer.cpp | 236 + src/core/memory/generic_buffer.h | 166 + src/core/rocprofiler.cpp | 265 +- src/core/session/att/att.cpp | 57 + src/core/session/att/att.h | 76 + src/core/session/device_profiling.cpp | 329 + src/core/session/device_profiling.h | 87 + src/core/session/filter.cpp | 248 + src/core/session/filter.h | 81 + src/core/session/profiler/profiler.cpp | 145 + src/core/session/profiler/profiler.h | 106 + src/core/session/session.cpp | 357 + src/core/session/session.h | 135 + src/core/session/spm/spm.cpp | 425 + src/core/session/spm/spm.h | 57 + .../session/tracer/src/correlation_id.cpp | 99 + src/core/session/tracer/src/correlation_id.h | 50 + src/core/session/tracer/src/exception.h | 44 + src/core/session/tracer/src/loader.h | 194 + .../session/tracer/src/registration_table.h | 101 + src/core/session/tracer/src/roctracer.cpp | 847 + src/core/session/tracer/src/roctracer.h | 472 + src/core/session/tracer/tracer.cpp | 434 + src/core/session/tracer/tracer.h | 104 + src/pcsampler/README.md | 12 + src/pcsampler/core/pc_sampler.cpp | 119 + src/pcsampler/gfxip/aldebaran_ip_offset.h | 1738 + .../gfxip/aldebaran_reg_offset_init.cpp | 33 + src/pcsampler/gfxip/arct_ip_offset.h | 1650 + src/pcsampler/gfxip/arct_reg_offset_init.cpp | 33 + src/pcsampler/gfxip/gc/gc_9_0_offset.h | 7279 + src/pcsampler/gfxip/gc/gc_9_0_sh_mask.h | 30029 +++ src/pcsampler/gfxip/gc/gc_9_4_1_offset.h | 266 + src/pcsampler/gfxip/gc/gc_9_4_1_sh_mask.h | 764 + src/pcsampler/gfxip/gc/gc_9_4_2_offset.h | 7687 + src/pcsampler/gfxip/gc/gc_9_4_2_sh_mask.h | 33003 ++++ src/pcsampler/gfxip/gfxip.cpp | 207 + src/pcsampler/gfxip/gfxip.h | 138 + src/pcsampler/gfxip/gfxip_v9.cpp | 299 + .../gfxip/osssys/osssys_4_0_offset.h | 327 + .../gfxip/osssys/osssys_4_0_sh_mask.h | 1204 + .../gfxip/osssys/osssys_4_2_0_offset.h | 345 + .../gfxip/osssys/osssys_4_2_0_sh_mask.h | 1300 + src/pcsampler/gfxip/vega10_enum.h | 22532 +++ src/pcsampler/gfxip/vega10_ip_offset.h | 1265 + .../gfxip/vega10_reg_offset_init.cpp | 33 + src/pcsampler/gfxip/vega20_ip_offset.h | 1051 + .../gfxip/vega20_reg_offset_init.cpp | 33 + src/pcsampler/session/pc_sampler.h | 60 + src/tools/CMakeLists.txt | 52 + src/tools/amdsys/CMakeLists.txt | 27 + src/tools/amdsys/amdsys.cpp | 238 + src/tools/ctrl.cpp | 25 + src/tools/exportmap | 1 + src/tools/rocprofv2/CMakeLists.txt | 29 + src/tools/rocprofv2/rocprofv2.cpp | 256 + src/tools/tool.cpp | 599 + src/util/hsa_rsrc_factory.h | 2 - src/utils/access_control.h | 116 + src/utils/exception.h | 53 + src/utils/handle.h | 114 + src/utils/helper.cpp | 238 + src/utils/helper.h | 72 + src/utils/logger.h | 171 + test/CMakeLists.txt | 18 +- test/app/c_test.c | 2 +- test/tool/tool.cpp | 2 + test/util/hsa_rsrc_factory.h | 2 +- tests/CMakeLists.txt | 9 + tests/README.md | 53 + tests/featuretests/CMakeLists.txt | 2 + tests/featuretests/profiler/CMakeLists.txt | 225 + .../profiler/discretetests/api/att_test.cpp | 259 + .../discretetests/api/multithreaded_test.cpp | 145 + .../profiler/discretetests/api/spm_test.cpp | 233 + .../profiler/discretetests/basic_metrics.txt | 1 + .../profiler/discretetests/binary/copy.cl | 32 + .../binary/multiprocess_test.cpp | 88 + .../discretetests/binary/multiqueue_test.cpp | 113 + .../binary/multiqueue_testapp.cpp | 284 + .../discretetests/binary/multiqueue_testapp.h | 343 + .../binary/multithreaded_test.cpp | 103 + .../binary/multithreaded_testapp.cpp | 83 + .../discretetests/run_discrete_tests.sh | 87 + .../hip_helloworld_golden_traces.txt | 20 + .../hip_vectoradd_golden_traces.txt | 16 + .../hsa_async_mem_copy_golden_traces.txt | 16 + .../gtests/apps/goldentraces/input.txt | 1 + .../mpi_vectoradd_golden_traces.txt | 37 + .../openmp_helloworld_golden_traces.txt | 5 + .../profiler/gtests/apps/hip/hello_world.cpp | 84 + .../gtests/apps/hip/hello_world_gtest.cpp | 88 + .../profiler/gtests/apps/hip/vector_add.cpp | 130 + .../gtests/apps/hip/vector_add_gtest.cpp | 86 + .../gtests/apps/hsa/async_mem_copy.cpp | 387 + .../gtests/apps/hsa/async_mem_copy_gtest.cpp | 54 + .../profiler/gtests/apps/mpi/mpi_run.sh | 37 + .../profiler/gtests/apps/mpi/vector_add.cpp | 132 + .../gtests/apps/mpi/vector_add_gtest.cpp | 94 + .../gtests/apps/openmp/hello_world.cpp | 91 + .../gtests/apps/openmp/hello_world_gtest.cpp | 94 + .../profiler/gtests/apps/profiler_gtest.cpp | 187 + .../profiler/gtests/apps/profiler_gtest.h | 107 + .../gtests/functional/loadunload_gtest.cpp | 67 + .../gtests/functional/multithread_gtest.cpp | 152 + .../profiler/gtests/gtests_main.cpp | 10 + .../profiler/utils/csv_parser.cpp | 170 + .../featuretests/profiler/utils/csv_parser.h | 118 + .../profiler/utils/test_utils.cpp | 64 + .../featuretests/profiler/utils/test_utils.h | 57 + tests/featuretests/tracer/CMakeLists.txt | 40 + .../hip_helloworld_golden_traces.txt | 14 + .../tracer/gtests/apps/hip/hello_world.cpp | 71 + .../gtests/apps/hip/hello_world_gtest.cpp | 73 + .../tracer/gtests/apps/tracer_gtest.cpp | 148 + .../tracer/gtests/apps/tracer_gtest.h | 103 + .../tracer/gtests/gtests_main.cpp | 9 + .../featuretests/tracer/utils/test_utils.cpp | 54 + tests/featuretests/tracer/utils/test_utils.h | 53 + tests/memorytests/CMakeLists.txt | 4 + tests/memorytests/input.txt | 1 + tests/memorytests/run_asan_tests.sh | 9 + tests/memorytests/suppr.txt | 4 + tests/memorytests/test_mem.py | 22 + tests/run_tests.sh | 23 + tests/unittests/CMakeLists.txt | 2 + tests/unittests/core/CMakeLists.txt | 107 + tests/unittests/core/gtests_main.cpp | 9 + .../core/hardware/hsa_info_gtest.cpp | 39 + tests/unittests/core/memory/memory_gtest.cpp | 55 + .../unittests/core/session/session_gtest.cpp | 206 + tests/unittests/profiler/CMakeLists.txt | 94 + .../unittests/profiler/api/rocmtool_gtest.cpp | 91 + tests/unittests/profiler/tools/amdsys.cpp | 265 + tests/unittests/profiler/tools/tool_gtest.cpp | 37 + 249 files changed, 378388 insertions(+), 463 deletions(-) rename _clang-format => .clang-format (98%) create mode 100644 .gitignore create mode 100755 bin/rocprofv2 create mode 100644 cmake_modules/FindLibDw.cmake create mode 100644 cmake_modules/FindLibElf.cmake create mode 100644 doc/Doxyfile.in create mode 100644 doc/Rocprofiler_Documentation.pdf create mode 100644 inc/rocprofiler_plugin.h create mode 100644 plugin/CMakeLists.txt create mode 100644 plugin/att/CMakeLists.txt create mode 100644 plugin/att/att.cpp create mode 100755 plugin/att/att.py create mode 100755 plugin/att/trace_view.py create mode 100644 plugin/att/ui/index.html create mode 100644 plugin/att/ui/logo.svg create mode 100644 plugin/att/ui/styles.css create mode 100644 plugin/ctf/.gitignore create mode 100644 plugin/ctf/CMakeLists.txt create mode 100644 plugin/ctf/README.adoc create mode 100644 plugin/ctf/barectf_event_record.h create mode 100644 plugin/ctf/barectf_platform.h create mode 100644 plugin/ctf/barectf_tracer.h create mode 100644 plugin/ctf/barectf_writer.h create mode 100644 plugin/ctf/config.yaml create mode 100644 plugin/ctf/ctf.cpp create mode 100644 plugin/ctf/dst_base.yaml create mode 100644 plugin/ctf/gen_api_files.py create mode 100644 plugin/ctf/gen_env_yaml.py create mode 100644 plugin/ctf/plugin.cpp create mode 100644 plugin/ctf/plugin.h create mode 100644 plugin/exportmap create mode 100644 plugin/file/CMakeLists.txt create mode 100644 plugin/file/file.cpp create mode 100644 plugin/perfetto/CMakeLists.txt create mode 100644 plugin/perfetto/perfetto.cpp create mode 100644 plugin/perfetto/perfetto_sdk/LICENSE create mode 100644 plugin/perfetto/perfetto_sdk/OWNERS create mode 100644 plugin/perfetto/perfetto_sdk/docs/tracing-sdk.md create mode 100644 plugin/perfetto/perfetto_sdk/sdk/perfetto.cc create mode 100644 plugin/perfetto/perfetto_sdk/sdk/perfetto.h create mode 100644 plugin/utils.h create mode 100644 samples/.clangd create mode 100644 samples/CMakeLists.txt create mode 100644 samples/Makefile create mode 100644 samples/README.md create mode 100644 samples/att.txt create mode 100644 samples/common/common.h create mode 100644 samples/common/helper.cpp create mode 100644 samples/common/helper.h create mode 100644 samples/input.txt create mode 100644 samples/pcsampler/code_printing_sample/.clangd create mode 100644 samples/pcsampler/code_printing_sample/Makefile create mode 100644 samples/pcsampler/code_printing_sample/README.md create mode 100644 samples/pcsampler/code_printing_sample/code_printing.cpp create mode 100644 samples/pcsampler/code_printing_sample/code_printing.hpp create mode 100644 samples/pcsampler/code_printing_sample/disassembly.cpp create mode 100644 samples/pcsampler/code_printing_sample/disassembly.hpp create mode 100644 samples/pcsampler/code_printing_sample/main.cpp create mode 100644 samples/pcsampler/code_printing_sample/program.hpp create mode 100644 samples/pcsampler/code_printing_sample/program_options.hpp create mode 100644 samples/profiler/application_replay_sample.cpp create mode 100644 samples/profiler/device_profiling_sample.cpp create mode 100644 samples/profiler/kernel_profiling_no_replay_sample.cpp create mode 100644 samples/profiler/kernel_replay_sample.cpp create mode 100644 samples/profiler/user_replay_sample.cpp create mode 100755 samples/run_samples.sh create mode 100644 samples/tracer/sample.cpp create mode 100755 script/gen_ostream_ops.py create mode 100755 script/hsaap.py create mode 100644 src/api/CMakeLists.txt create mode 100644 src/api/exportmap create mode 100644 src/api/rocmtool.cpp create mode 100644 src/api/rocmtool.h create mode 100644 src/api/rocmtools.cpp create mode 100644 src/core/counters/basic/basic_counter.h create mode 100755 src/core/counters/basic/gfx_metrics.xml create mode 100644 src/core/counters/basic/xml_parser_basic.py create mode 100644 src/core/counters/counter.cpp create mode 100644 src/core/counters/counter.h create mode 100644 src/core/counters/derived/derived_counter.cpp.in create mode 100644 src/core/counters/derived/derived_counter.h create mode 100755 src/core/counters/derived/metrics.xml create mode 100644 src/core/counters/derived/xml_parser_derived.py create mode 100755 src/core/counters/metrics/basic_counters.xml create mode 100755 src/core/counters/metrics/derived_counters.xml create mode 100644 src/core/counters/metrics/eval_metrics.cpp create mode 100644 src/core/counters/metrics/eval_metrics.h create mode 100644 src/core/counters/metrics/exception.h create mode 100644 src/core/counters/metrics/expr.h create mode 100644 src/core/counters/metrics/metrics.cpp create mode 100755 src/core/counters/metrics/metrics.h create mode 100644 src/core/counters/metrics/types.h create mode 100644 src/core/counters/metrics/xml.h create mode 100755 src/core/hardware/gfx10/gfx10.xml create mode 100755 src/core/hardware/gfx8/gfx8.xml create mode 100755 src/core/hardware/gfx9/gfx9.xml create mode 100644 src/core/hardware/hsa_info.cpp create mode 100644 src/core/hardware/hsa_info.h create mode 100644 src/core/hsa/hsa_common.cpp create mode 100644 src/core/hsa/hsa_common.h create mode 100644 src/core/hsa/hsa_support.cpp create mode 100644 src/core/hsa/hsa_support.h create mode 100644 src/core/hsa/packets/packets_generator.cpp create mode 100644 src/core/hsa/packets/packets_generator.h create mode 100644 src/core/hsa/queues/queue.cpp create mode 100644 src/core/hsa/queues/queue.h create mode 100644 src/core/memory/generic_buffer.cpp create mode 100644 src/core/memory/generic_buffer.h create mode 100644 src/core/session/att/att.cpp create mode 100644 src/core/session/att/att.h create mode 100644 src/core/session/device_profiling.cpp create mode 100644 src/core/session/device_profiling.h create mode 100644 src/core/session/filter.cpp create mode 100644 src/core/session/filter.h create mode 100644 src/core/session/profiler/profiler.cpp create mode 100644 src/core/session/profiler/profiler.h create mode 100644 src/core/session/session.cpp create mode 100644 src/core/session/session.h create mode 100644 src/core/session/spm/spm.cpp create mode 100644 src/core/session/spm/spm.h create mode 100644 src/core/session/tracer/src/correlation_id.cpp create mode 100644 src/core/session/tracer/src/correlation_id.h create mode 100644 src/core/session/tracer/src/exception.h create mode 100644 src/core/session/tracer/src/loader.h create mode 100644 src/core/session/tracer/src/registration_table.h create mode 100644 src/core/session/tracer/src/roctracer.cpp create mode 100644 src/core/session/tracer/src/roctracer.h create mode 100644 src/core/session/tracer/tracer.cpp create mode 100644 src/core/session/tracer/tracer.h create mode 100644 src/pcsampler/README.md create mode 100644 src/pcsampler/core/pc_sampler.cpp create mode 100644 src/pcsampler/gfxip/aldebaran_ip_offset.h create mode 100644 src/pcsampler/gfxip/aldebaran_reg_offset_init.cpp create mode 100644 src/pcsampler/gfxip/arct_ip_offset.h create mode 100644 src/pcsampler/gfxip/arct_reg_offset_init.cpp create mode 100644 src/pcsampler/gfxip/gc/gc_9_0_offset.h create mode 100644 src/pcsampler/gfxip/gc/gc_9_0_sh_mask.h create mode 100644 src/pcsampler/gfxip/gc/gc_9_4_1_offset.h create mode 100644 src/pcsampler/gfxip/gc/gc_9_4_1_sh_mask.h create mode 100644 src/pcsampler/gfxip/gc/gc_9_4_2_offset.h create mode 100644 src/pcsampler/gfxip/gc/gc_9_4_2_sh_mask.h create mode 100644 src/pcsampler/gfxip/gfxip.cpp create mode 100644 src/pcsampler/gfxip/gfxip.h create mode 100644 src/pcsampler/gfxip/gfxip_v9.cpp create mode 100644 src/pcsampler/gfxip/osssys/osssys_4_0_offset.h create mode 100644 src/pcsampler/gfxip/osssys/osssys_4_0_sh_mask.h create mode 100644 src/pcsampler/gfxip/osssys/osssys_4_2_0_offset.h create mode 100644 src/pcsampler/gfxip/osssys/osssys_4_2_0_sh_mask.h create mode 100644 src/pcsampler/gfxip/vega10_enum.h create mode 100644 src/pcsampler/gfxip/vega10_ip_offset.h create mode 100644 src/pcsampler/gfxip/vega10_reg_offset_init.cpp create mode 100644 src/pcsampler/gfxip/vega20_ip_offset.h create mode 100644 src/pcsampler/gfxip/vega20_reg_offset_init.cpp create mode 100644 src/pcsampler/session/pc_sampler.h create mode 100644 src/tools/CMakeLists.txt create mode 100644 src/tools/amdsys/CMakeLists.txt create mode 100644 src/tools/amdsys/amdsys.cpp create mode 100644 src/tools/ctrl.cpp create mode 100644 src/tools/exportmap create mode 100644 src/tools/rocprofv2/CMakeLists.txt create mode 100644 src/tools/rocprofv2/rocprofv2.cpp create mode 100644 src/tools/tool.cpp create mode 100644 src/utils/access_control.h create mode 100644 src/utils/exception.h create mode 100644 src/utils/handle.h create mode 100644 src/utils/helper.cpp create mode 100644 src/utils/helper.h create mode 100644 src/utils/logger.h create mode 100644 tests/CMakeLists.txt create mode 100644 tests/README.md create mode 100644 tests/featuretests/CMakeLists.txt create mode 100644 tests/featuretests/profiler/CMakeLists.txt create mode 100644 tests/featuretests/profiler/discretetests/api/att_test.cpp create mode 100644 tests/featuretests/profiler/discretetests/api/multithreaded_test.cpp create mode 100644 tests/featuretests/profiler/discretetests/api/spm_test.cpp create mode 100644 tests/featuretests/profiler/discretetests/basic_metrics.txt create mode 100644 tests/featuretests/profiler/discretetests/binary/copy.cl create mode 100644 tests/featuretests/profiler/discretetests/binary/multiprocess_test.cpp create mode 100644 tests/featuretests/profiler/discretetests/binary/multiqueue_test.cpp create mode 100644 tests/featuretests/profiler/discretetests/binary/multiqueue_testapp.cpp create mode 100644 tests/featuretests/profiler/discretetests/binary/multiqueue_testapp.h create mode 100644 tests/featuretests/profiler/discretetests/binary/multithreaded_test.cpp create mode 100644 tests/featuretests/profiler/discretetests/binary/multithreaded_testapp.cpp create mode 100755 tests/featuretests/profiler/discretetests/run_discrete_tests.sh create mode 100755 tests/featuretests/profiler/gtests/apps/goldentraces/hip_helloworld_golden_traces.txt create mode 100755 tests/featuretests/profiler/gtests/apps/goldentraces/hip_vectoradd_golden_traces.txt create mode 100644 tests/featuretests/profiler/gtests/apps/goldentraces/hsa_async_mem_copy_golden_traces.txt create mode 100644 tests/featuretests/profiler/gtests/apps/goldentraces/input.txt create mode 100755 tests/featuretests/profiler/gtests/apps/goldentraces/mpi_vectoradd_golden_traces.txt create mode 100755 tests/featuretests/profiler/gtests/apps/goldentraces/openmp_helloworld_golden_traces.txt create mode 100755 tests/featuretests/profiler/gtests/apps/hip/hello_world.cpp create mode 100755 tests/featuretests/profiler/gtests/apps/hip/hello_world_gtest.cpp create mode 100755 tests/featuretests/profiler/gtests/apps/hip/vector_add.cpp create mode 100755 tests/featuretests/profiler/gtests/apps/hip/vector_add_gtest.cpp create mode 100644 tests/featuretests/profiler/gtests/apps/hsa/async_mem_copy.cpp create mode 100755 tests/featuretests/profiler/gtests/apps/hsa/async_mem_copy_gtest.cpp create mode 100755 tests/featuretests/profiler/gtests/apps/mpi/mpi_run.sh create mode 100755 tests/featuretests/profiler/gtests/apps/mpi/vector_add.cpp create mode 100755 tests/featuretests/profiler/gtests/apps/mpi/vector_add_gtest.cpp create mode 100755 tests/featuretests/profiler/gtests/apps/openmp/hello_world.cpp create mode 100755 tests/featuretests/profiler/gtests/apps/openmp/hello_world_gtest.cpp create mode 100644 tests/featuretests/profiler/gtests/apps/profiler_gtest.cpp create mode 100644 tests/featuretests/profiler/gtests/apps/profiler_gtest.h create mode 100644 tests/featuretests/profiler/gtests/functional/loadunload_gtest.cpp create mode 100644 tests/featuretests/profiler/gtests/functional/multithread_gtest.cpp create mode 100644 tests/featuretests/profiler/gtests/gtests_main.cpp create mode 100644 tests/featuretests/profiler/utils/csv_parser.cpp create mode 100644 tests/featuretests/profiler/utils/csv_parser.h create mode 100644 tests/featuretests/profiler/utils/test_utils.cpp create mode 100644 tests/featuretests/profiler/utils/test_utils.h create mode 100644 tests/featuretests/tracer/CMakeLists.txt create mode 100755 tests/featuretests/tracer/gtests/apps/goldentraces/hip_helloworld_golden_traces.txt create mode 100755 tests/featuretests/tracer/gtests/apps/hip/hello_world.cpp create mode 100755 tests/featuretests/tracer/gtests/apps/hip/hello_world_gtest.cpp create mode 100644 tests/featuretests/tracer/gtests/apps/tracer_gtest.cpp create mode 100644 tests/featuretests/tracer/gtests/apps/tracer_gtest.h create mode 100644 tests/featuretests/tracer/gtests/gtests_main.cpp create mode 100644 tests/featuretests/tracer/utils/test_utils.cpp create mode 100644 tests/featuretests/tracer/utils/test_utils.h create mode 100644 tests/memorytests/CMakeLists.txt create mode 100644 tests/memorytests/input.txt create mode 100755 tests/memorytests/run_asan_tests.sh create mode 100644 tests/memorytests/suppr.txt create mode 100644 tests/memorytests/test_mem.py create mode 100755 tests/run_tests.sh create mode 100644 tests/unittests/CMakeLists.txt create mode 100644 tests/unittests/core/CMakeLists.txt create mode 100644 tests/unittests/core/gtests_main.cpp create mode 100644 tests/unittests/core/hardware/hsa_info_gtest.cpp create mode 100644 tests/unittests/core/memory/memory_gtest.cpp create mode 100644 tests/unittests/core/session/session_gtest.cpp create mode 100644 tests/unittests/profiler/CMakeLists.txt create mode 100644 tests/unittests/profiler/api/rocmtool_gtest.cpp create mode 100644 tests/unittests/profiler/tools/amdsys.cpp create mode 100644 tests/unittests/profiler/tools/tool_gtest.cpp diff --git a/_clang-format b/.clang-format similarity index 98% rename from _clang-format rename to .clang-format index 0c81671e..7f49cb07 100644 --- a/_clang-format +++ b/.clang-format @@ -1,6 +1,6 @@ --- Language: Cpp -# BasedOnStyle: Google +BasedOnStyle: Google AccessModifierOffset: -1 ConstructorInitializerIndentWidth: 4 AlignEscapedNewlinesLeft: false diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..7f0b629f --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +build +compile_commands.json +.cache +.DS_Store diff --git a/CMakeLists.txt b/CMakeLists.txt index c453eeb4..9729b97b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -################################################################################ +# ############################################################################## # Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -8,259 +8,472 @@ # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -################################################################################ +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ############################################################################## -cmake_minimum_required ( VERSION 2.8.12 ) +cmake_minimum_required(VERSION 3.18.0) -## Verbose output. -set ( CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "Verbose Output" FORCE ) +# Build is not supported on Windows plaform +if(WIN32) + message(FATAL_ERROR "Windows build is not supported.") +endif() -## Set module name and project name. -set ( ROCPROFILER_NAME "rocprofiler" ) -set ( ROCPROFILER_TARGET "${ROCPROFILER_NAME}64" ) -set ( ROCPROFILER_LIBRARY "lib${ROCPROFILER_TARGET}" ) -project ( ${ROCPROFILER_NAME} ) +# Set module name and project name. +set(ROCPROFILER_NAME "rocprofiler") +set(ROCPROFILER_TARGET "${ROCPROFILER_NAME}64") +set(ROCPROFILER_LIBRARY "lib${ROCPROFILER_TARGET}") +project(rocprofiler VERSION 2.0.0) include(GNUInstallDirs) -## Adding default path cmake modules -list ( APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules" ) -## Include common cmake modules -include ( utils ) -## Set build environment -include ( env ) - -## Setup the package version. -get_version ( "1.0.0" ) -message ( "-- LIB-VERSION: ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}" ) - -set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} ) -set ( BUILD_VERSION_MINOR ${VERSION_MINOR} ) -set ( BUILD_VERSION_PATCH ${VERSION_PATCH} ) -if ( DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "" ) - message ( "VERSION BUILD DEFINED ${VERSION_BUILD}" ) - set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" ) -endif () -set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) - -set ( LIB_VERSION_MAJOR ${VERSION_MAJOR} ) -set ( LIB_VERSION_MINOR ${VERSION_MINOR} ) -if ( ${ROCM_PATCH_VERSION} ) - set ( LIB_VERSION_PATCH ${ROCM_PATCH_VERSION} ) + +# set default ROCM_PATH +if(NOT DEFINED ROCM_PATH) + set(ROCM_PATH + "/opt/rocm" + CACHE STRING "Default ROCM installation directory") +endif() + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +add_compile_options(-Wall) + +set(THREADS_PREFER_PTHREAD_FLAG ON) + +# Adding default path cmake modules +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake_modules") +# Set build environment +include(utils) +include(env) + +# Setup the package version. +get_version("2.0.0") +message("-- LIB-VERSION: ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}") + +set(BUILD_VERSION_MAJOR ${VERSION_MAJOR}) +set(BUILD_VERSION_MINOR ${VERSION_MINOR}) +set(BUILD_VERSION_PATCH ${VERSION_PATCH}) +if(DEFINED VERSION_BUILD AND NOT ${VERSION_BUILD} STREQUAL "") + message("VERSION BUILD DEFINED ${VERSION_BUILD}") + set(BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}") +endif() +set(BUILD_VERSION_STRING + "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}") + +set(LIB_VERSION_MAJOR ${VERSION_MAJOR}) +set(LIB_VERSION_MINOR ${VERSION_MINOR}) +if(${ROCM_PATCH_VERSION}) + set(LIB_VERSION_PATCH ${ROCM_PATCH_VERSION}) else() - set ( LIB_VERSION_PATCH ${VERSION_PATCH} ) + set(LIB_VERSION_PATCH ${VERSION_PATCH}) endif() -set ( LIB_VERSION_STRING "${LIB_VERSION_MAJOR}.${LIB_VERSION_MINOR}.${LIB_VERSION_PATCH}" ) -message ( "-- LIB-VERSION STRING: ${LIB_VERSION_STRING}" ) +set(LIB_VERSION_STRING + "${LIB_VERSION_MAJOR}.${LIB_VERSION_MINOR}.${LIB_VERSION_PATCH}") +message("-- LIB-VERSION STRING: ${LIB_VERSION_STRING}") + +# Set target and root/lib/test directory +set(TARGET_NAME "${ROCPROFILER_TARGET}") +set(ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}") +set(LIB_DIR "${ROOT_DIR}/src") +set(TEST_DIR "${ROOT_DIR}/test") + +find_package( + amd_comgr + REQUIRED + CONFIG + HINTS + ${CMAKE_INSTALL_PREFIX} + PATHS + ${ROCM_PATH} + PATH_SUFFIXES + lib/cmake/amd_comgr) +message(STATUS "Code Object Manager found at ${amd_comgr_DIR}.") +link_libraries(amd_comgr) -## Set target and root/lib/test directory -set ( TARGET_NAME "${ROCPROFILER_TARGET}" ) -set ( ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ) -set ( LIB_DIR "${ROOT_DIR}/src" ) -set ( TEST_DIR "${ROOT_DIR}/test" ) +find_package(Threads REQUIRED) +find_package( + hsa-runtime64 + REQUIRED + CONFIG + HINTS + ${CMAKE_INSTALL_PREFIX} + PATHS + ${ROCM_PATH}) +find_package( + HIP + REQUIRED + CONFIG + HINTS + ${CMAKE_INSTALL_PREFIX} + PATHS + ${ROCM_PATH}) + +get_property( + HSA_RUNTIME_INCLUDE_DIRECTORIES + TARGET hsa-runtime64::hsa-runtime64 + PROPERTY INTERFACE_INCLUDE_DIRECTORIES) +find_file( + HSA_H hsa.h + PATHS ${HSA_RUNTIME_INCLUDE_DIRECTORIES} + PATH_SUFFIXES hsa + NO_DEFAULT_PATH REQUIRED) +get_filename_component(HSA_RUNTIME_INC_PATH ${HSA_H} DIRECTORY) +include_directories(${HSA_RUNTIME_INC_PATH}) + +if(NOT DEFINED LIBRARY_TYPE) + set(LIBRARY_TYPE SHARED) +endif() -## Enable tracing API -if (NOT USE_PROF_API) +# Enable tracing API +if(NOT USE_PROF_API) set(USE_PROF_API 1) endif() # Protocol header lookup set(PROF_API_HEADER_NAME prof_protocol.h) if(USE_PROF_API EQUAL 1) - find_path(PROF_API_HEADER_DIR ${PROF_API_HEADER_NAME} - HINTS - ${PROF_API_HEADER_PATH} - PATHS - /opt/rocm/include - PATH_SUFFIXES - roctracer/ext + find_path( + PROF_API_HEADER_DIR ${PROF_API_HEADER_NAME} + HINTS ${PROF_API_HEADER_PATH} + PATHS /opt/rocm/include + PATH_SUFFIXES roctracer/ext) + if(NOT PROF_API_HEADER_DIR) + message( + FATAL_ERROR + "Profiling API header not found. Tracer integration disabled. Use -DPROF_API_HEADER_PATH=" ) - if(NOT PROF_API_HEADER_DIR) - MESSAGE(FATAL_ERROR "Profiling API header not found. Tracer integration disabled. Use -DPROF_API_HEADER_PATH=") - else() - add_definitions(-DUSE_PROF_API=1) - include_directories(${PROF_API_HEADER_DIR}) - MESSAGE(STATUS "Profiling API: ${PROF_API_HEADER_DIR}/${PROF_API_HEADER_NAME}") - endif() + else() + include_directories(${PROF_API_HEADER_DIR}) + message( + STATUS "Profiling API: ${PROF_API_HEADER_DIR}/${PROF_API_HEADER_NAME}") + endif() endif() -## Build library -include ( ${LIB_DIR}/CMakeLists.txt ) - -## Set the VERSION and SOVERSION values -set_property ( TARGET ${TARGET_NAME} PROPERTY VERSION "${LIB_VERSION_STRING}" ) -set_property ( TARGET ${TARGET_NAME} PROPERTY SOVERSION "${LIB_VERSION_MAJOR}" ) - -## If the library is a release, strip the target library -if ( "${CMAKE_BUILD_TYPE}" STREQUAL release ) - add_custom_command ( TARGET ${ROCPROFILER_TARGET} POST_BUILD COMMAND ${CMAKE_STRIP} *.so ) -endif () - -## Build tests -add_subdirectory ( ${TEST_DIR} ${PROJECT_BINARY_DIR}/test ) - -## Installation and packaging -set ( DEST_NAME ${ROCPROFILER_NAME} ) -if ( DEFINED CMAKE_INSTALL_PREFIX ) - get_filename_component ( prefix_name ${CMAKE_INSTALL_PREFIX} NAME ) - get_filename_component ( prefix_dir ${CMAKE_INSTALL_PREFIX} DIRECTORY ) - if ( prefix_name STREQUAL ${DEST_NAME} ) - set ( CMAKE_INSTALL_PREFIX ${prefix_dir} ) - endif () -endif () -if ( DEFINED CPACK_PACKAGING_INSTALL_PREFIX ) - get_filename_component ( prefix_name ${CPACK_PACKAGING_INSTALL_PREFIX} NAME ) - get_filename_component ( prefix_dir ${CPACK_PACKAGING_INSTALL_PREFIX} DIRECTORY ) - if ( prefix_name STREQUAL ${DEST_NAME} ) - set ( CPACK_PACKAGING_INSTALL_PREFIX ${prefix_dir} ) - endif () -else () - set ( CPACK_PACKAGING_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX} ) -endif () -message ( "CMake-install-prefix: ${CMAKE_INSTALL_PREFIX}" ) -message ( "CPack-install-prefix: ${CPACK_PACKAGING_INSTALL_PREFIX}" ) -message ( "-----------Dest-name: ${DEST_NAME}" ) - -## set components -set ( CPACK_COMPONENTS_ALL runtime dev ) -## Enable Component Install -set(CPACK_RPM_COMPONENT_INSTALL ON) -set(CPACK_DEB_COMPONENT_INSTALL ON) - -## Install libraries: Non versioned lib file in dev package -install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT dev NAMELINK_ONLY ) -install ( TARGETS ${ROCPROFILER_TARGET} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT runtime NAMELINK_SKIP ) -## Install headers -install ( FILES - ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h - ${CMAKE_CURRENT_SOURCE_DIR}/src/core/activity.h - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${DEST_NAME} - COMPONENT dev ) +# Build libraries +add_subdirectory(src) + +if(${LIBRARY_TYPE} STREQUAL SHARED) + # Build samples + add_subdirectory(samples) + + # Build tests + add_subdirectory(tests) +endif() + +# Build Plugins +add_subdirectory(plugin) + +# Build tests +add_subdirectory(${TEST_DIR} ${PROJECT_BINARY_DIR}/test) + +# Installation and packaging +set(DEST_NAME ${ROCPROFILER_NAME}) +if(DEFINED CMAKE_INSTALL_PREFIX) + get_filename_component(prefix_name ${CMAKE_INSTALL_PREFIX} NAME) + get_filename_component(prefix_dir ${CMAKE_INSTALL_PREFIX} DIRECTORY) + if(prefix_name STREQUAL ${DEST_NAME}) + set(CMAKE_INSTALL_PREFIX ${prefix_dir}) + endif() +endif() +if(DEFINED CPACK_PACKAGING_INSTALL_PREFIX) + get_filename_component(prefix_name ${CPACK_PACKAGING_INSTALL_PREFIX} NAME) + get_filename_component(prefix_dir ${CPACK_PACKAGING_INSTALL_PREFIX} DIRECTORY) + if(prefix_name STREQUAL ${DEST_NAME}) + set(CPACK_PACKAGING_INSTALL_PREFIX ${prefix_dir}) + endif() +else() + set(CPACK_PACKAGING_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX}) +endif() +message("CMake-install-prefix: ${CMAKE_INSTALL_PREFIX}") +message("CPack-install-prefix: ${CPACK_PACKAGING_INSTALL_PREFIX}") +message("-----------Dest-name: ${DEST_NAME}") + +# Install headers +install( + FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/core/activity.h + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${ROCPROFILER_NAME} + COMPONENT dev) # rpl_run.sh -install ( FILES - ${CMAKE_CURRENT_SOURCE_DIR}/bin/rpl_run.sh - DESTINATION ${CMAKE_INSTALL_BINDIR} - PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE - RENAME rocprof - COMPONENT runtime ) - -install ( FILES - ${CMAKE_CURRENT_SOURCE_DIR}/bin/txt2xml.sh - ${CMAKE_CURRENT_SOURCE_DIR}/bin/merge_traces.sh - ${CMAKE_CURRENT_SOURCE_DIR}/bin/txt2params.py - ${CMAKE_CURRENT_SOURCE_DIR}/bin/tblextr.py - ${CMAKE_CURRENT_SOURCE_DIR}/bin/dform.py - ${CMAKE_CURRENT_SOURCE_DIR}/bin/mem_manager.py - ${CMAKE_CURRENT_SOURCE_DIR}/bin/sqlitedb.py - DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${DEST_NAME} - PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE - COMPONENT runtime ) +install( + FILES ${CMAKE_CURRENT_SOURCE_DIR}/bin/rpl_run.sh + DESTINATION ${CMAKE_INSTALL_BINDIR} + PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ + WORLD_EXECUTE + RENAME rocprof + COMPONENT runtime) + +configure_file(bin/rocprofv2 ${PROJECT_BINARY_DIR} COPYONLY) +install( + FILES ${PROJECT_SOURCE_DIR}/bin/rocprofv2 + DESTINATION ${CMAKE_INSTALL_BINDIR} + PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ + WORLD_EXECUTE + COMPONENT runtime) + +install( + FILES ${CMAKE_CURRENT_SOURCE_DIR}/bin/txt2xml.sh + ${CMAKE_CURRENT_SOURCE_DIR}/bin/merge_traces.sh + ${CMAKE_CURRENT_SOURCE_DIR}/bin/txt2params.py + ${CMAKE_CURRENT_SOURCE_DIR}/bin/tblextr.py + ${CMAKE_CURRENT_SOURCE_DIR}/bin/dform.py + ${CMAKE_CURRENT_SOURCE_DIR}/bin/mem_manager.py + ${CMAKE_CURRENT_SOURCE_DIR}/bin/sqlitedb.py + DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${ROCPROFILER_NAME} + PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ + WORLD_EXECUTE + COMPONENT runtime) + # gfx_metrics.xml metrics.xml -install ( FILES - ${CMAKE_CURRENT_SOURCE_DIR}/test/tool/metrics.xml - ${CMAKE_CURRENT_SOURCE_DIR}/test/tool/gfx_metrics.xml - DESTINATION ${CMAKE_INSTALL_LIBDIR}/${DEST_NAME} - COMPONENT runtime ) +install( + FILES ${CMAKE_CURRENT_SOURCE_DIR}/test/tool/metrics.xml + ${CMAKE_CURRENT_SOURCE_DIR}/test/tool/gfx_metrics.xml + DESTINATION ${CMAKE_INSTALL_LIBDIR}/${ROCPROFILER_NAME} + COMPONENT runtime) + # librocprof-tool.so -install ( FILES ${PROJECT_BINARY_DIR}/test/librocprof-tool.so DESTINATION ${CMAKE_INSTALL_LIBDIR}/${DEST_NAME} - COMPONENT runtime ) -install ( FILES ${PROJECT_BINARY_DIR}/test/rocprof-ctrl DESTINATION ${CMAKE_INSTALL_LIBDIR}/${DEST_NAME} - PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE - COMPONENT runtime ) +install( + FILES ${PROJECT_BINARY_DIR}/test/librocprof-tool.so + DESTINATION ${CMAKE_INSTALL_LIBDIR}/${ROCPROFILER_NAME} + COMPONENT runtime) + +install( + FILES ${PROJECT_BINARY_DIR}/test/rocprof-ctrl + DESTINATION ${CMAKE_INSTALL_LIBDIR}/${ROCPROFILER_NAME} + PERMISSIONS + OWNER_READ + OWNER_WRITE + OWNER_EXECUTE + GROUP_READ + GROUP_EXECUTE + WORLD_READ + WORLD_EXECUTE + COMPONENT runtime) # File reorg Backward compatibility -option(FILE_REORG_BACKWARD_COMPATIBILITY "Enable File Reorg with backward compatibility" ON) +option(FILE_REORG_BACKWARD_COMPATIBILITY + "Enable File Reorg with backward compatibility" ON) if(FILE_REORG_BACKWARD_COMPATIBILITY) - include (rocprofiler-backward-compat.cmake) + include(rocprofiler-backward-compat.cmake) endif() -## Packaging directives -set ( CPACK_GENERATOR "DEB" "RPM" "TGZ" CACHE STRING "CPACK GENERATOR e.g. DEB;RPM" ) -set ( ENABLE_LDCONFIG ON CACHE BOOL "Set library links and caches using ldconfig.") -set ( CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc." ) -set ( CPACK_PACKAGE_VERSION_MAJOR ${BUILD_VERSION_MAJOR} ) -set ( CPACK_PACKAGE_VERSION_MINOR ${BUILD_VERSION_MINOR} ) -set ( CPACK_PACKAGE_VERSION_PATCH ${BUILD_VERSION_PATCH} ) -set ( CPACK_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}" ) -set ( CPACK_PACKAGE_CONTACT "ROCm Profiler Support " ) -set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "ROCPROFILER library for AMD HSA runtime API extension support" ) -set ( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" ) - -# Install license file -install(FILES ${CPACK_RESOURCE_FILE_LICENSE} DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT runtime) - -if ( DEFINED ENV{ROCM_LIBPATCH_VERSION} ) - set ( CPACK_PACKAGE_VERSION "${CPACK_PACKAGE_VERSION}.$ENV{ROCM_LIBPATCH_VERSION}" ) - message ( "Using CPACK_PACKAGE_VERSION ${CPACK_PACKAGE_VERSION}" ) -endif() +if(${LIBRARY_TYPE} STREQUAL SHARED) + # Packaging directives + set(CPACK_GENERATOR "DEB" "RPM" "TGZ") + set(ENABLE_LDCONFIG + ON + CACHE BOOL "Set library links and caches using ldconfig.") + set(CPACK_PACKAGE_NAME "${PROJECT_NAME}") + set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc.") + set(CPACK_PACKAGE_VERSION_MAJOR ${PROJECT_VERSION_MAJOR}) + set(CPACK_PACKAGE_VERSION_MINOR ${PROJECT_VERSION_MINOR}) + set(CPACK_PACKAGE_VERSION_PATCH ${PROJECT_VERSION_PATCH}) + set(CPACK_PACKAGE_VERSION + "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}" + ) + set(CPACK_PACKAGE_CONTACT + "ROCm Profiler Support ") + set(CPACK_PACKAGE_DESCRIPTION_SUMMARY + "ROCPROFILER library for AMD HSA runtime API extension support") + set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") -## Debian package specific variables -if ( DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE} ) - set ( CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE} ) -else() - set ( CPACK_DEBIAN_PACKAGE_RELEASE "local" ) -endif() -message ( "Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}" ) -set ( CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT" ) - -set ( CPACK_DEBIAN_RUNTIME_PACKAGE_NAME "${ROCPROFILER_NAME}" ) -set ( CPACK_DEBIAN_RUNTIME_PACKAGE_DEPENDS "hsa-rocr-dev, rocm-core" ) -set ( CPACK_DEBIAN_DEV_PACKAGE_NAME "${ROCPROFILER_NAME}-dev" ) -set ( CPACK_DEBIAN_DEV_PACKAGE_DEPENDS "${ROCPROFILER_NAME}, hsa-rocr-dev, rocm-core" ) -## Process the Debian install/remove scripts to update the CPACK variables -configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst.in DEBIAN/postinst @ONLY ) -configure_file ( ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm.in DEBIAN/prerm @ONLY ) -set ( CPACK_DEBIAN_RUNTIME_PACKAGE_CONTROL_EXTRA "DEBIAN/postinst;DEBIAN/prerm" ) - -## RPM package specific variables -if ( DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE} ) - set ( CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE} ) -else() - set ( CPACK_RPM_PACKAGE_RELEASE "local" ) -endif() -message ( "Using CPACK_RPM_PACKAGE_RELEASE ${CPACK_RPM_PACKAGE_RELEASE}" ) + if(DEFINED ENV{ROCM_LIBPATCH_VERSION}) + set(CPACK_PACKAGE_VERSION + "${CPACK_PACKAGE_VERSION}.$ENV{ROCM_LIBPATCH_VERSION}") + message("Using CPACK_PACKAGE_VERSION ${CPACK_PACKAGE_VERSION}") + endif() -set( CPACK_RPM_PACKAGE_LICENSE "MIT" ) + # Install license file + install( + FILES ${CPACK_RESOURCE_FILE_LICENSE} + DESTINATION ${CMAKE_INSTALL_DOCDIR} + COMPONENT runtime) -## 'dist' breaks manual builds on debian systems due to empty Provides -execute_process( COMMAND rpm --eval %{?dist} - RESULT_VARIABLE PROC_RESULT - OUTPUT_VARIABLE EVAL_RESULT - OUTPUT_STRIP_TRAILING_WHITESPACE ) -message("RESULT_VARIABLE ${PROC_RESULT} OUTPUT_VARIABLE: ${EVAL_RESULT}") + # Debian package specific variables + if(DEFINED ENV{CPACK_DEBIAN_PACKAGE_RELEASE}) + set(CPACK_DEBIAN_PACKAGE_RELEASE $ENV{CPACK_DEBIAN_PACKAGE_RELEASE}) + else() + set(CPACK_DEBIAN_PACKAGE_RELEASE "local") + endif() -if ( PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "" ) - string ( APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}" ) -endif() -set ( CPACK_RPM_FILE_NAME "RPM-DEFAULT" ) - -set ( CPACK_RPM_RUNTIME_PACKAGE_NAME "${ROCPROFILER_NAME}" ) -set ( CPACK_RPM_RUNTIME_PACKAGE_REQUIRES "hsa-rocr-devel, rocm-core" ) -set ( CPACK_RPM_DEV_PACKAGE_NAME "${ROCPROFILER_NAME}-devel" ) -set ( CPACK_RPM_DEV_PACKAGE_REQUIRES "${ROCPROFILER_NAME}, hsa-rocr-devel, rocm-core" ) -set ( CPACK_RPM_DEV_PACKAGE_PROVIDES "${ROCPROFILER_NAME}-dev" ) -set ( CPACK_RPM_DEV_PACKAGE_OBSOLETES "${ROCPROFILER_NAME}-dev" ) -## Process the Rpm install/remove scripts to update the CPACK variables -configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/post.in" RPM/post @ONLY ) -configure_file ( "${CMAKE_CURRENT_SOURCE_DIR}/RPM/postun.in" RPM/postun @ONLY ) - -set ( CPACK_RPM_RUNTIME_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/post" ) -set ( CPACK_RPM_RUNTIME_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_BINARY_DIR}/RPM/postun" ) -# Remove dependency on rocm-core if -DROCM_DEP_ROCMCORE=ON not given to cmake -if(NOT ROCM_DEP_ROCMCORE) - string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_RUNTIME_PACKAGE_REQUIRES ${CPACK_RPM_RUNTIME_PACKAGE_REQUIRES}) - string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_DEV_PACKAGE_REQUIRES ${CPACK_RPM_DEV_PACKAGE_REQUIRES}) - string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_RUNTIME_PACKAGE_DEPENDS ${CPACK_DEBIAN_RUNTIME_PACKAGE_DEPENDS}) - string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_DEV_PACKAGE_DEPENDS ${CPACK_DEBIAN_DEV_PACKAGE_DEPENDS}) + message("Using CPACK_DEBIAN_PACKAGE_RELEASE ${CPACK_DEBIAN_PACKAGE_RELEASE}") + set(CPACK_DEB_COMPONENT_INSTALL ON) + set(CPACK_DEBIAN_FILE_NAME "DEB-DEFAULT") + set(CPACK_DEBIAN_RUNTIME_PACKAGE_NAME "${PROJECT_NAME}") + set(CPACK_DEBIAN_RUNTIME_PACKAGE_DEPENDS "hsa-rocr-dev, rocm-core") + set(CPACK_DEBIAN_DEV_PACKAGE_NAME "${PROJECT_NAME}-dev") + set(CPACK_DEBIAN_DEV_PACKAGE_DEPENDS + "${PROJECT_NAME}, hsa-rocr-dev, rocm-core") + set(CPACK_DEBIAN_TESTS_PACKAGE_NAME "${PROJECT_NAME}-tests") + set(CPACK_DEBIAN_TESTS_PACKAGE_DEPENDS + "${PROJECT_NAME}-dev, hsa-rocr-dev, rocm-core") + set(CPACK_DEBIAN_SAMPLES_PACKAGE_NAME "${PROJECT_NAME}-samples") + set(CPACK_DEBIAN_SAMPLES_PACKAGE_DEPENDS + "${PROJECT_NAME}-dev, hsa-rocr-dev, rocm-core") + set(CPACK_DEBIAN_DOCS_PACKAGE_NAME "${PROJECT_NAME}-docs") + set(CPACK_DEBIAN_DOCS_PACKAGE_DEPENDS + "${PROJECT_NAME}-dev, hsa-rocr-dev, rocm-core") + set(CPACK_DEBIAN_PLUGINS_PACKAGE_NAME "${PROJECT_NAME}-plugins") + set(CPACK_DEBIAN_PLUGINS_PACKAGE_DEPENDS + "${PROJECT_NAME}, hsa-rocr-dev, rocm-core") + + # RPM package specific variables + if(DEFINED ENV{CPACK_RPM_PACKAGE_RELEASE}) + set(CPACK_RPM_PACKAGE_RELEASE $ENV{CPACK_RPM_PACKAGE_RELEASE}) + else() + set(CPACK_RPM_PACKAGE_RELEASE "local") + endif() + + message("Using CPACK_RPM_PACKAGE_RELEASE ${CPACK_RPM_PACKAGE_RELEASE}") + + set(CPACK_RPM_PACKAGE_LICENSE "MIT") + + # 'dist' breaks manual builds on debian systems due to empty Provides + execute_process( + COMMAND rpm --eval %{?dist} + RESULT_VARIABLE PROC_RESULT + OUTPUT_VARIABLE EVAL_RESULT + OUTPUT_STRIP_TRAILING_WHITESPACE) + message("RESULT_VARIABLE ${PROC_RESULT} OUTPUT_VARIABLE: ${EVAL_RESULT}") + + if(PROC_RESULT EQUAL "0" AND NOT EVAL_RESULT STREQUAL "") + string(APPEND CPACK_RPM_PACKAGE_RELEASE "%{?dist}") + endif() + + set(CPACK_RPM_COMPONENT_INSTALL ON) + set(CPACK_RPM_FILE_NAME "RPM-DEFAULT") + set(CPACK_RPM_RUNTIME_PACKAGE_NAME "${PROJECT_NAME}") + set(CPACK_RPM_RUNTIME_PACKAGE_REQUIRES "hsa-rocr-dev, rocm-core") + set(CPACK_RPM_DEV_PACKAGE_NAME "${PROJECT_NAME}-devel") + set(CPACK_RPM_DEV_PACKAGE_REQUIRES "${PROJECT_NAME}, hsa-rocr-dev, rocm-core") + set(CPACK_RPM_DEV_PACKAGE_PROVIDES "${PROJECT_NAME}-dev") + set(CPACK_RPM_DEV_PACKAGE_OBSOLETES "${PROJECT_NAME}-dev") + set(CPACK_RPM_TESTS_PACKAGE_NAME "${PROJECT_NAME}-tests") + set(CPACK_RPM_TESTS_PACKAGE_REQUIRES + "${PROJECT_NAME}-devel, hsa-rocr-dev, rocm-core") + set(CPACK_RPM_SAMPLES_PACKAGE_NAME "${PROJECT_NAME}-samples") + set(CPACK_RPM_SAMPLES_PACKAGE_REQUIRES + "${PROJECT_NAME}-devel, hsa-rocr-dev, rocm-core") + set(CPACK_RPM_DOCS_PACKAGE_NAME "${PROJECT_NAME}-docs") + set(CPACK_RPM_DOCS_PACKAGE_REQUIRES + "${PROJECT_NAME}-devel, hsa-rocr-dev, rocm-core") + set(CPACK_RPM_PLUGINS_PACKAGE_NAME "${PROJECT_NAME}-plugins") + set(CPACK_RPM_PLUGINS_PACKAGE_REQUIRES + "${PROJECT_NAME}, hsa-rocr-dev, rocm-core") + message("CPACK_RPM_PACKAGE_RELEASE: ${CPACK_RPM_PACKAGE_RELEASE}") + + # Remove dependency on rocm-core if -DROCM_DEP_ROCMCORE=ON not given to cmake + if(NOT ROCM_DEP_ROCMCORE) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_RUNTIME_PACKAGE_REQUIRES + ${CPACK_RPM_RUNTIME_PACKAGE_REQUIRES}) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_DEV_PACKAGE_REQUIRES + ${CPACK_RPM_DEV_PACKAGE_REQUIRES}) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_TESTS_PACKAGE_REQUIRES + ${CPACK_RPM_TESTS_PACKAGE_REQUIRES}) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_SAMPLES_PACKAGE_REQUIRES + ${CPACK_RPM_SAMPLES_PACKAGE_REQUIRES}) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_DOCS_PACKAGE_REQUIRES + ${CPACK_RPM_DOCS_PACKAGE_REQUIRES}) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_RPM_PLUGINS_PACKAGE_REQUIRES + ${CPACK_RPM_PLUGINS_PACKAGE_REQUIRES}) + string(REGEX + REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_RUNTIME_PACKAGE_DEPENDS + ${CPACK_DEBIAN_RUNTIME_PACKAGE_DEPENDS}) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_DEV_PACKAGE_DEPENDS + ${CPACK_DEBIAN_DEV_PACKAGE_DEPENDS}) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_TESTS_PACKAGE_DEPENDS + ${CPACK_DEBIAN_TESTS_PACKAGE_DEPENDS}) + string(REGEX + REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_SAMPLES_PACKAGE_DEPENDS + ${CPACK_DEBIAN_SAMPLES_PACKAGE_DEPENDS}) + string(REGEX REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_DOCS_PACKAGE_DEPENDS + ${CPACK_DEBIAN_DOCS_PACKAGE_DEPENDS}) + string(REGEX + REPLACE ",? ?rocm-core" "" CPACK_DEBIAN_PLUGINS_PACKAGE_DEPENDS + ${CPACK_DEBIAN_PLUGINS_PACKAGE_DEPENDS}) + endif() + + set(CPACK_COMPONENTS_ALL runtime dev tests docs plugins samples) + include(CPack) + + cpack_add_component( + runtime + DISPLAY_NAME "Runtime" + DESCRIPTION "Dynamic libraries for the ROCProfiler") + + cpack_add_component( + dev + DISPLAY_NAME "Development" + DESCRIPTION "Development needed header files for ROCProfiler" + DEPENDS runtime) + + cpack_add_component( + plugins + DISPLAY_NAME "ROCProfile Plugins" + DESCRIPTION "Plugins for handling ROCProfiler data output" + DEPENDS runtime) + + cpack_add_component( + tests + DISPLAY_NAME "Tests" + DESCRIPTION "Tests for the ROCProfiler" + DEPENDS dev) + + cpack_add_component( + samples + DISPLAY_NAME "Samples" + DESCRIPTION "Samples for the ROCProfiler" + DEPENDS dev) + + cpack_add_component( + docs + DISPLAY_NAME "Documentation" + DESCRIPTION "Documentation for the ROCProfiler API" + DEPENDS dev) endif() -include ( CPack ) +find_package(Doxygen) + +if(DOXYGEN_FOUND) + # # Set input and output files + set(DOXYGEN_IN ${CMAKE_CURRENT_SOURCE_DIR}/doc/Doxyfile.in) + set(DOXYGEN_OUT ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile) + + # # Request to configure the file + configure_file(${DOXYGEN_IN} ${DOXYGEN_OUT} @ONLY) + + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/doc/html/index.html + ${CMAKE_CURRENT_BINARY_DIR}/doc/latex/refman.pdf + COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_OUT} + COMMAND make -C ${CMAKE_CURRENT_BINARY_DIR}/doc/latex pdf + MAIN_DEPENDENCY ${DOXYGEN_OUT} + ${DOXYGEN_IN} + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler.h + ${CMAKE_CURRENT_SOURCE_DIR}/inc/rocprofiler_plugin.h + COMMENT "Generating documentation") + + add_custom_target( + doc DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/doc/html/index.html + ${CMAKE_CURRENT_BINARY_DIR}/doc/latex/refman.pdf) + + install( + FILES "${CMAKE_CURRENT_BINARY_DIR}/doc/latex/refman.pdf" + DESTINATION ${CMAKE_INSTALL_DOCDIR} + RENAME "${PROJECT_NAME}.pdf" + OPTIONAL + COMPONENT docs) + + install( + DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/doc/html/" + DESTINATION ${CMAKE_INSTALL_DATADIR}/html/${PROJECT_NAME} + OPTIONAL + COMPONENT docs) +endif() diff --git a/bin/rocprofv2 b/bin/rocprofv2 new file mode 100755 index 00000000..d1fedd0c --- /dev/null +++ b/bin/rocprofv2 @@ -0,0 +1,296 @@ +#!/bin/bash + +ROCPROFV2_DIR=$(dirname -- $(realpath ${BASH_SOURCE[0]})); +ROCM_DIR=$(dirname -- ${ROCPROFV2_DIR}) +RUN_FROM_BUILD=0 +if [[ $ROCPROFV2_DIR == *"/build"* ]]; then + RUN_FROM_BUILD=1 +elif [[ $ROCPROFV2_DIR == *"/rocprofiler"* ]]; then + RUN_FROM_BUILD=1 + ROCM_DIR=$ROCPROFV2_DIR +fi + +usage() { + echo -e "ROCProfilerV2 Run Script Usage:" + echo -e "-h | --help For showing this message" + echo -e "--list-counters For showing all available counters for the current GPUs" + if [ $RUN_FROM_BUILD == 1 ]; then + echo -e "-b | --build For compiling" + echo -e "-cb | --clean-build For full clean build" + echo -e "-t | --test For Running the tests" + echo -e "-ct | --clean-build-test For Running the tests after a clean build" + echo -e "-mt | --mem-test For Running the Memory Leak tests. This run requires building using -acb | --asan-clean-build option" + echo -e "-acb | --asan-clean-build For compiling with ASAN library attached" + echo -e "--install For installing ROCProfilerV2 without clean build in the default installation folder (review build.sh to know more about the default paths)" + echo -e "--clean-install For installing ROCProfilerV2 with new clean build in the default installation folder (review build.sh to know more about the default paths)" + fi + echo -e "--hip-api For Collecting HIP API Traces" + echo -e "--hip-activity For Collecting HSA API Activities Traces" + echo -e "--hsa-api For Collecting HIP API Traces" + echo -e "--hsa-activity For Collecting HSA API Activities Traces" + echo -e "--roctx-trace For Collecting ROCTx Traces" + echo -e "--kernel-trace For Collecting Kernel dispatch Traces" + echo -e "--sys-trace For Collecting HIP and HSA APIs and their Activities Traces along ROCTX and Kernel Dispatch traces" + echo -e "--plugin PLUGIN_NAME For enabling a plugin (file/perfetto/att)" + echo -e "-i | --input For adding counters file path (every line in the text file represents a counter)" + echo -e "-o | --output-file For the output file name" + echo -e "-d | --output-directory For adding output path where the output files will be saved" + echo -e "-fi | --flush-interval For adding a flush interval in milliseconds, every \"flush interval\" the buffers will be flushed" + # echo -e "\n###ATT Plugin options: ###" + # if [ $RUN_FROM_BUILD == 1 ]; then + # ATT_PATH=$ROCPROFV2_DIR/build/plugin/att/att/att.py + # else + # ATT_PATH=$ROCPROFV2_DIR/../libexec/rocprofiler/att/att.py + # fi + # eval "python3 $ATT_PATH --help" + exit 1 +} + +if [ -z "$1" ] ; then + usage + exit 1 +fi + +while [ 1 ] ; do + if [[ "$1" = "-h" || "$1" = "--help" ]] ; then + usage + exit 1 + elif [[ "$1" = "-b" || "$1" = "--build" ]] ; then + if [ $RUN_FROM_BUILD == 1 ]; then + TO_CLEAN=no ./build.sh + exit 1 + fi + elif [[ "$1" = "-acb" || "$1" = "--asan-clean-build" ]] ; then + if [ $RUN_FROM_BUILD == 1 ]; then + ASAN=yes TO_CLEAN=yes ./build.sh + exit 1 + fi + elif [[ "$1" = "-cb" || "$1" = "--clean-build" ]] ; then + if [ $RUN_FROM_BUILD == 1 ]; then + TO_CLEAN=yes ./build.sh + exit 1 + fi + elif [[ "$1" = "-t" || "$1" = "--test" ]] ; then + if [ $RUN_FROM_BUILD == 1 ]; then + export ROCPROFILER_METRICS_PATH=$ROCM_DIR/build/counters/derived_counters.xml + TO_CLEAN=no $ROCM_DIR/build.sh + pushd build + ./run_tests.sh + exit 1 + fi + elif [[ "$1" = "-mt" || "$1" = "--mem-test" ]] ; then + if [ $RUN_FROM_BUILD == 1 ]; then + ASAN=yes TO_CLEAN=yes ./build.sh + ./tests/memorytests/run_asan_tests.sh $ROCM_DIR/build/tests/featuretests/profiler/gtests/apps/hip_vectoradd $ROCM_DIR/build/memleaks.log + exit 1 + fi + elif [[ "$1" = "-ct" || "$1" = "--clean-build-test" ]] ; then + if [ $RUN_FROM_BUILD == 1 ]; then + TO_CLEAN=yes $ROCM_DIR/build.sh + pushd build + ./run_tests.sh + exit 1 + fi + elif [[ "$1" = "--install" ]] ; then + if [ $RUN_FROM_BUILD == 1 ]; then + TO_CLEAN=no $ROCM_DIR/build.sh + pushd build + make install + exit 1 + fi + elif [[ "$1" = "--clean-install" ]] ; then + if [ $RUN_FROM_BUILD == 1 ]; then + TO_CLEAN=yes $ROCM_DIR/build.sh + pushd build + make install + exit 1 + fi + elif [[ "$1" = "--list-counters" ]] ; then + if [ $RUN_FROM_BUILD == 1 ]; then + export ROCPROFILER_METRICS_PATH=$ROCM_DIR/build/counters/derived_counters.xml + eval $ROCM_DIR/build/src/tools/ctrl + else + export ROCPROFILER_METRICS_PATH=$ROCPROFV2_DIR/../libexec/rocprofiler/counters/derived_counters.xml + export LD_LIBRARY_PATH=$ROCPROFV2_DIR/../lib:$LD_LIBRARY_PATH + export LD_PRELOAD=$ROCPROFV2_DIR/../lib/librocprofiler_tool.so + eval $ROCPROFV2_DIR/../libexec/rocprofiler/ctrl + fi + exit 1 + elif [[ "$1" = "-i" || "$1" = "--input" ]] ; then + if [ $2 ] && [ -n $2 ] && [ -r $2 ] ; then + if [ $RUN_FROM_BUILD == 1 ]; then + export ROCPROFILER_METRICS_PATH=$ROCM_DIR/build/counters/derived_counters.xml + else + export ROCPROFILER_METRICS_PATH=$ROCPROFV2_DIR/../libexec/rocprofiler/counters/derived_counters.xml + fi + export COUNTERS_PATH=$2 + else + echo -e "Error: \"$2\" doesn't exist!" + usage + exit 1 + fi + shift + shift + elif [[ "$1" = "-o" || "$1" = "--output-file-name" ]] ; then + if [ $2 ] ; then + export OUT_FILE_NAME=$2 + else + usage + exit 1 + fi + shift + shift + elif [[ "$1" = "-d" || "$1" = "--output-directory" ]] ; then + if [ $2 ] ; then + mkdir -p $2 + export OUTPUT_PATH=$2 + OUTPUT_PATH_INTERNAL=$2 + else + usage + exit 1 + fi + shift + shift + elif [[ "$1" = "-fi" || "$1" = "--flush-interval" ]] ; then + if [ $2 ] && [ $2 -gt 0 ] ; then + export ROCPROFILER_FLUSH_INTERVAL=$2 + else + echo -e "Wrong input \"$2\" for flush interval, it needs to be integer greater than zero!" + usage + exit 1 + fi + shift + shift + elif [ "$1" = "--hip-api" ] ; then + export ROCPROFILER_HIP_API_TRACE=1 + shift + elif [ "$1" = "--hip-activity" ] ; then + export ROCPROFILER_HIP_API_TRACE=1 + export ROCPROFILER_HIP_ACTIVITY_TRACE=1 + shift + elif [ "$1" = "--hsa-api" ] ; then + export ROCPROFILER_HSA_API_TRACE=1 + shift + elif [ "$1" = "--hsa-activity" ] ; then + export ROCPROFILER_HSA_API_TRACE=1 + export ROCPROFILER_HSA_ACTIVITY_TRACE=1 + shift + elif [ "$1" = "--roctx-trace" ] ; then + export ROCPROFILER_ROCTX_TRACE=1 + shift + elif [ "$1" = "--kernel-trace" ] ; then + export ROCPROFILER_KERNEL_TRACE=1 + shift + elif [ "$1" = "--sys-trace" ] ; then + export ROCPROFILER_HIP_API_TRACE=1 + export ROCPROFILER_HIP_ACTIVITY_TRACE=1 + export ROCPROFILER_HSA_API_TRACE=1 + export ROCPROFILER_HSA_ACTIVITY_TRACE=1 + export ROCPROFILER_ROCTX_TRACE=1 + export ROCPROFILER_KERNEL_TRACE=1 + shift + elif [ "$1" = "--amd-sys" ] ; then + export ROCPROFILER_ENABLE_AMDSYS=$2 + shift + shift + elif [ "$1" = "--plugin" ] ; then + if [ -n $2 ] ; then + PLUGIN=$2 + if [ $RUN_FROM_BUILD == 1 ]; then + export ROCPROFILER_PLUGIN_LIB=lib${PLUGIN}_plugin.so + else + export ROCPROFILER_PLUGIN_LIB=rocprofiler/lib${PLUGIN}_plugin.so + fi + else + echo -e "Wrong input \"$2\" for plugin!" + usage + exit 1 + fi + if [ "$2" = "att" ] ; then + if [ $RUN_FROM_BUILD == 1 ]; then + ATT_PATH=$ROCPROFV2_DIR/build/plugin/att/att/att.py + else + ATT_PATH=$ROCPROFV2_DIR/../libexec/rocprofiler/att/att.py + fi + ATT_ARGV=$3 + shift + + ATT_OPTIONS="Not done" + while [ "$ATT_OPTIONS" = "Not done" ]; do + if [[ "$3" = "--trace_file" ]]; then + ATT_ARGV="$ATT_ARGV $3 \"$4\"" + shift + shift + elif [[ "$3" = "--genasm" || "$3" = "--target_cu" || "$3" = "-o" || "$3" == "-k" || "$3" == "--att_kernel" ]]; then + ATT_ARGV="$ATT_ARGV $3 $4" + shift + shift + else + ATT_OPTIONS="Done" + fi + done + if [ $RUN_FROM_BUILD == 1 ]; then + ATT_PATH=$ROCPROFV2_DIR/build/plugin/att/att/att.py + else + ATT_PATH=$ROCPROFV2_DIR/../libexec/rocprofiler/att/att.py + fi + fi + shift + shift + elif [[ "$1" = "-"* || "$1" = "--"* ]] ; then + echo -e "Wrong option \"$1\", Please use the following options:\n" + usage + exit 1 + else + break + fi +done + +PMC_LINES=() +if [ -n "$COUNTERS_PATH" ]; then + input=$COUNTERS_PATH + while IFS= read -r line || [[ -n "$line" ]]; do + # if in att mode, only add the first line + if [[ ! -n "$PMC_LINES" ]] || [[ ! -n "$ATT_ARGV" ]]; then + PMC_LINES+=( "$line" ) + fi + done < $input +fi + +if [ -n "$PMC_LINES" ]; then + COUNTER=1 + for i in ${!PMC_LINES[@]}; do + export ROCPROFILER_COUNTERS="${PMC_LINES[$i]}" + if [ -n "$OUTPUT_PATH" ]; then + if [ ! -n "$ATT_ARGV" ]; then + FINAL_PATH="$OUTPUT_PATH_INTERNAL/pmc_$COUNTER" + else + FINAL_PATH="$OUTPUT_PATH" + fi + echo -e "\nThe output path for the following counters: $FINAL_PATH" + mkdir -p $FINAL_PATH + echo $ROCPROFILER_COUNTERS > $FINAL_PATH/pmc.txt + export OUTPUT_PATH=$FINAL_PATH + let COUNTER=COUNTER+1 + fi + if [ $RUN_FROM_BUILD == 1 ]; then + LD_PRELOAD=$LD_PRELOAD:$ROCM_DIR/build/librocprofiler_tool.so $* + else + LD_PRELOAD=$LD_PRELOAD:$ROCM_DIR/lib/librocprofiler_tool.so $* + fi + done +elif [ ! -n "$ATT_ARGV" ]; then + if [ $RUN_FROM_BUILD == 1 ]; then + LD_PRELOAD=$LD_PRELOAD:$ROCM_DIR/build/librocprofiler_tool.so $* + else + LD_PRELOAD=$LD_PRELOAD:$ROCM_DIR/lib/librocprofiler_tool.so $* + fi +fi + +if [ -n "$ATT_PATH" ]; then + if [ -n "$ATT_ARGV" ]; then + eval "python3 $ATT_PATH $ATT_ARGV" + elif [ ! -n "$PMC_LINES" ]; then + echo "ATT File is required!" + fi +fi diff --git a/build.sh b/build.sh index f4bea3a7..eb7bfd8d 100755 --- a/build.sh +++ b/build.sh @@ -22,19 +22,54 @@ # IN THE SOFTWARE. ################################################################################ -SRC_DIR=`dirname $0` -TO_CLEAN=yes +SRC_DIR=$(dirname "$0") COMPONENT="rocprofiler" ROCM_PATH="${ROCM_PATH:=/opt/rocm}" LD_RUNPATH_FLAG=" -Wl,--enable-new-dtags -Wl,--rpath,$ROCM_PATH/lib:$ROCM_PATH/lib64" +usage() { + echo -e "ROCProfiler Build Script Usage:" + echo -e "\nTo run ./run.sh PARAMs, PARAMs can be the following:" + echo -e "-h | --help For showing this message" + echo -e "-b | --build For compiling" + echo -e "-cb | --clean-build For full clean build" + echo -e "-act | --asan-clean-build For compiling with ASAN library attached" + exit 1 +} + +while [ 1 ] ; do + if [[ "$1" = "-h" || "$1" = "--help" ]] ; then + usage + exit 1 + elif [[ "$1" = "-b" || "$1" = "--build" ]] ; then + TO_CLEAN=no + shift + elif [[ "$1" = "-acb" || "$1" = "--asan-clean-build" ]] ; then + ASAN=True TO_CLEAN=yes + shift + elif [[ "$1" = "-cb" || "$1" = "--clean-build" ]] ; then + TO_CLEAN=yes + shift + elif [[ "$1" = "-"* || "$1" = "--"* ]] ; then + echo -e "Wrong option \"$1\", Please use the following options:\n" + usage + exit 1 + else + break + fi +done + +umask 022 + if [ -z "$ROCPROFILER_ROOT" ]; then ROCPROFILER_ROOT=$SRC_DIR; fi -if [ -z "$BUILD_DIR" ] ; then BUILD_DIR=$SRC_DIR/build; fi -if [ -z "$BUILD_TYPE" ] ; then BUILD_TYPE="release"; fi +if [ -z "$BUILD_DIR" ] ; then BUILD_DIR=build; fi +if [ -z "$BUILD_TYPE" ] ; then BUILD_TYPE="RelWithDebInfo"; fi if [ -z "$PACKAGE_ROOT" ] ; then PACKAGE_ROOT=$ROCM_PATH; fi -if [ -z "$PACKAGE_PREFIX" ] ; then PACKAGE_PREFIX="${ROCM_PATH}/${COMPONENT}"; fi -if [ -z "$PREFIX_PATH" ] ; then PREFIX_PATH="${ROCM_PATH}/include/hsa:${ROCM_PATH}"; fi +if [ -z "$PREFIX_PATH" ] ; then PREFIX_PATH=$PACKAGE_ROOT; fi +if [ -z "$HIP_VDI" ] ; then HIP_VDI=0; fi if [ -n "$ROCM_RPATH" ] ; then LD_RUNPATH_FLAG=" -Wl,--enable-new-dtags -Wl,--rpath,${ROCM_RPATH}"; fi +if [ -z "$TO_CLEAN" ] ; then TO_CLEAN=yes; fi +if [ -z "$ASAN" ] ; then ASAN=False; fi ROCPROFILER_ROOT=$(cd $ROCPROFILER_ROOT && echo $PWD) @@ -43,15 +78,19 @@ mkdir -p $BUILD_DIR pushd $BUILD_DIR cmake \ - -DCMAKE_MODULE_PATH=$ROCPROFILER_ROOT/cmake_modules \ - -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ - -DCMAKE_PREFIX_PATH="${PREFIX_PATH}" \ - -DCMAKE_INSTALL_PREFIX=$PACKAGE_ROOT \ - -DCPACK_PACKAGING_INSTALL_PREFIX=$PACKAGE_PREFIX \ - -DCPACK_GENERATOR="${CPACKGEN:-"DEB;RPM"}" \ + -DCMAKE_EXPORT_COMPILE_COMMANDS=TRUE \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE:-'RelWithDebInfo'} \ + -DCMAKE_MODULE_PATH=$ROCM_PATH/hip/cmake \ + -DCMAKE_PREFIX_PATH="$PREFIX_PATH" \ + -DCMAKE_INSTALL_PREFIX="$PACKAGE_ROOT" \ -DCMAKE_SHARED_LINKER_FLAGS="$LD_RUNPATH_FLAG" \ + -DCPACK_PACKAGING_INSTALL_PREFIX=$PACKAGE_ROOT \ + -DCPACK_GENERATOR=${CPACKGEN:-'DEB;RPM'} \ + -DCMAKE_INSTALL_RPATH=${ROCM_RPATH} \ + -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=FALSE \ $ROCPROFILER_ROOT make -j -make mytest -make package +make -j package + +exit 0 diff --git a/cmake_modules/FindLibDw.cmake b/cmake_modules/FindLibDw.cmake new file mode 100644 index 00000000..23eefe09 --- /dev/null +++ b/cmake_modules/FindLibDw.cmake @@ -0,0 +1,28 @@ +# Try to find LIBDW +# +# Once found, this will define: +# - LIBDW_FOUND - system has libelf +# - LIBDW_INCLUDE_DIRS - the libelf include directory +# - LIBDW_LIBRARIES - Link these to use libelf +# - LIBDW_DEFINITIONS - Compiler switches required for using libelf +find_path(FIND_LIBDW_INCLUDES + NAMES + elfutils/libdw.h + PATHS + /usr/include + /usr/local/include) + +find_library(FIND_LIBDW_LIBRARIES + NAMES + dw + PATH + /usr/lib + /usr/local/lib) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(LibDw DEFAULT_MSG + FIND_LIBDW_INCLUDES FIND_LIBDW_LIBRARIES) +mark_as_advanced(FIND_LIBDW_INCLUDES FIND_LIBDW_LIBRARIES) + +set(LIBDW_INCLUDES ${FIND_LIBDW_INCLUDES}) +set(LIBDW_LIBRARIES ${FIND_LIBDW_LIBRARIES}) diff --git a/cmake_modules/FindLibElf.cmake b/cmake_modules/FindLibElf.cmake new file mode 100644 index 00000000..30081404 --- /dev/null +++ b/cmake_modules/FindLibElf.cmake @@ -0,0 +1,30 @@ +# Try to find LIBELF +# +# Once found, this will define: +# - LIBELF_FOUND - system has libelf +# - LIBELF_INCLUDE_DIRS - the libelf include directory +# - LIBELF_LIBRARIES - Link these to use libelf +# - LIBELF_DEFINITIONS - Compiler switches required for using libelf +find_path(FIND_LIBELF_INCLUDES + NAMES + libelf.h + PATHS + /usr/include + /usr/include/libelf + /usr/local/include + /usr/local/include/libelf) + +find_library(FIND_LIBELF_LIBRARIES + NAMES + elf + PATH + /usr/lib + /usr/local/lib) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(LibElf DEFAULT_MSG + FIND_LIBELF_INCLUDES FIND_LIBELF_LIBRARIES) +mark_as_advanced(FIND_LIBELF_INCLUDES FIND_LIBELF_LIBRARIES) + +set(LIBELF_INCLUDES ${FIND_LIBELF_INCLUDES}) +set(LIBELF_LIBRARIES ${FIND_LIBELF_LIBRARIES}) diff --git a/cmake_modules/env.cmake b/cmake_modules/env.cmake index 2e9613be..66475780 100644 --- a/cmake_modules/env.cmake +++ b/cmake_modules/env.cmake @@ -20,43 +20,14 @@ # THE SOFTWARE. ################################################################################ -## Build is not supported on Windows plaform -if ( WIN32 ) - message ( FATAL_ERROR "Windows build is not supported." ) -endif () - -## Compiler Preprocessor definitions. -add_definitions ( -D__linux__ ) -add_definitions ( -DUNIX_OS ) -add_definitions ( -DLINUX ) -add_definitions ( -D__AMD64__ ) -add_definitions ( -D__x86_64__ ) -add_definitions ( -DLITTLEENDIAN_CPU=1 ) -add_definitions ( -DHSA_LARGE_MODEL= ) -add_definitions ( -DHSA_DEPRECATED= ) - ## Linux Compiler options -set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") -set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall" ) -set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror" ) -set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=return-type" ) -set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions" ) -set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden" ) -set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-math-errno" ) -set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-threadsafe-statics" ) -set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" ) -set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions" ) -set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fmerge-all-constants" ) -set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror=unused-result" ) -set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC" ) - -add_link_options ("-Bdynamic -z,neexecstack") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions") add_definitions ( -DNEW_TRACE_API=1 ) ## CLANG options -if ( "$ENV{CXX}" STREQUAL "/usr/bin/clang++" ) - set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ferror-limit=1000000" ) +if("$ENV{CXX}" STREQUAL "/usr/bin/clang++") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ferror-limit=1000000") endif() ## Enable debug trace @@ -74,25 +45,6 @@ if ( DEFINED ENV{CMAKE_LD_AQLPROFILE} ) add_definitions ( -DROCP_LD_AQLPROFILE=1 ) endif() -## Make env vars -if ( NOT DEFINED CMAKE_BUILD_TYPE OR "${CMAKE_BUILD_TYPE}" STREQUAL "" ) - if ( DEFINED ENV{CMAKE_BUILD_TYPE} ) - set ( CMAKE_BUILD_TYPE $ENV{CMAKE_BUILD_TYPE} ) - endif() -endif() -if ( NOT DEFINED CMAKE_PREFIX_PATH AND DEFINED ENV{CMAKE_PREFIX_PATH} ) - set ( CMAKE_PREFIX_PATH $ENV{CMAKE_PREFIX_PATH} ) -endif() - -## Extend Compiler flags based on build type -string ( TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE ) -if ( "${CMAKE_BUILD_TYPE}" STREQUAL debug ) - set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb" ) - set ( CMAKE_BUILD_TYPE "debug" ) -else () - set ( CMAKE_BUILD_TYPE "release" ) -endif () - ## Find hsa-runtime find_package(hsa-runtime64 CONFIG REQUIRED HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm PATH_SUFFIXES lib/cmake/hsa-runtime64 ) @@ -100,12 +52,13 @@ find_package(hsa-runtime64 CONFIG REQUIRED HINTS ${CMAKE_INSTALL_PREFIX} PATHS / find_package(hsakmt CONFIG REQUIRED HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm PATH_SUFFIXES lib/cmake/hsakmt ) ## Find ROCm -find_library ( HSA_KMT_LIB "libhsakmt.so" ) -if ( "${HSA_KMT_LIB_PATH}" STREQUAL "" ) - find_library ( HSA_KMT_LIB "libhsakmt.a" ) +## TODO: Need a better method to find the ROCm path +find_path ( HSA_KMT_INC_PATH "hsakmt/hsakmt.h" ) +if ( "${HSA_KMT_INC_PATH}" STREQUAL "" ) + get_target_property(HSA_KMT_INC_PATH hsakmt::hsakmt INTERFACE_INCLUDE_DIRECTORIES) endif() -get_filename_component ( HSA_KMT_LIB_PATH "${HSA_KMT_LIB}" DIRECTORY ) -get_filename_component ( ROCM_ROOT_DIR "${HSA_KMT_LIB_PATH}" DIRECTORY ) +## Include path: /opt/rocm-ver/include. Go up one level to get ROCm path +get_filename_component ( ROCM_ROOT_DIR "${HSA_KMT_INC_PATH}" DIRECTORY ) ## Basic Tool Chain Information message ( "----------Build-Type: ${CMAKE_BUILD_TYPE}" ) diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in new file mode 100644 index 00000000..a34c460c --- /dev/null +++ b/doc/Doxyfile.in @@ -0,0 +1,2447 @@ +## Copyright (c) 2018-2022 Advanced Micro Devices, Inc. +## +## Permission is hereby granted, free of charge, to any person obtaining a copy +## of this software and associated documentation files (the "Software"), to +## deal in the Software without restriction, including without limitation the +## rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +## sell copies of the Software, and to permit persons to whom the Software is +## furnished to do so, subject to the following conditions: +## +## The above copyright notice and this permission notice shall be included in +## all copies or substantial portions of the Software. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +## FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +## IN THE SOFTWARE. + +# Doxyfile 1.8.11 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all text +# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv +# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv +# for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = "@PROJECT_NAME@" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = @PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_PATCH@ + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = @CMAKE_CURRENT_BINARY_DIR@/doc/ + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = YES + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines. + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = YES + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, Javascript, +# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: +# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: +# Fortran. In the later case the parser tries to guess whether the code is fixed +# or free formatted code, this is the default for Fortran type files), VHDL. For +# instance to make doxygen treat .inc files as Fortran files (default is PHP), +# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = NO + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = NO + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# (class|struct|union) declarations. If set to NO, these declarations will be +# included in the documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = YES + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. +# The default value is: NO. + +WARN_NO_PARAMDOC = YES + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. +# The default value is: NO. + +WARN_AS_ERROR = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT = @CMAKE_CURRENT_SOURCE_DIR@/inc/rocprofiler.h @CMAKE_CURRENT_SOURCE_DIR@/inc/rocprofiler_plugin.h + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: http://www.gnu.org/software/libiconv) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f, *.for, *.tcl, +# *.vhd, *.vhdl, *.ucf, *.qsf, *.as and *.js. + +FILE_PATTERNS = + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# function all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see http://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the +# cost of reduced performance. This can be particularly helpful with template +# rich C++ code for which doxygen's built-in parser lacks the necessary type +# information. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse-libclang=ON option for CMake. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# http://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: http://developer.apple.com/tools/xcode/), introduced with +# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the master .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- +# folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 4 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# http://www.mathjax.org) which uses client side Javascript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = NO + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from http://www.mathjax.org before deployment. +# The default value is: http://cdn.mathjax.org/mathjax/latest. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /DAo+>c-BeEV{o2ZP3= zbU`qZ>dKjmMRQW7W&_Qmxz^l{SM0Bi8s%Q?$Ll%AH(jA>tvL|KubvxMZ(ZPw0n#_V z1S>saCgRn{s}yuL?7b{|D>vZGET|+^vFnN-g_TL|7UrG#a^LaM(L8m-h)e~Gwz1z& zLDy391H_TRGH7VV57uy6=47rgriqhyUm1&UK=#2}JAA_|3YH+G1HyPrT7Zi-7@Gj0 zP2?~`)-iQw{l{;lL5X{~qQ$-2#Vpl>aME%~Aj;KR|EBflXrUgrbdtdXgT2d1S$fSd z(oX$cHTdnTC7*_I4{3KbUMcvT7!`^7OXAnJF@G6&|=k5dh3RGwcrcqh^uhY=j}DA^E76oSpW1N?wR zLG}RYKPLjW=pc=y60J0v7i9hqSlYLrb`)R4X>84m52dLG4PZhO%S_8=)8cXzxd(_H ze4ZQKn;5nU_`rX*>~=j96wIgP9!1W*GEY0!LD`HcMSAnv8%y}%`lGF2Itsfq?XqzDf|;)M=Dr6q1NM}$ zXHNrWbZLYQJc}*LQd;xcRTUO#O_c9zUmVj@IVAeqt|?ZM=BEn4zKV8!AfxHgPYP4% zg4|a7cU(|z8&4HrF-+B4H%cI5>pmMV2$ZU`9ljAaXhib$u{F!>P>B&pz~K7T(pWA@ z*y$&^AolqL8~e#b7E}#fd4xd|^;l38B>3Y6N1?HbrlN4Ol=CPp&~g%uC&IV=aG~s- zr0g&JntA}Gn0}44zURUGUnnc1&+E8jS*Y4)hvq>lTqvLqHNF;>A~AL89aI-at72%c zac+e7g0g?)snouBZ%Fc$>>DZc&+49dGl^m);A&v!r(PGyrj=$Rr9M|6u<;<0S+g116N-vTi0+r3(sJ zp>)>gss7b`!9hO`IK;P4r6TGt&{cdnV>o{*~A1|>tj#tI^RW@ zVE0Zh>g%G7=Envc=lHx;=*I#2GXle&)?g(M1_z`_+gcbx((Pq$K0-A<@BhuM0+^Sj znE@4*LXN-=m*6SV0iy0PQ;WM`GnCleb29*gP3a;Q?5UXu4iwy&vf>~SQeQ~(D7rHP z66|zVa&06Q9KR=}mfW;5OD%(kq{=uM?J$grCd>Wod!5v@rmZ_5U5-^4|Eq!#t@M|L z^o+7LXtKKI)0cO2pw{Mz0Qq6K{;lrZ#ap_RI)qp#Kk>KnMh>|o_&1KB=d2MLS95`O8461R~!0? z%H*(n!BB+JGDJwmm*LaS^2ZSjs@7U3(8A#BF?0M@9yZ?eYiK zfCmw?sVbZBrw_Po#=MEDaVNrexYJM&;5xt2%7fx^DO7VjkIp|^#W{<|>vlpw`{B9= zeeMp);(TGIk0!m>+BF6nf=OHjwzTlfm@CTZGwwX`uVe0^yPT=iUHk)ufgL0GN@JsU z6OdOss}6AXjl?u2lh0t$9}lpRF2`$yQ5MO;eG+y>iA(=?EGyrEk)BKStS5*Kf!~>y z8>GCSwi=4&1AcaB3@oiOi~_9Z@=i~OV1lE^>$c3E%5sMyjOEF;T zlot0~q&Fu&Cr;NyDB9@WNNb@;c|1~4=gW{UIwaGq-x?9Ljl8scl~r~fV1_fw7s_HI zBr9txnhggM!tIPZ9#M@_DTLE;3f)#lc$YYoERsHxc);y{`gW7ih!=PlS=wY;WJnDN zbE_aNWpXE63}37kjb>~#O$cz4KOJ8F-EiDw(}sRmo(=hHjT{OQ>D(_YXOmKs{PGygDrF~i}7n!Ya@3W-1rgsNP*)=5vklxpF8{9Nl@`9_1%2A?6 zzRmoCUqtuHDcGv-ue}yYQwyk7+ zxOvaZMe^o=z(?EN%C8FS;gcxAxk>hsn_2~| z=^AW8lA-f!y7pD9Ym_fdPrHsn3E@zrr6ri+lOu%0ZR%QwLEwxb;-w7!%kVwLf2Jx; zNe@uPWGa}__vt5qy~gCh+ygH8T{Q{=R8Z1s3&I)naYPmCLwMzgB=z?agZCL9`ViQy zk9t|qTSnGy&3iSFb}NgQ)aA@hKK*Y*S1JS-Cn@EQVyp=N_>@Od74_X*(_ih(%Oc7K z4F#JJS#A8bb71$lG4rSp5LA7anm59SZn!fXYiH5c2-5LM+?2Z7eym_dY@$A9NlIORiS;V_mN-fAL1EUZ^f#F`sid+R1=c3+$BG&R)30e@mErS1M2 z;bj6`L+aw0b%j=oe52Bb1W0*?_bAG~kzEsy!gUW!Ss}zpbD}+Mxp9_G&jr^?>f*w*K2O;2)cPj_O>b)2C6jPBssOA zXzP#ty1I%8yy}@;U~*7sUX9GDj~dv@J|rYRW;K>tebxkZp5K>o z+er)fkD-+ekeCUQvkN`}qPV%ap6ZXp8i4Sq+)sKTEbv%l|5Lr~le$fD?8h--GGBL! zhPQy4Al{A7-iHi4&mI-W+}<^vpnlp&ZC~;O{wq>qZR~YfDf;01VA>;Z z;j_ul@Nop~mRw5KX~l9aMCj0fnl!&-l9y~BzYTk;OM1JCD8W>Q8;qj7TBUt!MPwxn zUqX_Mw%G~?@KD>5_(AB-)tG;0zp|&z!8(`SY5oq0+GCA1B0q5!|4#E^@Lp9+DbFj8 zC6>-Nb?2-t7IGM71QH}`-Z>4It63dekQQN;^Gvu-Fg56V+T=-5?P!WvD;zWjkw zWtE?}s11PGCyoSeD0N7Y*|AV-6UuK%aGWtZ);Bl!vGnBMT2N|f$JK73n?}RRvYTE>eU4(&KM(S%p80REU;8j{n?xiIr>v`W7o1hd3ECggGfj)sbS~14KQV0^ zC>;Lui9H^&7S~U4W-lxG!aXFc@Wt;>$2nj}EvGXxHUgQCO`9n(!7fWZnV#=^#J8$y zU37&MQB61Fpt~2%{VbU+m9*oOo!nb{n*K~9R1Q`z1(1lF!eHq)XdNjD)e}HI>|;({ zoD0>GcWD}D3<>P7*hFlX@SGw)_qjJiA%dY_{!TB z>)wM*WTX*3CwDSTAP||K>7bW8JIs@4+6p&U@uu;_kt|iLsy3mGMn4{9=W;s!z+n0H zF;4#+8>qdu0Ltj=53=@6_pK|G5BrG;I-H{}Ohjb~CSS4{qeF+{?<7V= z1XeeYey*qkt>Gnx>+}qa{T>?Dm&_JJ{D$P`>2zU!r$L z)c?*Y%O(QXp_VBnPhz*SjrZ?AKtuX|=j&1dveE)`bnA)4Qtu86{Pt+u zG_-MUvM$=B>*Vl5`_3k=>lp7hc#Ms)_U_-#T0T?78?2*oJn`Dp;wR>vb?mtlSyav! z5@phy@SvBa5Ma8w34}Kk_Bj>7J2h}!^5n@r55rU8Hwf%odh|qefr876aWZF1I|7on zS6qA3GS?%lWq_?3SE=T_J24a75TJY5l5%=f?uhy56liAPzEO1!#z{O?vBLW))<(y~ zOb@n#u6|mqIN4h!vLBkHlr5!{t9w!W(7|qu3#X+7+?gvRBGD*)4y>c9d^s4w zt3bgjOqp_maC!wooeq5k=d>wxUwqpBM|#OGCtvNTqrsJ$dwTc~PeIeXocL){uf(k$ zWJ(!3P9m1>(Wp(#R*t2We{9&Y4~11PNEzhd%J=va)%EOf zt?%4)APkOshmb3_C(i7+YXznBlfuZeDA0DVB>|}*+w_hC4v-2^=z{5}!JWPq_H(}) z91cX!KhR>Mmo6s%Y9#yh{$Bia`JBxAowD=aD+!b}-$>LaW0!^gMAkx?IjvyJ_*D%O zWa&@a2YHK7f8@ovWLQN3nw8r_P7Ieg?ZMJ>%>HfY`Li`Rr6frK1uWd)Kon7*7a}4Y z%DETBQj8_bC1PTBiwSm4SvW-VhA?-W=2K?~Z#|Xl;w@m)kP5-p@4rg*UA2z|6MjC@ z{E#LKJ&~0pK8>KU)Cz`5gghSc!PoX@1P1e~Xc+2}Mq`7|#~s{z=cBi4wJYk?&d$>C z2Q7DG>47PSK+&OB9iQAh(?Nc-3KRoyCxgs+RS7Feyz84(m?&kyhiBF^*!6yWL&im~ z;jPu+c$-|y%d^2dCGx^z4J$LeU#p>Gt|7hbMhZI#;@;uf-UJ|1-ez0CXg@`%I^Zf1 zGQRqx;oBs?ZIO^4muV+7N{BCOTc~FpO8w4|C~u3Z4>;sodXcl1gl4&Ciw`BAyr2NC zWl2@U(@CItT`xFDCV^7mQ;pd&%~CTrClvs`X$$KZXFUvc#R)skSM!oDnk}#@6EXej zHI5~L>jp*(Gy2{wGGZW1(>qfOwxNOb9u{7jlbzA)*WMKJ;}Yw zeG1ErTFkSh#ZjUYThzy(-rqygjh0}t26xbtMr12Nxi<6V_yEbOJ?V*==&gv3b1zS2 zX|Xip`Geyp0d$J&C0KLC)|Y*!Onr}9_dt0Wm?n>^6t_7>IPoYM7{*Sx{IowCqL&>K z_ER1y!M<`ZxiTTFFbakgbfZbf!9alZ8vAY@bB`0s5m)uSN`3a4)Vx)cdh?om+q3lY zt(FGq)xvvl9o;=(T44d>XTXgf}W}!!*u8X)8{9wj6fy-5MCmNfUH^8GlpnEau~)a=3uWGxW=;1MPBo{&Y%1ZPUlM$w_s?6+NG!CdUZqtl z;7<;r-rR33s*J1cn%-mtYbUcDmK&wN)h7+sS4cnsXkG%n%qtrItg;~)1HTY$L1>=K zagP-6HrmBUIK@*=9N_1nwW|=YU4KD3RV8MIf^TkM5X-12R$4UBy|EPdNB)YXA?&PdNFGQXA=<m3a~K8e2mqdY ze_#P3fcSJ232i6{5D4IaL_gq!TO|OKdh%(wd|C*1e-Qyi5gaKAHGH@9AYv@#vpX#S z_F!;;#AKASKU_Ek`+&|N0(}qx>}c2EPJJq&etUmHB%q)|pWlSQuU=xT8!A$gJAR9*iy_z7vN~li&2Gc?Q zrvido1~nEb1kj+rOvQ6GtZB9?g6rgXIywaqEYasGev@==l)w!)`~v>4t7rjF!tY<* zjUfcr*S^vECwo*TvHoqIe5^@71`OndJ{vd%76B9rN-{DiAb}kK1-tN~`8psmzS{Y{ z**()~7(m}z2{nISQy6oAX}rh}qDSEX-U5K;E^wCqJAT|hZNvZp0B#Hr*!mEz{{I?A zv$2fAH-DN&vwOIwu=N0j=iop9em=i0W)TMIxJZC+AMD?6h|m=kROwjd4?mb6dUB$o z#q9m^0_sQv1e9PP{sIJy{_X7)1p9d&&;9lMKHuWiV1sy&@!wTC(|kWWjc539_J4>F z@cVfk&hZ$hLV(+UWS_+O6bOMG{=ak1f4Gl-Mjv%Ff7Ai=`SpWZ}jo#{%%$Tf*36o1UU3DXaPFOA-dnva6^?s zJcMta_07YN^r}CmY`>-L>@k2U!$byrc?}p81@`?u7>I&6jB<%6U|9XS@-bk4nrfkd z0-L``*$fmJ0oHeLuZ4p-t)`_=_Q40s7vU~{vZetD1woKvAOZDFVF1>N{~ONX(nz2X z7)Nao!wfLG7Wy5By0>>31;1v2q!0Sl`Bh;-y@U#dpPI!ZTriZW8Qh$)Lc!<)Wf8km zSLG~3ocbshn$K>S`NnF#Z&)VTMzXom-@=%k&!mXUOusYV`*PyH_%7#QcW2k;#*Hdn zXsPj_P+)xfZr=cyv=$RUaQN!*S-3y(tZ6O!ydR#Z5uXbGC zI>d!L$y1INaQ1e%$N+qQ$%S6mQWjrgVs@&z4jt3*9w#hX-8y=gy)|q?9uQ2sWW0|A zjGspxc`=Zo6LuquPIp%ZNO8eQ@xo&7te&IF8+!K$?d!D*Ud3W0DVXhT>AS_ZjO-C^ z{5L4Viu8%z zJ0KEV-5+kaDq)39tsFM5O{Gm}9+{oI_#w@?G!;VC;5KJ0lxea^66ha?J_c_`adk%; zzgD}qc=zGEM76N`qacyqFU8#VR8S;a0k;PH+#{lXWP5l*F8VaFFZ_^k;ysc#8nBvd z5{G}XHTDz!*XpxqA8%Kxfc}Djm$S)=*>3yujjKF1^y@x4y>{f)gR5<^wR;JRUe+F# z+$wCT1Z-_Eaz8l=vY=X0;(KXs6y;m5`X$L6;5EnnHJWrcOc2F<;e|a=tD1-945#h) zubq+#*fz;wCPX^2x^EutF!_0iG?oeSAK3?)Lly|nE&>rx5W=uyvmME%|C9yYJj$(R zJ-6@xl9C0Qk)C_Xz9%|-A1&*Z-WStgt+qUO|QtUtZCt{!lPx(bf*jsy- z7O(k_HbTi+d3qTa^>mm<6{R?-*CtJ*(t3H{|5j~ggBt(pEbu@r74@+>0(F_HPB(mM zJs8pY(ILqdx!ORI|1-YYBL_2jZpsr2e_W5<9$VhsHZ6l@51e2Sc26l<>*=b4V*Rtx zP;D2nj_(Ug3AIo?hm#v^vfPgJVSTwAd(E)}L*cJkP%d56(7Ee1g>xTw+N%+G>ZLBp z?lk$PS)23w*<$|zs2?x2#Ka-5c~T$5Zi@u$Ge^p)wyjiHeKM5ojCC0>_V^}`P`W}; zDTi#NGg@e3wWHUlEu%{j?g-u>OjdXB& zNx_Xcmnb`~`FH{|4|SU0%C=h?-q;*!LuHwLsjjEKZ0tU5qLjX70bB9O=^!*CD%j;m z$}j>8dV>jzeqp)1{$UZH5?6RW7}gV;uG)fr4SD71LLQ}o()68XWVdoSM;pI0tq2n< z-tO(I9z*@x|F}2IH(O-!SnnJ)Z~ZB<_@mipiN zGKt30g(4BnrdfmBU=Is0|80Rj0>)iw({S^CR;GXC>i2^uxt=#|v0?5}Gv=j#sv_+%6~QzjSreqE z9b1iJXBBp)e9CLKld1WOpZdfs8gRk{fB#$BQzC_Ys#$Ax{jSMfas+bQo(4_*;dPcS zQ|ydSmRG1XM17`_-C@)g4_`w`qJIgAEmMIu+rVH)D8kB0m1m<`EXyvipJZ&kN=5H< z>`K)lkQ-*>Kf428iH@hb0j3x-WjLNm1m&ax4VotrT2u5Yq^>dPa%@Xeh<=|@DvTb%U*=zX$`xMM@H)eglR-jxzh(A?c6 z2{J68Du?G+^=GYF@J;i*30enFR&u4)u~n}rHq6@lq$wOPZ*)I+xqfC; zFVPJ^`Q#~ol#$SQG;HzjdF5y(&3uz87Xc#t{w(pxHM;!VRL@5!k!R>U>2S`Ja5Hfk z>&pK9Z782jv1Mhd12|0JO`v~&^Gty$1OV9_UakM*`@6xqjX_JgsH0}IG*PjCK#=tU zQ>NN8dGo&V$-uX?_SHsx3niXz1|;3qar0cVB!)Zp4&hs7o&A!teBpz6vWrmd9oY7h z_rLz4QZU(TVGR8(I1qhm80@z+Miu?`kX>iON%9aU;em+K?rWGwWp4TA&b`lNb^6SdD-K8r}2bG9?XTPN#RXz|@lt85WP z$%#M#`}tUnq)!GSyL5s~_ZMbj!_TeJ5|%tvyxhv@S7urAt{iAhVDPs)@tJpPerAT6 zUW6B`*wQkNH7!}9%;jI{wA_9b%Ufeb(?R+9*7Z4@t(bp z&nEgDPjFnE)F|2uA`7%^zd<&R$HbFMRKt9^K7Q5lGF)+n-nq8M2i>RzWqo73KOo?9 zNsWTnMDye!+D>vkG#&fO-}C`Co6FfiRVrr_(dQr@Vsj|`z3iEj;C4mVf)5&{TU5GG z%3WnQyrip3jp|7a&6jFd_JzlOEw>4KB!c)hRbl#YLmJFUjp;gb(&uUiE%$sb?Fe^c z^@wpCFOv8Ub;dNLoNMB#(PNPnW;t(4Av?8$qLp*V^T6%Ef*G}m`bLWAL;=d>?-KmXAaAu^MQRK&{kK#7 z;@zPSIxFN>OCCe)hQksx97k97b6Q92%;(xf+SSNc@MP+@<|Gb&eM82e^ziW6 zekH=N-q0TH(cM2v8mR&n45Hj8n@o-mf}T!7qu2xOHAM){V9fe23(u?u1%~U z+VRnIP*T_8vMZt$Xiu=6bO#6C&qo3z4SGH0u2ej48=v+f222X2&QkfG4(WuA<739J zf?_EHYT3dhm<42MPJxrMGGATxs+tKA(b-6qJ`Nd|l^w6j^cM?aGx>t*$5+d+t*VA0 z1E$oJwJGE`tb3z$;wYrCK7ZZIjXe)}b(d*=qxCd;*T!Anq=yVI#R}0yrM9ve@x#~9 zO?-!Da-D!(Gt29>4;X_HZlgdzP}-ul-4eEx1z(J-f4CD;AeQeHtJ zZtX=v2=eAz(08)U0V33G-{0R!)_O=1PGR14aAl#=V!Mk@8qBM0bQbeNTveNc{o{OFA zhdZXlk8wKCoMK-95u9;qdUu5r^RHW09c?vGM79UwC3>vhrsYX9Jj>hPBGm4Ab}Z3C z2)J7&E?bFtx!#u7GIlmHb{=}&kws1-=u2wf>(_Kx(>JIzPG96qJ?@@^-G zpj|mN+x#Q?u!}18W_*5AqS8($XK2RH!44sO@(j3EKP18hYPHuxV#)2?r+}3^GT4e z{Fr{zimjG*WFicxZSY>`LZOtQfT%i>;Uw`}r>8TrT$bcyJCtNMU=dHwOo0<;23oygmkiTceb`gd1 zMr);b7Cj3Y@UMSk$PUtqGr8Q>cvi)Az38MQZ||=#c)Gqyp4wz;F8R$e4-EqUo85GnE|&c%l)W?MWmK0s6 zySOW&%eZH_zxhmra2&RkWVYaEIFnb&ikuLj%2;f@+47VVf!M>BOvA%|z(1crF4J}t zt{$0l#Wt`XBq!*sDNzHF&5;^rYF5PweNN_#tm05l{-ah-MT321<*~1&rMBbGZKX1e z_HhBD2qjH}={)OwxS-m|5s9oJ##t)71Sp4eF9O)FXoG*$o}-x2G%|5oi1VP`ZwO|? zaRmT6)u4KLCJWV4Q%VKmomxM76Hc3_M-ykxN*Mp~7$zm=<4e%cJ(IpKalvIGm$j+!)Uz{fDA@;R ze9t)qk1>`Czt%_JUp{Tgmbk>-|3o<@I{)@+JMuIe$%(*BWA>w|EUU|B$x?H5T8s<05*7Tj7}|;= zW-o%jwQo_uj-5cU`LEhn-CD-W z)$Wr5s;XJ_{4~YIIPsm`2q7J9%3ekAT9yj#7QA*8@xte~!_v_+EN_XKeX-WPSV}NT zg+lvL0!nnO@}wLF&F46jVDR;d_Su(CmfSoW6_1R0(aqEY$>$2753OpgGt)jU{*Xp( zB}gLdLOjqbzX6_F6}$EbG0HwE&>TO^V+20?@DDTscd#TyU+NaMRUsVx1jRAz9#O;NI=bQ2KddEQY%Hpq+ZxrPU_r7^iFFlPEQ9b-p8Qor>gv##q`!+pFV7>=VZZ z!kt#~6FISJA1^YyZ?v=i-|}s4di5Bp5LYE6#EO$67g;4HcCf7*GRnJ-6&5qCE-c7t zYMIq-&dFBY*!?r6WRA&ba6rZ1>{4OID_GPQtG3U9<8NeRn~$^!7hDnRF&$!M+s`rE zXYE#uS{=kMpW?Yn8g8I_N#3s8LNh8Ic+=x288hxK3SGSSpMTMOeKkO?^@E0nr${QZ zkS~8VP4+NTM^g!WEmYQpsmPOrhSG&q&StbA6S0$?mlEB3 zzndickc)Rk?GB31=yIA?e2m?eQ&I<6H2=)jb8|f>+muf)-KF~!F?cq5x{+-xcTcHj zwu$4(_CQR$f250PkaoF}V_CbkJck5QxaFQE##Q$By+3?iLbq@Zz~Kp{Q_dO9JNB?X zGM6ocifTQ9Dm7CV?=tY(Y>J%s7VxrH(Fg6$xL$CA5iiVt9(XS3$J0VobDtmxkKidb{dbIek3L4~J#r1gm| z7g(*m|5kWDTN9q+)z;j`s_=_F)}s(NFJwJB91m`7VHgkW#7xz^T`N6usy~;W+9J5V zp$Q=gVo_Dln+vo%q~01dsE7ml$@@KU^DQH`n0)WsiQ8s`lLu>lJ1TNO9Y4A@{_{id za}u0J8)S|=+$72RR}L&th9rd7K+(LHy3p8;Pe!-;yg z--;aRHLiAH&!CF@sC$0OPbsyM{jWs&ExUZRmdD78Xn3C38~$$=(JfLz;Yft~T{~_7 z(Gr1%G*3ucuM3REBbJ?|=O<8DuEvA$T7#aA)kcMS9NU*Vlj%-E;n;Wv4%#0h59P~w zl!&v?+Ez!~dIpOtYqF(YkMfH4SL%a#Wbz{O1o&}G03+4jB^o&cUTc%lx<8{DwU|oy z8~?*{^C#&%YK$q<7Ei*dyrEQi(s-rohe`1E*wR%aUthO*%Bj*LDn03FJd{_>f0du6 z(;rTPEo_&o(36wfLQ@^g%~|=YyAzU@M9iP)oCdPFwQC2i!xO^H$E9#ZgXp9!@Z7Gb zGRpAkBT6O17vn@R*?@KCw2|UL@z}I4Y0L`%xv&>M@iI0`nEdc8KOob5W4@)lQSKWdc`+FDe?xby{~Nkvp=bCXSjX^>o{fQ-^?#24uLhZ(osRwg z!n^-JyQ^w7bSl=aqru`lFEXrMYvFL=XtA!TS~D4Kv38!TzmBmkG~D^=@&1n;TzTH) zI=;p-k8oC6@H{JVGoPkK1_;Rq^RF%SuSpH`;|=und}3< zSswA_e1d#+@I(VU7RYu8IzWiC`dq+uVaSa z&q+&NgQ(tzsa(Q|rD+5Rh+N0o7}7p~yP)Wqy3fsVnsuJsuxEhC+XZk*Ha zrvy~)cei}%ZI|r9H*n~e8u+($?DJQ4>$hV5*Cp)tH|O*hntI>BlFHWT)fMRHFyPNu zw>}P#gP-pWh9=S8FV)uA%*4v!Z{2<`xJBAG#P}})j(Obtt_kF(<`-H}di*s^^|OVvhJW{?ey^~`VOr(F|Z47IfM-$>czg^8h&pWGG4uh7Y< zou}S}U#f-PrSzhrAz~)uv!BcC58I>z-D~~|ycpl`?^uN%&a|J}VO|VELiigX+K^td zIRE5mn7(&b0#=yF*U`;QGMutqzG(@)CO)PwE*@8N4MRrfD2k&W$v-p6cJ znP1eE^l!}SFT>X%i~S$q--?gh@L$clU#(xxGZT{=6H-X3UR@xLWXv60526|-C<#QWrxJiA8pq&qpcf2#bE%<#e=RSCK? zKxJAa=#kD%Y`eK{twM~`O1dbI)5Lvy7kE8TIRSwtpQc2Jg0_?VMGB-{r!?2RR*qjDD*cxGTTvtoU;B|iNsZ9E zKk*Si`#lcX@GtWeNw+7DxIm07Jx1(;bMtW)bBf$v4xf+b7BvjSaL{4>u?4%2nfTlj4Dx7S!`Yl+ zqatoLO63}^8Nl+$6BqZHKEvFKF4Hm6dJ^W&t1!e=?`S) zifLM6G9h-l07zF|(i#}-2*fHa+?Yn1!(qf5Sh1}0*W{?O>&W5;Ey$zS-0uTlX2o;& zDDD`JX5B&Z3M=rOoDt1~&l8Qs0e_102BOyzsE#HTJj^FK@g91D^ExZu)MCDT8X2{> zl#hQ0ikb(^_$(^<_995)Fc#ycf*%y9aBnu5mj zMk058E|_V$*04P>s|w;9slsprM#dPLn!Ox$EW%~+BAZ%|+=o*(i(bh3K9NHG+ic>9 z2Egv~7Nwdht98|eAowv{1ZRKziJO(_agru=HG}k~C0gj3>Cz5HmNq7qcP7s-phHm2(5_548wy)gds zauc&+xax&BZ!Ic(*YN&2-JW4vo8qDU*;Eym5#s%joW7`{@=^ggY~H5Xe4%Lo?hB-E zpCv+dzBNnjZVC+> z6JHrQk3A>}1i7TsHkZ%^FAv8e5=3LbRbRwPHaGyUu*4>O>M@-vrTBUlgNHRUFjyN2W6GMZUbMnBA)lQ-LvW!fu zN%Md79cRF{tlsA343`Vf!P*sFGAE>G&u6pZ&)Y8a?d4hGy8Clxk^fh_D8aw|Sc#f& zq0yeWceH`au59=O%!N=j;-WIuzh7ytW)R~9v}2^*@0VC z-7ZFg<~BHVeI>`FwNJ2`PzkOlo7CDl$-Ua&iI_QVg^ZIC+U-#7|m5eXEpi~ z`lF}uC_xm-yHAwvG(NlqQE({APq z%tx`G(kAUY%etP=2{o&mNovN&G*2uW^Z4iK{u#KHJ)!I)aWO$tiNM-J!4^uY?o-5Y zY+Eh{P@*i;zH%w1gy+9rGHkqJG5S$Xn;w=uzVaF^p?l7mESpTeqm6_`7A&dQNhUOY zoi<6e8P+;3DZDcl*riCZHZ3DZte#PL() zUuE>&!|RhxdYoeXifebH#?o^I#cY-Ju`Tbxj}|oAVVm)Y7+m1FFWmykz3d^Oi3G!T zB}>j;7>W7pZ`p`T&u4L1rYQT6(=-iE zl?iQM>|T9XIiM6!o|mT~D+wbC%hB!_yz9HxQT60Ltajp4h?{=_MssV>A>+6%^V?qk z6u6q_k-Gmx^CIG1Ksr{OjX+#<$EhEx?&=BB|993rsG5I+OkuSUv?l&Vc45UgX5c#Z zx4(33LL{DCpenE`kopZLZR25rDSxT@xJ5|*B<|`BIqVM}I8ATpfozM~r$mm=uH9pK zm68JuKM9xEuw^wf9RsAyQ&)X)w=QF4^PJvc`I?@tl&>tCDOYDIZ=m|ukMKGZ>U<8E zjPr-qDJGDh9yMlHz)OdeZD9d$7S0c<85!<>VNj;TUl}_#x9{pTibiqomId7r4hS0v zN0Y} z5bmA;0{9lZA#wUmW|`W36%j{b&Ts$S~wLQ zd6(?7crVB&z?+RUD#1v^Xs!MVLuwkwmjC=*+RE&E?g-SSHTTWo&b(zr{{t1v#kOm! zqI(g1Mz5i~QZG1D^=*^68!C=a1b7$q>si&@Q~ZmOAOZuF)e(6p(QQ16yVdj4_}Tgb z?RW;DaK(r#hsMGa9eZ9k^7^-ril^5!B7R+WcI?AOKfC`sOPaCg1JYJJT|TJt{VHep z&qBqqWxK$21N3IX5c38D*$_BOpZ~f-%Y}bgw$ctMvIK0NSfh{IyrObAG5-YLVI;5C znHR;@@%N&ZS)9#cfoCR5U12VZsCGmrqlJ4Dn_@-wI`8`uSKKVSdO}UAQ*R4|rl8!y zvehfRuEDq}?L}+PSo5r|?EuV|FetzWUWUXOlqSnPrl*yqI0+AFh!ySzRZf|lUPM5^LFxt)^r>?hFgs7K zq<*Yw;E=~ws-Xc^hHEaVq4*j)lX(3Yr0Ud|rQ#-ew0YW9eRE5`3&j%$M9U;^pVxBV zumwApI+^-w(GDgYx<~w)iQ4Wz9wMSHV`FFb1x#zI0&tP(>STG0MZ2Tj3`YzlROn-& zf(-jbc*BEL6cE?Jg@ES=%ZBUVLd^HPc2L*qzUiazyRBm3dSK$ln&)^scQet}CM0@0)KG)uCimWWm*6XTE*z-C#yvFrVdN( z$QZQADq>YexBf*T%cBBPB3yY92a?tne|2PSujykbwMSv5^eLNB<_U@pFER*k@??a_ z!k|NYtb(96PT;Xj-iQ2gxvhrtYuIz|YZ1jRt2>3%A`&a66#@^fc^(sMJrLrS5Wwm6H3k^ypTx=wX&|!XLd0Y1^rZ@;j z3gIzhehrYG?DMz-BX(lV5g7ZqFF$ABMo1Y4F+p@(6ydu@h326GrJx#u?dC~Y>xw)5 z`yE2Ic1@Mc$AB!|TwTMpt4rU)$&+4AoiUBsBHF3}7(6)N!x+NX!G^YGHFvMF822!e z3FZM4<2;+n=}A&h4kv4UjYE&n;-q@Hq_s*NT}P<=^0uS~D)o(jZwlRGDYz0M`W;6l zvJ0jbuB~X%f6Ev`ZMgBusjMGiKyzv;C5VM1-94@Yc1L%x!{QMjVnk$2yeH(k^^NG} z91npxP9c-7T!KF{plD(%5)!egRm{@hW|-`ty%$&b?<;aRW=lN|5tfBtr2@_`rT#wq zN1|~dMsJ_1LDfGs8*nw}oY4sE)2Lw!e|_8Ahf0=BQPr(=pQ97}-zF_mwR;0gk+aE2 zzJ?JZnFg9GPPFah!XrkW`7@dqQ!)h^+xU$f>4J;}bcO(gNNT;lcX#jWZM;q~ zG49-60yTss3>E4nMsT`LY ztRRX5zY?xSvC|4>_weUg@Dm6%DcL_OXVD!Z)cDAMWg#!*QJ@4(N8ViSC8I#OE=(mU zfI}5R`e*J366(@sTqwmf8rB+B>DEi0umUn}6ntk1rLQ}oj``xOGTjTpTAM$m83iH{ zNCsLC$2hIxQe*I8|8h(k(H4RNZ*8$gj9ipv{;L;1_jSo@L26${H{Qk(kXMjcEoO5k z=Wjw%N-kRl5eD-mf71aJs%bfZ&RI!(vd1mSd7Q=0OZO4LGuFvZ?HzfQ0aH8;-7E^A zDWDyyx!9Hw5L!P7(-o**BJ}g9TBchk=fV#Uc@AdYY03qVBS*&v6~z+Dmp?Q;xSw2R z-ve+F;Od>=!Zaj)w|pft*$^N~kg{x*r508L$gk4GAB*G9;hNE{x1J$+Qb$HN3X&vq zkeOG(w#S%Hpsq(T!g&7fkkZE{Q3tJ73DKRhFzSfW{7Op37PjBEE z9Kt_b^(;TM*q;ae-0x%X-X=Yc7;S=( zfBx+>+JkW9q8&f);Eo(<4Uq7Mt;ricrCIf_j|93NorX|+JInNfp$AJb4`W%FdNp1W zTA(f}U;7X98TS}NswMMI>K3s-qv``Y(n^gaW@Fi53pTRue2S&@0Mu0OL^@~L%zDS#tt$0f%J();iJ^VV-HZ)Rin&6ekF_nGK??EJ8!)a~$^&B1b=2c=@FCq2-E9;wo8 zU=^0a4GX6RQA~{Wdh(Fd4H{?DG8ZC2f?v;ZTl5D4kT;69t|_P~^xn@CZlV|Q_C@E7 z(IhK+xG>t9#OaAOeKzKlZ45^>ou>E6a)&XlXtHcip_T-!Hv)P7z3E|vX)^5@n!ron z_m%#3Qz`B+%9;a3e2>ilna7ZSVGIRGq5NxT_eAYf&IsRx&aw6lt1^6oAvE|oI5q7X ze)}&NtlcVK<$O&$@>Fol@G%(qkgCi&C+e0 zJmQ40^^ncMbb-hpi(+^>rQO|+fKJ&#a)hVYOfS3)EqO6f8-lnN)o{*<A% z-0ff|jx_T|^Y`OwSpDKZ&>kxg$2>@op=47&fCNci)QFz4pw@Qcd8O)nAQcvo@tYD` zLAZp;oxDW<2E76@y?ZdghcUNIGAU@H+;^!>g&Hy(ZD+PhAns3pI)QB@F8JAs+25rC^ zK_+WTMwhZlt&pwxra1zEczFIG)o=|UO@4*$#@w})b62@|2BtTH@Gve`GOUW3MnICH ze01cyD~_+HgxyJMV|P#xx^NOHnSWn#UvAD*O^Jacg|o@qtd$+O9E#q%2PX|NREX#E zC51}rhUo@Q_0$(*au?=V@#FUnKOmJMm}rBPeqERGf$=Z0#8VL1SWXnXwu%#YWs-&Aj=$hut3alR*j> z>O`=I*3=L)+7a=^xQe%VXG(Q?BcbE?LI*?r3*frU#3mSdrdF;SMN$DIayc=zWww!M z43cI?oSg~5-y z*c}NC(wnLM-S;^iMUb$yZW}qCidQk*qxU1Sn zx=2?7vPpBjjj}&#+u{+P?Q{neRB5$Hqf{xexa#R5rEu{hKLJZ=pb?T0!H?)Wr4;O_ zsFWK#x|au!O?2I41+zQ7-*5W4>^6B$nt-crPj*RKkP-X=AKy|bZ$ZgkX@_M=%nj&Z zzFu)3&P;5G&`lwfi(O9@1+`3hPrX#S-lfor!0;W5jP}}Jn3nN};)Y#q$%d&oAsd+t z=hx`)i}}RSR7cZ?qo@m(;P=@UlIzzZRdA^CA4IM!>|PqOgz*_1U()d$UQ*F<@3ISz z4Aq}1DJ%`dDK_{@4vpaFDAiiDv7wPnkySC5D^=wh!blq#r5^EO7>v68)ezpm(BOS zzFB{YBpV53W%h7y!0p?;Vd*fKABwNCb@rNCUv zGsd6v(5A}eVS-{p-X|ErK@Dy$@PU8ch$%yKx5MrCJ`Be z?_YntXoC{zER}IC531a6e_DCTcYPO0iBC0j3|4o%>Jr&uGbgqot(v}C@e4wzt>#xJ zD#y-g9*~J^i_0a2vb{q_aeNIebNcp|9KS}e$?3t;0d)#JI-~y8o9*9xV38&QkXYvI z>x0r~Z_*9`7mX^jSowo5`Thp-8MJso|J;DM?S~V0QrCR7{2-gP%h`(i^fs(tsT;Qvdp2r6F2D^I-FZwIgd%VcqsR)s~hZl=u;MZwM1;|rrc@Gy4cO} zn~f?)o^DY2OM0?nW^cgk!*q^n_Y7`lf|OXfskyvGC1Re|2JkX3E8mr(;x3%2hV6wL z?jU1#T?j|C4$zbnXLikxWJ)xjR1Uz{JFcete4K7XQxa1XMh($lML^1>b=3R&PdUL3 z6PY}?@^;&6$vsvfuE5vua+#&ff2-}{EEvG86fzALK5}!-pK~kd^M^F^k2Z)UjJ%ev z=gAQYACGxtW92xkr>Xj3*BLSKA!s`6rxfRaPAOojV4AtR4R=gvD#O^XuBHGJax6CX zn39R6!Go_rHgwBU~;8Wxxc!fijKW!MWNmuFMI2%QB~4GL_>i}zuah@0ge%L2^MmE008eO+ zh|9~M>StR;CJXB6ZgpGme8|D_deSidG_(2Yr(o$!rd7w_fLh*$)k_=135E$|I5E*- zru?J3cz81XPrt?GN@p4NSCn}TybxYm$;^>vc~SCC6HW03k{*!94kpVl*6Y>*t9%wS z&e0%>o^I{uvUwq^zr&?`o@bk}suW!>WCJot2xQbw9j#+m!txXq8$kt*!yF!$3uy!X zHa_-Zyy)OrC+(fPk#eX?{V{9laeN$ysM=9Xr8lG09C;HEzB!>tY;b~kb zX<)r2u}bm1jGiUyBwEQ3I569a;SkNNPIH2q{{GICWO&f(y=?2Vig2ABqA%uXB^#it zhIWfxGA1T(bq4g-P+)4Abf(F}or; z-0L+lO&w;IcMMlQWvbSn$1t!JCPD8d>)tkcGR$+(@5p(19;y&!)AvL5f#n*} z5Oj2&UHn9NUK8@tAu6pfC>yfnLbpFa{LqnmVcksz_nh<@50%iZlgtKp2%}kj9d(av z_^mhcbgPxD*wb-2iiNn_AIPOHJqOUWBH24AjR2rx;fIq&&+KKKj2MCDw(K%%M%Q?^ zj3pHKugnpR%hoPWnBKmyv)`HbMjz2l4o@|K8(cxFcbAtojW!=8O?P!mn@d7ulUi`_bN9iqHh@uS3pqA0RssL~DCY}MNFyW>b#-%qF z{5QHD(ASNy+TF9rIBpKSU6sRjhWw-pe{qfJrR(MI;;LJ)CHv%0)tE=fOF>t#gCz_+ zW_4&@pkG(b;E@G3y$`Yx$xB%qQ0rqHYx~43S*aoC!XZRf6_6x76CQ&MgcGVMGIp zeL?uqMrqFZV5+1A4Y@*T6UzTB;dFUg)y~l|`fBO_PJVBKCgI{6`(3cSO!MF{-FJd|VFe23oAOZoL( zuL_H$yJpG`Vy(JAa;hfuM~+k!h8;^E@5vBzG63bYHRkwChfrAVXM}Kh`5|Ss)r|ak zaTtgcu`aEeE(+ed5lTH)AD?KFxn@%XzBC=E-(BxXz~OLy^2nLV#v!FPe>vJL%V&6c zzJsr+f7ufRUvqAH_x^sEaT;S7CPm!U6xIGZ)L-yjo_sntzvPBve4Y}ga#{1`+aWs# z{?u|tT9vx{JYd++d7-4tac@}6Tq8QT57-k|9@I@ltLm?oip7w(Qk+3NAFjd=1daNZ z#e6+aoM6UJpMiGR>mJeHl`x47_sv#xbbNi&tyzDtl>u{hFm7B#2{o>_K->n%OO>Y* z_75GN^Vtn@h`*X`w+OolrEC6O&YT+JT(75TGd&M-PUo<{u^x{FDr33Q=yNE^s&E2W z2dueEP;PVXM;x@9;THFL0JgZfOsSGpnc`&LW`9*y9In(ukj0AJFd`q9zGH=WbK%p7 zQDP0J*$~ZB?S?lCfuqN(jp4L>#AKR4|3QDD&;h~OxlTv&?aiVu`(D_=Mi49CiB|F;k+O906xY^jm&@^mlN+6pLE7-xDnG0 z6x6oTB5Ahp=Orf}A*rE97X&o3T8~}UC`NcStJRz|xLz}{xbquagZ!lz;dK=R_GDxBy?Qh#v(tgqiqbx4I@flCzgxYer0_dZdUYLV z>2UQ|^`AUh?b|1W*w{7$OS%|}P_uYa;OdELM^#>E!6Gos*{tdhe?S@4 zhgMP3+&RzfdOjs8NIzrRKQ_%5#&>Gop{{VIJ?jCWq{`cs%L;Tv7O2|gYd=?XPU>+T zer0UzUh-jR@Tcd6UROThjIAw%_|Ke!ISEzCU+2`eUHdQ9SA@EgB1)FdgD#N8-$_#P ziVY*|x{q?~&Quh-b*~iN+t)@1hy4&9OulPP3aEbfbs zMCEr~-xs5=5rn$uSV%n^Kw-CPhmiX3zuQT89?LOO_u<{6E7X!7e_C{2)c4DVodRe7 z0{iEgPXyi3q3E07uk^nnjNHaKvMyzlIweNsig^pS8*)%beKh)j>>Ls6rmUoDFOOAF#eR2NSX2+>$Utu(=cldupMZuxj3Qda@HU!F*aAkVlDKsOeLr8s`{ zsG*Pn<+Zlk1Emp!Dn^Gt_vJ;GUDn8;t#+n&g>3Ir=1&c#QNx0-vEeBwwWZ9b~?2bKYj+Y6zBT#6ViO&T8b2v}dJ6L;cOjs&t zLpt#A!Q0jg9)R>dwTLXpN4AvnYHAPzFle`Gs&JD4b;_E`YujOs5Gku45GcH8SF-A6p)RM&6SdF6^{w#M?pI7^UXm(@-OZ}F5*z= z_Ez_czj@uBKvPRjw#iNYE5{5R`iRP)REh*N1qdY&%2V+JQ;-c%(8I4qfN8OtB(gd= zP7WN`9HY4D5{>qu3oL@xrsc`z8^$Agfh*vijd{9HyZD68BgwzAkW|ckO!re{MTC%O zPSw)P=RC7=YVe!+a9q7e_qmiBXZvEahy6FLx4(@VMgsb)jwC)nCtc||F{dZ;9aGm( zRk&&wqTKWY{0;(K`L}Xb3p%R~T&m==JEPwOP^969_f}OmGi)~78Es`P&^Wiwbi~d_ zh-SehLm*@rksvF#`Cz4NXd%@_RROG?wlo&}I*H9EY1y6Dbu0eqTJKKrT(u&sCb&D$ z%4L}QBeY{3Gb@OI9r`+NVO09HUS#!TXjcaJlE9{ksrTntkv(FMl)a{J7v$2S=68OYR*eyz#M#W-xpo zXrGXogaxmcm$ZHJtU@pZAN}X~8pDQoJ&s7(CaB^+=t84`!uf*j3VpeO_&xA4afB{M z|EUZ6M$*-R^2vy(g|m}k?8_2W5sNnNP)ZHVmioe}F;u$UA)c?&YO8Y!Ljn+maF!~!=MfW{(X9BaUo!#Av&GV73Wc=jHX0>cy0;kmk(hQ`TqPzc z%x5&wYd*i>2Wx;yzyCETzXAR)S0EzXcACT;?5qJ?qf}GYCM+@`bWEmE8YHa$Xg$iy zl5K#|Fe$9bTG3VZ^dSYkdWK_42-;$M6m`pX!?hP*y5%{p8pcdA{ zko~4h$BUeFIDr^dDUlt)ZXxzk%5bsC{4NPS%NiOvXuNTLIH2mBTc3M#Fy!%SJA{Rw z-*GGHF_60?drIt;ZoUKkvQ7rSqw-4?)PsTdih$5<7IO zCh(nyyv&J-!V9H!n2CSJvzVujL=~-Uw(W9HBDZn&&Y_K|CuFDzx=JFpyqygG)XD!j z%_W%Lz+eWFlQ*#Z787Y>%W4P*L!sOrY#kIsnYApKiW6q+4BhXt7*^CDg*yWv!v7C6 zm4qYG-2@oy_oxnxC^CdhbsJ@qlnRSk+EH@f8#7UKOZhG^P}goub&}$|R4|k`AGR?? z6#C9?z#?-EX(=C{z}p16oPWyVsuW_=9h5oPg?|`Ex4@6O7WyyS*_zk>_h2c%C(;^Q z6S1>XeNancW^|bqT93*13Ri+}7}$a{Jw_MKrFTK$8<(cezu_S8_pk9eu?L6eKeTXsyO*eed!42E;S=J>E6I;4(L!B6mM~ zvN*kYhh-sw37gjZ=g(D3_>{kx5-~Sx0U#de?xxUiYCFXwBP~V3cwNqX&fIUAUw8hZMZ+Y1(pR*4L;hQeVF-N-cO`^hY$`=gup3XkJKq?vPr~l*z;Lw-^z(;pft7t1`}z2sBN4oc1@S5WmN*648q{uTvsj4NYly z2Dz=O5Yj{i`_JhF9lK-CkCcl4xM5pWg?_qDNv?=)hHhW}{dwkHH}En-lLNX1cJd9E58A%oP8T3nddUnKCu6il*?is}pt?bck4{m^hPao}i$06-_92Z0wYJtm z?ufB9`FvL$f4YMCX-#20T1}1rO}c~Xgf_7U6E)qD`;E+T`47h9!^LPvWb<6SavmWf z)Gay=&dbpSljVrAW4n(L9@&Q*&A>ITUz*kX4_>jQa#-EclhF?}Yr6i^&D{Z7GN5r3 zh9=qCHPKP0W%5{ajqj{dU75Cu(9sNC#Ro-o*a7dZi_I9Jmhx42@_u zTtQ3tt3?B}GM3FL+o^y8g{%6N`(Fk9z=9igX$7hS1%hx>S$><*FhOb(=(I7_e#eOw7gIBGz5i&>VofygB=zNZ}s-9?Re{ltI+IZNwzDW zleH1pY@8R#JoZK_Fk+dED`3 zC~nDiu07#9=W4oHM^({Fg*gt-M8YgwlRr&2sm-RJgUW_R3U%RY!{F|{gw_g1M@~c@ zqz{PBjVJ*nKyJyfephi-)#xER!k9%%oJGQ4#Uk&S-5dK&oX$&$52%vvU?UHVOObgW2Pj47%m~Htw}6F_wig*7W&bHKzhw9Lh~e0E(3zM-$C^1yM~TX)k#laLn8*i2 zcf{xr*O)vg13#&d_25u@m3s$cEiP@0&yN~v?)W{2`!gR;K8^ZPGMeY$L|+eOO&YED zW7jR$5jn`Z)Hpq*f*#Hk$b$e&2@?}$srJZ6o>%r3OFd)R_x6sb4*J6aZ&wBVl<_pR8x z6=>8M?ofx%X|LuIGDfppcH;bcv9z5#RvP7{hU$M*@%7AU)@D1-*0AtA_mDNPt2(w7 zvz3PpBb^b`k72P;Biys(`ioC^|FpBwsbGE+EL{>fy!`lUQO_zBL%u;bj`#p7{!K#` zON0Jk#%NKOCTwJJj8@Ic$5c^Et;mq1zBp1x<_R=*tQ+Rws%K#iMC{Su0KR*&Ye7_x zqRDEUT8kTbYSBGfW16hhlXupB~wFgfLcKl*5{x~*C>lcf{ODR-f~Kb09-lzVHZI45^|jXz_)tR~ELO`S zP{?f~@eFk#gJ>kty_iC?i&>#n@R2hEBI{tKZ^(@&UXrichz?LStb|Tc?pLwy4J+)B zS&n^1INizGlKti~-JV$5FyM?0`1b_R4X#jC}N zn%dI-q2))?kk<7kyESW_rmvmY|1G%7%T7=@3=9r~YdzcQ@_Z`(g&)ELXe*xSs=SEu zCOJ|^`Gc<;_P07`;hqzW-~vKFr9-xH@i~!4mwsm*S8mCA+K1}r59>fO+v2fi)~grh z>@>Gaul<;BqJSCxS*zDS%R=6ok%14&{xzM_Sc>Kg740DD3ye+{Bx>w1RG7l7Z~`K9 z4__pHwyz>9g?m=K;71Ne>S)u;#63;o&=Eil90;AH5;bRTn2RIQov(I34FWY$g`8uO zcKGjl;2GbQ5efN^n*@GY&tPAYSD5~g%xJfi_pnaDmGnYYF@K1D zYbn4+kiaDi3?}{aN(I)B(@0X`9#_72yIglv4Wq3@p*n1lzPbd_ew5rp5U$Z6*|j+0 zsk)0;r9f^}ri!PoJmZ4AR<4}`U9ls%Ska%sL{2uh_55t8$t6EqK0lny2_U8 zbmQ~(f)#@fXHb3=TRjlt)CUdYTKa{I6CPKM957{R@u8!*9+3Y@Mq z|M&UW#)*h2Af&l%O+7?8c0(OL7*659 zYcsBnh?ZsT#j|AO6}=CXUR{c@$O}OiQO=L|%|VuRGXoNPXBN^*LE~VUd<4nHjI1S% zzmhi9Le}VND@k7Zzb=)#i26}XgETP`jJr!MT;rRw<-->Zd3_TF0+>K6#ONnSsYhti zJ#bG?6jXDtwAIVd+4Efck08cJkcpVit>BjR!UWgtApu)enS$e(#0xn{_WeI3dnSLD z+>bX;I$}M~PSCWxbAcv_rBqRKb;G7-Y8wb0+IJrrry)rA6S@$kvm{%;7hX(E`!Jzx zZ3;;D6iYZIZ`N0}a`a4my`qlNHR;ro@u=}s>p%k$hR#56AJW{ZA2lU(>B&+>c^TN> z87#8sv~?%OE9(ZG7eM}zP906fGXBcPhN+aLf<8mcAgmpFzr(ygns$(tG#-1bX@AMf zSZiThe1jHw?YcE0=?HOlviNW!?2Sb;02Q=H@&_*HIn#G@;n-sWkJ1-2V6w>2ZNk%8&5UdmQ#Aaw z)~vmZy}-OH&Z)S}o7s2(S9UJptapXR)-PRvQDseJ5)=!1>`CaL5KzPnL*Su-fXveV zWet<%9V2ZpG(Oa839@{)EiY8aS+&Mt5VV(NJ4DPpK)b%csuHn4p?`k)Am&Io>qVnD z4KUqI$RcPmET;bJ;!qfhKtA$=T**XzdD$5BIGi) zZq+7o{C<{yZ>iZ-j=!Mv$TaN8szZ>fg0w*|(_GpV4%6h%K8HBc1w5>KcAIjYsT3G` zVG~+kSpP^kx*-zY`llLenFu4#UX6|nLG(0j)E(C~*qFl(iz-v}0H^UveIbJUtY%eT zc@V<<0G!^BV!pg0b_Q|pBn-iG8jEaO{I};rH#+xvG^JUZ%{wbPHLhp(K&ngWS{jA# z!{0$_L=+h#f9)eaA0FVa7gGLoO-B(s0-;=hc}aTpP+noX=@xdeEPO z(Hz|<%1W^HA(nCWEwv_GnYErjS~O!!OBt*%a*8@k{gn%6XL zmSz33E4>kmj44bre!Oz`xfYfru+A^6iyVF}wCcZbaYfd&lywW{AMZ>xD3vZ&;H08+dofSANc6c)a}qh%O|a zXn;XC%FN{DKJ;{w-L$4L8H2s_b99XoQsD>VnK(_z7x>bekA|hxvr*UV)(bvz_G-N+ z&c{=0LB)P2`4|@RU2eCb(ZI0yKIysjL)G#PY^QtL46`a|w%Hia;!A+zg~9mww`0of z3+rCP)VV2166&|NA#~awpd4ayWMFPBrTQD#C@uBmc~LFIER7*GJVo1(tqpaMJyz4_ z74nFJbeIN-{8#|`6vT!GCgf&$F>86{dqR%pJ#28o$KFFfwLCDKWvtIS36eO?)BdR; zQ0~gP^x+B`m+lL0s9lVTny~?;%duYi6YP!X_w8qH$Ws;8i?Q=9({`NrfvY~iW>{`U zy!Kd{t;aM;<$?GlKX(4Cl*Z#jzBwcr7g?%2WwWb>%t9?y{Q=|B2LV+Aq%!|Et{2vD z3^tD{8}?}auX#_;ezn1-WcsAE7WFRLT53NZ`s{VeC+4E*`cMK=-n;cb$h_nWIv$#5 zJx&mmSp>84B1O-sy2zLg!q|=WS{IVJg4|^0lH#X{hSk?)|nA781RZOW6W z4&#lakxf3*C{3W>tPwa!z)f4(40ZpeGH9TU_esU+6Z}N-5wdLL4?}HGT84gCy6N|A z^5yq$B3b^JqKp}o6)PcG-Qy0sWq2XTba?W|*X5#zu=(1jz^%%H(O}+P>{H|A`+_x7){>g{qmAx(rTWus0^vp+T!Fpw; z<1M%wi6^)S8NL$c1K#SUxG06J^GZ4H)h1R@Af}sLGR$cAtGY@E^pC2ZoC5v(CQ4Oo znx#%MVRTn1al;|SF3-ZvWF0ybz78mf>rJXOYI5l0S6tuSX3Gnn7k!l5DuF!XA}~w! zzvTV0x_A8fKMDbW-EaW)BSAVb+SnG!~9;@_L%E+BZdi|lX8InnLF99nv zI3IoA5UNNmgVh)&AJh>}zCv0m9mD{lDsGi_@GkeUBY<^}ypfge1e`-AExzH)HOa~5 zhH+xx`QG7V#~6U>_rEg}A3^3hJDNX7_JEWIRzEhzvX4Ji{yey< zi&rd2MF(M;(Qkhrpt~#?Qn2>rX}sAtz9WfIm#UElB~DQfR)G2tF@jeea6~3i&OLA8 zVMGqsvf_wb+c7kgMZdNiCDhQn4M>fxm+VCr!)+!Pi+~5jz`rwWKuDVN_pQebZYXD1 z_>f(cG7r>TE$USgC1b2X7kr}H;M4bngv%zgwgyf@!Bb znK-qZjQBq16~p>*#+u$IuNBBKSCihOE6sm=M{qX=Eo}4fWM&{)cdtYstN}GKVOeJ; zO6tC5DokH$h6H|3AovnB!KeNQXQ^Iuh<9JkJVmqt5#*$p5AwMuQ{9K{*WY|r^J07i zAhpI%pq=eB-qqA$qu;zkTv&^gBLW8qfA!e!=yfZ!R2mPPx?81Aa zxign!4W#s%l65ep%@Y2+?y=H?yaoHKqi=-1M6o3^NG`&KatQ|UQAf7=kMwr7 zhVqz$4lqR6zM8@bm4fm%XGnz}IwjD_O)EvF346JXiVOSZU;1V~L}Jfs^ZmGJv!^~V zO~$aAg%wZ|oq^t|KC_TK?Ibatm4mAx29mP11Pg8UvqKMBJ+k|o4Y1uCqdBvaA6GVi zUG?b6LIiuHO7A=}hl9vO$h*~*095|!+auaq0wkEm@Z(SELX1V@G$Fy`4ewf2GSpGk zdodY!vOIy8(6TXWgum#5G*=fEhDhrn-JJX7d!)jjB7a2#93BUH#|hxW4Iq&eDyX;b z874rfouA|QlUNky;WV}HB^t+zp8{oeU>=1?!4iL8Y)1t~206_8aWl~agU=4z^?JbS zoA^s3A@9$o*{DR>bZJpwkOoV^x{gpIKRxNQZJ@xIi^%JPU`My{szbj`u2*p1g*NE7 zL0QuC1mjSg`^k?D6kHBo)|ks8lnh8QYWAWMX4Rx_?!-Tp-m?tDL3|;r1K1T zew2$DKj;~SDo3k(c~dV$K)7mmJ+e}|$6aKUr!FBwn!TZo^n3DSyvUzj+K{HC87 z_68f={GTJewJn9UmNigR@9h2vdaur7 z5t}=gIT?Ai=>-e<6sI?iQh&XbUp~9$EoIrO-zZ$ z{C+2hx9|r^j5H3-Dpb_iWVm0%(_wHZj(g`>6)Beb!iUimm^<_1OUo9nW;P{Kt;WNOz(7Zjs^v%n$hFKEM__h&8b$BpU{SOVm5yAEEO!z^UXTmIs5lgOva zi-qL{t2||Zbm_mWr|44Mrq_6j+s)YtJ3!@)YrjD?VOCZ~lcsK~;Ot81FdonztalEO z8xr?ub2mj=Ew)|V94I`dJOfQ_ZKTo08@Jq##iG{HSS+GQ4s5=W8PXIFbJ312YpW_mEe=^)T+}JZ;#QOo(Z9}(P>#pj(dn)m*6aR(^>(L zp9Wjr3LrpD?i6U z6X7BL9!r4~rMrHKXCOAtV~~c`!rgJ(qCpYRs9Q#K`@G^O;`<2S+%a{EDmtD=>$%b&II!T4&!}XAsnx(XvqL?g7On|?{8`xr;Z%) z_(gO#{1`diX7>b2nw^hb+Q{hw74v&6_hC=s2 z)ka>jPq$EiaV)e)Al0Eh-pZkg=uR^P(d!PKjXiZD)QUe4!}JiBotw+qs;Ao-;Z(NO z(MQbqX$_*SFGk(o#!>R~36-619dd)je|WYz)Mm-z9Tk`lTY)qs(uiZw3S4(J<0?Fp z!4FWPMzs*{Om|d7{zh!*cYS%p5l=EG{26hiVg$3^!eU-g=0HddwW!-Nf}<~^28Dv| zcc{F}PZ{MA^x>+%6BgF7h;>a$DJJbuMZdnFSaQKYDFbXRIIqc*+7pI9ByV;4+#I{j zCITGkRhE|S5@R1NQE)<^2F^eo3&R>|S}E7*$bVqtgm~}$3S|mFNf7N#%Dny0oLH=! z|546H#nZtQ!1x8ilwEA$807%WOicff2r7@o0(ELkt1i)g#aJUH$&CfQB7=(IJ}Ds< zEJzVHq|ondd6GqhaI-dGG7qyhm{wpwZHwRKsd7dbpjo--e1run@RfmP_Lkc!5h~!J zpnRtwe5gUFvoQO(21*0y>@t#(hg1q6Aly5#Xf0)uDv9hGr5KjcVbr$eP&GgjTtelr zf*2ym0T!rIxdC-z zAnK{AIjElN&G_3>Y53b9O!G#+@fWy=qCSTj{lthy3EbdwHf&Fd&In#(gaG3&M7x zJZb&H0vJ}gi_uyx>6m@Vy&>-<@VU-RGWPfs{(MZw*NYd^bN96A;j}5Iq<#}3aLD-d z(J&OuNwPa_c80@XPkOWyVg)1XvLNb|u^7t^<<=wru-j!gY~+e&Vo-NHsJE{?LsDKSAC) zV^uPpl9MrUQZp0U^E?zX@r#rpF&?PXGHfRZt#C zMkTc|agw^i&o+~mYTQHw<7WBp=p)<^G34S3nxk@#7^x=^d<0d+N^XI_L_q9m0X13?&Gk7!p z;<)9{u8}X#xLc-N+$Q4q!Ji#-$e|%HcJ~LDO1PX`#ND((${M4FrBEVd;(TDj1*t4s zcCJzK56*@GwJK%IxVQj7Ydzwgp>$D!I_4Qix=G7$&VWRV`b-o(rUIj-v4VXCfKXC{ z5jGKQeVuazz@*L zIZO~Os$8@ISL-dEH#25qQ7U0-DTu{v`MR0z@r+e+nb^lik?WT7&`TKD-eSW)t3=cm)w?5CHhnizz`+G*zKAobS znq906zJr&VYSy+>a3;a9C?K{F8VDD3S3|cG-b!(}%p2fwo1<9jMn6=T7WWw;q3#Nm zxO__tFcmINMp4Qb?prDQaWD5L-%whe&M}w4H>=}WSdCCJ=>+QoL&m?7E$Byh^NHf$4C&nhL2n&@(M zVY1kw1Pj=BZhty6>=%3!sy>R5CRfiSlOUAl)|>q04Be zL%3m3JZdABXNA2h-<y<=16GrVV4fw^JxpUp9LAZCxX1^^VJo{Ei z+eSa+$VpX7zSxGLJ1;bPOHWWJ`SQ%(D?tYpS9=sGUwAP7u6b zexOAB6oJ#gnq((B2%4-anI znA@*6RroYq())xH^Ke$d>sLE!JCV0I_zb+f@%?}o*k*i^{>bXxzkHI|JS=;Q$-hl& zExP@e8wccBI!N&#(%J@IAQ-J%Yz#;^Bf}o>~fVm^K@o4-{SKy zk}>nl@d5pLJHk5<40={ry{HK*5c20@^YpPZpXH09Xb4CmPHf}yW{!e>Ei{xzIjM)a zi1ZvDqBCrZ$rN;oYT&CMEIl^TcmMo8(fTS8aC*h>hpYG`wq@Q@R8lqTq+9<}2htkZ zfloGoPdeefD&T>l|Dgxu(f)(>GnV*TKGLTw`C}#BjopUPEczzoC-_FwwFB?K{Yyta zZq`X4hmf8E@g~dt$xL>Y8wB#(bT7ty9F)W0Jnl~}>Gxu#ixQ6`+D~rcnbq3`!&*7{ z_KcimzhE<`zb@Z}8s`$<7(di`aO+-_1%~ryklg%ve4zaOo?P-pR84s16bBMg9L{S* z{%%Ncx`f||x>cY)cHCjC#k;j|lfGu4^7(((%IssO&7kt*dL#gn{ z(2SCy>t@*66KFs)r|B1)#!q<*J zmOGQ85=pgsD!=CPM(#xD^ zbL!{oTff=uRp7mHyeZK0+WhR+?cTK6nL^uLC3pGvy1Q-$>2#x$XJgIov1Ost&*$>d z?8cRKpaXZcx?r}sD5su5N%-aS7vaQ_j++#8lXDh)T6p!HK=k6v{x-q$U@0YpbcZDV z4d1~oQ;Bo8)3M*p)?GoAUGCqaZtG`g(F(@0jeS9TZDP^QYM{TQRt`>GvnNkJVxng$ zq^uf@xIyDDe{L*sMm(eh(Tw|jeSDj1f%snuE|IhD$q)EVv?Gfp6?*KL+F4mU2OKVO z78|;J6byGbtzz*ix_tO&%x)DDHAi!m)^UGkl^C>69Q>uvxuI~J)noOHDyGEQ+V`Qr z@6e>W;Y!Z%>d}~_os;6DxqglCc&T=4kyfu`-Ip4dY=YElfmJVJu@x4Hhay^~Sxh+e z;;o2j+X`wk%%zzXWvx4buR2Axtc$NR*iXUTHNw9Sv_pDzt}dfJ+b&*!F{bx@LMxk{ zvi|2{0LaDi9|&@VTwN^eov5f8RV-a>OuxRBU5%_vja|MxkuQ--m#;nIN6)H8k;&%IhdGzrO`7p0GSw=IjI??O+DT1olKmm zsNw!KlBKhtRKJm-8bs29Hy%FT4i$B2{^u8d z4u4&)XI>w}%Z&A*Kkgo9!as?0n@=x$TO9?qK0Q4gKc5fE4wtt+UOhfP+Ad=Xg1?jn zcCW8aMh-c`yRFk@FOe=kISJxtys^9URzE#_o~}NwXzu#YE5%+k{C}f)N1HybU0m45 z`|}gon|&@`wyqIKh`ti;xy-aOjBCz$AF09kq`n z*e8FcEZJVd_N#UwN1%-XVL{V0E3&e7EMxQmGiqlQL*vD^0TaRK|eb@_s(#vE~55OCelnhFF}IAwB?)B%+-g%T!J2iBQ3 zDub3Iyk(P;@w8N$gc2rAI1_3h=BQEz9b_X0g>)cYC|?caO?cKww}Dj$oq3t%B5k---NB>&xJrEsjkf{v1Tem z%V)fpCCq$kiA$Nzp`$j)`4#PgB#HhH(!f+1&32wZ&{Tldi&K z|CXj!MQ0>2dwV*YBL=V&oou3ZbT;A_3)Tnkbc|=W3Y9*SQ5bu+;N9n>w7+NlDFgP0 z7$vHD6nY@x_tUJJ{fBZ;Yp`0v;u#g!zKU~-iDMTJqnO!SVWJa8p`Dn$09C~A=jbwPkrw9ezw=uSmR`mJ7l1AJ0WREj6$1Kk#1I6Ck zDW))0tgk~!V>>Q~C-1DV`1fX`jB2HgF9WxxEJ=78B{T(fBMcS=mj|$jDsclrMN}Lz zBf>Ve@~lkAS^=Pl8@a8pQu(zI5e*u4{5uHr-uEx5<>D?;m!^*vf&wmD$xs<6J!Nvx zE2#(7gO3o8XRCFTUb|X_wCtL(=VnB~gY%HHrpZ-FLroBr730hX^u|FJeM%F5(>#f^ z&MJ0r8vKBIxzV0+g4`T;)yoTFIRO~hbbn*`)LO4U?1j25ea_EaGPrfe&WF_M@Y~^( z>9>F8+IlURoJ+6~x7qcO8pNDdKp58UBF}j@+L33V*`Ll7szEOv--tMq> zrR`T=YG%xyjI~4PQ>(<+W@pz0m&)Xny$%)f554eq>2x?;m+nf{59v4VP9}4EN4tEU z-gm!Vg_pO!Hu_lGbgoTzWL-4D^8QS6^RE2P<0k!A`BnAE#q74d;Cz*WmxF8G5o5)b z(t==Rs?6@^{VsyPug`+S36(0h+>%R9(Xn9JZie)s{b;(t^vl^L>vYqiV|p?ZH%)S! zaB}XVk5%KhNLbUP=ruI z$}g49AE(A0w>sd%9D!ajh8#SFYw}%WE6VD^D<$yHt`R{fTkLq=ezZSrh8E zB6Ye4p0%K4$bce>6RJRNn!L0&+qcs6&kW;5LBjkamlSG3RY~dhv(r~;qQdL$?<313 zgui4*8dBm!sA^t9kx|L5e7fhdS|gv5swJLhwbV7~FYgxqmLSh6|KR8NxaoC=c7NP` zzy3JC1^2UgnbkM-S{*pM_+saKdbozAC-(QXy5zR~c-vhJAKc2{BDnDL<;vlCG`vKT z7tlar@O>*e`2_VBrcC-j0q8Hu2*5vhke3&Z@n0xVL&?+(j#1Li#MA?z%>)2)0d?UR zl`XwY|8e0M)dAYffUgzBUyVxk_AXzI{{Ts)fA%o5|BC)W(=J@-IGl9SMxE zzdW7un3&WwhdJLcCHDpQbDXiZ^?#UgbJ*$R&%eJxs)m8~hy9pl&}nz1#Bs13^@+>I zb~a-)znp%Y#&y$O_9b-LD4^ZQ8Jj*Dy=&Vq?=!$_u=c9$QAaWyc3H9Kc$DKp_i~1w z>{gu)LUmT3>{gr(<2umG^sdQ)zn$!6eU#!N#-eKNlisJIQ?U!h9%t4l=7pI_jW&$D zq2g6$2sIryzEWu?P@{?;MBkUvDB(qnPpvXUtPXXpCt~jt{{xY22yYW=TTkfLC-(=k z(-8SA)VH2^`0v-!{<-zn^)ViC=)Va!?CVNI>>c*YF)!&?OY3Hww2hE?*;E^o6*~54 zas7^*pp<+*Ge;OXVXOv${al>L$_}6m1LSQ8>f1Cr!KV)3cEUZ!wxe5)*f(nzgWw)8 z@c<4U+O9OQg#3WcrI)VzREm_BYyEHJH%8T=0}p970{KqNEt6ZWlv-*L-F zO4aB$acPbgmWj~5W(^30X!5=)4Jex^%)TEQkdv?n5Y9nd{W%75Oq3bu|4aBCy%=_W zh`)Vhy55jkuLJY5_ZW0rnjmP<0U~Yy;tJq_!V5z*Bv~M+0H8uSMu7|h6AJuSfB^#s z@xO+mJ*@wsVWvEX=i{f#DgWNgj?ljk1c4j`-5>06Y2Vz~(+3ik@NPw6lAZ;*l5Hm9 zR@|Q8Yr%iS|87D3`IdNk9!AKd*0FKuB)5Bv9qhKi2q{~tkEA8tiv~>bW2+8qu18uL zg!%)M-S>R%+O3y;+S*U_N4q1k?z!8`Q9jU5K&EecKGuIyeng1ptT{%H^|A?_pK4=d znW*A$@;Ib>X;~m>q~z6y3EKlq{x7IZ+Qmrob3g1X%ZqY>*n{~t-Cp6LAsj=WSQ|w8 z-+#YQ1N5`&FFX#|7ea1rz#UI&<=#DOlTl>p$7AO-#%;DP>ix;G+7w1yd3sZ(8yX4T za^7_%f`0=tknNxY>Xv}qg56NH()zLQ`X_oY>fCsi(N(@3b#goh4%l=boIjo zW=65c8G0KK!rYroL`wvxI@A1JDfe6_$Hrqjup6nF3Y7|osDkZ3r&U9gkkk9!Ilj`! zWpg1ambzeywIH>~HOQ5JY5Zf1>HUwyPN}q}SEtBa9a>@E{P&NvaQ&RgB$g-Y{l*d# zPDMOa!g{xumI_Nb?hLzVZr>uc1Ru<#`WK*h}1Y;Ci2g7#H@@dX1 zqI05i1Y-x^*?;^@Hho&tP!IFpJpBM4U_>YW&;OjxE{0An9$(K7;8@t$nAkYs$jQaz H#o_)Nq`Zt3 literal 0 HcmV?d00001 diff --git a/inc/rocprofiler.h b/inc/rocprofiler.h index 390965d9..0e50f95e 100644 --- a/inc/rocprofiler.h +++ b/inc/rocprofiler.h @@ -36,29 +36,2327 @@ THE SOFTWARE. // HSA extension. The library implementation requires HSA API intercepting and // a profiling queue supporting a submit callback interface. // +// +/** \mainpage ROCProfiler API Specification + * + * \section introduction Introduction + * + * ROCProfiler library, GPU Applications Profiling/Tracing APIs. + * The API provides functionality for profiling GPU applications in kernel and + * application and user mode and also with no replay mode at all and it + * provides the records pool support with an easy sequence of calls, so the + * user can be able to profile and trace in easy small steps, our samples code + * can give good examples of how to use the API calls for both profiling and + * tracing + * + * This document is going to discuss the following: + * 1. @ref symbol_versions_group + * 2. @ref versioning_group + * 3. @ref status_codes_group + * 4. @ref rocprofiler_general_group + * 5. @ref timestamp_group + * 6. @ref generic_record_group + * - @ref record_agents_group + * - @ref record_queues_group + * - @ref record_kernels_group + * 7. @ref profiling_api_group + * - @ref profiling_api_counters_group + * 8. @ref tracing_api_group + * - @ref roctx_tracer_api_data_group + * - @ref hsa_tracer_api_data_group + * - @ref hip_tracer_api_data_group + * 9. @ref memory_storage_buffer_group + * 10. @ref sessions_handling_group + * - @ref session_filter_group + * - @ref session_range_group + * - @ref session_user_replay_pass_group + * 11. @ref device_profiling + * 12. @ref rocprofiler_plugins + */ +// +/** + * \file + * ROCPROFILER API interface. + */ //////////////////////////////////////////////////////////////////////////////// #ifndef INC_ROCPROFILER_H_ #define INC_ROCPROFILER_H_ -#include -#include -#include -#include +/* Placeholder for calling convention and import/export macros */ +#if !defined(ROCPROFILER_CALL) +#define ROCPROFILER_CALL +#endif /* !defined (ROCPROFILER_CALL) */ + +#if !defined(ROCPROFILER_EXPORT_DECORATOR) +#if defined(__GNUC__) +#define ROCPROFILER_EXPORT_DECORATOR __attribute__((visibility("default"))) +#elif defined(_MSC_VER) +#define ROCPROFILER_EXPORT_DECORATOR __declspec(dllexport) +#endif /* defined (_MSC_VER) */ +#endif /* !defined (ROCPROFILER_EXPORT_DECORATOR) */ + +#if !defined(ROCPROFILER_IMPORT_DECORATOR) +#if defined(__GNUC__) +#define ROCPROFILER_IMPORT_DECORATOR +#elif defined(_MSC_VER) +#define ROCPROFILER_IMPORT_DECORATOR __declspec(dllimport) +#endif /* defined (_MSC_VER) */ +#endif /* !defined (ROCPROFILER_IMPORT_DECORATOR) */ + +#define ROCPROFILER_EXPORT ROCPROFILER_EXPORT_DECORATOR ROCPROFILER_CALL +#define ROCPROFILER_IMPORT ROCPROFILER_IMPORT_DECORATOR ROCPROFILER_CALL + +#if !defined(ROCPROFILER) +#if defined(ROCPROFILER_EXPORTS) +#define ROCPROFILER_API ROCPROFILER_EXPORT +#else /* !defined (ROCPROFILER_EXPORTS) */ +#define ROCPROFILER_API ROCPROFILER_IMPORT +#endif /* !defined (ROCPROFILER_EXPORTS) */ +#endif /* !defined (ROCPROFILER) */ + +#include #include -#define ROCPROFILER_VERSION_MAJOR 8 -#define ROCPROFILER_VERSION_MINOR 0 - #ifdef __cplusplus extern "C" { -#endif // __cplusplus +#endif /* __cplusplus */ + +/** \defgroup symbol_versions_group Symbol Versions + * + * The names used for the shared library versioned symbols. + * + * Every function is annotated with one of the version macros defined in this + * section. Each macro specifies a corresponding symbol version string. After + * dynamically loading the shared library with \p dlopen, the address of each + * function can be obtained using \p dlsym with the name of the function and + * its corresponding symbol version string. An error will be reported by \p + * dlvsym if the installed library does not support the version for the + * function specified in this version of the interface. + * + * @{ + */ + +/** + * The function was introduced in version 1.5 of the interface and has the + * symbol version string of ``"ROCPROFILER_1.5"``. + */ +#define ROCPROFILER_VERSION_2_0 + +/** @} */ + +/** \defgroup versioning_group Library Versioning + * + * Version information about the interface and the associated installed + * library. + * + * The semantic version of the interface following semver.org rules. A client + * that uses this interface is only compatible with the installed library if + * the major version numbers match and the interface minor version number is + * less than or equal to the installed library minor version number. + * + * @{ + */ + +/** + * The major version of the interface as a macro so it can be used by the + * preprocessor. + */ +#define ROCPROFILER_VERSION_MAJOR 2 + +/** + * The minor version of the interface as a macro so it can be used by the + * preprocessor. + */ +#define ROCPROFILER_VERSION_MINOR 0 + +/** + * Query the major version of the installed library. + * + * Return the major version of the installed library. This can be used to + * check if it is compatible with this interface version. This function can be + * used even when the library is not initialized. + */ +ROCPROFILER_API uint32_t rocprofiler_version_major(); + +/** + * Query the minor version of the installed library. + * + * Return the minor version of the installed library. This can be used to + * check if it is compatible with this interface version. This function can be + * used even when the library is not initialized. + */ +ROCPROFILER_API uint32_t rocprofiler_version_minor(); + +/** @} */ + +#ifndef ROCPROFILER_V1 + +// TODO(aelwazir): Fix them to use the new Error codes +/** \defgroup status_codes_group Status Codes + * + * Most operations return a status code to indicate success or error. + * + * @{ + */ + +/** + * ROCProfiler API status codes. + */ +typedef enum { + /** + * The function has executed successfully. + */ + ROCPROFILER_STATUS_SUCCESS = 0, + /** + * A generic error has occurred. + */ + ROCPROFILER_STATUS_ERROR = -1, + /** + * ROCProfiler is already initialized. + */ + ROCPROFILER_STATUS_ERROR_ALREADY_INITIALIZED = -2, + /** + * ROCProfiler is not initialized. + */ + ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED = -3, + /** + * Missing Buffer for a session. + */ + ROCPROFILER_STATUS_ERROR_SESSION_MISSING_BUFFER = -4, + /** + * Timestamps can't be collected + */ + ROCPROFILER_STATUS_ERROR_TIMESTAMP_NOT_APPLICABLE = -5, + /** + * Agent is not found with given identifier. + */ + ROCPROFILER_STATUS_ERROR_AGENT_NOT_FOUND = -6, + /** + * Agent information is missing for the given identifier + */ + ROCPROFILER_STATUS_ERROR_AGENT_INFORMATION_MISSING = -7, + /** + * Queue is not found for the given identifier. + */ + ROCPROFILER_STATUS_ERROR_QUEUE_NOT_FOUND = -8, + /** + * The requested information about the queue is not found. + */ + ROCPROFILER_STATUS_ERROR_QUEUE_INFORMATION_MISSING = -9, + /** + * Kernel is not found with given identifier. + */ + ROCPROFILER_STATUS_ERROR_KERNEL_NOT_FOUND = -10, + /** + * The requested information about the kernel is not found. + */ + ROCPROFILER_STATUS_ERROR_KERNEL_INFORMATION_MISSING = -11, + /** + * Counter is not found with the given identifier. + */ + ROCPROFILER_STATUS_ERROR_COUNTER_NOT_FOUND = -12, + /** + * The requested Counter information for the given kernel is missing. + */ + ROCPROFILER_STATUS_ERROR_COUNTER_INFORMATION_MISSING = -13, + /** + * The requested Tracing API Data for the given data identifier is missing. + */ + ROCPROFILER_STATUS_ERROR_TRACER_API_DATA_NOT_FOUND = -14, + /** + * The requested information for the tracing API Data is missing. + */ + ROCPROFILER_STATUS_ERROR_TRACER_API_DATA_INFORMATION_MISSING = -15, + /** + * The given Domain is incorrect. + */ + ROCPROFILER_STATUS_ERROR_INCORRECT_DOMAIN = -16, + /** + * The requested Session given the session identifier is not found. + */ + ROCPROFILER_STATUS_ERROR_SESSION_NOT_FOUND = -17, + /** + * The requested Session Buffer given the session identifier is corrupted or + * deleted. + */ + ROCPROFILER_STATUS_ERROR_CORRUPTED_SESSION_BUFFER = -18, + /** + * The requested record given the record identifier is corrupted or deleted. + */ + ROCPROFILER_STATUS_ERROR_RECORD_CORRUPTED = -19, + /** + * Incorrect Replay mode. + */ + ROCPROFILER_STATUS_ERROR_INCORRECT_REPLAY_MODE = -20, + /** + * Missing Filter for a session. + */ + ROCPROFILER_STATUS_ERROR_SESSION_MISSING_FILTER = -21, + /** + * The size given for the buffer is not applicable. + */ + ROCPROFILER_STATUS_ERROR_INCORRECT_SIZE = -22, + /** + * Incorrect Flush interval. + */ + ROCPROFILER_STATUS_ERROR_INCORRECT_FLUSH_INTERVAL = -23, + /** + * The session filter can't accept the given data. + */ + ROCPROFILER_STATUS_ERROR_SESSION_FILTER_DATA_MISMATCH = -24, + /** + * The given filter data is corrupted. + */ + ROCPROFILER_STATUS_ERROR_FILTER_DATA_CORRUPTED = -25, + /** + * The given label is corrupted. + */ + ROCPROFILER_STATUS_ERROR_CORRUPTED_LABEL_DATA = -26, + /** + * There is no label in the labels stack to be popped. + */ + ROCPROFILER_STATUS_ERROR_RANGE_STACK_IS_EMPTY = -27, + /** + * There is no pass that started. + */ + ROCPROFILER_STATUS_ERROR_PASS_NOT_STARTED = -28, + /** + * There is already Active session, Can't activate two session at the same + * time + */ + ROCPROFILER_STATUS_ERROR_HAS_ACTIVE_SESSION = -29, + /** + * Can't terminate a non active session + */ + ROCPROFILER_STATUS_ERROR_SESSION_NOT_ACTIVE = -30, + /** + * The required filter is not found for the given session + */ + ROCPROFILER_STATUS_ERROR_FILTER_NOT_FOUND = -31, + /** + * The required buffer is not found for the given session + */ + ROCPROFILER_STATUS_ERROR_BUFFER_NOT_FOUND = -32, + /** + * The required Filter is not supported + */ + ROCPROFILER_STATUS_ERROR_FILTER_NOT_SUPPORTED = -33 +} rocprofiler_status_t; + +/** + * Query the textual description of the given error for the current thread. + * + * Returns a NULL terminated string describing the error of the given ROCProfiler + * API call by the calling thread that did not return success. + * + * @retval Return the error string. + */ +ROCPROFILER_API const char* rocprofiler_error_str(rocprofiler_status_t status) ROCPROFILER_VERSION_2_0; + +/** @} */ + +/** \defgroup rocprofiler_general_group General ROCProfiler Requirements + * @{ + */ + +// TODO(aelwazir): More clear description, (think about nested!!??) + +/** + * Initialize the API Tools + * + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + * @retval ::ROCPROFILER_STATUS_ERROR_API_ALREADY_INITIALIZED If initialize + * wasn't called or finalized called twice + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_initialize() ROCPROFILER_VERSION_2_0; + +/** + * Finalize the API Tools + * + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + * @retval ::ROCPROFILER_STATUS_ERROR_API_NOT_INITIALIZED If initialize wasn't + * called or finalized called twice + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_finalize() ROCPROFILER_VERSION_2_0; + +/** + * \addtogroup sessions_handling_group + * @{ + * ROCProfiler Session Modes. + */ + +/** + * Session Identifier + */ +typedef struct { + /** + * Session Identifier to get the session or to be used to call any API that + * needs to deal with a specific session + */ + uint64_t handle; +} rocprofiler_session_id_t; + +/** @} */ + +/** @} */ + +/** \defgroup timestamp_group Timestamp Operations + * + * For this group we are focusing on timestamps collection and timestamp + * definition + * + * @{ + */ + +/** + * ROCProfiling Timestamp Type. + */ +typedef struct { + uint64_t value; +} rocprofiler_timestamp_t; + +/** + * Get the system clock timestamp. + * + * @param[out] timestamp The system clock timestamp in nano seconds. + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + * @retval ::ROCPROFILER_STATUS_ERROR_TIMESTAMP_NOT_APPLICABLE The function + * failed to get the timestamp using HSA Function. + * + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_get_timestamp(rocprofiler_timestamp_t* timestamp) ROCPROFILER_VERSION_2_0; + +/** + * Timestamps (start & end), it will be used for kernel dispatch tracing as + * well as API Tracing + */ +typedef struct { + rocprofiler_timestamp_t begin; + rocprofiler_timestamp_t end; +} rocprofiler_record_header_timestamp_t; + +/** @} */ + +/** \defgroup generic_record_group General Records Type + * @{ + */ + +/** + * A unique identifier for every record + */ +typedef struct { + /** + * Record ID handle + */ + uint64_t handle; +} rocprofiler_record_id_t; + +/** + * Record kind + */ +typedef enum { + /** + * Represents records that have profiling data (ex. counter collection + * records) + */ + ROCPROFILER_PROFILER_RECORD = 0, + /** + * Represents records that have tracing data (ex. hip api tracing records) + */ + ROCPROFILER_TRACER_RECORD = 1, + /** + * Represents a ATT tracing record (Not available yet) + */ + ROCPROFILER_ATT_TRACER_RECORD = 2, + /** + * Represents a PC sampling record + */ + ROCPROFILER_PC_SAMPLING_RECORD = 3, + /** + * Represents SPM records + */ + ROCPROFILER_SPM_RECORD = 4 +} rocprofiler_record_kind_t; + +/** + * Generic ROCProfiler record header. + */ +typedef struct { + /** + * Represents the kind of the record using ::rocprofiler_record_kind_t + */ + rocprofiler_record_kind_t kind; + /** + * Represents the id of the record + */ + rocprofiler_record_id_t id; +} rocprofiler_record_header_t; + +/** \defgroup record_agents_group Agents(AMD CPU/GPU) Handling + * \ingroup generic_record_group + * @{ + */ + +/** + * Agent ID handle, which represents a unique id to the agent reported as it + * can be used to retrieve Agent information using + * ::rocprofiler_query_agent_info, Agents can be CPUs or GPUs + */ +typedef struct { + /** + * a unique id to represent every agent on the system, this handle should be + * unique across all nodes in multi-node system + */ + uint64_t handle; // Topology folder serial number +} rocprofiler_agent_id_t; + +/** + * Using ::rocprofiler_query_agent_info, user can determine the type of the agent + * the following struct will be the output in case of retrieving + * ::ROCPROFILER_AGENT_TYPE agent info + */ +typedef enum { + /** + * CPU Agent + */ + ROCPROFILER_CPU_AGENT = 0, + /** + * GPU Agent + */ + ROCPROFILER_GPU_AGENT = 1 +} rocprofiler_agent_type_t; + +// TODO(aelwazir): check if we need to report the family name as well!!?? OR +// return the agent itself so that they can use HSA API +/** + * Types of information that can be requested about the Agents + */ +typedef enum { + /** + * GPU Agent Name + */ + ROCPROFILER_AGENT_NAME = 0, + /** + * GPU Agent Type + */ + ROCPROFILER_AGENT_TYPE = 1 +} rocprofiler_agent_info_kind_t; + +/** + * Query Agent Information size to allow the user to allocate the right size + * for the information data requested, the information will be collected using + * ::rocprofiler_agent_id_t to identify one type of information available in + * ::rocprofiler_agent_info_t + * + * @param[in] kind Information kind requested by the user + * @param[in] agent_id Agent ID + * @param[out] data_size Size of the information data output + * @retval ::ROCPROFILER_STATUS_SUCCESS, if the information was found + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_AGENT_NOT_FOUND, if the agent was not found + * in the saved agents + * @retval ::ROCPROFILER_STATUS_ERROR_AGENT_INFORMATION_MISSING, if the agent + * was found in the saved agents but the required information is missing + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_query_agent_info_size(rocprofiler_agent_info_kind_t kind, + rocprofiler_agent_id_t agent_id, + size_t* data_size) ROCPROFILER_VERSION_2_0; + +/** + * Query Agent Information Data using an allocated data pointer by the user, + * user can get the size of the data using ::rocprofiler_query_agent_info_size, + * the user can get the data using ::rocprofiler_agent_id_t and the user need to + * identify one type of information available in ::rocprofiler_agent_info_t + * + * @param[in] kind Information kind requested by the user + * @param[in] agent_id Agent ID + * @param[out] data_size Size of the information data output + * @retval ::ROCPROFILER_STATUS_SUCCESS, if the information was found + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_AGENT_NOT_FOUND, if the agent was not found + * in the saved agents + * @retval ::ROCPROFILER_STATUS_ERROR_AGENT_INFORMATION_MISSING, if the agent + * was found in the saved agents but the required information is missing + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_query_agent_info(rocprofiler_agent_info_kind_t kind, + rocprofiler_agent_id_t descriptor, + const char** name) ROCPROFILER_VERSION_2_0; + +/** @} */ + +/** \defgroup record_queues_group Queues(AMD HSA QUEUES) Handling + * \ingroup generic_record_group + * @{ + */ + +/** + * Unique ID handle to represent an HSA Queue of type \p hsa_queue_t, this id + * can be used by the user to get queue information using + * ::rocprofiler_query_queue_info + */ +typedef struct { + /** + * Unique Id for every queue for one agent for one system + */ + uint64_t handle; +} rocprofiler_queue_id_t; + +// TODO(aelwazir): Check if there is anymore Queue Information needed +/** + * Types of information that can be requested about the Queues + */ +typedef enum { + /** + * AMD HSA Queue Size. + */ + ROCPROFILER_QUEUE_SIZE = 0 +} rocprofiler_queue_info_kind_t; + +/** + * Query Queue Information size to allow the user to allocate the right size + * for the information data requested, the information will be collected using + * ::rocprofiler_queue_id_t by using ::rocprofiler_query_queue_info and the user + * need to identify one type of information available in + * ::rocprofiler_queue_info_t + * + * @param[in] kind Information kind requested by the user + * @param[in] agent_id Queue ID + * @param[out] data_size Size of the information data output + * @retval ::ROCPROFILER_STATUS_SUCCESS, if the information was found + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_QUEUE_NOT_FOUND, if the queue was not found + * in the saved agents + * @retval ::ROCPROFILER_STATUS_ERROR_QUEUE_INFORMATION_MISSING, if the queue + * was found in the saved queues but the required information is missing + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_query_queue_info_size(rocprofiler_queue_info_kind_t kind, + rocprofiler_queue_id_t agent_id, + size_t* data_size) ROCPROFILER_VERSION_2_0; + +/** + * Query Queue Information Data using an allocated data pointer by the user, + * user can get the size of the data using ::rocprofiler_query_queue_info_size, + * the user can get the data using ::rocprofiler_queue_id_t and the user need to + * identify one type of information available in ::rocprofiler_queue_info_t + * + * @param[in] kind Information kind requested by the user + * @param[in] agent_id Queue ID + * @param[out] data_size Size of the information data output + * @retval ::ROCPROFILER_STATUS_SUCCESS, if the information was found + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_QUEUE_NOT_FOUND, if the queue was not found + * in the saved agents + * @retval ::ROCPROFILER_STATUS_ERROR_QUEUE_INFORMATION_MISSING, if the queue + * was found in the saved agents but the required information is missing + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_query_queue_info(rocprofiler_queue_info_kind_t kind, + rocprofiler_queue_id_t descriptor, + const char** name) ROCPROFILER_VERSION_2_0; + +/** @} */ + +/** \defgroup record_kernels_group Kernels Handling + * \ingroup generic_record_group + * @{ + */ + +/** + * Kernel identifier that represent a unique id for every kernel + */ +typedef struct { + /** + * Kernel object identifier + */ + uint64_t handle; +} rocprofiler_kernel_id_t; + +/** + * Kernel Information Types, can be used by ::rocprofiler_query_kernel_info + */ +typedef enum { + /** + * Kernel Name Information Type + */ + ROCPROFILER_KERNEL_NAME = 0 +} rocprofiler_kernel_info_kind_t; + +/** + * Query Kernel Information Data size to allow the user to allocate the right + * size for the information data requested, the information will be collected + * using + * ::rocprofiler_kernel_id_t by using ::rocprofiler_query_kernel_info and the + * user need to identify one type of information available in + * ::rocprofiler_kernel_info_t + * + * @param[in] kernel_info_type The tyoe of information needed + * @param[in] kernel_id Kernel ID + * @param[out] data_size Kernel Information Data size + * @retval ::ROCPROFILER_STATUS_SUCCESS, if the information was found + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_KERNEL_NOT_FOUND, if the kernel was not + * found in the saved kernels + * @retval ::ROCPROFILER_STATUS_ERROR_KERNEL_INFORMATION_MISSING, if the kernel + * was found in the saved counters but the required information is missing + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_query_kernel_info_size(rocprofiler_kernel_info_kind_t kind, + rocprofiler_kernel_id_t kernel_id, + size_t* data_size) ROCPROFILER_VERSION_2_0; + +/** + * Query Kernel Information Data using an allocated data pointer by the user, + * user can get the size of the data using ::rocprofiler_query_kernel_info_size, + * the user can get the data using ::rocprofiler_kernel_id_t and the user need + * to identify one type of information available in ::rocprofiler_kernel_info_t + * + * @param[in] kind Information kind requested by the user + * @param[in] kernel_id Kernel ID + * @param[out] data Information Data + * @retval ::ROCPROFILER_STATUS_SUCCESS, if the information was found + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_KERNEL_NOT_FOUND, if the kernel was not + * found in the saved kernels + * @retval ::ROCPROFILER_STATUS_ERROR_KERNEL_INFORMATION_MISSING, if the kernel + * was found in the saved kernels but the required information is missing + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_query_kernel_info(rocprofiler_kernel_info_kind_t kind, + rocprofiler_kernel_id_t kernel_id, + const char** data) ROCPROFILER_VERSION_2_0; + +/** @} */ + +/** + * Holds the thread id + */ +typedef struct { + /** + * Thread ID + */ + uint32_t value; +} rocprofiler_thread_id_t; + +/** @} */ + +/** \defgroup profiling_api_group Profiling Part Handling + * + * The profiling records are asynchronously logged to the pool and can be + * associated with the respective GPU kernels. + * Profiling API can be used to enable collecting of the records with or + * without timestamping data for the GPU Application in continuous mode or + * kernel mode. + * + * @{ + */ + +/** \defgroup profiling_api_counters_group Counter Collection Handling + * records + * \ingroup profiling_api_group + * @{ + */ + +typedef struct { + const char* name; + const char* description; + const char* expression; + uint32_t instances_count; + const char* block_name; + uint32_t block_counters; +} rocprofiler_counter_info_t; + +typedef int (*rocprofiler_counters_info_callback_t)(rocprofiler_counter_info_t counter, + const char* gpu_name, uint32_t gpu_index) ROCPROFILER_VERSION_2_0; + +ROCPROFILER_API rocprofiler_status_t +rocprofiler_iterate_counters(rocprofiler_counters_info_callback_t counters_info_callback) ROCPROFILER_VERSION_2_0; + +/** + * Counter ID to be used to query counter information using + * ::rocprofiler_query_counter_info + */ +typedef struct { + /** + * A unique id generated for every counter requested by the user + */ + uint64_t handle; +} rocprofiler_counter_id_t; + +/** + * Counter Information Types, can be used by ::rocprofiler_query_counter_info + */ +typedef enum { + /** + * Can be used to get the counter name + */ + ROCPROFILER_COUNTER_NAME = 0, + /** + * Can be used to get the block id of a counter + */ + ROCPROFILER_COUNTER_BLOCK_ID = 2, + /** + * This is the level of hierarchy from the GFX_IP where the counter value + * should be collected + */ + ROCPROFILER_COUNTER_HIERARCHY_LEVEL = 3 +} rocprofiler_counter_info_kind_t; + +/** + * Query Counter Information Data size to allow the user to allocate the right + * size for the information data requested, the information will be collected + * using + * ::rocprofiler_counter_id_t by using ::rocprofiler_query_counter_info and the + * user need to identify one type of information available in + * ::rocprofiler_counter_info_t + * + * @param[in] session_id Session id where this data was collected + * @param[in] counter_info_type The tyoe of information needed + * @param[in] counter_id Counter ID + * @param[out] data_size Counter Information Data size + * @retval ::ROCPROFILER_STATUS_SUCCESS, if the information was found + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_COUNTER_NOT_FOUND, if the counter was not + * found in the saved counters + * @retval ::ROCPROFILER_STATUS_ERROR_COUNTER_INFORMATION_MISSING, if the counter + * was found in the saved counters but the required information is missing + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_query_counter_info_size( + rocprofiler_session_id_t session_id, rocprofiler_counter_info_kind_t counter_info_type, + rocprofiler_counter_id_t counter_id, size_t* data_size) ROCPROFILER_VERSION_2_0; + +/** + * Query Counter Information Data using an allocated data pointer by the user, + * user can get the size of the data using ::rocprofiler_query_counter_info_size, + * the user can get the data using ::rocprofiler_counter_id_t and the user need + * to identify one type of information available in ::rocprofiler_counter_info_t + * + * @param[in] session_id Session id where this data was collected + * @param[in] kind Information kind requested by the user + * @param[in] counter_id Counter ID + * @param[out] data Information Data + * @retval ::ROCPROFILER_STATUS_SUCCESS, if the information was found + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_COUNTER_NOT_FOUND, if the counter was not + * found in the saved counters + * @retval ::ROCPROFILER_STATUS_ERROR_COUNTER_INFORMATION_MISSING, if the counter + * was found in the saved counters but the required information is missing + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_query_counter_info(rocprofiler_session_id_t session_id, + rocprofiler_counter_info_kind_t kind, + rocprofiler_counter_id_t counter_id, + const char** data) ROCPROFILER_VERSION_2_0; + +typedef struct { + /** + * queue index value + */ + uint64_t value; +} rocprofiler_queue_index_t; + +// TODO(aelwazir): add more types to the values should we use unions??!! +/** + * Counter Value Structure + */ +typedef struct { + /** + * Counter value + */ + double value; +} rocprofiler_record_counter_value_t; + +/** + * Counter Instance Structure, it will represent every counter reported in the + * array of counters reported by every profiler record if counters were needed + * to be collected + */ +typedef struct { + /** + * Counter Instance Identifier + */ + rocprofiler_counter_id_t counter_handler; // Counter Handler + /** + * Counter Instance Value + */ + rocprofiler_record_counter_value_t value; // Counter Value +} rocprofiler_record_counter_instance_t; + +/** + * Counters Instances Count Structure, every profiling record has this + * structure included to report the number of counters collected for this + * kernel dispatch + */ +typedef struct { + /** + * Counters Instances Count for every record + */ + uint64_t value; +} rocprofiler_record_counters_instances_count_t; + +/** + * Kernel properties, this will represent the kernel properties + * such as its grid size, workgroup size, wave_size + */ + +typedef struct { + /** + * Grid Size + */ + uint64_t grid_size; + /** + * workgroup size + */ + uint64_t workgroup_size; + /** + * lds_size + */ + uint64_t lds_size; + /** + * scratch_size + */ + uint64_t scratch_size; + /** + * arch vgpr count + */ + uint64_t arch_vgpr_count; + /** + * accum vgpr count + */ + uint64_t accum_vgpr_count; + /** + * sgpr_count + */ + uint64_t sgpr_count; + /** + * wave size + */ + uint64_t wave_size; + /** + * Dispatch completion signal handle + */ + uint64_t signal_handle; + +} rocprofiler_kernel_properties_t; +/** + * Profiling record, this will represent all the information reported by the + * profiler regarding kernel dispatches and their counters that were collected + * by the profiler and requested by the user, this can be used as the type of + * the flushed records that is reported to the user using + * ::rocprofiler_buffer_callback_t + */ +typedef struct { + /** + * ROCProfiler General Record base header to identify the id and kind of every + * record + */ + rocprofiler_record_header_t header; + /** + * Kernel Identifier to be used by the user to get the kernel info using + * ::rocprofiler_query_kernel_info + */ + rocprofiler_kernel_id_t kernel_id; + /** + * Agent Identifier to be used by the user to get the Agent Information using + * ::rocprofiler_query_agent_info + */ + rocprofiler_agent_id_t gpu_id; + /** + * Queue Identifier to be used by the user to get the Queue Information using + * ::rocprofiler_query_agent_info + */ + rocprofiler_queue_id_t queue_id; + /** + * Timestamps, start and end timestamps of the record data (ex. Kernel + * Dispatches) + */ + rocprofiler_record_header_timestamp_t timestamps; + /** + * Counters, including identifiers to get counter information and Counters + * values + */ + rocprofiler_record_counter_instance_t* counters; + /** + * kernel properties, including the grid size, work group size, + * registers count, wave size and completion signal + */ + rocprofiler_kernel_properties_t kernel_properties; + /** + * Thread id + */ + rocprofiler_thread_id_t thread_id; + /** + * Queue Index - packet index in the queue + */ + rocprofiler_queue_index_t queue_idx; + /** + * The count of the counters that were collected by the profiler + */ + rocprofiler_record_counters_instances_count_t counters_count; /* Counters Count */ +} rocprofiler_record_profiler_t; + +typedef struct { + uint32_t value; + +} rocprofiler_event_id_t; + +typedef struct { + uint16_t value; // Counter Value + +} rocprofiler_record_spm_counters_instances_count_t; + +/** + * Counters, including identifiers to get counter information and Counters + * values + */ +typedef struct { + rocprofiler_record_spm_counters_instances_count_t counters_data[32]; + +} rocprofiler_record_se_spm_data_t; + + +/** + * SPM record, this will represent all the information reported by the + * SPM regarding counters and their timestamps this can be used as the type of + * the flushed records that is reported to the user using + * ::rocprofiler_buffer_callback_t + */ +typedef struct { + /** + * ROCProfiler General Record base header to identify the id and kind of every + * record + */ + rocprofiler_record_header_t header; + + /** + * Timestamps at which the counters were sampled. + */ + rocprofiler_record_header_timestamp_t timestamps; + /** + * Counter values per shader engine + */ + rocprofiler_record_se_spm_data_t shader_engine_data[4]; + +} rocprofiler_record_spm_t; + +/** + * struct to store the trace data from a shader engine. + */ +typedef struct { + void* buffer_ptr; + uint32_t buffer_size; +} rocprofiler_record_se_att_data_t; + + /** + * ATT tracing record structure. + * This will represent all the information reported by the + * ATT tracer such as the kernel and its thread trace data. + * This record can be flushed to the user using + * ::rocprofiler_buffer_callback_t + */ +typedef struct { + /** + * ROCProfiler General Record base header to identify the id and kind of every + * record + */ + rocprofiler_record_header_t header; + /** + * Kernel Identifier to be used by the user to get the kernel info using + * ::rocprofiler_query_kernel_info + */ + rocprofiler_kernel_id_t kernel_id; + /** + * Agent Identifier to be used by the user to get the Agent Information using + * ::rocprofiler_query_agent_info + */ + rocprofiler_agent_id_t gpu_id; + /** + * Queue Identifier to be used by the user to get the Queue Information using + * ::rocprofiler_query_agent_info + */ + rocprofiler_queue_id_t queue_id; + /** + * kernel properties, including the grid size, work group size, + * registers count, wave size and completion signal + */ + rocprofiler_kernel_properties_t kernel_properties; + /** + * Thread id + */ + rocprofiler_thread_id_t thread_id; + /** + * Queue Index - packet index in the queue + */ + rocprofiler_queue_index_t queue_idx; + /** + * ATT data output from each shader engine. + */ + rocprofiler_record_se_att_data_t* shader_engine_data; + /** + * The count of the shader engine ATT data + */ + uint64_t shader_engine_data_count; +} rocprofiler_record_att_tracer_t; + + + +/** @} */ + +/** \defgroup tracing_api_group Tracer Part Handling + * @{ + */ + +/** + * Traced API domains + */ +typedef enum { + /** + * HSA API domain + */ + ACTIVITY_DOMAIN_HSA_API = 0, + /** + * HSA async activity domain + */ + ACTIVITY_DOMAIN_HSA_OPS = 1, + /** + * HIP async activity domain + */ + ACTIVITY_DOMAIN_HIP_OPS = 2, + /** + * HIP API domain + */ + ACTIVITY_DOMAIN_HIP_API = 3, + /** + * KFD API domain + */ + ACTIVITY_DOMAIN_KFD_API = 4, + /** + * External ID domain + */ + ACTIVITY_DOMAIN_EXT_API = 5, + /** + * ROCTX domain + */ + ACTIVITY_DOMAIN_ROCTX = 6, + // TODO(aelwazir): Used in kernel Info, memcpy, ..etc, refer to hsa_support + // TODO(aelwazir): Move HSA Events to hsa_support + /** + * HSA events (Device Activity) + */ + ACTIVITY_DOMAIN_HSA_EVT = 7, + ACTIVITY_DOMAIN_NUMBER +} rocprofiler_tracer_activity_domain_t; + +/** + * Tracing Operation ID for HIP/HSA + */ +typedef struct { + uint32_t id; +} rocprofiler_tracer_operation_id_t; + +/** + * Correlation identifier + */ +typedef struct { + /** + * Correlation ID Value + */ + uint64_t value; +} rocprofiler_tracer_activity_correlation_id_t; + +/** + * Tracer API Calls Data Handler + */ +typedef struct { + /** + * Data Handler Identifier + */ + const void* handle; + /** + * API Data Size + */ + size_t size; +} rocprofiler_tracer_api_data_handle_t; + +/** \defgroup roctx_tracer_api_data_group Tracer ROCTX API Data + * \ingroup tracing_api_group + * @{ + */ + +/** + * ROCTX Tracer Data Information Kinds + */ +typedef enum { + /** + * ROCTX Tracer Data kind that can be used to return ROCTX message + */ + ROCPROFILER_ROCTX_MESSAGE = 0, + /** + * ROCTX Tracer Data kind that can be used to return ROCTX id + */ + ROCPROFILER_ROCTX_ID = 1 +} rocprofiler_tracer_roctx_api_data_info_t; + +/** + * Query Tracer API Call Data Information size to allow the user to allocate + * the right size for the information data requested, the information will be + * collected using + * ::rocprofiler_tracer_api_data_id_t by using + * ::rocprofiler_query_tracer_api_data_info and the user need to identify one + * type of information available in + * ::rocprofiler_query_tracer_api_data_info + * + * @param[in] session_id Session id where this data was collected + * @param[in] kind The tyoe of information needed + * @param[in] api_data_id API Data ID + * @param[in] operation_id API Operation ID + * @param[out] data_size API Data Information size + * @retval ::ROCPROFILER_STATUS_SUCCESS, if the information was found + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_TRACER_API_DATA_NOT_FOUND, if the api data + * was not found in the saved api data + * @retval ::ROCPROFILER_STATUS_ERROR_TRACER_API_DATA_INFORMATION_MISSING, if the + * api data was found in the saved data but the required information is + * missing + * @retval ::ROCPROFILER_STATUS_ERROR_INCORRECT_DOMAIN if the user sent a handle + * that is not related to the requested domain + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_query_roctx_tracer_api_data_info_size( + rocprofiler_session_id_t session_id, rocprofiler_tracer_roctx_api_data_info_t kind, + rocprofiler_tracer_api_data_handle_t api_data_id, rocprofiler_tracer_operation_id_t operation_id, + size_t* data_size) ROCPROFILER_VERSION_2_0; + +/** + * Query API Data Information using an allocated data pointer by the user, + * user can get the size of the data using + * ::rocprofiler_query_tracer_api_data_info_length, the user can get the data + * using ::rocprofiler_tracer_api_data_id_t and the user need to identify one + * type of information available in ::rocprofiler_tracer_api_data_info_t + * + * @param[in] session_id Session id where this data was collected + * @param[in] kind Information kind requested by the user + * @param[in] api_data_id API Data ID + * @param[in] operation_id API Operation ID + * @param[out] data API Data Data + * @retval ::ROCPROFILER_STATUS_SUCCESS, if the information was found + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_TRACER_API_DATA_NOT_FOUND, if the api data + * was not found in the saved api data + * @retval ::ROCPROFILER_STATUS_ERROR_TRACER_API_DATA_INFORMATION_MISSING, if the + * api data was found in the saved data but the required information is + * missing + * @retval ::ROCPROFILER_STATUS_ERROR_INCORRECT_DOMAIN if the user sent a handle + * that is not related to the requested domain + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_query_roctx_tracer_api_data_info( + rocprofiler_session_id_t session_id, rocprofiler_tracer_roctx_api_data_info_t kind, + rocprofiler_tracer_api_data_handle_t api_data_id, rocprofiler_tracer_operation_id_t operation_id, + char** data) ROCPROFILER_VERSION_2_0; + +/** @} */ + +/** \defgroup hsa_tracer_api_data_group Tracer HSA API Data + * \ingroup tracing_api_group + * @{ + */ + +/** + * hsa Tracer Data Information Kinds + */ +typedef enum { + /** + * HSA Tracer Data kind that can be used to return to a pointer to all the + * API Call Data + */ + ROCPROFILER_HSA_FUNCTION_NAME = 0, + /** + * HSA API Data in string format. + */ + ROCPROFILER_HSA_API_DATA_STR = 1, + /** + * HSA Activity Name + */ + ROCPROFILER_HSA_ACTIVITY_NAME = 2, + /** + * HSA Data + * User has to reinterpret_cast to hsa_api_data_t* + */ + ROCPROFILER_HSA_API_DATA = 3 +} rocprofiler_tracer_hsa_api_data_info_t; + +/** + * Query Tracer API Call Data Information size to allow the user to allocate + * the right size for the information data requested, the information will be + * collected using + * ::rocprofiler_tracer_api_data_id_t by using + * ::rocprofiler_query_tracer_api_data_info and the user need to identify one + * type of information available in + * ::rocprofiler_query_tracer_api_data_info + * + * @param[in] session_id Session id where this data was collected + * @param[in] kind The tyoe of information needed + * @param[in] api_data_id API Data ID + * @param[in] operation_id API Operation ID + * @param[out] data_size API Data Information size + * @retval ::ROCPROFILER_STATUS_SUCCESS, if the information was found + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_TRACER_API_DATA_NOT_FOUND, if the api data + * was not found in the saved api data + * @retval ::ROCPROFILER_STATUS_ERROR_TRACER_API_DATA_INFORMATION_MISSING, if the + * api data was found in the saved data but the required information is + * missing + * @retval ::ROCPROFILER_STATUS_ERROR_INCORRECT_DOMAIN if the user sent a handle + * that is not related to the requested domain + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_query_hsa_tracer_api_data_info_size( + rocprofiler_session_id_t session_id, rocprofiler_tracer_hsa_api_data_info_t kind, + rocprofiler_tracer_api_data_handle_t api_data_id, rocprofiler_tracer_operation_id_t operation_id, + size_t* data_size) ROCPROFILER_VERSION_2_0; + +/** + * Query API Data Information using an allocated data pointer by the user, + * user can get the size of the data using + * ::rocprofiler_query_tracer_api_data_info_length, the user can get the data + * using ::rocprofiler_tracer_api_data_id_t and the user need to identify one + * type of information available in ::rocprofiler_tracer_api_data_info_t + * + * @param[in] session_id Session id where this data was collected + * @param[in] kind Information kind requested by the user + * @param[in] api_data_id API Data ID + * @param[in] operation_id API Operation ID + * @param[out] data API Data Data + * @retval ::ROCPROFILER_STATUS_SUCCESS, if the information was found + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_TRACER_API_DATA_NOT_FOUND, if the api data + * was not found in the saved api data + * @retval ::ROCPROFILER_STATUS_ERROR_TRACER_API_DATA_INFORMATION_MISSING, if the + * api data was found in the saved data but the required information is + * missing + * @retval ::ROCPROFILER_STATUS_ERROR_INCORRECT_DOMAIN if the user sent a handle + * that is not related to the requested domain + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_query_hsa_tracer_api_data_info( + rocprofiler_session_id_t session_id, rocprofiler_tracer_hsa_api_data_info_t kind, + rocprofiler_tracer_api_data_handle_t api_data_id, rocprofiler_tracer_operation_id_t operation_id, + char** data) ROCPROFILER_VERSION_2_0; + +/** @} */ + +/** \defgroup hip_tracer_api_data_group Tracer HIP API Data + * \ingroup tracing_api_group + * @{ + */ + +/** + * hip Tracer Data Information Kinds + */ +typedef enum { + // TODO(aelwazir): Get the data from hip_api_data_t + /** + * hip Tracer Data kind that can be used to return to a pointer to all the + * API Call Data + */ + ROCPROFILER_HIP_FUNCTION_NAME = 0, + /** + * Only available for HIP Functions that lead to kernel launch to get the + * kernel name + */ + ROCPROFILER_HIP_KERNEL_NAME = 1, + /** + * Only available to hip calls that has memory copy operation with source + * available + */ + ROCPROFILER_HIP_MEM_COPY_SRC = 2, + /** + * Only available to hip calls that has memory copy operation with + * destination available + */ + ROCPROFILER_HIP_MEM_COPY_DST = 3, + /** + * Only available to hip calls that has memory copy operation with data size + * available + */ + ROCPROFILER_HIP_MEM_COPY_SIZE = 4, + /** + * Reporting the whole API data as one string + */ + ROCPROFILER_HIP_API_DATA_STR = 5, + /** + * HIP Activity Name + */ + ROCPROFILER_HIP_ACTIVITY_NAME = 6, + /** + * Stream ID + */ + ROCPROFILER_HIP_STREAM_ID = 7, + /** + * HIP API Data + * User has to reinterpret_cast to hip_api_data_t* + */ + ROCPROFILER_HIP_API_DATA = 8 +} rocprofiler_tracer_hip_api_data_info_t; + +/** + * Query Tracer API Call Data Information size to allow the user to allocate + * the right size for the information data requested, the information will be + * collected using + * ::rocprofiler_tracer_api_data_id_t by using + * ::rocprofiler_query_tracer_api_data_info and the user need to identify one + * type of information available in + * ::rocprofiler_query_tracer_api_data_info + * + * @param[in] session_id Session id where this data was collected + * @param[in] kind The tyoe of information needed + * @param[in] api_data_id API Data ID + * @param[out] data_size API Data Information size + * @retval ::ROCPROFILER_STATUS_SUCCESS, if the information was found + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_TRACER_API_DATA_NOT_FOUND, if the api data + * was not found in the saved api data + * @retval ::ROCPROFILER_STATUS_ERROR_TRACER_API_DATA_INFORMATION_MISSING, if the + * api data was found in the saved data but the required information is + * missing + * @retval ::ROCPROFILER_STATUS_ERROR_INCORRECT_DOMAIN if the user sent a handle + * that is not related to the requested domain + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_query_hip_tracer_api_data_info_size( + rocprofiler_session_id_t session_id, rocprofiler_tracer_hip_api_data_info_t kind, + rocprofiler_tracer_api_data_handle_t api_data_id, rocprofiler_tracer_operation_id_t operation_id, + size_t* data_size) ROCPROFILER_VERSION_2_0; + +/** + * Query API Data Information using an allocated data pointer by the user, + * user can get the size of the data using + * ::rocprofiler_query_tracer_api_data_info_length, the user can get the data + * using ::rocprofiler_tracer_api_data_id_t and the user need to identify one + * type of information available in ::rocprofiler_tracer_api_data_info_t + * + * @param[in] session_id Session id where this data was collected + * @param[in] kind Information kind requested by the user + * @param[in] api_data_id API Data ID + * @param[out] data API Data Data + * @retval ::ROCPROFILER_STATUS_SUCCESS, if the information was found + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_TRACER_API_DATA_NOT_FOUND, if the api data + * was not found in the saved api data + * @retval ::ROCPROFILER_STATUS_ERROR_TRACER_API_DATA_INFORMATION_MISSING, if the + * api data was found in the saved data but the required information is + * missing + * @retval ::ROCPROFILER_STATUS_ERROR_INCORRECT_DOMAIN if the user sent a handle + * that is not related to the requested domain + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_query_hip_tracer_api_data_info( + rocprofiler_session_id_t session_id, rocprofiler_tracer_hip_api_data_info_t kind, + rocprofiler_tracer_api_data_handle_t api_data_id, rocprofiler_tracer_operation_id_t operation_id, + char** data) ROCPROFILER_VERSION_2_0; + +/** @} */ + +/** + * Tracing external ID + */ +typedef struct { + uint64_t id; +} rocprofiler_tracer_external_id_t; + +/** + * Tracing record, this will represent all the information reported by the + * tracer regarding APIs and their data that were traced and collected + * by the tracer and requested by the user, this can be used as the type of + * the flushed records that is reported to the user using + * ::rocprofiler_buffer_async_callback_t + */ +typedef struct { + /** + * ROCProfiler General Record base header to identify the id and kind of every + * record + */ + rocprofiler_record_header_t header; + /** + * Tracing external ID + */ + rocprofiler_tracer_external_id_t external_id; + /** + * Activity domain id, represents the type of the APIs that are being traced + */ + rocprofiler_tracer_activity_domain_t domain; + /** + * Tracing Operation ID for HIP/HSA + */ + rocprofiler_tracer_operation_id_t operation_id; + /** + * API Data Handler to be used by + * ::rocprofiler_query_roctx_tracer_api_data_info or + * ::rocprofiler_query_hsa_tracer_api_data_info or + * ::rocprofiler_query_hip_tracer_api_data_info depending on the domain type + */ + rocprofiler_tracer_api_data_handle_t api_data_handle; + /** + * Activity correlation ID + */ + rocprofiler_tracer_activity_correlation_id_t correlation_id; + /** + * Timestamps + */ + rocprofiler_record_header_timestamp_t timestamps; + /** + * Agent identifier that can be used as a handler in + * ::rocprofiler_query_agent_info + */ + rocprofiler_agent_id_t agent_id; + /** + * Queue identifier that can be used as a handler in + * ::rocprofiler_query_queue_info + */ + rocprofiler_queue_id_t queue_id; + /** + * Thread id + */ + rocprofiler_thread_id_t thread_id; +} rocprofiler_record_tracer_t; + +/** + * Kernel dispatch correlation ID, unique across all dispatches + */ +typedef struct { + uint64_t value; +} rocprofiler_kernel_dispatch_id_t; + +/** + * An individual PC sample + */ +typedef struct { + /** + * Kernel dispatch ID. This is used by PC sampling to associate samples with + * individual dispatches and is unrelated to any user-supplied correlation ID + */ + rocprofiler_kernel_dispatch_id_t dispatch_id; + union { + /** + * Host timestamp + */ + rocprofiler_timestamp_t timestamp; + /** + * GPU clock counter (not currently used) + */ + uint64_t cycle; + }; + /** + * Sampled program counter + */ + uint64_t pc; + /** + * Sampled shader element + */ + uint32_t se; + /** + * Sampled GPU agent + */ + rocprofiler_agent_id_t gpu_id; +} rocprofiler_pc_sample_t; + +/** + * PC sample record: contains the program counter/instruction pointer observed + * during periodic sampling of a kernel + */ +typedef struct { + /** + * ROCProfiler General Record base header to identify the id and kind of every + * record + */ + rocprofiler_record_header_t header; + /** + * PC sample data + */ + rocprofiler_pc_sample_t pc_sample; +} rocprofiler_record_pc_sample_t; + +/** @} */ + +/** \defgroup memory_storage_buffer_group Memory Storage Buffer + * Sessions + * + * In this group, Memory Pools and their types will be discussed. + * @{ + */ + +/** + * Buffer Property Options + */ +typedef enum { + /** + * Flush interval + */ + ROCPROFILER_BUFFER_PROPERTY_KIND_INTERVAL_FLUSH = 0, + // Periodic Flush + // Size + // Think of using the kind as an end of the array!!?? +} rocprofiler_buffer_property_kind_t; + +typedef struct { + rocprofiler_buffer_property_kind_t kind; + uint64_t value; +} rocprofiler_buffer_property_t; + +typedef struct { + uint64_t value; +} rocprofiler_buffer_id_t; + +typedef struct { + uint64_t value; +} rocprofiler_filter_id_t; + +/** + * Memory pool buffer callback. + * The callback that will be invoked when a memory pool buffer becomes full or + * is flushed by the user or using flush thread that was initiated using the + * flush interval set by the user ::rocprofiler_create_session. + * The user needs to read the record header to identify the record kind and + * depending on the kind they can reinterpret_cast to either + * ::rocprofiler_record_profiler_t if the kind was ::ROCPROFILER_PROFILER_RECORD or + * ::rocprofiler_record_tracer_t if the kind is ::ROCPROFILER_TRACER_RECORD + * + * @param[in] begin pointer to first entry in the buffer. + * @param[in] end pointer to one past the end entry in the buffer. + * @param[in] session_id The session id associated with that record + * @param[in] buffer_id The buffer id associated with that record + */ +typedef void (*rocprofiler_buffer_callback_t)(const rocprofiler_record_header_t* begin, + const rocprofiler_record_header_t* end, + rocprofiler_session_id_t session_id, + rocprofiler_buffer_id_t buffer_id); + +/** + * Flush specific Buffer + * + * @param[in] session_id The created session id + * @param[in] buffer_id The buffer ID of the created filter group + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_SESSION_NOT_FOUND may return if + * the session is not found + * @retval ::ROCPROFILER_STATUS_ERROR_CORRUPTED_SESSION_BUFFER may return if + * the session buffer is corrupted + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_flush_data(rocprofiler_session_id_t session_id, + rocprofiler_buffer_id_t buffer_id) ROCPROFILER_VERSION_2_0; + +/** + * Get a pointer to the next profiling record. + * A memory pool generates buffers that contain multiple profiling records. + * This function steps to the next profiling record. + * + * @param[in] record Pointer to the current profiling record in a memory pool + * buffer. + * @param[out] next Pointer to the following profiling record in the memory + * pool buffer. + * @param[in] session_id Session ID + * @param[in] buffer_id Buffer ID + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_RECORD_CORRUPTED if the function couldn't + * get the next record because of corrupted data reported by the previous + * record + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_next_record(const rocprofiler_record_header_t* record, + const rocprofiler_record_header_t** next, + rocprofiler_session_id_t session_id, + rocprofiler_buffer_id_t buffer_id) ROCPROFILER_VERSION_2_0; + +/** @} */ + +/** \defgroup sessions_handling_group ROCProfiler Sessions + * @{ + */ + +// TODO(aelwazir): Replay mode naming !!?? (If changed, reflect on start&stop) +/** + * Replay Profiling Modes. + */ +typedef enum { + /** + * No Replay to be done, Mostly for tracing tool or if the user wants to make + * sure that no replays will be done + */ + ROCPROFILER_NONE_REPLAY_MODE = -1, + /** + * Replaying the whole application to get multi passes (Not Yet Supported) + */ + ROCPROFILER_APPLICATION_REPLAY_MODE = 0, + /** + * Replaying every kernel dispatch to get multi passes + */ + ROCPROFILER_KERNEL_REPLAY_MODE = 1, + /** + * Replaying an user-specified range to get multi passes (Not Yet Supported) + */ + ROCPROFILER_USER_REPLAY_MODE = 2 +} rocprofiler_replay_mode_t; + +/** + * Create Session + * A ROCProfiler Session is having enough information about what needs to be + * collected or traced and it allows the user to start/stop profiling/tracing + * whenever required. + * Session will hold multiple mode, that can be added using + * ::rocprofiler_add_session_mode, it is required to add at least one session + * mode, if it is tracing or profiling and using ::rocprofiler_session_set_filter + * can set specific data that is required for the profiler or the tracer such + * as the counters for profiling or the APIs for tracing before calling + * ::rocprofiler_start_session, also + * ::rocprofiler_session_set_filter can be used to set optional filters like + * specific GPUs/Kernel Names/API Names and more. Session can be started using + * ::rocprofiler_start_session and can be stopped using + * ::rocprofiler_terminate_session + * + * @param[in] replay_mode The Replay strategy that should be used if replay is + * needed + * @param[out] session_id Pointer to the created session id, the session is + * alive up till ::rocprofiler_destroy_session being called, however, the session + * id can be + * used while the session is active which can be activated using + * ::rocprofiler_start_session and deactivated using + * ::rocprofiler_terminate_session but ::rocprofiler_flush_data can use session_id + * even if it is deactivated for flushing the saved records + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_create_session(rocprofiler_replay_mode_t replay_mode, + rocprofiler_session_id_t* session_id) ROCPROFILER_VERSION_2_0; + +/** + * Destroy Session + * Destroy session created by ::rocprofiler_create_session, please refer to + * the samples for how to use. + * This marks the end of session and its own id life and none of the session + * related functions will be available after this call. + * + * @param[in] session_id The created session id + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_SESSION_NOT_FOUND may return if + * the session is not found + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_destroy_session(rocprofiler_session_id_t session_id) ROCPROFILER_VERSION_2_0; + +/** \defgroup session_filter_group Session Filters Handling + * \ingroup sessions_handling_group + * @{ + */ +typedef enum { + /** + * Kernel Dispatch Timestamp collection. + */ + ROCPROFILER_DISPATCH_TIMESTAMPS_COLLECTION = 1, + /** + * GPU Application counter collection. + */ + ROCPROFILER_COUNTERS_COLLECTION = 2, + /** + * PC Sampling collection. (Not Yet Supported) + */ + ROCPROFILER_PC_SAMPLING_COLLECTION = 3, + /** + * ATT Tracing. (Not Yet Supported) + */ + ROCPROFILER_ATT_TRACE_COLLECTION = 4, + /** + * SPM collection. (Not Yet Supported) + */ + ROCPROFILER_SPM_COLLECTION = 5, + /** + * HIP/HSA/ROCTX/SYS Trace. + */ + ROCPROFILER_API_TRACE = 6 +} rocprofiler_filter_kind_t; + +/** + * Data Filter Types to be used by ::rocprofiler_session_set_filter to add + * filters to a specific session + */ +typedef enum { + /** + * Add HSA API calls that will be only traced (ex. hsa_amd_memory_async_copy) + */ + ROCPROFILER_FILTER_HSA_TRACER_API_FUNCTIONS = 1, + /** + * Add HIP API calls that will be only traced (ex. hipLaunchKernel) + */ + ROCPROFILER_FILTER_HIP_TRACER_API_FUNCTIONS = 2, + /** + * Add GPU names that will be only profiled or traced + */ + ROCPROFILER_FILTER_GPU_NAME = 3, + // TODO(aelwazir): Add more clear description on how to use? + /** + * Add Range of calls to be traced or kernels to be profiled + */ + ROCPROFILER_FILTER_RANGE = 4, + /** + * Add Kernel names that will be only profiled or traced + */ + ROCPROFILER_FILTER_KERNEL_NAMES = 5 +} rocprofiler_filter_property_kind_t; + +// TODO(aelwazir): Another way to define this as needed +typedef const char* rocprofiler_hip_function_name_t; +typedef const char* rocprofiler_hsa_function_name_t; + +// ATT tracing parameter names +typedef enum { + ROCPROFILER_ATT_COMPUTE_UNIT_TARGET = 0, + ROCPROFILER_ATT_VM_ID_MASK = 1, + ROCPROFILER_ATT_MASK = 2, + ROCPROFILER_ATT_TOKEN_MASK = 3, + ROCPROFILER_ATT_TOKEN_MASK2 = 4, + ROCPROFILER_ATT_SE_MASK = 5, + ROCPROFILER_ATT_SAMPLE_RATE = 6, + ROCPROFILER_ATT_PERF_MASK = 240, + ROCPROFILER_ATT_PERF_CTRL = 241, + ROCPROFILER_ATT_PERFCOUNTER = 242, + ROCPROFILER_ATT_PERFCOUNTER_NAME = 243, + ROCPROFILER_ATT_MAXVALUE +} rocprofiler_att_parameter_name_t; + +// att tracing parameters object +typedef struct { + rocprofiler_att_parameter_name_t parameter_name; + union { + uint32_t value; + const char* counter_name; + }; +} rocprofiler_att_parameter_t; + +/** + * Filter Data Type + * filter data will be used to report required and optional filters for the + * sessions using ::rocprofiler_session_add_filters + */ +typedef struct { + /** + * Filter Property kind + */ + rocprofiler_filter_property_kind_t kind; + // TODO(aelwazir): get HIP or HSA or counters as enums + /** + * Array of data required for the filter type chosen + */ + union { + const char** name_regex; + rocprofiler_hip_function_name_t* hip_functions_names; + rocprofiler_hsa_function_name_t* hsa_functions_names; + uint32_t range[2]; + }; + /** + * Data array count + */ + uint64_t data_count; +} rocprofiler_filter_property_t; + +typedef struct { + /** + * Counters to profile + */ + const char** counters_names; + /** + * Counters count + */ + int counters_count; + /** + * Sampling rate + */ + uint32_t sampling_rate; + /** + * Preferred agents to collect SPM on + */ + rocprofiler_agent_id_t* gpu_agent_id; + +} rocprofiler_spm_parameter_t; + +/** + * Filter Kind Data + */ +typedef union { + /** + * APIs to trace + */ + rocprofiler_tracer_activity_domain_t* trace_apis; + /** + * Counters to profile + */ + const char** counters_names; + /** + * att parameters + */ + rocprofiler_att_parameter_t* att_parameters; + /** + * spm counters parameters + */ + rocprofiler_spm_parameter_t* spm_parameters; +} rocprofiler_filter_data_t; + +/** + * Create Session Filter + * This function will create filter and associate it with a specific session + * For every kind, one filter only is allowed per session + * + * @param[in] session_id Session id where these filters will applied to + * @param[in] filter_kind Filter kind associated with these filters + * @param[in] data Pointer to the filter data + * @param[in] data_count Count of data in the data array given in ::data + * @param[out] filter_id The id of the filter created + * @param[in] property property needed for more filteration requests by the + * user (Only one property is allowed per filter) (Optional) + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_SESSION_NOT_FOUND Couldn't find session + * associated with the given session identifier + * @retval ::ROCPROFILER_STATUS_ERROR_SESSION_FILTER_DATA_MISMATCH The session + * filter can't accept the given data + * @retval ::ROCPROFILER_STATUS_ERROR_FILTER_DATA_CORRUPTED Data can't be read or + * corrupted + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_create_filter(rocprofiler_session_id_t session_id, + rocprofiler_filter_kind_t filter_kind, + rocprofiler_filter_data_t data, + uint64_t data_count, + rocprofiler_filter_id_t* filter_id, + rocprofiler_filter_property_t property = {}) ROCPROFILER_VERSION_2_0; + +/** + * Set Session Filter Buffer + * This function will associate buffer to a specific filter + * + * if the user wants to get the API traces for the api calls synchronously then + * the user is required to call ::rocprofiler_set_api_trace_sync_callback + * + * @param[in] session_id Session id where these filters will applied to + * @param[in] filter_id The id of the filter + * @param[in] buffer_id The id of the buffer + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_SESSION_NOT_FOUND Couldn't find session + * associated with the given session identifier + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_set_filter_buffer(rocprofiler_session_id_t session_id, + rocprofiler_filter_id_t filter_id, + rocprofiler_buffer_id_t buffer_id) ROCPROFILER_VERSION_2_0; + +/** + * Synchronous Callback + * To be only used by ::rocprofiler_set_api_trace_sync_callback, please refer to + * ::rocprofiler_set_api_trace_sync_callback for more details + * + * @param[in] record pointer to the record. + * @param[in] session_id The session id associated with that record + */ +typedef void (*rocprofiler_sync_callback_t)(rocprofiler_record_tracer_t record, + rocprofiler_session_id_t session_id); + +/** + * Set Session API Tracing Filter Synchronous Callback + * This function will associate buffer to a specific filter + * + * Currently Synchronous callbacks are only available to API Tracing filters + * for the api calls tracing and not available for the api activities or any + * other filter type, the user is responsible to create and set buffer for the + * other types + * + * @param[in] session_id Session id where these filters will applied to + * @param[in] filter_id The id of the filter + * @param[in] callback Synchronous callback + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_SESSION_NOT_FOUND, Couldn't find session + * associated with the given session identifier + * @retval ::ROCPROFILER_STATUS_ERROR_FILTER_NOT_SUPPORTED, if the filter is not + * related to API tracing + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_set_api_trace_sync_callback( + rocprofiler_session_id_t session_id, rocprofiler_filter_id_t filter_id, + rocprofiler_sync_callback_t callback) ROCPROFILER_VERSION_2_0; + +/** + * Destroy Session Filter + * This function will destroy a specific filter + * + * @param[in] session_id Session id where these filters will applied to + * @param[in] filter_id The id of the filter + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_SESSION_NOT_FOUND Couldn't find session + * associated with the given session identifier + * @retval ::ROCPROFILER_STATUS_FILTER_NOT_FOUND Couldn't find session filter + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_destroy_filter(rocprofiler_session_id_t session_id, + rocprofiler_filter_id_t filter_id) ROCPROFILER_VERSION_2_0; + +/** + * Create Buffer + * This function will create a buffer that can be associated with a filter + * + * @param[in] session_id Session id where these filters will applied to + * @param[in] buffer_callback Providing a callback for the buffer specialized + * for that filters + * @param[in] buffer_size Providing size for the buffer that will be created + * @param[in] buffer_properties Array of Flush Properties provided by the user + * @param[in] buffer_properties_count The count of the flush properties in the + * array + * @param[out] buffer_id Buffer id that was created + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_SESSION_NOT_FOUND Couldn't find session + * associated with the given session identifier + * @retval ::ROCPROFILER_STATUS_ERROR_SESSION_PROPERTIES_MISMATCH The given + * properties data are mismatching the properties kind + * @retval ::ROCPROFILER_STATUS_ERROR_PROPERTY_DATA_CORRUPTED Data can't be read + * or corrupted + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_create_buffer( + rocprofiler_session_id_t session_id, rocprofiler_buffer_callback_t buffer_callback, + size_t buffer_size, rocprofiler_buffer_id_t* buffer_id) ROCPROFILER_VERSION_2_0; + +/** + * Setting Buffer Properties + * This function will set buffer properties + * + * @param[in] session_id Session id where the buffer is associated with + * @param[in] buffer_id Buffer id of the buffer that the properties are going + * to be associated with for that filters + * @param[in] buffer_properties Array of Flush Properties provided by the user + * @param[in] buffer_properties_count The count of the flush properties in the + * array + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_SESSION_NOT_FOUND Couldn't find session + * associated with the given session identifier + * @retval ::ROCPROFILER_STATUS_BUFFER_NOT_FOUND Couldn't find buffer + * associated with the given buffer identifier + * @retval ::ROCPROFILER_STATUS_ERROR_SESSION_PROPERTIES_MISMATCH The given + * properties data are mismatching the properties kind + * @retval ::ROCPROFILER_STATUS_ERROR_PROPERTY_DATA_CORRUPTED Data can't be read + * or corrupted + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_set_buffer_properties( + rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id, + rocprofiler_buffer_property_t* buffer_properties, uint32_t buffer_properties_count) ROCPROFILER_VERSION_2_0; + +/** + * Destroy Buffer + * This function will destroy a buffer given its id and session id + * + * @param[in] session_id Session id where these filters will applied to + * @param[in] buffer_id Buffer id that will b e destroyed + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_SESSION_NOT_FOUND Couldn't find session + * associated with the given session identifier + * @retval ::ROCPROFILER_STATUS_BUFFER_NOT_FOUND Couldn't find buffer + * associated with the given buffer identifier + * @retval ::ROCPROFILER_STATUS_ERROR_SESSION_PROPERTIES_MISMATCH The given + * properties data are mismatching the properties kind + * @retval ::ROCPROFILER_STATUS_ERROR_PROPERTY_DATA_CORRUPTED Data can't be read + * or corrupted + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_destroy_buffer(rocprofiler_session_id_t session_id, + rocprofiler_buffer_id_t buffer_id) ROCPROFILER_VERSION_2_0; + +/** @} */ + +/** + * Create Ready Session + * A one call to create a ready profiling or tracing session, so that the + * session will be ready to collect counters with a one call to + * ::rocprofiler_start_session. + * ::rocprofiler_session_set_filter can be used to set optional filters like + * specific GPUs/Kernel Names/Counter Names and more. The Creation of the + * session is responsible for the creation of the buffer saving the records + * generated while the session is active. Session can be started using + * ::rocprofiler_start_session and can be stopped using + * ::rocprofiler_terminate_session + * + * @param[in] counters counter filter data, it is required from the user to + * create the filter with ::ROCPROFILER_FILTER_PROFILER_COUNTER_NAMES and to + * provide an array of counter names needed and their count + * @param[in] replay_mode The Replay strategy that should be used if replay is + * needed + * @param[in] filter_kind Filter kind associated with these filters + * @param[in] data Pointer to the filter data + * @param[in] data_count Filter data array count + * @param[in] buffer_size Size of the memory pool that will be used to save the + * data from profiling or/and tracing, if the buffer was allocated before it + * will be reallocated with the new size in addition to the old size + * @param[in] buffer_callback Asynchronous callback using Memory buffers saving + * the data and then it will be flushed if the user called + * ::rocprofiler_flush_data or if the buffer is full or if the application + * finished execution + * @param[out] session_id Pointer to the created session id, the session is + * alive up till ::rocprofiler_destroy_session being called, however, the session + * id can be used while the session is active which can be activated using + * ::rocprofiler_start_session and deactivated using + * ::rocprofiler_terminate_session but ::rocprofiler_flush_data can use session_id + * even if it is deactivated for flushing the saved records + * @param[in] property Filter Property (Optional) + * @param[in] callback Synchronous callback for API traces (Optional) + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_SESSION_MODE_FILTER_MISMATCH The session + * doesn't have the required mode for that filter type + * @retval ::ROCPROFILER_STATUS_ERROR_FILTER_DATA_CORRUPTED Data can't be read or + * corrupted + * @retval ::ROCPROFILER_STATUS_ERROR_INCORRECT_SIZE If the size is less than one + * potential record size + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_create_ready_session( + rocprofiler_replay_mode_t replay_mode, rocprofiler_filter_kind_t filter_kind, + rocprofiler_filter_data_t data, uint64_t data_count, size_t buffer_size, + rocprofiler_buffer_callback_t buffer_callback, rocprofiler_session_id_t* session_id, + rocprofiler_filter_property_t property = {}, rocprofiler_sync_callback_t callback = nullptr) ROCPROFILER_VERSION_2_0; + +// TODO(aelwazir): Multiple sessions activate for different set of filters +/** + * Activate Session + * Activating session created by ::rocprofiler_create_session, please refer to + * the samples for how to use. + * + * @param[in] session_id Session ID representing the created session + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_SESSION_NOT_FOUND may return if + * the session is not found + * @retval ::ROCPROFILER_STATUS_ERROR_SESSION_MODE_NOT_ADDED if there is no + * session_mode added + * @retval ::ROCPROFILER_STATUS_ERROR_MISSING_SESSION_CALLBACK if any + * session_mode is missing callback set + * @retval ::ROCPROFILER_STATUS_ERROR_HAS_ACTIVE_SESSION if there is already + * active session + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_start_session(rocprofiler_session_id_t session_id) ROCPROFILER_VERSION_2_0; + +/** + * Deactivate Session + * Deactivate session created by ::rocprofiler_create_session, please refer to + * the samples for how to use. + * + * @param[in] session_id Session ID for the session that will be terminated + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully + * @retval ::ROCPROFILER_STATUS_ERROR_SESSION_NOT_FOUND may return if + * the session is not found + * @retval ::ROCPROFILER_STATUS_ERROR_SESSION_NOT_ACTIVE if the session is not + * active + */ + +ROCPROFILER_API rocprofiler_status_t rocprofiler_terminate_session(rocprofiler_session_id_t session_id) ROCPROFILER_VERSION_2_0; + +/** \defgroup session_range_group Session Range Labeling + * \ingroup sessions_handling_group + * @{ + */ + +/** + * Setting a label to a block range + * This can be used to label a range of code that is having active profiling + * session or labeling a pass + * + * @param[in] label The label given for a certain block or pass to name/label. + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_CORRUPTED_LABEL_DATA may return if + * the label pointer can't be read by the API + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_push_range(const char* label) ROCPROFILER_VERSION_2_0; + +/** + * Setting an endpoint for a range + * This function can be used to set an endpoint to range labeled by + * ::rocprofiler_push_range + * + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_RANGE_STACK_IS_EMPTY may return if + * ::rocprofiler_push_range wasn't called correctly + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_pop_range() ROCPROFILER_VERSION_2_0; + +/** @} */ + +/** \defgroup session_user_replay_pass_group Session User Replay Pass Mode + * \ingroup sessions_handling_group + * @{ + */ + +/** + * Create and Start a pass + * A Pass is a block of code that can be replayed if required by the + * profiling/tracing and it mainly depends on the profiling data given in the + * ::rocprofiler_create_session + * + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_SESSION_NOT_FOUND If the no active session + * found + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_start_replay_pass() ROCPROFILER_VERSION_2_0; + +/** + * End a pass + * End a pass created and started by ::rocprofiler_start_pass + * + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + * @retval ::ROCPROFILER_STATUS_ERROR_NOT_INITIALIZED, if rocprofiler_initialize + * wasn't called before or if rocprofiler_finalize is called + * @retval ::ROCPROFILER_STATUS_ERROR_PASS_NOT_STARTED if there is no pass + * started before this call + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_end_replay_pass() ROCPROFILER_VERSION_2_0; + +/** @} */ +/** @} */ + +/** \defgroup device_profiling Device Profiling API + * @{ + */ + +typedef struct { + double value; +} rocprofiler_counter_value_t; + +typedef struct { + char metric_name[64]; + rocprofiler_counter_value_t value; +} rocprofiler_device_profile_metric_t; + +/** + * Create a device profiling session + * + * A device profiling session allows the user to profile the GPU device + * for counters irrespective of the running applications on the GPU. + * This is different from application profiling. device profiling session + * doesn't care about the host running processes and threads. It directly + * provides low level profiling information. + * + * @param[in] counter_names The names of the counters to be collected. + * @param[in] num_counters The number of counters specifief to be collected + * @param[out] session_id Pointer to the created session id. + * @param[in] cpu_index index of the cpu to be used + * @param[in] gpu_index index of the gpu to be used + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_device_profiling_session_create( + const char** counter_names, uint64_t num_counters, rocprofiler_session_id_t* session_id, + int cpu_index, int gpu_index) ROCPROFILER_VERSION_2_0; + +/** + * Start the device profiling session that was created previously. + * This will enable the GPU device to start incrementing counters + * + * @param[in] session_id session id of the session to start + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + */ +ROCPROFILER_API rocprofiler_status_t +rocprofiler_device_profiling_session_start(rocprofiler_session_id_t session_id) ROCPROFILER_VERSION_2_0; + +/** + * Poll the device profiling session to read counters from the GPU device. + * This will read out the values of the counters from the GPU device at the + * specific instant when this API is called. This is a thread-blocking call. + * Any thread that calls this API will have to wait until + * the counter values are being read out. + * + * @param[in] session_id session id of the session to start + * @param[out] data records of counter data read out from device + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + */ +ROCPROFILER_API rocprofiler_status_t rocprofiler_device_profiling_session_poll( + rocprofiler_session_id_t session_id, rocprofiler_device_profile_metric_t* data) ROCPROFILER_VERSION_2_0; + +/** + * Stop the device profiling session that was created previously. + * This will inform the GPU device to stop counters collection. + * + * @param[in] session_id session id of the session to start + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + */ +ROCPROFILER_API rocprofiler_status_t +rocprofiler_device_profiling_session_stop(rocprofiler_session_id_t session_id) ROCPROFILER_VERSION_2_0; + +/** + * Destroy the device profiling session that was created previously. + * + * @param[in] session_id session id of the session to start + * @retval ::ROCPROFILER_STATUS_SUCCESS The function has been executed + * successfully. + */ +ROCPROFILER_API rocprofiler_status_t +rocprofiler_device_profiling_session_destroy(rocprofiler_session_id_t session_id) ROCPROFILER_VERSION_2_0; + +/** @} */ + +#endif + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// +// Old ROCProfiler +//////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// -// Returning library version -uint32_t rocprofiler_version_major(); -uint32_t rocprofiler_version_minor(); +#include +#include +#include +#include +#include //////////////////////////////////////////////////////////////////////////////// // Global properties structure @@ -243,7 +2541,7 @@ typedef struct { // Profiling callback data typedef struct { hsa_agent_t agent; // GPU agent handle - uint32_t agent_index; // GPU index + uint32_t agent_index; // GPU index (GPU Driver Node ID as reported in the sysfs topology) const hsa_queue_t* queue; // HSA queue uint64_t queue_index; // Index in the queue uint32_t queue_id; // Queue id @@ -362,7 +2660,7 @@ typedef union { // Profiling info data typedef struct { - uint32_t agent_index; // GPU HSA agent index + uint32_t agent_index; // GPU HSA agent index (GPU Driver Node ID as reported in the sysfs topology) rocprofiler_info_kind_t kind; // info data kind union { struct { diff --git a/inc/rocprofiler_plugin.h b/inc/rocprofiler_plugin.h new file mode 100644 index 00000000..e660853a --- /dev/null +++ b/inc/rocprofiler_plugin.h @@ -0,0 +1,135 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +/** \section rocprofiler_plugin_api ROCProfiler Plugin API + * + * The ROCProfiler Plugin API is used by the ROCProfiler Tool to output all + * profiling information. Different implementations of the ROCProfiler Plugin + * API can be developed that output the data in different formats. The + * ROCProfiler Tool can be configured to load a specific library that supports + * the user desired format. + * + * The API is not thread safe. It is the responsibility of the ROCProfiler Tool + * to ensure the operations are synchronized and not called concurrently. There + * is no requirement for the ROCProfiler Tool to report trace data in any + * specific order. If the format supported by plugin requires specific + * ordering, it is the responsibility of the plugin implementation to perform + * any necessary sorting. + */ + +/** + * \file + * ROCProfiler Tool Plugin API interface. + */ + +#ifndef ROCPROFILER_PLUGIN_H_ +#define ROCPROFILER_PLUGIN_H_ + +#include + +#include "rocprofiler.h" + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +/** \defgroup rocprofiler_plugins ROCProfiler Plugin API Specification + * @{ + */ + +/** \defgroup initialization_group Initialization and Finalization + * \ingroup rocprofiler_plugins + * + * The ROCProfiler Plugin API must be initialized before using any of the + * operations to report trace data, and finalized after the last trace data has + * been reported. + * + * @{ + */ + +/** + * Initialize plugin. + * Must be called before any other operation. + * + * @param[in] rocprofiler_major_version The major version of the ROCProfiler API + * being used by the ROCProfiler Tool. An error is reported if this does not + * match the major version of the ROCProfiler API used to build the plugin + * library. This ensures compatibility of the trace data format. + * @param[in] rocprofiler_minor_version The minor version of the ROCProfiler API + * being used by the ROCProfiler Tool. An error is reported if the + * \p rocprofiler_major_version matches and this is greater than the minor + * version of the ROCProfiler API used to build the plugin library. This ensures + * compatibility of the trace data format. + * @return Returns 0 on success and -1 on error. + */ +ROCPROFILER_EXPORT int rocprofiler_plugin_initialize(uint32_t rocprofiler_major_version, + uint32_t rocprofiler_minor_version); + +/** + * Finalize plugin. + * This must be called after ::rocprofiler_plugin_initialize and after all + * profiling data has been reported by + * ::rocprofiler_plugin_write_kernel_records + */ +ROCPROFILER_EXPORT void rocprofiler_plugin_finalize(); + +/** @} */ + +/** \defgroup profiling_record_write_functions Profiling data reporting + * \ingroup rocprofiler_plugins + * Operations to output profiling data. + * @{ + */ + +// TODO(aelwazir): Recheck wording of the description + +/** + * Report Buffer Records. + * + * @param[in] begin Pointer to the first record. + * @param[in] end Pointer to one past the last record. + * @param[in] session_id Session ID + * @param[in] buffer_id Buffer ID + * @return Returns 0 on success and -1 on error. + */ +ROCPROFILER_EXPORT int rocprofiler_plugin_write_buffer_records(const rocprofiler_record_header_t* begin, + const rocprofiler_record_header_t* end, + rocprofiler_session_id_t session_id, + rocprofiler_buffer_id_t buffer_id); + +/** + * Report Synchronous Record. + * + * @param[in] record Pointer to the Synchronous Tracer record. + * @param[in] session_id Session ID + * @return Returns 0 on success and -1 on error. + */ +ROCPROFILER_EXPORT int rocprofiler_plugin_write_record(rocprofiler_record_tracer_t record, + rocprofiler_session_id_t session_id); + +/** @} */ + +/** @} */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif /* ROCPROFILER_PLUGIN_H_ */ diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt new file mode 100644 index 00000000..1576b72a --- /dev/null +++ b/plugin/CMakeLists.txt @@ -0,0 +1,26 @@ +################################################################################ +## Copyright (c) 2022 Advanced Micro Devices, Inc. +## +## Permission is hereby granted, free of charge, to any person obtaining a copy +## of this software and associated documentation files (the "Software"), to +## deal in the Software without restriction, including without limitation the +## rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +## sell copies of the Software, and to permit persons to whom the Software is +## furnished to do so, subject to the following conditions: +## +## The above copyright notice and this permission notice shall be included in +## all copies or substantial portions of the Software. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +## FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +## IN THE SOFTWARE. +################################################################################ + +add_subdirectory(file) +add_subdirectory(perfetto) +add_subdirectory(ctf) +add_subdirectory(att) diff --git a/plugin/att/CMakeLists.txt b/plugin/att/CMakeLists.txt new file mode 100644 index 00000000..d3b666b0 --- /dev/null +++ b/plugin/att/CMakeLists.txt @@ -0,0 +1,66 @@ +# ############################################################################## +# # Copyright (c) 2022 Advanced Micro Devices, Inc. # # Permission is hereby +# granted, free of charge, to any person obtaining a copy # of this software and +# associated documentation files (the "Software"), to # deal in the Software +# without restriction, including without limitation the # rights to use, copy, +# modify, merge, publish, distribute, sublicense, and/or # sell copies of the +# Software, and to permit persons to whom the Software is # furnished to do so, +# subject to the following conditions: # # The above copyright notice and this +# permission notice shall be included in # all copies or substantial portions of +# the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +# KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO +# EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +# OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE +# USE OR OTHER DEALINGS # IN THE SOFTWARE. +# ############################################################################## + +find_library( + ROCPROFV2_ATT rocprofv2_att + HINTS ${CMAKE_INSTALL_PREFIX} + PATHS ${ROCM_PATH} + PATH_SUFFIXES hsa-amd-aqlprofile) + +set(ENV{ROCPROFV2_ATT_LIB_PATH} $ROCPROFV2_ATT) + +# Building att plugin library +file(GLOB ROCPROFILER_UTIL_SRC_FILES ${PROJECT_SOURCE_DIR}/src/utils/helper.cpp) +file(GLOB FILE_SOURCES att.cpp) +add_library(att_plugin SHARED ${FILE_SOURCES} ${ROCPROFILER_UTIL_SRC_FILES}) + +set_target_properties( + att_plugin + PROPERTIES CXX_VISIBILITY_PRESET hidden + LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/../exportmap + LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) + +target_compile_definitions(att_plugin PRIVATE HIP_PROF_HIP_API_STRING=1 + __HIP_PLATFORM_HCC__=1) + +target_include_directories( + att_plugin PRIVATE ${PROJECT_SOURCE_DIR}/inc ${PROJECT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_options( + att_plugin PRIVATE + -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exportmap + -Wl,--no-undefined) +target_link_libraries(att_plugin PRIVATE ${ROCPROFILER_TARGET} systemd + hsa-runtime64::hsa-runtime64 stdc++fs) + +install(TARGETS att_plugin + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME} + COMPONENT runtime) + +configure_file(att.py att/att.py COPYONLY) +configure_file(trace_view.py att/trace_view.py COPYONLY) +# configure_file(t.db att/t.db COPYONLY) +configure_file(ui/index.html att/ui/index.html COPYONLY) +configure_file(ui/logo.svg att/ui/logo.svg COPYONLY) +configure_file(ui/styles.css att/ui/styles.css COPYONLY) +# configure_file(ui/trace.json att/ui/trace.json COPYONLY) +install( + DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/att + DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/rocprofiler + USE_SOURCE_PERMISSIONS + COMPONENT runtime) diff --git a/plugin/att/att.cpp b/plugin/att/att.cpp new file mode 100644 index 00000000..aa646867 --- /dev/null +++ b/plugin/att/att.cpp @@ -0,0 +1,195 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rocprofiler.h" +#include "rocprofiler_plugin.h" +#include "../utils.h" + +namespace { + +class att_plugin_t { + public: + att_plugin_t() {} + + std::mutex writing_lock; + bool is_valid_{true}; + + inline bool att_file_exists(const std::string& name) { + struct stat buffer; + return stat(name.c_str(), &buffer) == 0; + } + + bool IsValid() const { return is_valid_; } + + void FlushATTRecord(const rocprofiler_record_att_tracer_t* att_tracer_record, + rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id) { + std::lock_guard lock(writing_lock); + + if (!att_tracer_record) { + printf("No att data buffer received\n"); + return; + } + + size_t name_length; + CHECK_ROCPROFILER(rocprofiler_query_kernel_info_size(ROCPROFILER_KERNEL_NAME, + att_tracer_record->kernel_id, &name_length)); + const char* kernel_name_c = static_cast(malloc(name_length * sizeof(char))); + CHECK_ROCPROFILER(rocprofiler_query_kernel_info(ROCPROFILER_KERNEL_NAME, + att_tracer_record->kernel_id, &kernel_name_c)); + + std::string name_demangled = rocmtools::truncate_name(rocmtools::cxx_demangle(kernel_name_c)); + + // Get the number of shader engine traces + int se_num = att_tracer_record->shader_engine_data_count; + std::string outpath; + if (getenv("OUTPUT_PATH") == nullptr) { + outpath = ""; + } else { + outpath = std::string(getenv("OUTPUT_PATH")) + "/"; + } + // Find if this filename already exists. If so, increment vname. + int file_iteration = -1; + bool bIncrementVersion = true; + while (bIncrementVersion) { + file_iteration += 1; + std::string fss = name_demangled + "_v" + std::to_string(file_iteration); + bIncrementVersion = att_file_exists(outpath + fss + "_kernel.txt"); + } + + std::string fname = + outpath + name_demangled + "_v" + std::to_string(file_iteration) + "_kernel.txt"; + std::ofstream(fname.c_str()) << name_demangled << ": " << kernel_name_c << '\n'; + + // iterate over each shader engine att trace + for (int i = 0; i < se_num; i++) { + if (!att_tracer_record->shader_engine_data && + !att_tracer_record->shader_engine_data[i].buffer_ptr) + continue; + printf("--------------collecting data for shader_engine %d---------------\n", i); + rocprofiler_record_se_att_data_t* se_att_trace = &att_tracer_record->shader_engine_data[i]; + uint32_t size = se_att_trace->buffer_size; + const char* data_buffer_ptr = reinterpret_cast(se_att_trace->buffer_ptr); + + // dump data in binary format + std::ostringstream oss; + oss << outpath + name_demangled << "_v" << file_iteration << "_se" << i << ".att"; + std::ofstream out(oss.str().c_str(), std::ios::binary); + if (out.is_open()) { + out.write((char*)data_buffer_ptr, size); + out.close(); + } else { + std::cerr << "\t" << __FUNCTION__ << " Failed to open file: " << oss.str().c_str() << '\n'; + } + } + } + + int WriteBufferRecords(const rocprofiler_record_header_t* begin, + const rocprofiler_record_header_t* end, + rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id) { + while (begin < end) { + if (!begin) return 0; + switch (begin->kind) { + case ROCPROFILER_PROFILER_RECORD: + case ROCPROFILER_TRACER_RECORD: + case ROCPROFILER_PC_SAMPLING_RECORD: + case ROCPROFILER_SPM_RECORD: + printf("Invalid record Kind: %d", begin->kind); + break; + + case ROCPROFILER_ATT_TRACER_RECORD: { + rocprofiler_record_att_tracer_t* att_record = + const_cast( + reinterpret_cast(begin)); + FlushATTRecord(att_record, session_id, buffer_id); + break; + } + } + rocprofiler_next_record(begin, &begin, session_id, buffer_id); + } + + return 0; + } + + private: +}; + +att_plugin_t* att_plugin = nullptr; + +} // namespace + +ROCPROFILER_EXPORT int rocprofiler_plugin_initialize(uint32_t rocprofiler_major_version, + uint32_t rocprofiler_minor_version) { + if (rocprofiler_major_version != ROCPROFILER_VERSION_MAJOR || + rocprofiler_minor_version < ROCPROFILER_VERSION_MINOR) + return -1; + + if (att_plugin != nullptr) return -1; + + att_plugin = new att_plugin_t(); + if (att_plugin->IsValid()) return 0; + + // The plugin failed to initialied, destroy it and return an error. + delete att_plugin; + att_plugin = nullptr; + return -1; +} + +ROCPROFILER_EXPORT void rocprofiler_plugin_finalize() { + if (!att_plugin) return; + delete att_plugin; + att_plugin = nullptr; +} + +ROCPROFILER_EXPORT int rocprofiler_plugin_write_buffer_records( + const rocprofiler_record_header_t* begin, const rocprofiler_record_header_t* end, + rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id) { + if (!att_plugin || !att_plugin->IsValid()) return -1; + return att_plugin->WriteBufferRecords(begin, end, session_id, buffer_id); +} + +ROCPROFILER_EXPORT int rocprofiler_plugin_write_record(rocprofiler_record_tracer_t record, + rocprofiler_session_id_t session_id) { + if (!att_plugin || !att_plugin->IsValid()) return -1; + if (record.header.id.handle == 0) return 0; + return 0; +} diff --git a/plugin/att/att.py b/plugin/att/att.py new file mode 100755 index 00000000..7ded24e4 --- /dev/null +++ b/plugin/att/att.py @@ -0,0 +1,518 @@ +#!/usr/bin/env python3 +import sys +if sys.version_info[0] < 3: + raise Exception("Must be using Python 3") + +import os +import argparse +from pathlib import Path +from struct import * +from ctypes import * +import ctypes +from copy import deepcopy +from trace_view import view_trace +import sys +import glob +import numpy as np +import matplotlib.pyplot as plt +import json + +class PerfEvent(ctypes.Structure): + _fields_ = [ + ('time', c_uint64), + ('event0', c_uint16), + ('event1', c_uint16), + ('event2', c_uint16), + ('event3', c_uint16), + ('cu', c_uint8), + ('bank', c_uint8), + ] + def toTuple(self): + return (int(self.time), int(self.event0), int(self.event1), + int(self.event2), int(self.event3), int(self.cu), int(self.bank)) + + +class CodeWrapped(ctypes.Structure): + """ Matches CodeWrapped on the python side """ + _fields_ = [('line', ctypes.c_char_p), + ('loc', ctypes.c_char_p), + ('value', ctypes.c_int), + ('to_line', ctypes.c_int), + ('index', ctypes.c_int), + ('line_num', ctypes.c_int)] + + +class KvPair(ctypes.Structure): + """ Matches pair = (key, value) on the python side """ + _fields_ = [('key', ctypes.c_int), + ('value', ctypes.c_int)] + + +class ReturnAssemblyInfo(ctypes.Structure): + """ Matches ReturnAssemblyInfo on the python side """ + _fields_ = [('code', POINTER(CodeWrapped)), + ('jumps', POINTER(KvPair)), + ('code_len', ctypes.c_int), + ('jumps_len', ctypes.c_int)] + + +class Wave(ctypes.Structure): + _fields_ = [ + ('simd', ctypes.c_uint64), + ('wave_id', ctypes.c_uint64), + ('begin_time', ctypes.c_uint64), # Begin and end cycle + ('end_time', ctypes.c_uint64), + + # total VMEM/FLAT/LDS/SMEM instructions issued + # total issued memory instructions + ('num_mem_instrs', ctypes.c_uint64), + # total issued instructions (compute + memory) + ('num_issued_instrs', ctypes.c_uint64), + ('num_valu_instrs', ctypes.c_uint64), + ('num_valu_stalls', ctypes.c_uint64), + # VMEM Pipeline: instrs and stalls + ('num_vmem_instrs', ctypes.c_uint64), + ('num_vmem_stalls', ctypes.c_uint64), + # FLAT instrs and stalls + ('num_flat_instrs', ctypes.c_uint64), + ('num_flat_stalls', ctypes.c_uint64), + + # LDS instr and stalls + ('num_lds_instrs', ctypes.c_uint64), + ('num_lds_stalls', ctypes.c_uint64), + + # SCA instrs stalls + ('num_salu_instrs', ctypes.c_uint64), + ('num_smem_instrs', ctypes.c_uint64), + ('num_salu_stalls', ctypes.c_uint64), + ('num_smem_stalls', ctypes.c_uint64), + + # Branch + ('num_branch_instrs', ctypes.c_uint64), + ('num_branch_taken_instrs', ctypes.c_uint64), + ('num_branch_stalls', ctypes.c_uint64), + + ('timeline_string', ctypes.c_char_p), + ('instructions_string', ctypes.c_char_p)] + + +class ReturnInfo(ctypes.Structure): + _fields_ = [('num_waves', ctypes.c_uint64), + ('wavedata', POINTER(Wave)), + ('num_events', ctypes.c_uint64), + ('perfevents', POINTER(PerfEvent))] + +rocprofv2_att_lib = os.getenv('ROCPROFV2_ATT_LIB_PATH') +try: # For build dir + path_to_parser = os.path.abspath(rocprofv2_att_lib) + SO = CDLL(path_to_parser) +except: # For installed dir + path_to_parser = os.path.abspath('/opt/rocm/lib/hsa-amd-aqlprofile/librocprofv2_att.so') + SO = CDLL(path_to_parser) + +SO.AnalyseBinary.restype = ReturnInfo +SO.AnalyseBinary.argtypes = [ctypes.c_char_p, ctypes.c_int, ctypes.c_bool] +SO.wrapped_parse_binary.argtypes = [ctypes.c_char_p, ctypes.c_char_p] +SO.wrapped_parse_binary.restype = ReturnAssemblyInfo + +def parse_binary(filename, kernel=None): + if kernel is None or kernel == '': + kernel = ctypes.c_char_p(0) + print('Parsing all kernels') + else: + with open(glob.glob(kernel)[0], 'r') as file: + kernel = file.readlines() + print('Parsing kernel:', kernel[0].split(': ')[0]) + kernel = kernel[0].split(': ')[1].split('.kd')[0] + kernel = str(kernel).encode('utf-8') + filename = os.path.abspath(str(filename)) + info = SO.wrapped_parse_binary(str(filename).encode('utf-8'), kernel) + + code = [] + for k in range(info.code_len): + code_entry = info.code[k] + + # copy string memory from C++ + line = deepcopy(code_entry.line.decode("utf-8")) + loc = deepcopy(code_entry.loc.decode("utf-8")) + + # Transform empty entries back to python's None + to_line = int(code_entry.to_line) if (code_entry.to_line >= 0) else None + loc = loc if len(loc) > 0 else None + + code.append((line, int(code_entry.value), to_line, loc, + int(code_entry.index), int(code_entry.line_num))) + + jumps = {} + for k in range(info.jumps_len): + jumps[info.jumps[k].key] = info.jumps[k].value + + return code, jumps + + +def getWaves(filename, target_cu, verbose): + filename = os.path.abspath(str(filename)) + info = SO.AnalyseBinary(filename.encode('utf-8'), target_cu, verbose) + + waves = [info.wavedata[k] for k in range(info.num_waves)] + events = [deepcopy(info.perfevents[k]) for k in range(info.num_events)] + + for wave in waves: + wave.timeline = deepcopy(wave.timeline_string.decode("utf-8")) + wave.instructions = deepcopy(wave.instructions_string.decode("utf-8")) + + return waves, events + + +def persist(output_ui, trace_file, SIMD): + trace = Path(trace_file).name + simds, waves = [], [] + begin_time, end_time, timeline, instructions = [], [], [], [] + mem_ins, issued_ins, valu_ins, valu_stalls = [], [], [], [] + vmem_ins, vmem_stalls, flat_ins, flat_stalls = [], [], [], [] + lds_ins, lds_stalls, salu_ins, salu_stalls = [], [], [], [] + smem_ins, smem_stalls, br_ins, br_taken_ins, br_stalls = [], [], [], [], [] + + for wave in SIMD: + simds.append(wave.simd) + waves.append(wave.wave_id) + begin_time.append(wave.begin_time) + end_time.append(wave.end_time) + mem_ins.append(wave.num_mem_instrs) + issued_ins.append(wave.num_issued_instrs) + valu_ins.append(wave.num_valu_instrs) + valu_stalls.append(wave.num_valu_stalls) + vmem_ins.append(wave.num_vmem_instrs) + vmem_stalls.append(wave.num_vmem_stalls) + flat_ins.append(wave.num_flat_instrs) + flat_stalls.append(wave.num_flat_stalls) + lds_ins.append(wave.num_lds_instrs) + lds_stalls.append(wave.num_lds_stalls) + salu_ins.append(wave.num_salu_instrs) + salu_stalls.append(wave.num_salu_stalls) + smem_ins.append(wave.num_smem_instrs) + smem_stalls.append(wave.num_smem_stalls) + br_ins.append(wave.num_branch_instrs) + br_taken_ins.append(wave.num_branch_taken_instrs) + br_stalls.append(wave.num_branch_stalls) + timeline.append(wave.timeline) + instructions.append(wave.instructions) + + #df = pd.DataFrame({ + df = { + 'name': [trace for _ in range(len(begin_time))], + 'id': [i for i in range(len(begin_time))], + 'simd': simds, + 'wave_slot': waves, + 'begin_time': begin_time, + 'end_time': end_time, + 'mem_ins': mem_ins, + 'issued_ins': issued_ins, + 'valu_ins': valu_ins, + 'valu_stalls': valu_stalls, + 'vmem_ins': vmem_ins, + 'vmem_stalls': vmem_stalls, + 'flat_ins': flat_ins, + 'flat_stalls': flat_stalls, + 'lds_ins': lds_ins, + 'lds_stalls': lds_stalls, + 'salu_ins': salu_ins, + 'salu_stalls': salu_stalls, + 'smem_ins': smem_ins, + 'smem_stalls': smem_stalls, + 'br_ins': br_ins, + 'br_taken_ins': br_taken_ins, + 'br_stalls': br_stalls, + 'timeline': timeline, + 'instructions': instructions, + }#) + #[print(d) for c, d in df.iterrows()]; quit() + return df + + +def mem_max(array): + mem_dict = {} + for SE in array: + for wave in SE: + for inst in wave: + try: + mem_dict[inst[0]][0] = max(mem_dict[inst[0]][0], inst[1]) + except: + mem_dict[inst[0]] = inst[1:] + assert(mem_dict[inst[0]][1] == inst[2]) + + return mem_dict + +def lgk(count): + return 'lgkmcnt({0})'.format(count) +def vmc(count): + return 'vmcnt({0})'.format(count) +def both_cnt(count): + return lgk(count)+' '+vmc(count) + +def insert_waitcnt(flight_count, assembly_code): + flight_count = mem_max(flight_count) + for key in sorted(flight_count): + line_n = key + issue_amount, waitcnt_amount, = flight_count[key] + if 'vmcnt' in assembly_code[line_n] and 'lgkmcnt' in assembly_code[line_n]: + counter_type = both_cnt + elif 'vmcnt' in assembly_code[line_n]: + counter_type = vmc + elif 'lgkmcnt' in assembly_code[line_n]: + counter_type = lgk + else: + print('Error: Line mismatch') + exit(-1) + + for count in range(waitcnt_amount+1, issue_amount): + print('Inserted line: '+str(line_n)) + as_index = line_n - count/(issue_amount+1) + assembly_code[as_index] = \ + '\ts_waitcnt {0}\t\t; Timing analysis.'.format(counter_type(count)) + as_index += 0.5/(issue_amount+1) + assembly_code[as_index] = '\ts_nop 0\t\t\t\t\t\t; Counters: '+str(issue_amount) + + return assembly_code + + +def Copy_Files(output_ui): + curpath = os.path.dirname(os.path.abspath(__file__)) + outpath = output_ui+'/ui/' + + os.makedirs(outpath, exist_ok=True) + os.system('cp '+curpath+'/ui/* '+outpath) + + +def get_delta_time(events): + try: + CUS = [[e.time for e in events if e.cu==k and e.bank==0] for k in range(16)] + CUS = [np.asarray(c).astype(np.int64) for c in CUS if len(c) > 2] + return np.min([np.min(abs(c[1:]-c[:-1])) for c in CUS]) + except: + return 1 + +def draw_wave_metrics(selections, normalize): + global PIC_SAVE_FOLDER + global EVENTS + global EVENT_NAMES + + #event_names = ['Busy CUs', 'Occupancy', 'Eligible waves', 'Waves waiting'] + with open(PIC_SAVE_FOLDER+'counters.json', 'w') as f: + f.write(json.dumps({"counters": EVENT_NAMES})) + + plt.figure(figsize=(15,3)) + + delta_time = int(0.5+np.min([get_delta_time(events) for events in EVENTS])) + maxtime = np.max([np.max([e.time for e in events]) for events in EVENTS])+1 + event_timeline = np.zeros((16, maxtime), dtype=np.int32) + print('Delta:', delta_time) + print('Max_cycles:', maxtime) + + kernsize = 2*(delta_time//14)+1 + trim = max(maxtime//5000,1) + cycles = 4*np.arange(maxtime)[::trim] + + kernel = np.asarray([np.exp(-abs(k/kernsize)**2) for k in range(-kernsize*3,kernsize*3+1)]) + kernel /= np.sum(kernel)*len(EVENTS)*delta_time + + for events in EVENTS: + for e in range(len(events)-1): + bk = events[e].bank*4 + start = events[e].time + end = start+delta_time + event_timeline[bk:bk+4, start:end] += np.asarray(events[e].toTuple()[1:5])[:, None] + start = events[-1].time + event_timeline[bk:bk+4, start:start+delta_time] += \ + np.asarray(events[-1].toTuple()[1:5])[:, None] + + event_timeline = [np.convolve(e, kernel)[3*kernsize:-3*kernsize] for e in event_timeline] + + if normalize: + event_timeline = [100*e/max(e.max(), 1E-5) for e in event_timeline] + + colors = ['blue', 'green', 'gray', 'red', 'orange', 'cyan', 'black', 'darkviolet', + 'yellow', 'darkred', 'pink', 'lime', 'gold', 'tan', 'aqua', 'olive'] + [plt.plot(cycles, e[::trim], '-', label=n, color=c) + for e, n, c, sel in zip(event_timeline, EVENT_NAMES, colors, selections) if sel] + + plt.legend() + if normalize: + plt.ylabel('As % of maximum') + else: + plt.ylabel('Value') + plt.subplots_adjust(left=0.05, right=1, top=1, bottom=0.07) + plt.savefig(PIC_SAVE_FOLDER+'timeline.png', dpi=150) + #plt.show() + + +def draw_wave_states(selections, normalize): + global TIMELINES + global PIC_SAVE_FOLDER + plot_indices = [1, 2, 3, 4] + STATES = [['Empty', 'Idle', 'Exec', 'Wait', 'Stall'][k] for k in plot_indices] + colors = [['gray', 'orange', 'green', 'red', 'blue'][k] for k in plot_indices] + + plt.figure(figsize=(15,3)) + + maxtime = max([np.max((TIMELINES[k]!=0)*np.arange(0,TIMELINES[k].size)) for k in plot_indices]) + timelines = [deepcopy(TIMELINES[k][:maxtime]) for k in plot_indices] + timelines = [np.pad(t, [0, maxtime-t.size]) for t in timelines] + + if normalize: + timelines = np.array(timelines) / np.maximum(np.sum(timelines,0)*1E-2,1E-7) + + kernsize = maxtime//150+1 + trim = max(maxtime//5000,1) + cycles = np.arange(timelines[0].size)[::trim] + + kernel = np.asarray([np.exp(-abs(10*k/kernsize)) for k in range(-kernsize//2,kernsize//2+1)]) + kernel /= np.sum(kernel) + + timelines = [np.convolve(time, kernel)[kernsize//2:-kernsize//2][::trim] for time in timelines] + + with open(PIC_SAVE_FOLDER+'counters.json', 'w') as f: + f.write(json.dumps({"counters": STATES})) + + [plt.plot(cycles, t, label='State '+s, linewidth=1.1, color=c) + for t, s, c, sel in zip(timelines, STATES, colors, selections) if sel] + + plt.legend() + if normalize: + plt.ylabel('Waves state %') + else: + plt.ylabel('Waves state total') + plt.ylim(-1) + plt.xlim(-maxtime//200, maxtime+maxtime//200) + plt.subplots_adjust(left=0.05, right=1, top=1, bottom=0.07) + plt.savefig(PIC_SAVE_FOLDER+'timeline.png', dpi=150) + + +def GeneratePIC(selections=[True for k in range(16)], normalize=True, bScounter=True): + if bScounter and len(EVENTS) > 0 and np.sum([len(e) for e in EVENTS]) > 32: + draw_wave_metrics(selections, normalize) + else: + draw_wave_states(selections, normalize) + + +if __name__ == "__main__": + pathenv = os.getenv('OUTPUT_PATH') + if pathenv is None: + pathenv = "." + parser = argparse.ArgumentParser() + parser.add_argument("assembly_code", help="Path of the assembly code") + parser.add_argument("--trace_file", help="Filter for trace files", default=None, type=str) + parser.add_argument("-o", "--output_ui", help="Output Folder", default='/dev/shm/attplugin/') + parser.add_argument("-k", "--att_kernel", help="Kernel file", type=str, default=pathenv+'/*_kernel.txt') + parser.add_argument("-w", "--wave_id", help="wave id") + parser.add_argument("-p", "--ports", help="Server and websocket ports, default: 8000,18000") + parser.add_argument("--target_cu", help="Collected target CU id{0-15}", type=int, default=None) + parser.add_argument("-v", "--verbose", action="store_true") + parser.add_argument("-g", "--genasm", + help="Generate post-processed asm file at this path", type=str, default="") + args = parser.parse_args() + + global EVENT_NAMES + with open(os.getenv("COUNTERS_PATH"), 'r') as f: + lines = [l.split('//')[0] for l in f.readlines()] + + EVENT_NAMES = [] + clean = lambda x: x.split('=')[1].split(' ')[0].split('\n')[0] + for line in lines: + if 'PERFCOUNTER_ID=' in line: + EVENT_NAMES += ['id: '+clean(line)] + elif args.target_cu is None and 'att: TARGET_CU' in line: + args.target_cu = int(clean(line)) + print('Target CU set to:', args.target_cu) + for line in lines: + if 'PERFCOUNTER=' in line: + EVENT_NAMES += [clean(line).split('SQ_')[1].lower()] + + if args.target_cu is None: + args.target_cu = 1 + + # Assembly parsing + path = Path(args.assembly_code) + if not path.is_file(): + print("Invalid assembly_code('{0}')!".format(args.assembly_code)) + sys.exit(1) + + att_kernel = glob.glob(args.att_kernel) + + if len(att_kernel) == 0: + print('Could not find att output kernel:', args.att_kernel) + exit(1) + elif len(att_kernel) > 1: + print('Found multiple kernel matching given filters:') + for n, k in enumerate(att_kernel): + print('\t', n, '->', k) + + bValid = False + while bValid == False: + try: + args.att_kernel = att_kernel[int(input("Please select number: "))] + bValid = True + except KeyboardInterrupt: + exit(0) + except: + print('Invalid option.') + else: + args.att_kernel = att_kernel[0] + + print('Att kernel:', args.att_kernel) + code, jumps = parse_binary(args.assembly_code, args.att_kernel) + + # Trace Parsing + if args.trace_file is None: + filenames = glob.glob(args.att_kernel.split('_kernel.txt')[0]+'*.att') + assert(len(filenames) > 0) + else: + filenames = glob.glob(args.trace_file) + + print('Trace filenames:', filenames) + + Copy_Files(args.output_ui) + DBFILES = [] + global TIMELINES + global EVENTS + TIMELINES = [np.zeros(int(1E4),dtype=np.int32) for k in range(5)] + EVENTS = [] + for name in filenames: + SIMD, perfevents = getWaves(name, args.target_cu, args.verbose) + EVENTS.append(perfevents) + DBFILES.append( persist(args.output_ui, name, SIMD) ) + for wave in SIMD: + time_acc = 0 + tuples1 = wave.timeline.split('(') + tuples2 = [t.split(')')[0].split(',') for t in tuples1 if t != ''] + tuples3 = [(int(t[0]),int(t[1])) for t in tuples2] + + for state in tuples3: + if state[1] > 1E7: + print('Warning: Time limit reached for ',state[0], state[1]) + break + + if time_acc+state[1] > TIMELINES[state[0]].size: + TIMELINES[state[0]] = np.hstack([ + TIMELINES[state[0]], + np.zeros_like(TIMELINES[state[0]]) + ]) + TIMELINES[state[0]][time_acc:time_acc+state[1]] += 1 + time_acc += state[1] + + if args.genasm and len(args.genasm) > 0: + flight_count = view_trace(args, 0, code, jumps, DBFILES, filenames, True, None) + + with open(args.assembly_code, 'r') as file: + lines = file.readlines() + assembly_code = {l+1.0: lines[l][:-1] for l in range(len(lines))} + assembly_code = insert_waitcnt(flight_count, assembly_code) + + with open(args.genasm, 'w') as file: + keys = sorted(assembly_code.keys()) + for k in keys: + file.write(assembly_code[k]+'\n') + else: + global PIC_SAVE_FOLDER + PIC_SAVE_FOLDER = args.output_ui+"/ui/" + view_trace(args, 0, code, jumps, DBFILES, filenames, False, GeneratePIC) diff --git a/plugin/att/trace_view.py b/plugin/att/trace_view.py new file mode 100755 index 00000000..584e49b4 --- /dev/null +++ b/plugin/att/trace_view.py @@ -0,0 +1,465 @@ +#!/usr/bin/env python3 +import sys +if sys.version_info[0] < 3: + raise Exception("Must be using Python 3") + +import os +import sys +import time +import socket +from pathlib import Path +from struct import * +from collections import defaultdict +import json +import time +import webbrowser +import http.server +import socketserver +import socket +import asyncio +import websockets +from multiprocessing import * +from copy import deepcopy + +PORT, WebSocketPort = 8000, 18000 +SP = '\u00A0' + +RS_TRACE_DEBUG = "RS_TRACE_DEBUG" in os.environ +if RS_TRACE_DEBUG: + LOG = open('./att_viewer.log', 'w') + + +def get_ip(): + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.settimeout(0) + try: + hostname = socket.gethostname() + IPAddr = socket.gethostbyname(hostname) + s.connect(({IPAddr}, 1)) + except Exception: + IPAddr = '127.0.0.1' + finally: + return IPAddr + + +IPAddr = get_ip() + + +def debug_log(msg, last=False): + if RS_TRACE_DEBUG: + LOG.write(msg) + + if last: + LOG.close() + + +def try_match_swapped(insts, code, i, line): + return insts[i+1][1] == code[line][1] and insts[i][1] == code[line+1][1] + + +def stitch(insts, code, jumps): + result, i, line, loopCount, N = [], 0, 0, defaultdict(int), len(insts) + + SMEM_INST = [] + VMEM_INST = [] + FLAT_INST = [] + NUM_SMEM = 0 + NUM_VMEM = 0 + NUM_FLAT = 0 + + mem_unroll = [] + flight_count = [] + + ordering = 0 # 0 for in order, 1 for ongoing flats and 2 for SMEM + while i < N: + inst = insts[i] + if line >= len(code): + break + + as_line = code[line] + if inst[1] == as_line[1]: + if line in jumps: + loopCount[line-1] += 1 # label is the previous line + matched, next = True, line + 1 + + num_inflight = NUM_FLAT + NUM_SMEM + NUM_VMEM + + if inst[1] == 1 or inst[1] == 5: # SMEM, LDS + ordering = 2 if inst[1] == 1 else ordering + SMEM_INST.append([line, num_inflight]) + NUM_SMEM += 1 + elif inst[1] == 3 or (inst[1] == 4 and 'global_' in as_line[0]): # VMEM R/W + VMEM_INST.append([line, num_inflight]) + NUM_VMEM += 1 + elif inst[1] == 4: # FLAT + ordering = max(ordering, 1) + FLAT_INST.append([line, num_inflight]) + NUM_FLAT += 1 + elif inst[1] == 9 and 'waitcnt' in as_line[0]: + + if 'lgkmcnt' in as_line[0]: + wait_N = int(as_line[0].split('lgkmcnt(')[1].split(')')[0]) + flight_count.append([as_line[-1], num_inflight, wait_N]) + if wait_N == 0: + ordering = 0 + if ordering == 0: + offset = len(SMEM_INST)-wait_N + mem_unroll.append( [line, SMEM_INST[:offset]+FLAT_INST] ) + SMEM_INST = SMEM_INST[offset:] + FLAT_INST = [] + NUM_FLAT = 0 + NUM_SMEM = 0 + else: + NUM_SMEM = min(max(wait_N-NUM_FLAT, 0), NUM_SMEM) + NUM_FLAT = min(max(wait_N-NUM_SMEM, 0), NUM_FLAT) + + if 'vmcnt' in as_line[0]: + wait_N = int(as_line[0].split('vmcnt(')[1].split(')')[0]) + flight_count.append([as_line[-1], num_inflight, wait_N]) + if wait_N == 0 and ordering != 2: + ordering = 0 + if ordering == 0: + offset = len(VMEM_INST)-wait_N + mem_unroll.append( [line, VMEM_INST[:offset]+FLAT_INST] ) + VMEM_INST = VMEM_INST[offset:] + FLAT_INST = [] + NUM_FLAT = 0 + NUM_VMEM = 0 + else: + NUM_VMEM = min(max(wait_N-NUM_FLAT, 0), NUM_VMEM) + NUM_FLAT = min(max(wait_N-NUM_VMEM, 0), NUM_FLAT) + + + elif inst[1] == 7 and as_line[1] == 10: # jump + matched, next = True, as_line[2] + elif inst[1] == 8 and as_line[1] == 10: # next + matched, next = True, line + 1 + else: + # instructions with almost same timestamp swapped + # if i+1 < N and line+1 < len(code) and inst[0] == insts[i+1][0]: + matched = False + next = line + 1 + if i+1 < N and line+1 < len(code): + if try_match_swapped(insts, code, i, line): + #print('Swap:', code[line]) + #print('For:', code[line+1]) + #result += (*(insts[i+1]), line), + #result += (*inst, line+1), + #i, next = i+2, line+2 + temp = insts[i] + insts[i] = insts[i+1] + insts[i+1] = temp + next = line + #else: + # print('Could not parse tokens:', insts[i], as_line) + + if matched: + new_res = inst + (line,) + result.append(new_res) + i += 1 + line = next + + N = max(N, 1) + if len(result) != N: + print('Warning - Stitching rate: '+str(len(result) * 100 / N)+'% matched') + + return result, loopCount, mem_unroll, flight_count + + +def extract_tuple(content, num): + vals = content.split(',') + assert (len(vals) == num) + last_val = vals[-1][:-1] if vals[-1].endswith(')') else vals[-1] + vals = [vals[0][1:]] + vals[1:-1] + [last_val] + return tuple(int(val) for val in vals) + + +def get_top_n(stitched): + TOP_N = 10 + by_line_num = defaultdict(lambda: [0, 0, 0]) + for (_, _, s2i, run_time, line_num) in stitched: + entry = by_line_num[line_num] + entry[0] += 1 + entry[1] += s2i + entry[2] += run_time + top_n = sorted( + [(line_num, v[0], v[1], v[2]) + for (line_num, v) in by_line_num.items()], + key=lambda x: x[2] + x[3], + reverse=True) + return top_n[:TOP_N] + + +def rjust_html(s, n): + s = str(s) + return SP * (n-len(s)) + s if len(s) < n else s + + +def rjust_html_format(msg, n1, inst, n2, n3, stall): + return str(rjust_html(msg,n1)) + str(rjust_html(inst,n2)) + str(SP*n3) + str(stall) + + +def wave_info(df, id): + issued_ins, mem_ins = df['issued_ins'][id], df['mem_ins'][id] + valu_ins, valu_stalls = df['valu_ins'][id], df['valu_stalls'][id] + salu_ins, salu_stalls = df['salu_ins'][id], df['salu_stalls'][id] + vmem_ins, vmem_stalls = df['vmem_ins'][id], df['vmem_stalls'][id] + smem_ins, smem_stalls = df['smem_ins'][id], df['smem_stalls'][id] + flat_ins, flat_stalls = df['flat_ins'][id], df['flat_stalls'][id] + lds_ins, lds_stalls = df['lds_ins'][id], df['lds_stalls'][id] + br_ins, br_stalls = df['br_ins'][id], df['br_stalls'][id] + + return 'Issued:' + str(rjust_html(issued_ins,8)) + str(SP*2) + 'Mem:' + str(mem_ins) \ + + "-" * 26 + rjust_html_format("VALU:",6,valu_ins,8,4,valu_stalls) \ + + rjust_html_format("SALU:",6,salu_ins,8,4,salu_stalls) \ + + rjust_html_format("VMEM:",6,vmem_ins,8,4,vmem_stalls) \ + + rjust_html_format("SMEM:",6,smem_ins,8,4,smem_stalls) \ + + rjust_html_format("FLAT:",6,flat_ins,8,4,flat_stalls) \ + + rjust_html_format("LDS:",6,lds_ins,8,4,lds_stalls) \ + + rjust_html_format("BR:",6,br_ins,8,4,br_stalls) + + +def extract_waves(waves): + result, slot2seq = [], {} + for id in waves['id']: + row = {key: waves[key][id] for key in waves.keys()} + + insts, timeline = [], [] + for x in row['instructions'].split('),'): + if len(x) > 0: + insts.append(extract_tuple(x, 4)) + for x in row['timeline'].split('),'): + if len(x) > 0: + timeline.append(extract_tuple(x, 2)) + + # aggregate per wave slot + if (row['simd'], row['wave_slot']) in slot2seq: + slot = result[slot2seq[(row['simd'], row['wave_slot'])]] + last_end_time = slot[2][-1][-1] + slot[2] += (row['id'], row['begin_time'], row['end_time']), + slot[3] += insts + # filler between waves + slot[4] += (0, row['begin_time'] - last_end_time), + slot[4] += timeline + else: + slot2seq[row['simd'], row['wave_slot']] = len(result) + result.append([row['simd'], row['wave_slot'], + [(row['id'], row['begin_time'], row['end_time'])], + insts, + timeline]) + + return result + + +def extract_data(df, output_ui, se_number, code, jumps): + if len(df['id']) == 0 or len(df['instructions']) == 0 or len(df['timeline']) == 0: + return None + + cu_waves = extract_waves(df) + all_filenames = [] + flight_count = [] + + for wave_id in df['id']: + insts, timeline = [], [] + if len(df['instructions'][wave_id]) == 0 or len(df['timeline'][wave_id]) == 0: + continue + + for x in df['instructions'][wave_id].split('),'): + insts.append(extract_tuple(x, 4)) + for x in df['timeline'][wave_id].split('),'): + timeline.append(extract_tuple(x, 2)) + + stitched, loopCount, mem_unroll, count = stitch(insts, code, jumps) + flight_count.append(count) + + wave_entry = { + "id": int(df['id'][wave_id]), + "simd": int(df['simd'][wave_id]), + "slot": int(df['wave_slot'][wave_id]), + "begin": int(df['begin_time'][wave_id]), + "end": int(df['end_time'][wave_id]), + "info": wave_info(df, wave_id), + "instructions": stitched, + "timeline": timeline, + "code": code, + "waitcnt": mem_unroll + } + data_obj = { + "name": 'SE'.format(se_number), + "kernel": code[0][0], + "duration": sum(dur for (_, dur) in timeline), + "wave": wave_entry, + "simd_waves": [], + "cu_waves": cu_waves, + "loop_count": loopCount, + "top_n": get_top_n(stitched), + "websocket_port": WebSocketPort, + "generation_time": time.ctime() + } + if len(data_obj["cu_waves"]) == 0: + continue + + OUT = output_ui+'/ui/se'+str(se_number)+'_sm'+str(df['simd'][wave_id])+\ + '_wv'+str(df['wave_slot'][wave_id])+'.json' + + with open(OUT, 'w') as f: + f.write(json.dumps(data_obj)) + all_filenames.append(OUT.split('/')[-1]) + + return flight_count, all_filenames + +def open_browser(): + time.sleep(0.1) + webbrowser.open_new_tab('http://{0}:{1}'.format(IPAddr, PORT)) + + +class NoCacheHTTPRequestHandler(http.server.SimpleHTTPRequestHandler): + def end_headers(self): + self.send_my_headers() + http.server.SimpleHTTPRequestHandler.end_headers(self) + + def send_my_headers(self): + self.send_header("Cache-Control", "no-cache, no-store, must-revalidate") + self.send_header("Pragma", "no-cache") + self.send_header("Expires", "0") + + def do_GET(self): + global PICTURE_CALLBACK + if 'timeline.png?' in self.path: + selections = [int(s)!=0 for s in self.path.split('timeline.png?')[1]] + PICTURE_CALLBACK(selections[1:], selections[0]) + #PICTURE_CALLBACK(selections[2:], selections[1], selections[0]) + http.server.SimpleHTTPRequestHandler.do_GET(self) + +class RocTCPServer(socketserver.TCPServer): + def server_bind(self): + self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + self.socket.bind(self.server_address) + + +def run_server(): + global RS_HOME + Handler = NoCacheHTTPRequestHandler + + os.chdir(RS_HOME+'/ui') + try: + with RocTCPServer((IPAddr, PORT), Handler) as httpd: + httpd.serve_forever() + except KeyboardInterrupt: + pass + + +def fix_space(line): + line = line.replace(' ', SP) + line = line.replace('\t', SP*4) + return line + + +def WebSocketserver(websocket, path): + data = websocket.recv() + print(354, data) + cpp, ln, _ = data.split(':') + ln = int(ln) + HL, EMP = 'highlight', '' + content = None + print("loading...") + try: + f = open(cpp, 'r', errors='replace') + content = ''.join('
  • '+str(i).ljust(5)+fix_space(l)+'
  • ' + for i, l in enumerate(f.readlines(), 1)) + except FileNotFoundError: + content = cpp + ' not found!' + websocket.send(content) + + +def run_websocket(): + start_server = websockets.serve(WebSocketserver, IPAddr, WebSocketPort) + try: + asyncio.get_event_loop().run_until_complete(start_server) + asyncio.get_event_loop().run_forever() + except KeyboardInterrupt: + pass + + +def assign_ports(ports): + ps = [int(port) for port in ports.split(',')] + if ps[0] <= 5000 or ps[1] <= 5000: + print('Need to have port values > 5000') + sys.exit(1) + elif ps[0] == ps[1]: + print('Can not use the same port for both web server and websocket server: '+ps[0]) + sys.exit(1) + global IPAddr, PORT, WebSocketPort + PORT, WebSocketPort = ps[0], ps[1] + + +def view_trace(args, wait, code, jumps, dbnames, att_filenames, bReturnLoc, pic_callback): + global PICTURE_CALLBACK + PICTURE_CALLBACK = pic_callback + pic_thread = Process(target=pic_callback) + pic_thread.start() + + assert(len(dbnames) > 0) + global RS_HOME + output_ui = args.output_ui + RS_HOME = output_ui + + att_filenames = [Path(f).name for f in att_filenames] + se_numbers = [int(a.split('_se')[1].split('.att')[0]) for a in att_filenames] + flight_count = [] + simd_wave_filenames = {} + + for se_number, dbname in zip(se_numbers, dbnames): + if len(dbname['id']) == 0: + continue + + count, wv_filenames = extract_data(dbname, output_ui, se_number, code, jumps) + + if count is not None: + flight_count.append(count) + simd_wave_filenames[se_number] = wv_filenames + + if bReturnLoc: + return flight_count + + for key in simd_wave_filenames.keys(): + wv_array = [[ + int(s.split('_sm')[1].split('_wv')[0]), + int(s.split('_wv')[1][0]), + s + ] for s in simd_wave_filenames[key]] + + wv_dict = {} + for wv in wv_array: + try: + wv_dict[wv[0]][wv[1]] = wv[2] + except: + try: + wv_dict[wv[0]] = {wv[1]: wv[2]} + except: + exit(-1) + + simd_wave_filenames[key] = wv_dict + + with open(output_ui+'/ui/filenames.json', 'w') as f: + f.write(json.dumps({"filenames": simd_wave_filenames})) + + if args.ports: + assign_ports(args.ports) + print('serving at ports: {0},{1}'.format(PORT, WebSocketPort)) + + if wait == 0: + try: + PROCS = [Process(target=run_server), + Process(target=open_browser), + Process(target=run_websocket)] + if pic_thread is not None: + pic_thread.join() + + for p in PROCS: + p.start() + for p in PROCS: + p.join() + except KeyboardInterrupt: + print("Exitting.") diff --git a/plugin/att/ui/index.html b/plugin/att/ui/index.html new file mode 100644 index 00000000..b1dbbc77 --- /dev/null +++ b/plugin/att/ui/index.html @@ -0,0 +1,1047 @@ + + + + + + MI Trace Viewer + + + +
    +
    + +
    + +
    + +
    + +
    + +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
      +
      + + + + + + diff --git a/plugin/att/ui/logo.svg b/plugin/att/ui/logo.svg new file mode 100644 index 00000000..340c490f --- /dev/null +++ b/plugin/att/ui/logo.svg @@ -0,0 +1,132 @@ + + + + + + + + + + + MI Trace View + + diff --git a/plugin/att/ui/styles.css b/plugin/att/ui/styles.css new file mode 100644 index 00000000..1949a608 --- /dev/null +++ b/plugin/att/ui/styles.css @@ -0,0 +1,106 @@ +h2,h3,h4 { + text-align: center; + font-family: Calibri, Candara, Optima, Arial, 'Trebuchet MS', sans-serif; +} + +ul, ol { + list-style-type: none; + padding: 0; + margin: 20px 5px; + /*font-family: Calibri, Candara, Segoe, "Segoe UI", Optima, Arial, sans-serif; font-size: 15px; font-style: normal; font-variant: normal; */ + font-family: 'Courier New', monospace; font-size: 15px; font-style: normal; +} + +a { + color: royalblue; + cursor: pointer; +} + +div#flexbox { + display: flex; + flex-wrap: wrap; +} + +div#flexbox > div { + flex: 50%; +} + +div#minimap { + position: absolute; + width: 350px; + top: 340px; + display: flex; + justify-content: right; + right: 5px; +} + +div#wave, div#cu_wave { + overflow:scroll; + overflow-y:hidden; +} + +div#ma_code { + height: calc(100vh - 180px); + overflow: auto; + overflow-x: hidden; + display: block; + margin: 20px; +} + +nav { + padding-right: 8px; + position: absolute; + top: 1rem; + right: 1rem; + background: #FFFDE3; + z-index: 5; + margin-left: 10px; + width:330px; +} + +.highlight { + background-color: lightgray; + font-weight: bold; +} + +.clickable { + cursor: pointer; + font-weight: 180; +} + +ul li { + border-bottom: 1px dotted black; +} + +.tooltip { + display: none; + position: relative; + left: 10px; + z-index: 100; + background: #333; + border: 1px solid #c0c0c0; + opacity: 0.8; + color: white; +} +li:hover .tooltip { + display: inline-block; /*block;*/ +} +.loop { + margin-left: 30px; + background: lightseagreen; +} + +.btn { + border: 1px solid black; + background-color: #D7D7D7; + color: black; + padding: 3px 4px; + font-size: 14px; + cursor: pointer; + border-style: ridge; + border-radius: 6px; +} + +.btn:hover { + color: blue; +} \ No newline at end of file diff --git a/plugin/ctf/.gitignore b/plugin/ctf/.gitignore new file mode 100644 index 00000000..daa30a3f --- /dev/null +++ b/plugin/ctf/.gitignore @@ -0,0 +1 @@ +README.html diff --git a/plugin/ctf/CMakeLists.txt b/plugin/ctf/CMakeLists.txt new file mode 100644 index 00000000..d8c17a6d --- /dev/null +++ b/plugin/ctf/CMakeLists.txt @@ -0,0 +1,161 @@ +################################################################################ +## Copyright (c) 2022 Advanced Micro Devices, Inc. +## +## Permission is hereby granted, free of charge, to any person obtaining a copy +## of this software and associated documentation files (the "Software"), to +## deal in the Software without restriction, including without limitation the +## rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +## sell copies of the Software, and to permit persons to whom the Software is +## furnished to do so, subject to the following conditions: +## +## The above copyright notice and this permission notice shall be included in +## all copies or substantial portions of the Software. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +## FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +## IN THE SOFTWARE. +################################################################################ + +# Plugin shared object. +add_library(ctf_plugin SHARED + ctf.cpp + plugin.cpp + barectf.c "${CMAKE_CURRENT_BINARY_DIR}/barectf.h" + ${PROJECT_SOURCE_DIR}/src/utils/helper.cpp + hsa_begin.cpp.i hsa_end.cpp.i + hip_begin.cpp.i hip_end.cpp.i) +set_target_properties(ctf_plugin PROPERTIES + CXX_VISIBILITY_PRESET hidden + LINK_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/../exportmap" + LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}") +set(METADATA_STREAM_FILE_DIR "${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/plugin/ctf") +target_compile_definitions(ctf_plugin PRIVATE + HIP_PROF_HIP_API_STRING=1 + __HIP_PLATFORM_HCC__=1 + CTF_PLUGIN_METADATA_FILE_PATH="${CMAKE_INSTALL_PREFIX}/${METADATA_STREAM_FILE_DIR}/metadata") +target_include_directories(ctf_plugin PRIVATE + "${PROJECT_SOURCE_DIR}/inc" + "${PROJECT_SOURCE_DIR}" + "${CMAKE_BINARY_DIR}/src/api" + "${CMAKE_CURRENT_BINARY_DIR}") +target_link_options(ctf_plugin PRIVATE + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exportmap" + -Wl,--no-undefined) +target_link_libraries(ctf_plugin PRIVATE + ${ROCPROFILER_TARGET} + hsa-runtime64::hsa-runtime64 + systemd + stdc++fs + dl) +install(TARGETS ctf_plugin LIBRARY + DESTINATION "${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME}" + COMPONENT plugins) + +# `gen_api_files.py` and `gen_env_yaml.py` require Python 3, +# CppHeaderParser, PyYAML, and barectf. +find_package(Python3 COMPONENTS Interpreter REQUIRED) + +message("Python: ${Python3_EXECUTABLE})") + +execute_process(COMMAND Python3::Interpreter -c "print('hello')") + +function(check_py3_pkg pkg_name) + execute_process(COMMAND "${Python3_EXECUTABLE}" -c "import ${pkg_name}" + RESULT_VARIABLE PY3_IMPORT_RES + OUTPUT_QUIET) + + if(NOT (${PY3_IMPORT_RES} EQUAL 0)) + message(FATAL_ERROR "Cannot find Python 3 package `${pkg_name}`") + endif() + + message(STATUS "Found Python 3 package `${pkg_name}`") +endfunction() + +check_py3_pkg(CppHeaderParser) +check_py3_pkg(yaml) +find_program(BARECTF_RES barectf REQUIRED) + +# Generate barectf YAML and C++ files for HSA API. +get_property(HSA_RUNTIME_INCLUDE_DIRS + TARGET hsa-runtime64::hsa-runtime64 + PROPERTY INTERFACE_INCLUDE_DIRECTORIES) +find_file(HSA_H hsa.h + PATHS ${HSA_RUNTIME_INCLUDE_DIRS} + PATH_SUFFIXES hsa + NO_DEFAULT_PATH + REQUIRED) +get_filename_component(HSA_RUNTIME_INC_PATH "${HSA_H}" DIRECTORY) +add_custom_command( + OUTPUT hsa_erts.yaml hsa_begin.cpp.i hsa_end.cpp.i + COMMAND ${CMAKE_C_COMPILER} -E "${HSA_RUNTIME_INC_PATH}/hsa.h" -o hsa.h.i + COMMAND ${CMAKE_C_COMPILER} -E "${HSA_RUNTIME_INC_PATH}/hsa_ext_amd.h" + -o hsa_ext_amd.h.i + COMMAND ${CMAKE_COMMAND} -E cat hsa.h.i + hsa_ext_amd.h.i + "${CMAKE_BINARY_DIR}/src/api/hsa_prof_str.h" + > hsa_input.h + COMMAND "${Python3_EXECUTABLE}" "${CMAKE_CURRENT_SOURCE_DIR}/gen_api_files.py" + hsa hsa_input.h + BYPRODUCTS hsa.h.i hsa_ext_amd.h.i hsa_input.h + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/gen_api_files.py" + "${HSA_RUNTIME_INC_PATH}/hsa.h" + "${HSA_RUNTIME_INC_PATH}/hsa_ext_amd.h" + "${CMAKE_BINARY_DIR}/src/api/hsa_prof_str.h" + COMMENT "Generating HSA API files for the `ctf` plugin...") + +# Generate barectf YAML and C++ files for HIP API. +get_property(HIP_INCLUDE_DIRS TARGET hip::amdhip64 + PROPERTY INTERFACE_INCLUDE_DIRECTORIES) +find_file(HIP_RUNTIME_API_H hip_runtime_api.h + PATHS ${HIP_INCLUDE_DIRS} + PATH_SUFFIXES hip + NO_DEFAULT_PATH + REQUIRED) +find_file(HIP_PROF_STR_H hip_prof_str.h + PATHS ${HIP_INCLUDE_DIRS} + PATH_SUFFIXES hip hip/amd_detail + NO_DEFAULT_PATH + REQUIRED) +list(TRANSFORM HIP_INCLUDE_DIRS PREPEND -I) +add_custom_command( + OUTPUT hip_erts.yaml hip_begin.cpp.i hip_end.cpp.i + COMMAND ${CMAKE_C_COMPILER} ${HIP_INCLUDE_DIRS} + -E "${HIP_RUNTIME_API_H}" + -D__HIP_PLATFORM_HCC__=1 + -D__HIP_ROCclr__=1 + -o hip_runtime_api.h.i + COMMAND cat hip_runtime_api.h.i "${HIP_PROF_STR_H}" > hip_input.h + BYPRODUCTS hip_runtime_api.h.i hip_input.h + COMMAND "${Python3_EXECUTABLE}" "${CMAKE_CURRENT_SOURCE_DIR}/gen_api_files.py" + hip hip_input.h + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/gen_api_files.py" + "${HIP_RUNTIME_API_H}" + "${HIP_PROF_STR_H}" + COMMENT "Generating HIP API files for the `ctf` plugin...") + +# Generate `env.yaml` (trace environment for barectf). +add_custom_command( + OUTPUT env.yaml + COMMAND "${Python3_EXECUTABLE}" "${CMAKE_CURRENT_SOURCE_DIR}/gen_env_yaml.py" + ${PROJECT_VERSION} + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/gen_env_yaml.py" + COMMENT "Generating `env.yaml`...") + +# Generate raw CTF tracer with barectf. +add_custom_command( + OUTPUT barectf.c barectf.h barectf-bitfield.h metadata + COMMAND "${BARECTF_RES}" gen "-I${CMAKE_CURRENT_BINARY_DIR}" + "-I${CMAKE_CURRENT_SOURCE_DIR}" + "${CMAKE_CURRENT_SOURCE_DIR}/config.yaml" + DEPENDS hsa_erts.yaml + hip_erts.yaml + env.yaml + "${CMAKE_CURRENT_SOURCE_DIR}/config.yaml" + "${CMAKE_CURRENT_SOURCE_DIR}/dst_base.yaml" + COMMENT "Generating raw CTF tracer with barectf...") +install(FILES "${CMAKE_CURRENT_BINARY_DIR}/metadata" + DESTINATION "${METADATA_STREAM_FILE_DIR}" COMPONENT plugins) diff --git a/plugin/ctf/README.adoc b/plugin/ctf/README.adoc new file mode 100644 index 00000000..fe82a0d0 --- /dev/null +++ b/plugin/ctf/README.adoc @@ -0,0 +1,260 @@ += CTF plugin for ROCProfiler +13 December 2022 +Philippe Proulx + +This plugin writes the received ROCProfiler tracer and profiler records to +a https://diamon.org/ctf/[CTF] trace. + +== Build requirements + +* Python ≥ 3.10 +* barectf ≥ 3.1.1 (`pip3 install barectf`) +* PyYAML (`apt-get install python3-yaml`) +* CppHeaderParser (`pip3 install CppHeaderParser`) + +== Usage + +Once installed, you may load this plugin with `rocprofv2` using +the `--plugin ctf` command-line arguments. + +This plugin honours the `OUTPUT_PATH` environment variable which +`rocprofv2` sets with the `-d` option. If you pass `-d my-dir` to +`rocprofv2`, then the plugin will write the CTF trace to the +`my-dir/trace` directory. + +IMPORTANT: This plugin performs important cleanup tasks at finalization +time, so the resulting CTF trace could be corrupted if the plugin is +never finalized. + +Once the plugin is finalized, open the resulting trace directory with +either https://babeltrace.org/[Babeltrace{nbsp}2] or +https://www.eclipse.org/tracecompass/[Trace Compass] to view or analyze +it. + +=== Event record types + +This plugin writes to different CTF data streams having different types. +On the file system, the prefix of a data stream file name indicates the +data stream type, that is: + +`roctx_`:: + rocTX messages. ++ +Each CTF event record is named `roctx` and corresponds to a rocTX +tracer record. ++ +The fields are: ++ +-- +[horizontal] +`thread_id`:: + Thread ID. + +`id`:: + rocTX ID. + +`msg`:: + rocTX message. +-- + +`hsa_api_`:: + HSA API beginning and end function calls. ++ +All CTF event records have the following common fields: ++ +-- +[horizontal] +`thread_id`:: + Thread ID. + +`queue_id`:: + Queue ID. + +`agent_id`:: + Agent ID. + +`correlation_id`:: + Correlation ID. +-- ++ +For each ROCProfiler HSA API tracer record for the HSA function named +`__name__`, this plugin writes two event records: ++ +`__name___begin`::: + Beginning of the function call. ++ +The event record contains fields which correspond to most of the +parameters of the HSA function. + +`__name___end`::: + End of the function call. + +`hip_api_`:: + HIP API beginning and end function calls. ++ +All CTF event records have the following common fields: ++ +-- +[horizontal] +`thread_id`:: + Thread ID. + +`queue_id`:: + Queue ID. + +`agent_id`:: + Agent ID. + +`correlation_id`:: + Correlation ID. + +`kernel_name`:: + Kernel name (empty string if not available). +-- ++ +For each ROCProfiler HIP API tracer record for the HIP function named +`__name__`, this plugin writes two event records: ++ +`__name__Begin`::: + Beginning of the function call. ++ +The event record contains fields which correspond to most of the +parameters of the HIP function. + +`__name__End`::: + End of the function call. + +`api_ops_`:: + HSA/HIP API beginning and end operations. ++ +All CTF event records have the following common fields: ++ +-- +[horizontal] +`thread_id`:: + Thread ID. + +`queue_id`:: + Queue ID. + +`agent_id`:: + Agent ID. + +`correlation_id`:: + Correlation ID. +-- ++ +The possible CTF event records are: ++ +`hsa_op_begin`::: + HSA API operation beginning. + +`hsa_op_end`::: + HSA API operation end. + +`hip_op_begin`::: + HIP API operation beginning. ++ +Such an event record also has the field `kernel_name` which is the +kernel name (empty string if not available). + +`hip_op_end`::: + HIP API operation end. + +`profiler_`:: + Profiler records. ++ +All CTF event records have the following common fields: ++ +-- +[horizontal] +`dispatch`:: + Dispatch ID. + +`gpu_id`:: + GPU ID. + +`queue_id`:: + Queue ID. + +`queue_index`:: + Queue index. + +`process_id`:: + Process ID. + +`thread_id`:: + Thread ID. + +`kernel_id`:: + Kernel ID. + +`kernel_name`:: + Kernel name (empty string if not available). + +`counter_names`:: + Array of counter names, each one having a corresponding integral + value in the `counter_values` field. + +`counter_values`:: + Array of integers, each one being the value of a counter of which + the name is a corresponding string in the `counter_names` field. +-- ++ +The possible CTF event records are: ++ +`profiler_record`::: + Profiler record. + +`profiler_record_with_kernel_properties`::: + Profiler record with kernel properties. ++ +Such an event record also has the following fields: ++ +-- +`grid_size`:: + Grid size. + +`workgroup_size`:: + Workgroup size. + +`lds_size`:: + Local memory size. + +`scratch_size`:: + Scratch size. + +`arch_vgpr_count`:: + Architecture vector general purpose register count. + +`accum_vgpr_count`:: + Accum. vector general purpose register count + +`sgpr_count`:: + Scalar general purpose register count. + +`wave_size`:: + Wavefront size. + +`signal_handle`:: + Signal handle. +-- + +`hsa_handles_`:: + HSA handle type mappings. ++ +Each CTF event record is named `hsa_handle_type` and maps an HSA handle +to a processor unit type (CPU or GPU). ++ +The clock value of those event records is irrelevant (always{nbsp}0). ++ +The fields are: ++ +-- +[horizontal] +`handle`:: + HSA handle. + +`type`:: + Processor unit type (`CPU` or `GPU` enumeration label). +-- diff --git a/plugin/ctf/barectf_event_record.h b/plugin/ctf/barectf_event_record.h new file mode 100644 index 00000000..f3b40e1e --- /dev/null +++ b/plugin/ctf/barectf_event_record.h @@ -0,0 +1,67 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef PLUGIN_CTF_BARECTF_EVENT_RECORD_H +#define PLUGIN_CTF_BARECTF_EVENT_RECORD_H + +#include +#include + +struct barectf_default_ctx; + +namespace rocm_ctf { + +// Abstract base class of any barectf event record. +// +// A concrete event record class must implement Write() which must call +// a corresponding barectf tracing function. +// +// `CtxT` is the specific type of the barectf context which Write() +// receives. +template class BarectfEventRecord { + protected: + // Builds a barectf event record having the clock value `clock_val`. + explicit BarectfEventRecord(const std::uint64_t clock_val) noexcept : clock_val_{clock_val} {} + + public: + // Shared pointer to const barectf event record. + using SP = std::shared_ptr; + + virtual ~BarectfEventRecord() = default; + + // Disabled copy operations to make this class simpler. + BarectfEventRecord(const BarectfEventRecord&) = delete; + BarectfEventRecord& operator=(const BarectfEventRecord&) = delete; + + // Clock value of this event record. + std::uint64_t GetClockVal() const noexcept { return clock_val_; } + + // Calls a corresponding barectf tracing function using the barectf + // context `barectf_ctx`. + virtual void Write(CtxT& barectf_ctx) const = 0; + + private: + // Clock value. + std::uint64_t clock_val_; +}; + +} // namespace rocm_ctf + +#endif // PLUGIN_CTF_BARECTF_EVENT_RECORD_H diff --git a/plugin/ctf/barectf_platform.h b/plugin/ctf/barectf_platform.h new file mode 100644 index 00000000..914028de --- /dev/null +++ b/plugin/ctf/barectf_platform.h @@ -0,0 +1,192 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef PLUGIN_CTF_BARECTF_PLATFORM_H +#define PLUGIN_CTF_BARECTF_PLATFORM_H + +#include +#include +#include +#include +#include +#include + +#include "barectf.h" + +namespace rocm_ctf { + +template class BarectfWriter; + +// A barectf platform for any barectf writer. +// +// The user doesn't deal directly with such an object: it's closely +// coupled with a barectf writer. +// +// Each platform takes care of a single CTF data stream file. +// +// After building such a platform, get the raw barectf context with +// GetCtx() to call tracing functions. The platform must still exist +// when calling a tracing function. +// +// Such a platform opens the data stream file on construction and closes +// it on destruction. +// +// `DescrT` is the specific barectf platform descriptor. It must be a +// structure having: +// +// `Ctx`: +// Specific barectf context type. +// +// `static void OpenPacket(Ctx&)`: +// Packet opening function. +// +// `static void ClosePacket(Ctx&)`: +// Packet closing function. +template class BarectfPlatform final { + friend class BarectfWriter; + + private: + // Builds a barectf platform. + // + // The platform writes CTF packets of size `packet_size` bytes to the + // CTF data stream file `data_stream_file_path`. + // + // For each event record to write, the platform reads `clock_val` to + // know the current timestamp. + explicit BarectfPlatform(const std::size_t packet_size, + const std::experimental::filesystem::path& data_stream_file_path, + const std::uint64_t& clock_val) + : clock_val_{&clock_val}, buffer_(packet_size) { + // Initialize barectf callbacks. + barectf_platform_callbacks callbacks; + + callbacks.default_clock_get_value = GetClockCb; + callbacks.is_backend_full = IsBackendFullCb; + callbacks.open_packet = OpenPacketCb; + callbacks.close_packet = ClosePacketCb; + + // Configure exceptions so that stream operations throw instead of + // just setting flags on error. + output_.exceptions(std::ofstream::failbit | std::ofstream::badbit); + + // Open CTF data stream output file in binary mode. + output_.open(data_stream_file_path, std::ios_base::out | std::ios_base::binary); + + // Initialize the raw barectf context. + barectf_init(&ctx_, buffer_.data(), buffer_.size(), callbacks, this); + + // Open the initial packet. + OpenPacketCb(); + } + + public: + // Disabled copy operations to make this class simpler. + BarectfPlatform(const BarectfPlatform&) = delete; + BarectfPlatform& operator=(const BarectfPlatform&) = delete; + + // Closes/writes any last CTF packet and closes the data stream file. + ~BarectfPlatform() { + if (barectf_packet_is_open(&ctx_) && !barectf_packet_is_empty(&ctx_)) { + // Close and write last CTF packet (not empty). + ClosePacketCb(); + } + + // Close data stream output file. + output_.close(); + } + + // Returns the raw barectf context of this platform. + const typename DescrT::Ctx& GetCtx() const noexcept { return ctx_; } + typename DescrT::Ctx& GetCtx() noexcept { return ctx_; } + + private: + static BarectfPlatform& AsPlatform(void* const data) noexcept { + return *static_cast(data); + } + + // Four callbacks for barectf. + // + // Those four functions receive an instance of this class as `data`. + + static std::uint64_t GetClockCb(void* const data) noexcept { + // Forward to instance method. + return AsPlatform(data).GetClockCb(); + } + + static int IsBackendFullCb(void* const data) noexcept { + // Forward to instance method. + return AsPlatform(data).IsBackendFullCb(); + } + + static void OpenPacketCb(void* const data) { + // Forward to instance method. + AsPlatform(data).OpenPacketCb(); + } + + static void ClosePacketCb(void* const data) { + // Forward to instance method. + AsPlatform(data).ClosePacketCb(); + } + + // Instance version of the "get clock value" callback. + std::uint64_t GetClockCb() noexcept { return *clock_val_; } + + // Instance version of the "is the back end full?" callback. + int IsBackendFullCb() noexcept { + // Never full. + return 0; + } + + // Instance version of the "open packet" callback. + void OpenPacketCb() { + // Forward to user (descriptor) function. + DescrT::OpenPacket(ctx_); + } + + // Instance version of the "close packet" callback. + void ClosePacketCb() { + // Forward to user (descriptor) function to finalize the packet. + DescrT::ClosePacket(ctx_); + + // Write to the data stream file. + WriteCurrentPacket(); + } + + // Writes the current CTF packet (`buffer_`) to the data stream file. + void WriteCurrentPacket() { + output_.write(reinterpret_cast(buffer_.data()), buffer_.size()); + } + + // Clock value pointer. + const std::uint64_t* clock_val_; + + // CTF data stream output file stream. + std::ofstream output_; + + // Raw barectf context. + typename DescrT::Ctx ctx_; + + // CTF packet buffer. + std::vector buffer_; +}; + +} // namespace rocm_ctf + +#endif // PLUGIN_CTF_BARECTF_PLATFORM_H diff --git a/plugin/ctf/barectf_tracer.h b/plugin/ctf/barectf_tracer.h new file mode 100644 index 00000000..1842cdbe --- /dev/null +++ b/plugin/ctf/barectf_tracer.h @@ -0,0 +1,124 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef PLUGIN_CTF_BARECTF_TRACER_H +#define PLUGIN_CTF_BARECTF_TRACER_H + +#include +#include +#include +#include +#include + +#include "barectf_event_record.h" +#include "barectf_writer.h" + +namespace rocm_ctf { + +// A barectf tracer offers the AddEventRecord() method to add an event +// record which it will ultimately write to some CTF data stream file +// within some specified CTF trace directory. +// +// One important feature of such a tracer is that you don't need to add +// event records in order of time. A barectf tracer manages one or more +// barectf writers, each one managing a single barectf platform/context +// (CTF data stream file). +// +// All the CTF data stream files which a barectf tracer indirectly +// manages share a common specified prefix. You must not use the same +// prefix for two barectf tracers writing to the same CTF trace +// directory. +// +// `PlatformDescrT` is the specific barectf platform descriptor (see the +// documentation of the `BarectfPlatform` class template). +template class BarectfTracer final { + public: + // Specific barectf event record type. + using EventRecord = typename BarectfWriter::EventRecord; + + // Builds a barectf tracer to write CTF packets of size `packet_size` + // bytes to CTF data stream files having the prefix + // `data_stream_file_name_prefix` within the CTF trace directory + // `trace_dir`. + // + // The internal barectf writers manage event record queues having a + // maximum size of `max_writer_queue_size`. Increasing + // `max_writer_queue_size` increases the memory footprint of the + // tracer, but may reduce the number of required CTF data stream files + // to ensure time-ordered event records. + explicit BarectfTracer(const std::size_t packet_size, + std::experimental::filesystem::path trace_dir, + const char* const data_stream_file_name_prefix, + const std::size_t max_writer_queue_size = 200) + : packet_size_{packet_size}, + trace_dir_{std::move(trace_dir)}, + data_stream_file_name_prefix_{data_stream_file_name_prefix}, + max_writer_queue_size_{max_writer_queue_size} {} + + // Disabled copy operations to make this class simpler. + BarectfTracer(const BarectfTracer&) = delete; + BarectfTracer& operator=(const BarectfTracer&) = delete; + + // Adds the event record `event_record` to this tracer. + // + // The clock value of `event_record` may be less than the clock value + // of previously added event records. + void AddEventRecord(typename EventRecord::SP event_record) { + // Try to find a barectf writer to accept `event_record`. + for (auto& writer : writers_) { + if (writer->MayAddEventRecord(*event_record)) { + // Found: add the event record to this writer and return. + writer->AddEventRecord(std::move(event_record)); + return; + } + } + + // No barectf writer found: create a new one. + std::ostringstream ss; + + ss << data_stream_file_name_prefix_ << writers_.size(); + writers_.emplace_back(new BarectfWriter{packet_size_, trace_dir_ / ss.str(), + max_writer_queue_size_}); + + // Add the event record to this new barectf writer. + assert(writers_.back()->MayAddEventRecord(*event_record)); + writers_.back()->AddEventRecord(std::move(event_record)); + } + + private: + // CTF packet size. + std::size_t packet_size_; + + // CTF trace directory. + std::experimental::filesystem::path trace_dir_; + + // CTF data stream file name prefix. + std::string data_stream_file_name_prefix_; + + // Maximum event record queue size of a barectf writer. + std::size_t max_writer_queue_size_; + + // barectf writers. + std::vector>> writers_; +}; + +} // namespace rocm_ctf + +#endif // PLUGIN_CTF_BARECTF_TRACER_H diff --git a/plugin/ctf/barectf_writer.h b/plugin/ctf/barectf_writer.h new file mode 100644 index 00000000..73a03546 --- /dev/null +++ b/plugin/ctf/barectf_writer.h @@ -0,0 +1,178 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef PLUGIN_CTF_BARECTF_WRITER_H +#define PLUGIN_CTF_BARECTF_WRITER_H + +#include +#include +#include +#include +#include +#include +#include + +#include "barectf_platform.h" +#include "barectf_event_record.h" + +namespace rocm_ctf { + +template class BarectfTracer; + +// A barectf writer manages a queue of event records, writing them +// through barectf when needed. +// +// Such an object makes it possible to add some event record with a +// clock value V and then some other event record of which the clock +// value is less than V. The barectf writer ensures that actual barectf +// tracing functions are called chronologically, a requirement of CTF. +// +// A barectf writer keeps event records in memory until its queue is +// full (you provide the maximum queue size at construction time), in +// which case it writes the oldest event record to some current CTF +// packet through a barectf tracing function. +// +// Call MayAddEventRecord() to check whether or not you may add an event +// record to the barectf writer, and then AddEventRecord() if you may. +// +// A barectf writer writes all its remaining event records on +// destruction. +// +// `PlatformDescrT` is the specific barectf platform descriptor (see the +// documentation of the `BarectfPlatform` class template). +template class BarectfWriter final { + friend class BarectfTracer; + + public: + // Specific barectf event record type. + using EventRecord = BarectfEventRecord; + + private: + // Builds a barectf writer to write CTF packets of size `packet_size` + // bytes to the CTF data stream file `data_stream_file_path`. + // + // The built barectf writer manages an event record queue having a + // maximum size of `max_queue_size`. + explicit BarectfWriter(const std::size_t packet_size, + const std::experimental::filesystem::path& data_stream_file_path, + const std::size_t max_queue_size) + : platform_{packet_size, data_stream_file_path, clock_val_}, + max_queue_size_{max_queue_size} {} + + public: + // Writes all its remaining event records. + ~BarectfWriter() { + // Write all the remaining event records from the oldest to the + // newest. + while (!queue_.empty()) { + WriteOldestEventRecord(); + } + } + + // Disabled copy operations to make this class simpler. + BarectfWriter(const BarectfWriter&) = delete; + BarectfWriter& operator=(const BarectfWriter&) = delete; + + // Whether or not you may add the event record `event_record` to this + // writer with AddEventRecord(). + bool MayAddEventRecord(const EventRecord& event_record) const noexcept { + if (queue_.empty()) { + return true; + } + + // One may only add an event record if its clock value is greater + // than or equal to the clock value of the most recently written + // event record. + return event_record.GetClockVal() >= clock_val_; + } + + // Adds the event record `event_record` to this writer. + // + // `MayAddEventRecord(*event_record)` must return `true`. + void AddEventRecord(typename EventRecord::SP event_record) { + assert(MayAddEventRecord(*event_record) && "May add event record"); + + // Add event record to queue. + queue_.emplace(std::move(event_record)); + + if (queue_.size() > max_queue_size_) { + // Queue is too large: write the oldest event record now to + // satisfy the requirement. + WriteOldestEventRecord(); + } + } + + private: + // Comparison type for `queue_`. + struct EventRecordQueueCompare final { + bool operator()(const typename EventRecord::SP& left, + const typename EventRecord::SP& right) const noexcept { + // "Greater than" so that the top element of the queue is the + // oldest event record. + return left->GetClockVal() > right->GetClockVal(); + } + }; + + // Oldest event record within `queue_`. + // + // `queue_` must not be empty. + const EventRecord& GetOldestEventRecord() const noexcept { + assert(!queue_.empty() && "Queue isn't empty"); + return *queue_.top(); + } + + // Writes the oldest event record through a barectf tracing function + // and removes it from the event record queue. + void WriteOldestEventRecord() { + auto& oldest_event_record = GetOldestEventRecord(); + + // When calling a barectf tracing function, it calls the clock value + // accessor callback of the platform, which itself reads from + // `clock_val_`. + clock_val_ = oldest_event_record.GetClockVal(); + + // Forward to a barectf tracing function. + oldest_event_record.Write(platform_.GetCtx()); + + // Remove from queue. + queue_.pop(); + } + + // barectf platform (manages file I/O). + BarectfPlatform platform_; + + // Current clock value for `platform_`. + // + // This is also the clock value of the most recently written event + // record, therefore that MayAddEventRecord() can rely on this. + std::uint64_t clock_val_ = 0; + + // Maximum size of `queue_` below. + std::size_t max_queue_size_; + + // Event record queue. + std::priority_queue, + EventRecordQueueCompare> + queue_; +}; + +} // namespace rocm_ctf + +#endif // PLUGIN_CTF_BARECTF_WRITER_H diff --git a/plugin/ctf/config.yaml b/plugin/ctf/config.yaml new file mode 100644 index 00000000..f0b75eb2 --- /dev/null +++ b/plugin/ctf/config.yaml @@ -0,0 +1,165 @@ +################################################################################ +# Copyright (c) 2022 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +################################################################################ + +%YAML 1.2 +--- ! +trace: + $include: + # Environment (generated file). + - env.yaml + type: + $include: + - stdint.yaml + - stdmisc.yaml + native-byte-order: little-endian + clock-types: + default: + origin-is-unix-epoch: true + $c-type: uint64_t + data-stream-types: + hsa_api: + event-record-common-context-field-type: + class: struct + members: + - _thread_id: uint32 + - _queue_id: uint32 + - _agent_id: uint32 + - _correlation_id: uint64 + $include: + # Base. + - dst_base.yaml + + # HSA API event record types (generated file). + - hsa_erts.yaml + hip_api: + event-record-common-context-field-type: + class: struct + members: + - _thread_id: uint32 + - _queue_id: uint32 + - _agent_id: uint32 + - _correlation_id: uint64 + - _kernel_name: str + $include: + # Base. + - dst_base.yaml + + # HIP API event record types (generated file). + - hip_erts.yaml + roctx: + $include: + # Base + - dst_base.yaml + event-record-common-context-field-type: + class: struct + members: + - _thread_id: uint32 + event-record-types: + roctx: + payload-field-type: + class: struct + members: + - _id: sint64 + - _msg: str + hsa_handles: + $include: + # Base. + - dst_base.yaml + event-record-types: + hsa_handle_type: + payload-field-type: + class: struct + members: + - _handle: uint64 + - _type: + field-type: + class: uenum + size: 8 + mappings: + CPU: [0] + GPU: [1] + api_ops: + $include: + # Base. + - dst_base.yaml + event-record-common-context-field-type: + class: struct + members: + - _thread_id: uint32 + - _queue_id: uint32 + - _agent_id: uint32 + - _correlation_id: uint64 + event-record-types: + hsa_op_begin: + payload-field-type: + class: struct + hsa_op_end: + payload-field-type: + class: struct + hip_op_begin: + payload-field-type: + class: struct + members: + - _kernel_name: str + hip_op_end: + payload-field-type: + class: struct + profiler: + $include: + # Base. + - dst_base.yaml + event-record-common-context-field-type: + class: struct + members: + - _dispatch: uint64 + - _gpu_id: uint64 + - _queue_id: uint64 + - _queue_index: uint64 + - _process_id: uint32 + - _thread_id: uint32 + - _kernel_id: uint64 + - _kernel_name: str + - _counter_names: + field-type: + class: dynamic-array + element-field-type: str + - _counter_values: + field-type: + class: dynamic-array + element-field-type: uint64 + event-record-types: + profiler_record: + payload-field-type: + class: struct + profiler_record_with_kernel_properties: + payload-field-type: + class: struct + members: + - _grid_size: uint64 + - _workgroup_size: uint64 + - _lds_size: uint64 + - _scratch_size: uint64 + - _arch_vgpr_count: uint64 + - _accum_vgpr_count: uint64 + - _sgpr_count: uint64 + - _wave_size: uint64 + - _signal_handle: uint64 diff --git a/plugin/ctf/ctf.cpp b/plugin/ctf/ctf.cpp new file mode 100644 index 00000000..829231c3 --- /dev/null +++ b/plugin/ctf/ctf.cpp @@ -0,0 +1,107 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include +#include +#include +#include + +#include "rocprofiler.h" +#include "rocprofiler_plugin.h" + +#include "plugin.h" + +namespace fs = std::experimental::filesystem; + +namespace { + +// Global plugin instance +rocm_ctf::Plugin* the_plugin = nullptr; + +} // namespace + +ROCPROFILER_EXPORT int rocprofiler_plugin_initialize(const uint32_t rocprofiler_major_version, + const uint32_t rocprofiler_minor_version) { + if (rocprofiler_major_version != ROCPROFILER_VERSION_MAJOR || + rocprofiler_minor_version < ROCPROFILER_VERSION_MINOR) { + return -1; + } + + if (the_plugin) { + return -1; + } + + const auto output_dir = getenv("OUTPUT_PATH"); + + if (!output_dir) { + std::cerr << "rocprofiler_plugin_initialize(): " + << "`OUTPUT_PATH` environment variable isn't set" << std::endl; + return -1; + } + + // Create the plugin instance. + try { + the_plugin = new rocm_ctf::Plugin{256 * 1024, fs::path{output_dir} / "trace", + CTF_PLUGIN_METADATA_FILE_PATH}; + } catch (const std::exception& exc) { + std::cerr << "rocprofiler_plugin_initialize(): " << exc.what() << std::endl; + return -1; + } + + return 0; +} + +ROCPROFILER_EXPORT void rocprofiler_plugin_finalize() { + delete the_plugin; + the_plugin = nullptr; +} + +ROCPROFILER_EXPORT int rocprofiler_plugin_write_buffer_records( + const rocprofiler_record_header_t* const begin, const rocprofiler_record_header_t* const end, + const rocprofiler_session_id_t session_id, const rocprofiler_buffer_id_t buffer_id) { + assert(the_plugin); + + try { + the_plugin->HandleBufferRecords(begin, end, session_id, buffer_id); + } catch (const std::exception& exc) { + std::cerr << "rocprofiler_plugin_write_buffer_records(): " << exc.what() << std::endl; + return -1; + } + + return 0; +} + +ROCPROFILER_EXPORT int rocprofiler_plugin_write_record(const rocprofiler_record_tracer_t record, + const rocprofiler_session_id_t session_id) { + assert(the_plugin); + + if (record.header.id.handle == 0) { + return 0; + } + + try { + the_plugin->HandleTracerRecord(record, session_id); + } catch (const std::exception& exc) { + std::cerr << "rocprofiler_plugin_write_record(): " << exc.what() << std::endl; + return -1; + } + + return 0; +} diff --git a/plugin/ctf/dst_base.yaml b/plugin/ctf/dst_base.yaml new file mode 100644 index 00000000..18f6f8f2 --- /dev/null +++ b/plugin/ctf/dst_base.yaml @@ -0,0 +1,28 @@ +################################################################################ +# Copyright (c) 2022 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +################################################################################ + +$default-clock-type-name: default +$features: + packet: + beginning-timestamp-field-type: false + discarded-event-records-counter-snapshot-field-type: false + end-timestamp-field-type: false diff --git a/plugin/ctf/gen_api_files.py b/plugin/ctf/gen_api_files.py new file mode 100644 index 00000000..58743bcb --- /dev/null +++ b/plugin/ctf/gen_api_files.py @@ -0,0 +1,645 @@ +################################################################################ +# Copyright (c) 2022 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +################################################################################ + +import os +import os.path +import sys +import re +import yaml +import CppHeaderParser + + +# Numeric field type (abstract). +class _NumericFt: + # Returns the C++ expression to cast the expression `expr` to the C + # type of this field type. + def cast(self, expr): + return f'static_cast<{self.c_type}>({expr})' + + +# Integer field type (abstract). +class _IntFt(_NumericFt): + def __init__(self, size, pref_disp_base='dec'): + self._size = size + self._pref_disp_base = pref_disp_base + + # Size (bits). + @property + def size(self): + return self._size + + # Preferred display base (`dec` or `hex`). + @property + def pref_disp_base(self): + return self._pref_disp_base + + # Equivalent barectf field type in YAML. + @property + def barectf_yaml(self): + return { + 'size': self._size, + 'preferred-display-base': self._pref_disp_base, + } + + +# Signed integer field type. +class _SIntFt(_IntFt): + # Equivalent barectf field type in YAML. + @property + def barectf_yaml(self): + ret = super().barectf_yaml + ret['class'] = 'sint' + return ret + + # Equivalent C type + @property + def c_type(self): + return f'std::int{self._size}_t' + + +# Unsigned integer field type. +class _UIntFt(_IntFt): + # Equivalent barectf field type in YAML. + @property + def barectf_yaml(self): + ret = super().barectf_yaml + ret['class'] = 'uint' + return ret + + # Equivalent C type. + @property + def c_type(self): + return f'std::uint{self._size}_t' + + +# Pointer field type. +class _PointerFt(_UIntFt): + def __init__(self): + super().__init__(64, 'hex') + + # Returns the C++ expression to cast the expression `expr` to the C + # type of this field type. + def cast(self, expr): + return f'static_cast<{self.c_type}>(reinterpret_cast({expr}))' + + +# Enumeration field type (abstract). +class _EnumFt(_IntFt): + def __init__(self, size, mappings): + super().__init__(size) + self._mappings = mappings.copy() + + # Mappings (names to integers). + @property + def mappings(self): + return self._mappings + + # Equivalent barectf field type in YAML. + @property + def barectf_yaml(self): + ret = super().barectf_yaml + mappings = {} + + for name, val in self._mappings.items(): + mappings[name] = [val] + + ret['mappings'] = mappings + return ret + + +# Unsigned enumeration field type. +class _UEnumFt(_EnumFt, _UIntFt): + # Equivalent barectf field type in YAML. + @property + def barectf_yaml(self): + ret = super().barectf_yaml + ret['class'] = 'uenum' + return ret + + +# Signed enumeration field type. +class _SEnumFt(_EnumFt, _UIntFt): + # Equivalent barectf field type in YAML. + @property + def barectf_yaml(self): + ret = super().barectf_yaml + ret['class'] = 'senum' + return ret + + +# Optional string field type. +class _OptStrFt: + # Equivalent barectf field type in YAML. + @property + def barectf_yaml(self): + return { + 'class': 'str', + } + + +# String field type. +class _StrFt(_OptStrFt): + pass + + +# Floating-point number field type. +class _FloatFt(_NumericFt): + def __init__(self, size): + self._size = size + + # Size (bits): 32 or 64. + @property + def size(self): + return self._size + + # Equivalent barectf field type in YAML. + @property + def barectf_yaml(self): + return { + 'class': 'real', + 'size': self._size, + } + + # Equivalent C type. + @property + def c_type(self): + if self._size == 32: + return 'float' + else: + assert self._size == 64 + return 'double' + + +# Event record type. +class _Ert: + def __init__(self, api_func_name, members): + self._api_func_name = api_func_name + self._members = members + + # API function name + @property + def api_func_name(self): + return self._api_func_name + + # Parameters of function (list of `_ErtMember`). + @property + def members(self): + return self._members + + +# Beginning event record type. +class _BeginErt(_Ert): + # Name of event record type depending on the API prefix. + def name(self, api_prefix): + suffix = '_begin' if api_prefix == 'hsa' else 'Begin' + return f'{self._api_func_name}{suffix}' + + +# End event record type. +class _EndErt(_Ert): + # Name of event record type depending on the API prefix. + def name(self, api_prefix): + suffix = '_end' if api_prefix == 'hsa' else 'End' + return f'{self._api_func_name}{suffix}' + + +# Event record type member. +class _ErtMember: + def __init__(self, access, member_names, ft): + self._access = access + self._member_names = member_names.copy() + self._ft = ft + + # C++ access expression. + @property + def access(self): + return self._access + + # List of member names. + @property + def member_names(self): + return self._member_names + + # Equivalent field type. + @property + def ft(self): + return self._ft + + +# Makes sure some condition is satisfied, or prints the error message +# `error_msg` and quits with exit status 1 otherwise. +# +# This is an unconditional assertion. +def _make_sure(cond, error_msg): + if not cond: + print(f'Error: {error_msg}', file=sys.stderr) + sys.exit(1) + + +def _enumerator_effective_val(enum_val): + # Try the value, but this value may be a string (an + # enumerator/definition). + val = enum_val.get('value') + + if type(val) is int: + return val + + # Try the raw value. + val = enum_val.get('raw_value') + + if val is not None: + if type(val) is int: + # Raw value is already an integer. + return val + else: + # Try to parse the raw value string as an integer. + try: + return int(val, 0) + except: + pass + + _make_sure(False, + f'Cannot get the integral value of enumerator `{enum_val["name"]}`') + + +# Returns the equivalent field type of the C type `c_type`. +def _number_ft_from_c_type(cpp_header, c_type): + # Check for known enumeration. + m = re.match(r'(?:enum\s+)?(\w+)', c_type) + + if m: + size = 32 + + for enum_info in cpp_header.enums: + if m.group(1) == enum_info.get('name'): + # Fill enumeration field type mappings. + mappings = { + str(v['name']): _enumerator_effective_val(v) + for v in enum_info['values'] + } + + if len(mappings) == 0: + return _SIntFt(64) + + if max(mappings.values()) >= 2**31 or min(mappings.values()) < -2**31: + size = 64 + + _make_sure(len(mappings) > 0, f'Enumeration `{enum_info["name"]}` is empty') + + # Create corresponding enumeration field type. + return _SEnumFt(size, mappings) + + # Find corresponding basic field type. + is_unsigned = 'unsigned' in c_type + + if 'long' in c_type: + if is_unsigned: + return _UIntFt(64) + else: + return _SIntFt(64) + elif 'short' in c_type: + if is_unsigned: + return _UIntFt(16) + else: + return _SIntFt(16) + elif 'char' in c_type: + if is_unsigned: + return _UIntFt(8) + else: + return _SIntFt(8) + elif 'float' in c_type: + return _FloatFt(32) + elif 'double' in c_type: + return _FloatFt(64) + else: + # Assume `int` (often an unresolved C enumeration). + if is_unsigned: + return _UIntFt(32) + else: + return _SIntFt(32) + + +# Returns whether or not a property has a pointer type. +def _prop_is_pointer(prop, c_type): + if prop['pointer'] or prop['function_pointer']: + return True + + if prop['array'] and 'array_size' in prop: + return True + + if prop['unresolved']: + # HSA API function pointers. + if prop['name'] in ('callback', 'handler'): + return True + + # HIP API function pointers. + if c_type.endswith('Fn_t'): + return True + + # Check the C type itself. + if '*' in c_type or '*' in prop.get('raw_type', ''): + return True + + return False + + +# Returns a list of event record type member objects for the structure +# `struct` considering the initial C++ access expression `access` and +# member names `member_names`. +def _get_ert_members_for_struct(cpp_header, struct, access, member_names): + members = [] + member_names = member_names.copy() + member_names.append(None) + props = struct['properties']['public'] + + for index, prop in enumerate(props): + # Property name. + name = prop['name'] + + # Member names, access, and C type. + member_names[-1] = str(name) + this_access = f'{access}.{name}' + c_type = prop['type'] + aliases = prop['aliases'] + + # Skip no type. + if c_type == '': + continue + + # Skip unnamed or union. + if name == '' or 'union' in name or re.match(r'\bunion\b', c_type): + continue + + # Check for known C type alias. + while True: + c_type_alias = cpp_header.typedefs.get(c_type) + + if c_type_alias is None: + break + + c_type = c_type_alias + + # Check for C string. + if re.match(r'^((const\s+char)|(char\s+const)|char)\s*\*$', + c_type.strip()): + members.append(_ErtMember(this_access, member_names, _OptStrFt())) + continue + + # Check for pointer. + if _prop_is_pointer(prop, c_type): + # Pointer: use numeric value. + members.append(_ErtMember(this_access, member_names, _PointerFt())) + continue + + # Check for substructure. + sub_struct = cpp_header.classes.get(c_type) + + if sub_struct is None and len(aliases) == 1: + sub_struct = cpp_header.classes.get(aliases[0]) + + if sub_struct is not None: + members += _get_ert_members_for_struct(cpp_header, sub_struct, + this_access, member_names) + continue + + # Use a basic field type. + members.append(_ErtMember(this_access, member_names, + _number_ft_from_c_type(cpp_header, c_type))) + + return members + + +# Returns the beginning and end event record type objects for the +# callback data structure `struct`. +def _erts_from_cb_data_struct(api_prefix, cpp_header, retval_info, struct): + # The location of the `args` union within the nested structures of + # `struct`. + args_nested_cls_index = 0 + + # Create return value members (to be used later). + if retval_info is not None: + args_nested_cls_index = 1 + retval_members = {} + nested_classes = struct['nested_classes'] + _make_sure(len(nested_classes) >= 1, + f"Return value union doesn't exist in `{struct['name']}`") + retval_union = nested_classes[0] + + for prop in retval_union['properties']['public']: + name = str(prop['name']) + member = _ErtMember(f'GetApiData().{name}', ['retval'], + _number_ft_from_c_type(cpp_header, prop['type'])) + retval_members[prop['name']] = member + + # Make sure we have everything we need. + for api_func_name, retval_name in retval_info.items(): + if retval_name is not None: + _make_sure(retval_name in retval_members, + f"Return value union member `{retval_name}` doesn't exist (function {api_func_name}())") + + # Create beginning/end event record type objects. + begin_erts = [] + end_erts = [] + nested_classes = struct['nested_classes'][args_nested_cls_index]['nested_classes'] + props = struct['nested_classes'][args_nested_cls_index]['properties']['public'] + _make_sure(len(nested_classes) == len(props), + f'Mismatch between nested structure and member count in `{struct["name"]}`') + + for index, prop in enumerate(props): + # API function name is the name of the member. + api_func_name = str(prop['name']) + + # Get the parameters. + members = _get_ert_members_for_struct(cpp_header, + nested_classes[index], + f'GetApiData().args.{api_func_name}', + []) + + # Append new beginning event record type object. + begin_erts.append(_BeginErt(api_func_name, members)) + + # Append new end event record type object if possible. + ret_members = [] + + if retval_info is not None: + retval_type = retval_info.get(api_func_name) + + if retval_type is not None: + ret_members.append(retval_members[retval_type]) + + end_erts.append(_EndErt(api_func_name, ret_members)) + + return begin_erts, end_erts + + +# Creates and returns the return value information dictionary. +# +# This dictionary maps API function names to the member to get within +# the callback data structure. +# +# This only applies to the HSA API: for other APIs, this function +# returns `None`. +def _get_retval_info(path): + if 'hsa' not in os.path.basename(path): + return + + retval_info = {} + cur_api_func_name = None + + with open(path) as f: + for line in f: + if 'out << ")' in line and cur_api_func_name is not None: + m = re.search(r'api_data.(\w+_retval)', line) + retval_info[cur_api_func_name] = m.group(1) if m else None + else: + m = re.search(r'out << "(hsa_\w+)\(";', line) + + if m: + cur_api_func_name = m.group(1) + + return retval_info + + +# Returns a partial barectf data stream type in YAML with the event +# record types `erts`. +def _yaml_dst_from_erts(api_prefix, erts): + # Base. + yaml_erts = {} + yaml_dst = { + 'event-record-types': yaml_erts, + } + + # Create one event record type per API function. + for ert in erts: + # Base. + yaml_members = [] + yaml_ert = { + 'payload-field-type': { + 'class': 'struct', + 'members': yaml_members, + }, + } + + # Create one structure field type member per member. + for member in ert.members: + # barectf doesn't support nested CTF structures, so join + # individual member names with `__` to flatten. + yaml_members.append({ + '_' + '__'.join(member.member_names): { + 'field-type': member.ft.barectf_yaml, + }, + }) + + # Add event record type. + yaml_erts[ert.name(api_prefix)] = yaml_ert + + # Convert to YAML. + return yaml.dump(yaml_dst) + + +# Returns the C++ switch statement which calls the correct barectf +# tracing function depending on the API function operation ID. +def _cpp_switch_statement_from_erts(api_prefix, erts): + lines = [] + lines.append('switch (GetOp()) {') + + for ert in erts: + lines.append(f' case {api_prefix.upper()}_API_ID_{ert.api_func_name}:') + lines.append(f' barectf_{api_prefix}_api_trace_{ert.name(api_prefix)}(') + lines.append(f' &barectf_ctx,') + lines.append(f' GetThreadId(),') + lines.append(f' GetQueueId(),') + lines.append(f' GetAgentId(),') + lines.append(f' GetCorrelationId(),') + + if api_prefix == 'hip': + lines.append(f' GetKernelName().c_str(),') + + if len(ert.members) == 0: + # Remove last comma. + lines[-1] = lines[-1].replace(',', '') + + for index, member in enumerate(ert.members): + if type(member.ft) is _OptStrFt: + # Only dereference C string if not null, otherwise use + # an empty string. + lines.append(f' {member.access} ? {member.access} : ""') + elif type(member.ft) is _StrFt: + lines.append(f' {member.access}') + else: + lines.append(f' {member.ft.cast(member.access)}') + + if index + 1 < len(ert.members): + lines[-1] += ',' + + lines.append(' );') + lines.append(' break;') + + lines.append('}') + return lines + + +# Processes the complete API header file `path`. +def _process_file(api_prefix, path): + # Create `CppHeader` object. + try: + cpp_header = CppHeaderParser.CppHeader(path) + except CppHeaderParser.CppParseError as exc: + print(exc, file=sys.stderr) + sys.exit(1) + + # Get return value information dictionary. + retval_info = _get_retval_info(path) + + # Find callback data structure. + for struct_name, struct in cpp_header.classes.items(): + if re.match(r'^' + api_prefix + r'_api_data\w+$', struct_name): + # Process callback data structure. + begin_erts, end_erts = _erts_from_cb_data_struct(api_prefix, + cpp_header, + retval_info, + struct) + + # Write barectf YAML file. + with open(f'{api_prefix}_erts.yaml', 'w') as f: + f.write(_yaml_dst_from_erts(api_prefix, begin_erts + end_erts)) + + # Write C++ code (beginning event record). + with open(f'{api_prefix}_begin.cpp.i', 'w') as f: + f.write('\n'.join(_cpp_switch_statement_from_erts(api_prefix, + begin_erts))) + + # Write C++ code (end event record). + with open(f'{api_prefix}_end.cpp.i', 'w') as f: + f.write('\n'.join(_cpp_switch_statement_from_erts(api_prefix, + end_erts))) + + +if __name__ == '__main__': + # Disable `CppHeaderParser` printing to standard output. + CppHeaderParser.CppHeaderParser.print_warnings = 0 + CppHeaderParser.CppHeaderParser.print_errors = 0 + CppHeaderParser.CppHeaderParser.debug = 0 + CppHeaderParser.CppHeaderParser.debug_trace = 0 + + # Process the complete API header file. + _process_file(sys.argv[1], sys.argv[2]) diff --git a/plugin/ctf/gen_env_yaml.py b/plugin/ctf/gen_env_yaml.py new file mode 100644 index 00000000..009f3689 --- /dev/null +++ b/plugin/ctf/gen_env_yaml.py @@ -0,0 +1,33 @@ +################################################################################ +# Copyright (c) 2022 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +################################################################################ + +import sys +import yaml + + +if __name__ == '__main__': + with open('env.yaml', 'w') as f: + f.write(yaml.dump({ + 'environment': { + 'rocprofiler_version': sys.argv[1], + } + })) diff --git a/plugin/ctf/plugin.cpp b/plugin/ctf/plugin.cpp new file mode 100644 index 00000000..7124f6f2 --- /dev/null +++ b/plugin/ctf/plugin.cpp @@ -0,0 +1,869 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "hsa_prof_str.h" + +#include +#include + +#include "rocprofiler.h" +#include "rocprofiler_plugin.h" +#include "../utils.h" + +#include "barectf.h" +#include "barectf_event_record.h" +#include "barectf_tracer.h" +#include "plugin.h" + +namespace fs = std::experimental::filesystem; + +namespace rocm_ctf { +namespace { + +// Abstract tracer event record using the barectf context type `CtxT`. +template class TracerEventRecord : public BarectfEventRecord { + protected: + explicit TracerEventRecord(const rocprofiler_record_tracer_t& record, const std::uint64_t clock_val) + : BarectfEventRecord{clock_val}, + op_{record.operation_id.id}, + thread_id_{record.thread_id.value}, + queue_id_{record.queue_id.handle}, + agent_id_{record.agent_id.handle}, + correlation_id_{record.correlation_id.value} {} + + std::uint32_t GetOp() const noexcept { return op_; } + std::uint32_t GetThreadId() const noexcept { return thread_id_; } + std::uint64_t GetQueueId() const noexcept { return queue_id_; } + std::uint64_t GetAgentId() const noexcept { return agent_id_; } + std::uint64_t GetCorrelationId() const noexcept { return correlation_id_; } + + private: + std::uint32_t op_; + std::uint32_t thread_id_; + std::uint64_t queue_id_; + std::uint64_t agent_id_; + std::uint64_t correlation_id_; +}; + +// Returns the beginning clock value of the tracer or profiler record +// `record`. +template std::uint64_t GetRecordBeginClockVal(const RecordT& record) { + return record.timestamps.begin.value; +} + +// Returns the end clock value of the tracer or profiler record +// `record`. +template std::uint64_t GetRecordEndClockVal(const RecordT& record) { + return record.timestamps.end.value; +} + +// Queries allocated string data using the size query function +// `query_size_func` and the data query function `query_data_func`, +// returning the corresponding string and freeing temporary allocated +// memory. +// +// Returns an empty string if anything goes wrong. +template +std::string QueryAllocStr(QuerySizeFuncT&& query_size_func, QueryDataFuncT&& query_data_func) { + // Query size first. + std::size_t size = 0; + [[maybe_unused]] auto ret = query_size_func(&size); + + assert(ret == ROCPROFILER_STATUS_SUCCESS && "Query size"); + + if (size == 0) { + // No size: return empty string. + return {}; + } + + // Query data (allocated by query_data_func()). + char* alloc_str = nullptr; + + ret = query_data_func(&alloc_str); + assert(ret == ROCPROFILER_STATUS_SUCCESS && "Query data"); + + if (!alloc_str) { + // No data: return empty string. + return {}; + } + + // Allocate return value. + std::string str_ret{alloc_str}; + + // Free allocated data. + std::free(alloc_str); + + // Return string object. + return str_ret; +} + +// rocTX event record. +class RocTxEventRecord final : public TracerEventRecord { + public: + explicit RocTxEventRecord(const rocprofiler_record_tracer_t& record, + const rocprofiler_session_id_t session_id) + : TracerEventRecord{record, GetRecordBeginClockVal(record)}, + id_{QueryId(record, session_id)}, + msg_{QueryMsg(record, session_id)} {} + + void Write(barectf_roctx_ctx& barectf_ctx) const override { + barectf_roctx_trace_roctx(&barectf_ctx, GetThreadId(), id_, msg_.c_str()); + } + + private: + // Queries and returns the rocTX message of the record `record` and + // session ID `session_id`. + // + // Returns an empty string if not available. + static std::string QueryMsg(const rocprofiler_record_tracer_t& record, + const rocprofiler_session_id_t session_id) { + // Query size first. + std::size_t msg_size = 0; + [[maybe_unused]] auto ret = rocprofiler_query_roctx_tracer_api_data_info_size( + session_id, ROCPROFILER_ROCTX_MESSAGE, record.api_data_handle, record.operation_id, + &msg_size); + + assert(ret == ROCPROFILER_STATUS_SUCCESS && "Query rocTX message size"); + + if (msg_size == 0) { + // No size: return empty string. + return {}; + } + + // Query data (borrowed from the record: no need to free). + char* msg = nullptr; + + ret = rocprofiler_query_roctx_tracer_api_data_info( + session_id, ROCPROFILER_ROCTX_MESSAGE, record.api_data_handle, record.operation_id, &msg); + assert(ret == ROCPROFILER_STATUS_SUCCESS && "Query rocTX message"); + + if (!msg) { + // No data: return empty string. + return {}; + } + + return rocmtools::cxx_demangle(msg); + } + + // Queries and returns the rocTX ID of the record `record` and the + // session ID `session_id`. + // + // Returns 0 if anything goes wrong. + static std::uint64_t QueryId(const rocprofiler_record_tracer_t& record, + const rocprofiler_session_id_t session_id) { + try { + return std::stoull(QueryAllocStr( + [&record, session_id](const auto size) { + return rocprofiler_query_roctx_tracer_api_data_info_size( + session_id, ROCPROFILER_ROCTX_ID, record.api_data_handle, record.operation_id, size); + }, + [&record, session_id](const auto str) { + return rocprofiler_query_roctx_tracer_api_data_info( + session_id, ROCPROFILER_ROCTX_ID, record.api_data_handle, record.operation_id, str); + })); + } catch (...) { + return 0; + } + } + + std::uint64_t id_; + std::string msg_; +}; + +// Abstract HSA API event record. +class HsaApiEventRecord : public TracerEventRecord { + protected: + explicit HsaApiEventRecord(const rocprofiler_record_tracer_t& record, + const rocprofiler_session_id_t session_id, const std::uint64_t clock_val) + : TracerEventRecord{record, clock_val}, + api_data_{QueryApiData(record, session_id)} {} + + const hsa_api_data_t& GetApiData() const noexcept { return api_data_; } + + private: + // Queries and returns the API data of the record `record` and session + // ID `session_id`. + static const hsa_api_data_t& QueryApiData(const rocprofiler_record_tracer_t& record, + const rocprofiler_session_id_t session_id) { + // Query size first (only for assertions). + [[maybe_unused]] std::size_t size = 0; + [[maybe_unused]] auto ret = rocprofiler_query_hsa_tracer_api_data_info_size( + session_id, ROCPROFILER_HSA_API_DATA, record.api_data_handle, record.operation_id, &size); + + assert(ret == ROCPROFILER_STATUS_SUCCESS && "Query HSA API data size"); + assert(size > 0); + + // Query data (borrowed from the record). + char* data = nullptr; + ret = rocprofiler_query_hsa_tracer_api_data_info( + session_id, ROCPROFILER_HSA_API_DATA, record.api_data_handle, record.operation_id, &data); + + assert(ret == ROCPROFILER_STATUS_SUCCESS && "Query HSA API data"); + assert(data); + + // Reinterpret as an HSA API data pointer. + return *reinterpret_cast(data); + } + + hsa_api_data_t api_data_; +}; + +// HSA API event record (beginning). +class HsaApiEventRecordBegin final : public HsaApiEventRecord { + public: + explicit HsaApiEventRecordBegin(const rocprofiler_record_tracer_t& record, + const rocprofiler_session_id_t session_id) + : HsaApiEventRecord{record, session_id, GetRecordBeginClockVal(record)} {} + + void Write(barectf_hsa_api_ctx& barectf_ctx) const override { + // Include generated switch statement. +#include "hsa_begin.cpp.i" + } +}; + +// HSA API event record (end). +class HsaApiEventRecordEnd final : public HsaApiEventRecord { + public: + explicit HsaApiEventRecordEnd(const rocprofiler_record_tracer_t& record, + const rocprofiler_session_id_t session_id) + : HsaApiEventRecord{record, session_id, GetRecordEndClockVal(record)} {} + + void Write(barectf_hsa_api_ctx& barectf_ctx) const override { + // Include generated switch statement. +#include "hsa_end.cpp.i" + } +}; + +// Abstract HIP API event record. +class HipApiEventRecord : public TracerEventRecord { + protected: + explicit HipApiEventRecord(const rocprofiler_record_tracer_t& record, + const rocprofiler_session_id_t session_id, const std::uint64_t clock_val) + : TracerEventRecord{record, clock_val}, + api_data_{QueryApiData(record, session_id)}, + kernel_name_{QueryKernelName(record, session_id)} {} + + const hip_api_data_t& GetApiData() const noexcept { return api_data_; } + const std::string& GetKernelName() const noexcept { return kernel_name_; } + + private: + // Queries and returns the API data of the record `record` and session + // ID `session_id`. + static const hip_api_data_t& QueryApiData(const rocprofiler_record_tracer_t& record, + const rocprofiler_session_id_t session_id) { + // Query size first (only for assertions). + [[maybe_unused]] std::size_t size = 0; + [[maybe_unused]] auto ret = rocprofiler_query_hip_tracer_api_data_info_size( + session_id, ROCPROFILER_HIP_API_DATA, record.api_data_handle, record.operation_id, &size); + + assert(ret == ROCPROFILER_STATUS_SUCCESS && "Query HIP API data size"); + assert(size > 0); + + // Query data (borrowed from the record). + char* data = nullptr; + + ret = rocprofiler_query_hip_tracer_api_data_info( + session_id, ROCPROFILER_HIP_API_DATA, record.api_data_handle, record.operation_id, &data); + assert(ret == ROCPROFILER_STATUS_SUCCESS && "Query HIP API data"); + assert(data); + + // Reinterpret as an HIP API data pointer. + return *reinterpret_cast(data); + } + + // Queries and returns the kernel name of the record `record` and + // session ID `session_id`. + // + // Returns an empty string if not available. + static std::string QueryKernelName(const rocprofiler_record_tracer_t& record, + const rocprofiler_session_id_t session_id) { + const auto kernel_name = QueryAllocStr( + [&record, session_id](const auto size) { + return rocprofiler_query_hip_tracer_api_data_info_size( + session_id, ROCPROFILER_HIP_KERNEL_NAME, record.api_data_handle, record.operation_id, + size); + }, + [&record, session_id](const auto str) { + return rocprofiler_query_hip_tracer_api_data_info(session_id, ROCPROFILER_HIP_KERNEL_NAME, + record.api_data_handle, + record.operation_id, str); + }); + + if (kernel_name.size() > 1) { + // Return demangled version. + return rocmtools::cxx_demangle(kernel_name); + } + + return kernel_name; + } + + hip_api_data_t api_data_; + std::string kernel_name_; +}; + +// HIP API event record (beginning). +class HipApiEventRecordBegin final : public HipApiEventRecord { + public: + explicit HipApiEventRecordBegin(const rocprofiler_record_tracer_t& record, + const rocprofiler_session_id_t session_id) + : HipApiEventRecord{record, session_id, GetRecordBeginClockVal(record)} {} + + void Write(barectf_hip_api_ctx& barectf_ctx) const override { + // Include generated switch statement. +#include "hip_begin.cpp.i" + } +}; + +// HIP API event record (end). +class HipApiEventRecordEnd final : public HipApiEventRecord { + public: + explicit HipApiEventRecordEnd(const rocprofiler_record_tracer_t& record, + const rocprofiler_session_id_t session_id) + : HipApiEventRecord{record, session_id, GetRecordEndClockVal(record)} {} + + void Write(barectf_hip_api_ctx& barectf_ctx) const override { + // Include generated switch statement. +#include "hip_end.cpp.i" + } +}; + +// HSA API handle type event record. +class HsaHandleTypeEventRecord final : public BarectfEventRecord { + public: + enum class Type { + CPU = 0, + GPU = 1, + }; + + explicit HsaHandleTypeEventRecord(const std::uint64_t handle, const Type type) + : BarectfEventRecord{0}, handle_{handle}, type_{type} {} + + void Write(barectf_hsa_handles_ctx& barectf_ctx) const override { + barectf_hsa_handles_trace_hsa_handle_type(&barectf_ctx, handle_, + static_cast(type_)); + } + + private: + std::uint64_t handle_; + Type type_; +}; + +// Abstract API operation event record. +class ApiOpEventRecord : public TracerEventRecord { + protected: + explicit ApiOpEventRecord(const rocprofiler_record_tracer_t& record, const std::uint64_t clock_val) + : TracerEventRecord{record, clock_val} {} +}; + +// HSA API operation event record (beginning). +class HsaOpEventRecordBegin final : public ApiOpEventRecord { + public: + explicit HsaOpEventRecordBegin(const rocprofiler_record_tracer_t& record) + : ApiOpEventRecord{record, GetRecordBeginClockVal(record)} {} + + void Write(barectf_api_ops_ctx& barectf_ctx) const override { + barectf_api_ops_trace_hsa_op_begin(&barectf_ctx, GetThreadId(), GetQueueId(), GetAgentId(), + GetCorrelationId()); + } +}; + +// HSA API operation event record (end). +class HsaOpEventRecordEnd final : public ApiOpEventRecord { + public: + explicit HsaOpEventRecordEnd(const rocprofiler_record_tracer_t& record) + : ApiOpEventRecord{record, GetRecordEndClockVal(record)} {} + + void Write(barectf_api_ops_ctx& barectf_ctx) const override { + barectf_api_ops_trace_hsa_op_end(&barectf_ctx, GetThreadId(), GetQueueId(), GetAgentId(), + GetCorrelationId()); + } +}; + +// HIP API operation event record (beginning). +class HipOpEventRecordBegin final : public ApiOpEventRecord { + public: + explicit HipOpEventRecordBegin(const rocprofiler_record_tracer_t& record) + : ApiOpEventRecord{record, GetRecordBeginClockVal(record)}, + kernel_name_{QueryKernelName(record)} {} + + void Write(barectf_api_ops_ctx& barectf_ctx) const override { + barectf_api_ops_trace_hip_op_begin(&barectf_ctx, GetThreadId(), GetQueueId(), GetAgentId(), + GetCorrelationId(), kernel_name_.c_str()); + } + + private: + // Queries and returns the kernel name of the record `record`. + // + // Returns an empty string if not available. + static std::string QueryKernelName(const rocprofiler_record_tracer_t& record) { + if (record.operation_id.id == 0) { + if (const auto api_handle = record.api_data_handle.handle) { + const auto str = reinterpret_cast(api_handle); + + if (std::strlen(str) > 1) { + // Return demangled version. + return rocmtools::cxx_demangle(str); + } + } + } + + return {}; + } + + std::string kernel_name_; +}; + +// HIP API operation event record (end). +class HipOpEventRecordEnd final : public ApiOpEventRecord { + public: + explicit HipOpEventRecordEnd(const rocprofiler_record_tracer_t& record) + : ApiOpEventRecord{record, GetRecordEndClockVal(record)} {} + + void Write(barectf_api_ops_ctx& barectf_ctx) const override { + barectf_api_ops_trace_hip_op_end(&barectf_ctx, GetThreadId(), GetQueueId(), GetAgentId(), + GetCorrelationId()); + } +}; + +// Profiler record base. +class ProfilerEventRecord : public BarectfEventRecord { + public: + explicit ProfilerEventRecord(const rocprofiler_record_profiler_t& record, + const rocprofiler_session_id_t session_id) + : BarectfEventRecord{GetRecordBeginClockVal(record)}, + dispatch_{record.header.id.handle}, + gpu_id_{record.gpu_id.handle}, + queue_id_{record.queue_id.handle}, + queue_index_{record.queue_idx.value}, + process_id_{GetPid()}, + thread_id_{record.thread_id.value}, + kernel_id_{record.kernel_id.handle}, + kernel_name_{QueryKernelName(record)}, + counter_infos_{QueryCounterInfos(record, session_id)} {} + + void Write(barectf_profiler_ctx& barectf_ctx) const override { + barectf_profiler_trace_profiler_record( + &barectf_ctx, dispatch_, gpu_id_, queue_id_, queue_index_, process_id_, thread_id_, + kernel_id_, kernel_name_.c_str(), counter_infos_.names.size(), counter_infos_.names.data(), + counter_infos_.values.size(), counter_infos_.values.data()); + } + + protected: + // Counter infos. + // + // `names[i]` names the counter value `values[i]`. + struct CounterInfos final { + // `names_storage` owns the strings while the elements of `names` + // point to the internal C strings of `names_storage`. + // + // This is needed because barectf expects an array of contiguous + // C string pointers. + std::vector names_storage; + std::vector names; + + // Counter values. + std::vector values; + }; + + std::uint64_t GetDispatch() const noexcept { return dispatch_; } + std::uint64_t GetGpuId() const noexcept { return gpu_id_; } + std::uint64_t GetQueueId() const noexcept { return queue_id_; } + std::uint64_t GetQueueIndex() const noexcept { return queue_index_; } + std::uint32_t GetProcessId() const noexcept { return process_id_; } + std::uint32_t GetThreadId() const noexcept { return thread_id_; } + std::uint64_t GetKernelId() const noexcept { return kernel_id_; } + const std::string& GetKernelName() const noexcept { return kernel_name_; } + const CounterInfos& GetCounterInfos() const noexcept { return counter_infos_; } + + private: + // Queries and returns the kernel name of the record `record`. + // + // Returns an empty string if not available. + static std::string QueryKernelName(const rocprofiler_record_profiler_t& record) { + const auto kernel_name = QueryAllocStr( + [&record](const auto size) { + return rocprofiler_query_kernel_info_size(ROCPROFILER_KERNEL_NAME, record.kernel_id, size); + }, + [&record](const auto str) { + return rocprofiler_query_kernel_info(ROCPROFILER_KERNEL_NAME, record.kernel_id, + const_cast(str)); + }); + + if (kernel_name.size() <= 1) { + return {}; + } + + // Return truncated and demangled version. + return rocmtools::truncate_name(rocmtools::cxx_demangle(kernel_name)); + } + + // Queries and returns the counter infos of the record `record` and + // session ID `session_id`. + static CounterInfos QueryCounterInfos(const rocprofiler_record_profiler_t& record, + const rocprofiler_session_id_t session_id) { + if (!record.counters) { + // No counters. + return {}; + } + + CounterInfos infos; + + for (std::size_t i = 0; i < record.counters_count.value; ++i) { + auto& counter = record.counters[i]; + + if (counter.counter_handler.handle == 0) { + // Not available: continue. + continue; + } + + // Query counter name size first + std::size_t counter_name_size = 0; + [[maybe_unused]] auto ret = rocprofiler_query_counter_info_size( + session_id, ROCPROFILER_COUNTER_NAME, counter.counter_handler, &counter_name_size); + + assert(ret == ROCPROFILER_STATUS_SUCCESS && "Query counter name size"); + + if (counter_name_size == 0) { + // No size: continue. + continue; + } + + // Query counter name (borrowed from `record`: no need to free). + const char* counter_name = nullptr; + + ret = rocprofiler_query_counter_info(session_id, ROCPROFILER_COUNTER_NAME, + counter.counter_handler, &counter_name); + assert(ret == ROCPROFILER_STATUS_SUCCESS && "Query counter name"); + + if (!counter_name) { + // Not available: continue. + continue; + } + + // Push back infos. + infos.names_storage.emplace_back(counter_name); + infos.names.push_back(infos.names_storage.back().c_str()); + infos.values.push_back(counter.value.value); + } + + return infos; + } + + std::uint64_t dispatch_; + std::uint64_t gpu_id_; + std::uint64_t queue_id_; + std::uint64_t queue_index_; + std::uint32_t process_id_; + std::uint32_t thread_id_; + std::uint64_t kernel_id_; + std::string kernel_name_; + CounterInfos counter_infos_; +}; + +// Profiler record base. +class ProfilerWithKernelPropsEventRecord final : public ProfilerEventRecord { + private: + // According to `plugin/file/file.cpp`: + // + // > Taken from rocprofiler: The size hasn't changed in recent past + static constexpr std::uint32_t lds_block_size_ = 128 * 4; + + public: + explicit ProfilerWithKernelPropsEventRecord(const rocprofiler_record_profiler_t& record, + const rocprofiler_session_id_t session_id) + : ProfilerEventRecord{record, session_id}, + grid_size_{record.kernel_properties.grid_size}, + workgroup_size_{record.kernel_properties.workgroup_size}, + lds_size_{ + ((record.kernel_properties.lds_size + (lds_block_size_ - 1)) & ~(lds_block_size_ - 1))}, + scratch_size_{record.kernel_properties.scratch_size}, + arch_vgpr_count_{record.kernel_properties.arch_vgpr_count}, + accum_vgpr_count_{record.kernel_properties.accum_vgpr_count}, + sgpr_count_{record.kernel_properties.sgpr_count}, + wave_size_{record.kernel_properties.wave_size}, + signal_handle_{record.kernel_properties.signal_handle} {} + + void Write(barectf_profiler_ctx& barectf_ctx) const override { + barectf_profiler_trace_profiler_record_with_kernel_properties( + &barectf_ctx, GetDispatch(), GetGpuId(), GetQueueId(), GetQueueIndex(), GetProcessId(), + GetThreadId(), GetKernelId(), GetKernelName().c_str(), GetCounterInfos().names.size(), + GetCounterInfos().names.data(), GetCounterInfos().values.size(), + GetCounterInfos().values.data(), grid_size_, workgroup_size_, lds_size_, scratch_size_, + arch_vgpr_count_, accum_vgpr_count_, sgpr_count_, wave_size_, signal_handle_); + } + + private: + std::uint64_t grid_size_; + std::uint64_t workgroup_size_; + std::uint64_t lds_size_; + std::uint64_t scratch_size_; + std::uint64_t arch_vgpr_count_; + std::uint64_t accum_vgpr_count_; + std::uint64_t sgpr_count_; + std::uint64_t wave_size_; + std::uint64_t signal_handle_; +}; + +} // namespace + +Plugin::Plugin(const std::size_t packet_size, const fs::path& trace_dir, + const fs::path& metadata_stream_path) + : roctx_tracer_{packet_size, trace_dir, "roctx_"}, + hsa_api_tracer_{packet_size, trace_dir, "hsa_api_"}, + hip_api_tracer_{packet_size, trace_dir, "hip_api_"}, + api_ops_tracer_{packet_size, trace_dir, "api_ops_"}, + hsa_handles_tracer_{packet_size, trace_dir, "hsa_handles_"}, + profiler_tracer_{packet_size, trace_dir, "profiler_"} { + // Make sure the trace directory doesn't exist. + if (fs::exists(trace_dir)) { + std::ostringstream ss; + + ss << "CTF trace directory `" << trace_dir.string() << "` already exists"; + throw std::runtime_error{ss.str()}; + } + + // Make sure the metadata stream file exists. + if (!fs::exists(metadata_stream_path)) { + std::ostringstream ss; + + ss << "CTF metadata stream file `" << metadata_stream_path.string() << "` doesn't exist"; + throw std::runtime_error{ss.str()}; + } + + // Create trace directory. + if (!fs::create_directory(trace_dir)) { + std::ostringstream ss; + + ss << "Cannot create the CTF trace directory `" << trace_dir.string() << "`"; + throw std::runtime_error{ss.str()}; + } + + // Copy adjusted metadata stream file to trace directory. + try { + CopyAdjustedMetadataStreamFile(metadata_stream_path, trace_dir); + } catch (const std::exception& exc) { + std::ostringstream ss; + + ss << "Cannot adjust and copy metadata stream file `" << metadata_stream_path.string() + << "` to the CTF trace directory `" << trace_dir.string() << "`: " << exc.what(); + throw std::runtime_error{ss.str()}; + } + + // Write HSA handle type event records. + WriteHsaHandleTypes(); +} + +void Plugin::HandleTracerRecord(const rocprofiler_record_tracer_t& record, + const rocprofiler_session_id_t session_id) { + std::lock_guard lock{lock_}; + + // Depending on the domain, create and add an event record to the + // corresponding tracer. + switch (record.domain) { + case ACTIVITY_DOMAIN_ROCTX: + roctx_tracer_.AddEventRecord(std::make_shared(record, session_id)); + break; + case ACTIVITY_DOMAIN_HSA_API: { + hsa_api_tracer_.AddEventRecord( + std::make_shared(record, session_id)); + hsa_api_tracer_.AddEventRecord( + std::make_shared(record, session_id)); + break; + } + case ACTIVITY_DOMAIN_HIP_API: { + hip_api_tracer_.AddEventRecord( + std::make_shared(record, session_id)); + hip_api_tracer_.AddEventRecord( + std::make_shared(record, session_id)); + break; + } + case ACTIVITY_DOMAIN_HSA_OPS: + api_ops_tracer_.AddEventRecord(std::make_shared(record)); + api_ops_tracer_.AddEventRecord(std::make_shared(record)); + break; + case ACTIVITY_DOMAIN_HIP_OPS: + api_ops_tracer_.AddEventRecord(std::make_shared(record)); + api_ops_tracer_.AddEventRecord(std::make_shared(record)); + break; + default: + // Warn + std::cerr << "rocm_ctf::Plugin::HandleTracerRecord(): " + << "ignoring record for unknown domain #" << record.domain << std::endl; + break; + } +} + +void Plugin::HandleProfilerRecord(const rocprofiler_record_profiler_t& record, + const rocprofiler_session_id_t session_id) { + std::lock_guard lock{lock_}; + profiler_tracer_.AddEventRecord( + std::make_shared(record, session_id)); +} + +void Plugin::HandleBufferRecords(const rocprofiler_record_header_t* begin, + const rocprofiler_record_header_t* const end, + const rocprofiler_session_id_t session_id, + const rocprofiler_buffer_id_t buffer_id) { + while (begin && begin < end) { + if (begin->kind == ROCPROFILER_TRACER_RECORD) { + HandleTracerRecord(*reinterpret_cast(begin), session_id); + } else { + assert(begin->kind == ROCPROFILER_PROFILER_RECORD); + HandleProfilerRecord(*reinterpret_cast(begin), + session_id); + } + + rocprofiler_next_record(begin, &begin, session_id, buffer_id); + } +} + +void Plugin::WriteHsaHandleTypes() { + [[maybe_unused]] const auto status = hsa_iterate_agents( + [](const auto agent, const auto user_data) { + auto& tracer = *static_cast(user_data); + hsa_device_type_t type; + + if (hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type) != HSA_STATUS_SUCCESS) { + return HSA_STATUS_ERROR; + } + + using Type = HsaHandleTypeEventRecord::Type; + + auto event_record = std::make_shared( + agent.handle, type == HSA_DEVICE_TYPE_CPU ? Type::CPU : Type::GPU); + + tracer.AddEventRecord(std::move(event_record)); + return HSA_STATUS_SUCCESS; + }, + &hsa_handles_tracer_); + + assert(status == HSA_STATUS_SUCCESS && "Iterate HSA agents"); +} + +namespace { + +constexpr std::uint64_t ns_per_s = 1'000'000'000ULL; + +// Samples the ROCProfiler clock and returns the value. +std::uint64_t GetClkVal() { + rocprofiler_timestamp_t ts; + [[maybe_unused]] const auto ret = rocprofiler_get_timestamp(&ts); + + assert(ret == ROCPROFILER_STATUS_SUCCESS && "Get timestamp"); + return ts.value; +} + +// Updates `offset` and `delta`, if needed, to a more accurate clock +// class offset and a smaller ROCProfiler clock value delta. +// +// This function samples the ROCProfiler clock twice, also sampling the +// real-time clock in between, and uses the average ROCProfiler clock +// value to approximate the actual clock class offset. +// +// This strategy is based on the measure_single_clock_offset() function +// of the LTTng-tools project . +void UpdateClkClsOffsetAndDelta(std::uint64_t& offset, std::uint64_t& delta) { + // Sample ROCProfiler clock (first time). + const auto rocm_clk_val1 = GetClkVal(); + + // Sample real-time clock. + timespec realtime_spec = {0, 0}; + [[maybe_unused]] const auto ret = clock_gettime(CLOCK_REALTIME, &realtime_spec); + + assert(ret == 0); + + // Sample ROCProfiler clock (second time). + const auto rocm_clk_val2 = GetClkVal(); + + // Compute the current ROCProfiler clock value delta. + const auto this_delta = rocm_clk_val2 - rocm_clk_val1; + + if (this_delta > delta) { + // Discard larger delta. + return; + } + + // Compute the average ROCProfiler clock value. + const auto rocm_clk_val_avg = (rocm_clk_val1 + rocm_clk_val2) >> 1; + + // Compute the real-time clock value in nanoseconds. + const auto realtime_ns = + (static_cast(realtime_spec.tv_sec) * ns_per_s) + realtime_spec.tv_nsec; + + // Update clock class offset and delta. + assert(rocm_clk_val_avg < realtime_ns); + offset = realtime_ns - rocm_clk_val_avg; + delta = this_delta; +} + +// Computes and returns the most possible accurate clock class offset. +std::uint64_t GetMetadataClkClsOffset() { + std::uint64_t offset = 0; + std::uint64_t delta = std::numeric_limits::max(); + + // Best effort to find the most accurate offset. + for (auto i = 0U; i < 50U; ++i) { + UpdateClkClsOffsetAndDelta(offset, delta); + } + + return offset; +} + +} // namespace + +void Plugin::CopyAdjustedMetadataStreamFile(const fs::path& metadata_stream_path, + const fs::path& trace_dir) { + // Load installed metadata stream file contents. + std::string metadata; + std::getline(std::ifstream{metadata_stream_path}, metadata, '\0'); + + // Replace the original `offset` property. + { + static constexpr auto offset_term = "offset = 0;"; + std::ostringstream ss; + + ss << "offset = " << GetMetadataClkClsOffset() << ';'; + metadata.replace(metadata.find(offset_term), std::strlen(offset_term), ss.str()); + } + + // Write adjusted metadata stream to trace directory. + { + std::ofstream output{trace_dir / "metadata"}; + + output.write(metadata.data(), metadata.size()); + } +} + +} // namespace rocm_ctf diff --git a/plugin/ctf/plugin.h b/plugin/ctf/plugin.h new file mode 100644 index 00000000..3880162d --- /dev/null +++ b/plugin/ctf/plugin.h @@ -0,0 +1,146 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#ifndef PLUGIN_CTF_PLUGIN_H +#define PLUGIN_CTF_PLUGIN_H + +#include +#include +#include + +#include "rocprofiler.h" +#include "rocprofiler_plugin.h" + +#include "barectf.h" +#include "barectf_tracer.h" + +namespace rocm_ctf { + +// CTF plugin. +// +// Build a plugin instance, and then call HandleTracerRecord(), +// HandleProfilerRecord(), and HandleBufferRecords() to add event +// records. +// +// A plugin instance performs important tasks at destruction time. +class Plugin final { + public: + // Builds a plugin instance to write a CTF trace in the `trace_dir` + // directory with packets of size `packet_size` bytes. + // + // `trace_dir` must not exist. + // + // This constructor immediately adjusts and copies the metadata stream + // file `metadata_stream_path` to the trace directory (`trace_dir`). + explicit Plugin(std::size_t packet_size, const std::experimental::filesystem::path& trace_dir, + const std::experimental::filesystem::path& metadata_stream_path); + + // Handles a tracer record. + void HandleTracerRecord(const rocprofiler_record_tracer_t& record, + rocprofiler_session_id_t session_id); + + // Handles a profiler record. + void HandleProfilerRecord(const rocprofiler_record_profiler_t& record, + rocprofiler_session_id_t session_id); + + // Handles tracer or profiler records from `begin` to `end` + // (excluded). + void HandleBufferRecords(const rocprofiler_record_header_t* begin, + const rocprofiler_record_header_t* end, rocprofiler_session_id_t session_id, + rocprofiler_buffer_id_t buffer_id); + + private: + // rocTX barectf platform descriptor. + struct RocTxPlatformDescr final { + using Ctx = barectf_roctx_ctx; + + static void OpenPacket(Ctx& ctx) { barectf_roctx_open_packet(&ctx); } + static void ClosePacket(Ctx& ctx) { barectf_roctx_close_packet(&ctx); } + }; + + // HSA API barectf platform descriptor. + struct HsaApiPlatformDescr final { + using Ctx = barectf_hsa_api_ctx; + + static void OpenPacket(Ctx& ctx) { barectf_hsa_api_open_packet(&ctx); } + static void ClosePacket(Ctx& ctx) { barectf_hsa_api_close_packet(&ctx); } + }; + + // HIP API barectf platform descriptor. + struct HipApiPlatformDescr final { + using Ctx = barectf_hip_api_ctx; + + static void OpenPacket(Ctx& ctx) { barectf_hip_api_open_packet(&ctx); } + static void ClosePacket(Ctx& ctx) { barectf_hip_api_close_packet(&ctx); } + }; + + // HSA handles barectf platform descriptor. + struct HsaHandlesPlatformDescr final { + using Ctx = barectf_hsa_handles_ctx; + + static void OpenPacket(Ctx& ctx) { barectf_hsa_handles_open_packet(&ctx); } + static void ClosePacket(Ctx& ctx) { barectf_hsa_handles_close_packet(&ctx); } + }; + + // API operations barectf platform descriptor. + struct ApiOpsPlatformDescr final { + using Ctx = barectf_api_ops_ctx; + + static void OpenPacket(Ctx& ctx) { barectf_api_ops_open_packet(&ctx); } + static void ClosePacket(Ctx& ctx) { barectf_api_ops_close_packet(&ctx); } + }; + + // Profiler barectf platform descriptor. + struct ProfilerPlatformDescr final { + using Ctx = barectf_profiler_ctx; + + static void OpenPacket(Ctx& ctx) { barectf_profiler_open_packet(&ctx); } + static void ClosePacket(Ctx& ctx) { barectf_profiler_close_packet(&ctx); } + }; + + // barectf tracer for HSA handle mappings. + using HsaHandlesTracer = BarectfTracer; + + // Writes the HSA handle type mappings to a dedicated data stream + // file. + void WriteHsaHandleTypes(); + + // Loads the existing metadata stream file `metadata_stream_path`, + // adjusts the `offset` property of its single clock class, and writes + // the result to the `metadata` file within the `trace_dir` directory. + void CopyAdjustedMetadataStreamFile( + const std::experimental::filesystem::path& metadata_stream_path, + const std::experimental::filesystem::path& trace_dir); + + // Dedicated tracers. + BarectfTracer roctx_tracer_; + BarectfTracer hsa_api_tracer_; + BarectfTracer hip_api_tracer_; + BarectfTracer api_ops_tracer_; + HsaHandlesTracer hsa_handles_tracer_; + BarectfTracer profiler_tracer_; + + // Locks any operation performed on the data of this. + std::mutex lock_; +}; + +} // namespace rocm_ctf + +#endif // PLUGIN_CTF_PLUGIN_H diff --git a/plugin/exportmap b/plugin/exportmap new file mode 100644 index 00000000..e5ec3016 --- /dev/null +++ b/plugin/exportmap @@ -0,0 +1,7 @@ +{ +global: rocprofiler_plugin_initialize; + rocprofiler_plugin_finalize; + rocprofiler_plugin_write_buffer_records; + rocprofiler_plugin_write_record; +local: *; +}; \ No newline at end of file diff --git a/plugin/file/CMakeLists.txt b/plugin/file/CMakeLists.txt new file mode 100644 index 00000000..4cd41814 --- /dev/null +++ b/plugin/file/CMakeLists.txt @@ -0,0 +1,44 @@ +# ############################################################################### +# # Copyright (c) 2022 Advanced Micro Devices, Inc. +# # +# # Permission is hereby granted, free of charge, to any person obtaining a copy +# # of this software and associated documentation files (the "Software"), to +# # deal in the Software without restriction, including without limitation the +# # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# # sell copies of the Software, and to permit persons to whom the Software is +# # furnished to do so, subject to the following conditions: +# # +# # The above copyright notice and this permission notice shall be included in +# # all copies or substantial portions of the Software. +# # +# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# # IN THE SOFTWARE. +# ############################################################################### + +file(GLOB ROCPROFILER_UTIL_SRC_FILES ${PROJECT_SOURCE_DIR}/src/utils/helper.cpp) + +file(GLOB FILE_SOURCES "*.cpp") +add_library(file_plugin SHARED ${FILE_SOURCES} ${ROCPROFILER_UTIL_SRC_FILES}) + +set_target_properties(file_plugin PROPERTIES + CXX_VISIBILITY_PRESET hidden + LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/../exportmap + LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) + +target_compile_definitions(file_plugin + PRIVATE HIP_PROF_HIP_API_STRING=1 __HIP_PLATFORM_HCC__=1) + +target_include_directories(file_plugin PRIVATE ${PROJECT_SOURCE_DIR}/inc ${PROJECT_SOURCE_DIR}) + +target_link_options(file_plugin PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exportmap -Wl,--no-undefined) + +target_link_libraries(file_plugin PRIVATE ${ROCPROFILER_TARGET} hsa-runtime64::hsa-runtime64 systemd stdc++fs amd_comgr dl) + +install(TARGETS file_plugin LIBRARY + DESTINATION ${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME} + COMPONENT runtime) \ No newline at end of file diff --git a/plugin/file/file.cpp b/plugin/file/file.cpp new file mode 100644 index 00000000..3443a0ae --- /dev/null +++ b/plugin/file/file.cpp @@ -0,0 +1,475 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rocprofiler.h" +#include "rocprofiler_plugin.h" +#include "../utils.h" + +namespace fs = std::experimental::filesystem; + +namespace { + +static std::string output_file_name; +class file_plugin_t { + private: + enum class output_type_t { + COUNTER, + TRACER, + PC_SAMPLING + }; + + class output_file_t { + public: + output_file_t(std::string name) : name_(std::move(name)) {} + + std::string name() const { return name_; } + + template std::ostream& operator<<(T&& value) { + if (!is_open()) open(); + return stream_ << std::forward(value); + } + + std::ostream& operator<<(std::ostream& (*func)(std::ostream&)) { + if (!is_open()) open(); + return stream_ << func; + } + + void open() { + // If the stream is already in the failed state, there's no need to try + // to open the file. + if (fail()) return; + + const char* output_dir = getenv("OUTPUT_PATH"); + output_file_name = getenv("OUT_FILE_NAME") ? std::string(getenv("OUT_FILE_NAME")) + "_" : ""; + + if (output_dir == nullptr) { + stream_.copyfmt(std::cout); + stream_.clear(std::cout.rdstate()); + stream_.basic_ios::rdbuf(std::cout.rdbuf()); + return; + } + + fs::path output_prefix(output_dir); + if (!fs::is_directory(fs::status(output_prefix))) { + if (!stream_.fail()) rocmtools::warning("Cannot open output directory '%s'", output_dir); + stream_.setstate(std::ios_base::failbit); + return; + } + + std::stringstream ss; + ss << output_file_name << GetPid() << "_" << name_; + stream_.open(output_prefix / ss.str()); + } + + bool is_open() const { return stream_.is_open(); } + bool fail() const { return stream_.fail(); } + + private: + const std::string name_; + std::ofstream stream_; + }; + + output_file_t* get_output_file(output_type_t output_type, uint32_t domain = 0) { + switch (output_type) { + case output_type_t::COUNTER: + return &output_file_; + case output_type_t::TRACER: + switch (domain) { + case ACTIVITY_DOMAIN_ROCTX: + return &roctx_file_; + case ACTIVITY_DOMAIN_HSA_API: + return &hsa_api_file_; + case ACTIVITY_DOMAIN_HIP_API: + return &hip_api_file_; + case ACTIVITY_DOMAIN_HIP_OPS: + return &hip_activity_file_; + case ACTIVITY_DOMAIN_HSA_OPS: + return &hsa_async_copy_file_; + default: + assert(!"domain/op not supported!"); + break; + } + break; + case output_type_t::PC_SAMPLING: + return &pc_sample_file_; + } + return nullptr; + } + + public: + file_plugin_t() { + output_file_t hsa_handles("hsa_handles.txt"); + + [[maybe_unused]] hsa_status_t status = hsa_iterate_agents( + [](hsa_agent_t agent, void* user_data) { + auto* file = static_cast(user_data); + hsa_device_type_t type; + + if (hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &type) != HSA_STATUS_SUCCESS) + return HSA_STATUS_ERROR; + + *file << std::hex << std::showbase << agent.handle << " agent " + << ((type == HSA_DEVICE_TYPE_CPU) ? "cpu" : "gpu") << std::endl; + return HSA_STATUS_SUCCESS; + }, + &hsa_handles); + assert(status == HSA_STATUS_SUCCESS && "failed to iterate HSA agents"); + if (hsa_handles.fail()) { + rocmtools::warning("Cannot write to '%s'", hsa_handles.name().c_str()); + return; + } + + // App begin timestamp begin_ts_file.txt + output_file_t begin_ts("begin_ts_file.txt"); + + [[maybe_unused]] rocprofiler_timestamp_t app_begin_timestamp = {}; + CHECK_ROCPROFILER(rocprofiler_get_timestamp(&app_begin_timestamp)); + + begin_ts << std::dec << app_begin_timestamp.value << std::endl; + if (begin_ts.fail()) { + rocmtools::warning("Cannot write to '%s'", begin_ts.name().c_str()); + return; + } + + valid_ = true; + } + + std::mutex writing_lock; + + const char* GetDomainName(rocprofiler_tracer_activity_domain_t domain) { + switch (domain) { + case ACTIVITY_DOMAIN_ROCTX: + return "ROCTX_DOMAIN"; + break; + case ACTIVITY_DOMAIN_HIP_API: + return "HIP_API_DOMAIN"; + break; + case ACTIVITY_DOMAIN_HIP_OPS: + return "HIP_OPS_DOMAIN"; + break; + case ACTIVITY_DOMAIN_HSA_API: + return "HSA_API_DOMAIN"; + break; + case ACTIVITY_DOMAIN_HSA_OPS: + return "HSA_OPS_DOMAIN"; + break; + case ACTIVITY_DOMAIN_HSA_EVT: + return "HSA_EVT_DOMAIN"; + break; + default: + return ""; + } + } + + void FlushTracerRecord(rocprofiler_record_tracer_t tracer_record, rocprofiler_session_id_t session_id, + rocprofiler_buffer_id_t buffer_id = rocprofiler_buffer_id_t{0}) { + std::lock_guard lock(writing_lock); + std::string kernel_name; + std::string function_name; + std::string roctx_message; + uint64_t roctx_id; + if ((tracer_record.operation_id.id == 0 && tracer_record.domain == ACTIVITY_DOMAIN_HIP_OPS)) { + if (tracer_record.api_data_handle.handle && + strlen(reinterpret_cast(tracer_record.api_data_handle.handle)) > 1) + kernel_name = rocmtools::cxx_demangle( + reinterpret_cast(tracer_record.api_data_handle.handle)); + } + if (tracer_record.domain == ACTIVITY_DOMAIN_HSA_API) { + size_t function_name_size = 0; + CHECK_ROCPROFILER(rocprofiler_query_hsa_tracer_api_data_info_size( + session_id, ROCPROFILER_HSA_FUNCTION_NAME, tracer_record.api_data_handle, + tracer_record.operation_id, &function_name_size)); + if (function_name_size > 1) { + char* function_name_c = (char*)malloc(function_name_size); + CHECK_ROCPROFILER(rocprofiler_query_hsa_tracer_api_data_info( + session_id, ROCPROFILER_HSA_FUNCTION_NAME, tracer_record.api_data_handle, + tracer_record.operation_id, &function_name_c)); + if (function_name_c) function_name = std::string(function_name_c); + } + } + if (tracer_record.domain == ACTIVITY_DOMAIN_HIP_API) { + size_t function_name_size = 0; + CHECK_ROCPROFILER(rocprofiler_query_hip_tracer_api_data_info_size( + session_id, ROCPROFILER_HIP_FUNCTION_NAME, tracer_record.api_data_handle, + tracer_record.operation_id, &function_name_size)); + if (function_name_size > 1) { + char* function_name_c = (char*)malloc(function_name_size); + CHECK_ROCPROFILER(rocprofiler_query_hip_tracer_api_data_info( + session_id, ROCPROFILER_HIP_FUNCTION_NAME, tracer_record.api_data_handle, + tracer_record.operation_id, &function_name_c)); + if (function_name_c) function_name = std::string(function_name_c); + } + size_t kernel_name_size = 0; + CHECK_ROCPROFILER(rocprofiler_query_hip_tracer_api_data_info_size( + session_id, ROCPROFILER_HIP_KERNEL_NAME, tracer_record.api_data_handle, + tracer_record.operation_id, &kernel_name_size)); + if (kernel_name_size > 1) { + char* kernel_name_str = (char*)malloc(kernel_name_size * sizeof(char)); + CHECK_ROCPROFILER(rocprofiler_query_hip_tracer_api_data_info( + session_id, ROCPROFILER_HIP_KERNEL_NAME, tracer_record.api_data_handle, + tracer_record.operation_id, &kernel_name_str)); + if (kernel_name_str) kernel_name = rocmtools::cxx_demangle(std::string(kernel_name_str)); + } + } + if (tracer_record.domain == ACTIVITY_DOMAIN_ROCTX) { + size_t roctx_message_size = 0; + CHECK_ROCPROFILER(rocprofiler_query_roctx_tracer_api_data_info_size( + session_id, ROCPROFILER_ROCTX_MESSAGE, tracer_record.api_data_handle, + tracer_record.operation_id, &roctx_message_size)); + if (roctx_message_size > 1) { + [[maybe_unused]] char* roctx_message_str = + static_cast(malloc(roctx_message_size * sizeof(char))); + CHECK_ROCPROFILER(rocprofiler_query_roctx_tracer_api_data_info( + session_id, ROCPROFILER_ROCTX_MESSAGE, tracer_record.api_data_handle, + tracer_record.operation_id, &roctx_message_str)); + if (roctx_message_str) + roctx_message = rocmtools::cxx_demangle(std::string(strdup(roctx_message_str))); + } + size_t roctx_id_size = 0; + CHECK_ROCPROFILER(rocprofiler_query_roctx_tracer_api_data_info_size( + session_id, ROCPROFILER_ROCTX_ID, tracer_record.api_data_handle, tracer_record.operation_id, + &roctx_id_size)); + if (roctx_id_size > 1) { + [[maybe_unused]] char* roctx_id_str = + static_cast(malloc(roctx_id_size * sizeof(char))); + CHECK_ROCPROFILER(rocprofiler_query_roctx_tracer_api_data_info( + session_id, ROCPROFILER_ROCTX_ID, tracer_record.api_data_handle, + tracer_record.operation_id, &roctx_id_str)); + if (roctx_id_str) { + roctx_id = std::stoll(std::string(strdup(roctx_id_str))); + free(roctx_id_str); + } + } + } + output_file_t* output_file = get_output_file(output_type_t::TRACER, tracer_record.domain); + *output_file << "Record [" << tracer_record.header.id.handle << "], Domain(" + << GetDomainName(tracer_record.domain) << "), Begin(" + << tracer_record.timestamps.begin.value << "), End(" + << tracer_record.timestamps.end.value << "), Correlation ID( " + << tracer_record.correlation_id.value << ")"; + if (roctx_id >= 0) *output_file << ", ROCTX ID(" << roctx_id << ")"; + if (roctx_message.size() > 1) *output_file << ", ROCTX Message(" << roctx_message << ")"; + if (function_name.size() > 1) *output_file << ", Function(" << function_name << ")"; + if (kernel_name.size() > 1) *output_file << ", Kernel Name(" << kernel_name.c_str() << ")"; + *output_file << std::endl; + } + + void FlushProfilerRecord(const rocprofiler_record_profiler_t* profiler_record, + rocprofiler_session_id_t session_id, rocprofiler_buffer_id_t buffer_id) { + std::lock_guard lock(writing_lock); + size_t name_length = 0; + output_file_t* output_file{nullptr}; + output_file = get_output_file(output_type_t::COUNTER); + CHECK_ROCPROFILER(rocprofiler_query_kernel_info_size(ROCPROFILER_KERNEL_NAME, + profiler_record->kernel_id, &name_length)); + // Taken from rocprofiler: The size hasn't changed in recent past + static const uint32_t lds_block_size = 128 * 4; + const char* kernel_name_c; + if (name_length > 1) { + kernel_name_c = static_cast(malloc(name_length * sizeof(char))); + CHECK_ROCPROFILER(rocprofiler_query_kernel_info(ROCPROFILER_KERNEL_NAME, profiler_record->kernel_id, + &kernel_name_c)); + } + *output_file << std::string("dispatch[") << std::to_string(profiler_record->header.id.handle) + << "], " << std::string("gpu_id(") + << std::to_string(profiler_record->gpu_id.handle) << "), " + << std::string("queue_id(") << std::to_string(profiler_record->queue_id.handle) + << "), " << std::string("queue_index(") + << std::to_string(profiler_record->queue_idx.value) << "), " << std::string("pid(") + << std::to_string(GetPid()) << "), " << std::string("tid(") + << std::to_string(profiler_record->thread_id.value) << ")"; + *output_file << ", " << std::string("grd(") + << std::to_string(profiler_record->kernel_properties.grid_size) << "), " + << std::string("wgr(") + << std::to_string(profiler_record->kernel_properties.workgroup_size) << "), " + << std::string("lds(") + << std::to_string( + ((profiler_record->kernel_properties.lds_size + (lds_block_size - 1)) & + ~(lds_block_size - 1))) + << "), " << std::string("scr(") + << std::to_string(profiler_record->kernel_properties.scratch_size) << "), " + << std::string("arch_vgpr(") + << std::to_string(profiler_record->kernel_properties.arch_vgpr_count) << "), " + << std::string("accum_vgpr(") + << std::to_string(profiler_record->kernel_properties.accum_vgpr_count) << "), " + << std::string("sgpr(") + << std::to_string(profiler_record->kernel_properties.sgpr_count) << "), " + << std::string("wave_size(") + << std::to_string(profiler_record->kernel_properties.wave_size) << "), " + << std::string("sig(") + << std::to_string(profiler_record->kernel_properties.signal_handle); + std::string kernel_name = ""; + if (name_length > 1) { + kernel_name = rocmtools::truncate_name(rocmtools::cxx_demangle(kernel_name_c)); + } + *output_file << "), " << std::string("obj(") + << std::to_string(profiler_record->kernel_id.handle) << "), " + << std::string("kernel-name(\"") << kernel_name << "\")" + << std::string(", start_time(") + << std::to_string(profiler_record->timestamps.begin.value) << ")" + << std::string(", end_time(") + << std::to_string(profiler_record->timestamps.end.value) << ")"; + + // For Counters + *output_file << std::endl; + if (profiler_record->counters) { + for (uint64_t i = 0; i < profiler_record->counters_count.value; i++) { + if (profiler_record->counters[i].counter_handler.handle > 0) { + size_t counter_name_length = 0; + CHECK_ROCPROFILER(rocprofiler_query_counter_info_size( + session_id, ROCPROFILER_COUNTER_NAME, profiler_record->counters[i].counter_handler, + &counter_name_length)); + if (counter_name_length > 1) { + const char* name_c = static_cast(malloc(name_length * sizeof(char))); + CHECK_ROCPROFILER(rocprofiler_query_counter_info( + session_id, ROCPROFILER_COUNTER_NAME, profiler_record->counters[i].counter_handler, + &name_c)); + *output_file << ", " << name_c << " (" + << std::to_string(profiler_record->counters[i].value.value) << ")" + << std::endl; + } + } + } + } + } + + void FlushPCSamplingRecord( + const rocprofiler_record_pc_sample_t *pc_sampling_record) { + output_file_t* output_file{nullptr}; + output_file = get_output_file(output_type_t::PC_SAMPLING); + const auto &sample = pc_sampling_record->pc_sample; + *output_file << "dispatch[" << sample.dispatch_id.value << "], " + << "timestamp(" << sample.timestamp.value << "), " + << "gpu_id(" << sample.gpu_id.handle << "), " + << "pc-sample(" << std::hex << std::showbase << sample.pc << "), " + << "se(" << sample.se << ')' + << std::endl; + } + + int WriteBufferRecords(const rocprofiler_record_header_t* begin, + const rocprofiler_record_header_t* end, rocprofiler_session_id_t session_id, + rocprofiler_buffer_id_t buffer_id) { + while (begin < end) { + if (!begin) return 0; + switch (begin->kind) { + case ROCPROFILER_PROFILER_RECORD: { + const rocprofiler_record_profiler_t* profiler_record = + reinterpret_cast(begin); + FlushProfilerRecord(profiler_record, session_id, buffer_id); + break; + } + case ROCPROFILER_TRACER_RECORD: { + rocprofiler_record_tracer_t* tracer_record = const_cast( + reinterpret_cast(begin)); + FlushTracerRecord(*tracer_record, session_id, buffer_id); + break; + } + case ROCPROFILER_ATT_TRACER_RECORD: { + break; + } + case ROCPROFILER_PC_SAMPLING_RECORD: { + const rocprofiler_record_pc_sample_t *pc_sampling_record = + reinterpret_cast(begin); + FlushPCSamplingRecord(pc_sampling_record); + break; + } + default: + break; + } + rocprofiler_next_record(begin, &begin, session_id, buffer_id); + } + return 0; + } + + bool is_valid() const { return valid_; } + + private: + bool valid_{false}; + + output_file_t roctx_file_{"roctx_trace.txt"}, hsa_api_file_{"hsa_api_trace.txt"}, + hip_api_file_{"hip_api_trace.txt"}, hip_activity_file_{"hcc_ops_trace.txt"}, + hsa_async_copy_file_{"async_copy_trace.txt"}, pc_sample_file_{"pcs_trace.txt"}, + output_file_{"results.txt"}; +}; + +file_plugin_t* file_plugin = nullptr; + +} // namespace + +ROCPROFILER_EXPORT int rocprofiler_plugin_initialize(uint32_t rocprofiler_major_version, + uint32_t rocprofiler_minor_version) { + if (rocprofiler_major_version != ROCPROFILER_VERSION_MAJOR || + rocprofiler_minor_version < ROCPROFILER_VERSION_MINOR) + return -1; + + if (file_plugin != nullptr) return -1; + + file_plugin = new file_plugin_t(); + if (file_plugin->is_valid()) return 0; + + // The plugin failed to initialied, destroy it and return an error. + delete file_plugin; + file_plugin = nullptr; + return -1; +} + +ROCPROFILER_EXPORT void rocprofiler_plugin_finalize() { + if (!file_plugin) return; + delete file_plugin; + file_plugin = nullptr; +} + +ROCPROFILER_EXPORT int rocprofiler_plugin_write_buffer_records(const rocprofiler_record_header_t* begin, + const rocprofiler_record_header_t* end, + rocprofiler_session_id_t session_id, + rocprofiler_buffer_id_t buffer_id) { + if (!file_plugin || !file_plugin->is_valid()) return -1; + return file_plugin->WriteBufferRecords(begin, end, session_id, buffer_id); +} + +ROCPROFILER_EXPORT int rocprofiler_plugin_write_record(rocprofiler_record_tracer_t record, + rocprofiler_session_id_t session_id) { + if (!file_plugin || !file_plugin->is_valid()) return -1; + if (record.header.id.handle == 0) return 0; + file_plugin->FlushTracerRecord(record, session_id); + return 0; +} diff --git a/plugin/perfetto/CMakeLists.txt b/plugin/perfetto/CMakeLists.txt new file mode 100644 index 00000000..669b7e81 --- /dev/null +++ b/plugin/perfetto/CMakeLists.txt @@ -0,0 +1,27 @@ +file(GLOB ROCPROFILER_UTIL_SRC_FILES ${PROJECT_SOURCE_DIR}/src/utils/helper.cpp) + +add_library(perfetto_plugin + ${LIBRARY_TYPE} ${ROCPROFILER_UTIL_SRC_FILES} + perfetto.cpp perfetto_sdk/sdk/perfetto.cc) + +set_target_properties(perfetto_plugin PROPERTIES + CXX_VISIBILITY_PRESET hidden + LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/../exportmap + LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) + +target_compile_definitions(perfetto_plugin + PRIVATE HIP_PROF_HIP_API_STRING=1 + __HIP_PLATFORM_HCC__=1) + +target_include_directories(perfetto_plugin + PRIVATE ${PROJECT_SOURCE_DIR}/inc ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/plugin/perfetto/perfetto_sdk/sdk) + +target_link_options(perfetto_plugin + PRIVATE -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exportmap -Wl,--no-undefined) + +target_link_libraries(perfetto_plugin PRIVATE ${ROCPROFILER_TARGET} Threads::Threads systemd stdc++fs amd_comgr) + +install(TARGETS perfetto_plugin LIBRARY + DESTINATION ${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME} + COMPONENT plugins) \ No newline at end of file diff --git a/plugin/perfetto/perfetto.cpp b/plugin/perfetto/perfetto.cpp new file mode 100644 index 00000000..6455f282 --- /dev/null +++ b/plugin/perfetto/perfetto.cpp @@ -0,0 +1,804 @@ +/* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. */ + +#include "rocprofiler.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "perfetto_sdk/sdk/perfetto.h" +#include "rocprofiler_plugin.h" +#include "../utils.h" + +#define STREAM_CONSTANT 98736677 +#define QUEUE_CONSTANT 18746479 + +namespace fs = std::experimental::filesystem; + +PERFETTO_DEFINE_CATEGORIES( + perfetto::Category("GENERIC").SetDescription("GENERAL_CATEGORY"), + perfetto::Category("ROCTX_API").SetDescription("ACTIVITY_DOMAIN_ROCTX_API"), + perfetto::Category("HSA_API").SetDescription("ACTIVITY_DOMAIN_HSA_API"), + perfetto::Category("HIP_API").SetDescription("ACTIVITY_DOMAIN_HIP_API"), + perfetto::Category("External_API").SetDescription("ACTIVITY_DOMAIN_EXT_API"), + perfetto::Category("HIP_OPS").SetDescription("ACTIVITY_DOMAIN_HIP_OPS"), + perfetto::Category("HSA_OPS").SetDescription("ACTIVITY_DOMAIN_HSA_OPS"), + perfetto::Category("KERNELS").SetDescription("KERNEL_DISPATCHES"), + perfetto::Category("COUNTERS").SetDescription("PERFORMANCE_COUNTERS")); + +PERFETTO_TRACK_EVENT_STATIC_STORAGE(); + +namespace { + +std::string process_name; +static std::string output_file_name; + +std::string get_kernel_name(rocprofiler_record_profiler_t& profiler_record) { + std::string kernel_name = ""; + size_t name_length = 1; + CHECK_ROCPROFILER(rocprofiler_query_kernel_info_size(ROCPROFILER_KERNEL_NAME, profiler_record.kernel_id, + &name_length)); +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#pragma GCC diagnostic ignored "-Wstringop-overread" + if (name_length > 1) { + const char* kernel_name_c = static_cast(malloc(name_length * sizeof(char))); + CHECK_ROCPROFILER(rocprofiler_query_kernel_info(ROCPROFILER_KERNEL_NAME, profiler_record.kernel_id, + &kernel_name_c)); + if (kernel_name_c && strlen(kernel_name_c) > 1) + kernel_name = rocmtools::cxx_demangle(strdup(kernel_name_c)); + } +#pragma GCC diagnostic pop + return kernel_name; +} + + +class perfetto_plugin_t { + public: + perfetto_plugin_t() { + const char* output_dir = getenv("OUTPUT_PATH"); + const char* temp_file_name = getenv("OUT_FILE_NAME"); + output_file_name = temp_file_name ? std::string(temp_file_name) + "_" : ""; + + if (output_dir == nullptr) { + stream_.copyfmt(std::cout); + stream_.clear(std::cout.rdstate()); + stream_.basic_ios::rdbuf(std::cout.rdbuf()); + return; + } + + output_prefix_ = output_dir; + if (!fs::is_directory(fs::status(output_prefix_))) { + if (!stream_.fail()) rocmtools::warning("Cannot open output directory '%s'", output_dir); + stream_.setstate(std::ios_base::failbit); + return; + } + + perfetto::TracingInitArgs args; + args.backends |= perfetto::kInProcessBackend; + + perfetto::Tracing::Initialize(args); + perfetto::TrackEvent::Register(); + + perfetto::protos::gen::TrackEventConfig track_event_cfg; + track_event_cfg.add_disabled_categories("*"); + track_event_cfg.add_enabled_categories("GENERIC"); + track_event_cfg.add_enabled_categories("ROCTX_API"); + track_event_cfg.add_enabled_categories("HSA_API"); + track_event_cfg.add_enabled_categories("HIP_API"); + track_event_cfg.add_enabled_categories("External_API"); + track_event_cfg.add_enabled_categories("HIP_OPS"); + track_event_cfg.add_enabled_categories("HSA_OPS"); + track_event_cfg.add_enabled_categories("KERNELS"); + track_event_cfg.add_enabled_categories("COUNTERS"); + + perfetto::TraceConfig trace_cfg; + + auto buffer_cfg = trace_cfg.add_buffers(); + uint32_t max_buffer_size = 10 * 1024 * 1024; // Default max buffer size is 10 GB + const char* max_buffer_size_str = getenv("rocprofiler_PERFETTO_MAX_BUFFER_SIZE_KIB"); + if (max_buffer_size_str && std::atol(max_buffer_size_str) > 0) + max_buffer_size = std::atol(max_buffer_size_str); + // Record up to max buffer size determined by user or the 10 GB (default value) + buffer_cfg->set_size_kb(max_buffer_size); + + auto* data_source_cfg = trace_cfg.add_data_sources()->mutable_config(); + data_source_cfg->set_name("track_event"); + data_source_cfg->set_track_event_config_raw(track_event_cfg.SerializeAsString()); + + output_prefix_.append(output_file_name + std::to_string(GetPid()) + "_output.pftrace"); + file_descriptor_ = open(output_prefix_.string().c_str(), O_RDWR | O_CREAT | O_TRUNC, 0600); + if (file_descriptor_ == -1) rocmtools::warning("Can't open output file\n"); + + tracing_session_ = perfetto::Tracing::NewTrace(); + tracing_session_->Setup(trace_cfg, file_descriptor_); + tracing_session_->StartBlocking(); + + + hostname_[1023] = '\0'; + gethostname(hostname_, 1023); + sd_id128_t ret; + char machine_id[SD_ID128_STRING_MAX]; + [[maybe_unused]] int status = sd_id128_get_machine(&ret); + assert(status == 0 && "Error: Couldn't get machine id!"); + if (sd_id128_to_string(ret, machine_id)) machine_id_ = std::hash{}(machine_id); + + { + std::lock_guard lock(thread_tracks_lock_); + process_name = + perfetto::ProcessTrack::Current().Serialize().mutable_process()->process_name(); + auto process_track_desc = perfetto::ProcessTrack::Current().Serialize(); + uint64_t track_id = + track_counter_.fetch_add((1 + machine_id_) * GetPid(), std::memory_order_acquire); + for (uint64_t tid : track_ids_used_) { + while (track_id == tid) { + track_id = + track_counter_.fetch_add((1 + machine_id_) * GetPid(), std::memory_order_acquire); + } + } + std::string thread_track_str = + rocmtools::string_printf("Node: %s Process ID: %lu Thread ID:", hostname_, GetPid()); + process_track_desc.mutable_process()->set_process_name(thread_track_str); + perfetto::TrackEvent::SetTrackDescriptor(perfetto::ProcessTrack::Current(), + process_track_desc); + perfetto::ProcessTrack::Current().Serialize().set_uuid(track_id); + thread_tracks_.emplace(GetPid(), perfetto::ProcessTrack::Current()); + } + + is_valid_ = true; + } + + ~perfetto_plugin_t() { + if (is_valid_) { + tracing_session_->StopBlocking(); + close(file_descriptor_); + } + } + + const char* GetDomainName(rocprofiler_tracer_activity_domain_t domain) { + switch (domain) { + case ACTIVITY_DOMAIN_ROCTX: + return "ROCTX_DOMAIN"; + break; + case ACTIVITY_DOMAIN_HIP_API: + return "HIP_API_DOMAIN"; + break; + case ACTIVITY_DOMAIN_HIP_OPS: + return "HIP_OPS_DOMAIN"; + break; + case ACTIVITY_DOMAIN_HSA_API: + return "HSA_API_DOMAIN"; + break; + case ACTIVITY_DOMAIN_HSA_OPS: + return "HSA_OPS_DOMAIN"; + break; + case ACTIVITY_DOMAIN_HSA_EVT: + return "HSA_EVT_DOMAIN"; + break; + default: + return ""; + } + } + + std::mutex writing_lock; + + int FlushProfilerRecord(rocprofiler_record_profiler_t profiler_record, + rocprofiler_session_id_t session_id) { + std::lock_guard lock(writing_lock); + // ToDO: rename this variable? + if (!tracing_session_) rocmtools::warning("Tracing session is deleted!\n"); + + int device_id = profiler_record.gpu_id.handle; + std::unordered_map::iterator device_track_it; + { + std::lock_guard lock(device_tracks_lock_); + device_track_it = device_tracks_.find(device_id); + if (device_track_it == device_tracks_.end()) { + /* Create a new perfetto::Track (Sub-Track) */ + device_track_it = + device_tracks_ + .emplace(device_id, perfetto::ProcessTrack::Global(((device_id + 1) * machine_id_))) + .first; + auto gpu_desc = device_track_it->second.Serialize(); + gpu_desc.mutable_process()->set_pid(device_id); + std::string gpu_str = rocmtools::string_printf("Node: %s Device:", hostname_); + gpu_desc.mutable_process()->set_process_name(gpu_str); + perfetto::TrackEvent::SetTrackDescriptor(device_track_it->second, gpu_desc); + track_ids_used_.emplace_back(device_id + 1 + machine_id_); + } + } + auto& gpu_track = device_track_it->second; + std::pair gpu_queue_id = + std::make_pair(device_id, profiler_record.queue_id.handle); + auto queue_track_it = queue_tracks_.find(gpu_queue_id.first); + { + std::lock_guard lock(queue_tracks_lock_); + queue_track_it = queue_tracks_.find(gpu_queue_id.first); + if (queue_track_it == queue_tracks_.end()) { + /* Create a new perfetto::Track */ + queue_track_it = queue_tracks_ + .emplace(gpu_queue_id.first, + perfetto::Track((profiler_record.queue_id.handle + 1 + + profiler_record.gpu_id.handle) * + QUEUE_CONSTANT * machine_id_ * GetPid(), + gpu_track)) + .first; + + auto queue_desc = queue_track_it->second.Serialize(); + std::string queue_str = + rocmtools::string_printf("Process ID: %lu Queue %ld", GetPid(), gpu_queue_id.second); + queue_desc.set_name(queue_str); + perfetto::TrackEvent::SetTrackDescriptor(queue_track_it->second, queue_desc); + } + track_ids_used_.emplace_back(profiler_record.queue_id.handle + machine_id_ + 1 + + profiler_record.gpu_id.handle); + } + auto& queue_track = queue_track_it->second; + + // Taken from rocprofiler: The size hasn't changed in recent past + static const uint32_t lds_block_size = 128 * 4; + + std::string full_kernel_name = get_kernel_name(profiler_record); + // std::string truncated_kernel_name = rocmtools::truncate_name(full_kernel_name); + // perfetto::StaticString kernel_name(truncated_kernel_name.c_str()); + TRACE_EVENT_BEGIN("KERNELS", perfetto::StaticString(full_kernel_name.c_str()), queue_track, + profiler_record.timestamps.begin.value, "Full Kernel Name", + full_kernel_name.c_str(), "Agent ID", device_id, "Queue ID", + profiler_record.queue_id.handle, "GRD", + profiler_record.kernel_properties.grid_size, "WGR", + profiler_record.kernel_properties.workgroup_size, "LDS", + (((profiler_record.kernel_properties.lds_size + (lds_block_size - 1)) & + ~(lds_block_size - 1))), + "SCR", profiler_record.kernel_properties.scratch_size, "Arch. VGPR", + profiler_record.kernel_properties.arch_vgpr_count, "Accumilative Vgpr", + profiler_record.kernel_properties.accum_vgpr_count, "SGPR", + profiler_record.kernel_properties.sgpr_count, "Wave Size", + profiler_record.kernel_properties.wave_size, "Signal", + profiler_record.kernel_properties.signal_handle); + + TRACE_EVENT_END("KERNELS", queue_track, profiler_record.timestamps.end.value); + + auto get_counter_track_fn = [&](std::string counter_name) { + std ::string counter_track_id = + std::to_string(machine_id_) + std::to_string(GetPid()) + counter_name; + std::pair gpu_counter_track_id = std::make_pair(device_id, counter_name); + std::unordered_map::iterator counters_track_it; + { + std::lock_guard lock(counter_tracks_lock_); + counters_track_it = counter_tracks_.find(gpu_counter_track_id.second); + if (counters_track_it == counter_tracks_.end()) { + /* Create a new perfetto::Track */ + counters_track_it = + counter_tracks_ + .emplace(gpu_counter_track_id.second, + perfetto::CounterTrack(counter_track_id.c_str(), gpu_track)) + .first; + + auto counter_track_desc = counters_track_it->second.Serialize(); + std::string counter_track_str = "Process ID " + std::to_string(GetPid()) + " - Counter " + + gpu_counter_track_id.second; + counter_track_desc.set_name(counter_track_str); + perfetto::TrackEvent::SetTrackDescriptor(counters_track_it->second, counter_track_desc); + } + } + return counters_track_it->second; + }; + + // For Counters + if (profiler_record.counters) { + for (uint64_t i = 0; i < profiler_record.counters_count.value; i++) { + if (profiler_record.counters[i].counter_handler.handle > 0) { + size_t name_length = 0; + CHECK_ROCPROFILER(rocprofiler_query_counter_info_size( + session_id, ROCPROFILER_COUNTER_NAME, profiler_record.counters[i].counter_handler, + &name_length)); + if (name_length > 1) { + const char* name_c = static_cast(malloc(name_length * sizeof(char))); + CHECK_ROCPROFILER( + rocprofiler_query_counter_info(session_id, ROCPROFILER_COUNTER_NAME, + profiler_record.counters[i].counter_handler, &name_c)); + + perfetto::CounterTrack counters_track = get_counter_track_fn(std::string(name_c)); + TRACE_COUNTER("COUNTERS", counters_track, profiler_record.timestamps.begin.value, + profiler_record.counters[i].value.value); + // Added an extra zero event for maintaining start-end of the counter + TRACE_COUNTER("COUNTERS", counters_track, profiler_record.timestamps.end.value, 0.001); + } + } + } + } + + return 0; + } + + int FlushTracerRecord(rocprofiler_record_tracer_t tracer_record, + rocprofiler_session_id_t session_id) { + std::lock_guard lock(writing_lock); + if (!tracing_session_) rocmtools::warning("Tracing session is deleted!\n"); + std::string kernel_name; + char* function_name; + char* activity_name; + std::string roctx_message; + uint64_t roctx_id = 0; + uint64_t thread_id = tracer_record.thread_id.value; + std::unordered_map::iterator thread_track_it; + std::unordered_map::iterator device_track_it; + if (tracer_record.domain == ACTIVITY_DOMAIN_HIP_OPS || + tracer_record.domain == ACTIVITY_DOMAIN_HSA_OPS) { + int device_id = tracer_record.agent_id.handle; + if (tracer_record.domain == ACTIVITY_DOMAIN_HIP_OPS && device_id > 0) device_id--; + { + std::lock_guard lock(device_tracks_lock_); + device_track_it = device_tracks_.find(device_id); + if (device_track_it == device_tracks_.end()) { + /* Create a new perfetto::Track (Sub-Track) */ + device_track_it = + device_tracks_ + .emplace(device_id, + perfetto::ProcessTrack::Global(((device_id + 1) * machine_id_))) + .first; + auto gpu_desc = device_track_it->second.Serialize(); + gpu_desc.mutable_process()->set_pid(device_id); + std::string gpu_str = rocmtools::string_printf("Node: %s Device:", hostname_); + gpu_desc.mutable_process()->set_process_name(gpu_str); + perfetto::TrackEvent::SetTrackDescriptor(device_track_it->second, gpu_desc); + track_ids_used_.emplace_back(1 + machine_id_ + device_id); + } + } + } else { + std::lock_guard lock(thread_tracks_lock_); + thread_track_it = thread_tracks_.find(thread_id); + if (thread_track_it == thread_tracks_.end()) { + uint64_t track_id = + track_counter_.fetch_add((1 + machine_id_) * GetPid(), std::memory_order_acquire); + for (uint64_t tid : track_ids_used_) { + while (track_id == tid) { + track_id = + track_counter_.fetch_add((1 + machine_id_) * GetPid(), std::memory_order_acquire); + } + } + thread_track_it = + thread_tracks_.emplace(thread_id, perfetto::ProcessTrack::Global(track_id)).first; + auto thread_track_desc = thread_track_it->second.Serialize(); + std::string thread_track_str = + rocmtools::string_printf("Node: %s Process ID: %lu Thread ID:", hostname_, GetPid()); + thread_track_desc.mutable_process()->set_pid(thread_id); + thread_track_desc.mutable_process()->set_process_name(thread_track_str); + perfetto::TrackEvent::SetTrackDescriptor(thread_track_it->second, thread_track_desc); + } + } + auto& thread_track = thread_track_it->second; + auto& gpu_track = device_track_it->second; + switch (tracer_record.domain) { + case ACTIVITY_DOMAIN_ROCTX: { + std::unordered_map::iterator roctx_track_it; + { + std::lock_guard lock(roctx_tracks_lock_); + roctx_track_it = roctx_tracks_.find(thread_id); + if (roctx_track_it == roctx_tracks_.end()) { + /* Create a new perfetto::Track */ + uint64_t track_id = + track_counter_.fetch_add((1 + machine_id_) * GetPid(), std::memory_order_acquire); + for (uint64_t tid : track_ids_used_) { + while (track_id == tid) { + track_id = track_counter_.fetch_add((1 + machine_id_) * GetPid(), + std::memory_order_acquire); + } + } + roctx_track_it = + roctx_tracks_.emplace(thread_id, perfetto::Track(track_id, thread_track)).first; + + auto roctx_track_desc = roctx_track_it->second.Serialize(); + std::string roctx_track_str = rocmtools::string_printf("ROCTX Markers"); + roctx_track_desc.set_name(roctx_track_str); + perfetto::TrackEvent::SetTrackDescriptor(roctx_track_it->second, roctx_track_desc); + } + } + auto& roctx_track = roctx_track_it->second; + + size_t roctx_message_size = 0; + CHECK_ROCPROFILER(rocprofiler_query_roctx_tracer_api_data_info_size( + session_id, ROCPROFILER_ROCTX_MESSAGE, tracer_record.api_data_handle, + tracer_record.operation_id, &roctx_message_size)); + if (roctx_message_size > 1) { + char* roctx_message_str = static_cast(malloc(roctx_message_size * sizeof(char))); + CHECK_ROCPROFILER(rocprofiler_query_roctx_tracer_api_data_info( + session_id, ROCPROFILER_ROCTX_MESSAGE, tracer_record.api_data_handle, + tracer_record.operation_id, &roctx_message_str)); + if (roctx_message_str) + roctx_message = rocmtools::cxx_demangle(std::string(strdup(roctx_message_str))); + } + size_t roctx_id_size = 0; + CHECK_ROCPROFILER(rocprofiler_query_roctx_tracer_api_data_info_size( + session_id, ROCPROFILER_ROCTX_ID, tracer_record.api_data_handle, + tracer_record.operation_id, &roctx_id_size)); + if (roctx_id_size > 1) { + char* roctx_id_str = static_cast(malloc(roctx_id_size * sizeof(char))); + CHECK_ROCPROFILER(rocprofiler_query_roctx_tracer_api_data_info( + session_id, ROCPROFILER_ROCTX_ID, tracer_record.api_data_handle, + tracer_record.operation_id, &roctx_id_str)); + if (roctx_id_str) { + roctx_id = std::stoll(std::string(strdup(roctx_id_str))); + free(roctx_id_str); + } + } + + if (tracer_record.operation_id.id == 1) { + perfetto::StaticString roctx_message_pft( + (!roctx_message.empty() ? roctx_message.c_str() : "")); + TRACE_EVENT_BEGIN("ROCTX_API", roctx_message_pft, roctx_track, + tracer_record.timestamps.begin.value, "Timestamp(ns)", + tracer_record.timestamps.begin.value, "RocTx ID", roctx_id); + roctx_track_entries_++; + } else { + TRACE_EVENT_END("ROCTX_API", roctx_track, tracer_record.timestamps.begin.value); + roctx_track_entries_--; + } + break; + } + case ACTIVITY_DOMAIN_HSA_API: { + std::unordered_map::iterator hsa_track_it; + { + std::lock_guard lock(hsa_tracks_lock_); + hsa_track_it = hsa_tracks_.find(thread_id); + if (hsa_track_it == hsa_tracks_.end()) { + /* Create a new perfetto::Track */ + uint64_t track_id = + track_counter_.fetch_add((1 + machine_id_) * GetPid(), std::memory_order_acquire); + for (uint64_t tid : track_ids_used_) { + while (track_id == tid) { + track_id = track_counter_.fetch_add((1 + machine_id_) * GetPid(), + std::memory_order_acquire); + } + } + hsa_track_it = + hsa_tracks_.emplace(thread_id, perfetto::Track(track_id, thread_track)).first; + auto hsa_track_desc = hsa_track_it->second.Serialize(); + std::string hsa_track_str = rocmtools::string_printf("HSA API"); + hsa_track_desc.set_name(hsa_track_str); + perfetto::TrackEvent::SetTrackDescriptor(hsa_track_it->second, hsa_track_desc); + } + } + auto& hsa_track = hsa_track_it->second; + size_t function_name_size = 0; + CHECK_ROCPROFILER(rocprofiler_query_hsa_tracer_api_data_info_size( + session_id, ROCPROFILER_HSA_FUNCTION_NAME, tracer_record.api_data_handle, + tracer_record.operation_id, &function_name_size)); + if (function_name_size > 1) { + function_name = static_cast(malloc(function_name_size * sizeof(char))); + CHECK_ROCPROFILER(rocprofiler_query_hsa_tracer_api_data_info( + session_id, ROCPROFILER_HSA_FUNCTION_NAME, tracer_record.api_data_handle, + tracer_record.operation_id, &function_name)); + } + TRACE_EVENT_BEGIN("HSA_API", perfetto::StaticString(function_name), hsa_track, + tracer_record.timestamps.begin.value, + perfetto::Flow::ProcessScoped(tracer_record.correlation_id.value)); + TRACE_EVENT_END("HSA_API", hsa_track, tracer_record.timestamps.end.value); + break; + } + case ACTIVITY_DOMAIN_HIP_API: { + std::unordered_map::iterator hip_track_it; + { + std::lock_guard lock(hip_tracks_lock_); + hip_track_it = hip_tracks_.find(thread_id); + if (hip_track_it == hip_tracks_.end()) { + /* Create a new perfetto::Track */ + uint64_t track_id = + track_counter_.fetch_add((1 + machine_id_) * GetPid(), std::memory_order_acquire); + for (uint64_t tid : track_ids_used_) { + while (track_id == tid) { + track_id = track_counter_.fetch_add((1 + machine_id_) * GetPid(), + std::memory_order_acquire); + } + } + hip_track_it = + hip_tracks_.emplace(thread_id, perfetto::Track(track_id, thread_track)).first; + + auto hip_track_desc = hip_track_it->second.Serialize(); + std::string hip_track_str = rocmtools::string_printf("HIP API"); + hip_track_desc.set_name(hip_track_str); + perfetto::TrackEvent::SetTrackDescriptor(hip_track_it->second, hip_track_desc); + } + } + auto& hip_track = hip_track_it->second; + size_t function_name_size = 0; + CHECK_ROCPROFILER(rocprofiler_query_hip_tracer_api_data_info_size( + session_id, ROCPROFILER_HIP_FUNCTION_NAME, tracer_record.api_data_handle, + tracer_record.operation_id, &function_name_size)); + if (function_name_size > 1) { + function_name = static_cast(malloc(function_name_size * sizeof(char))); + CHECK_ROCPROFILER(rocprofiler_query_hip_tracer_api_data_info( + session_id, ROCPROFILER_HIP_FUNCTION_NAME, tracer_record.api_data_handle, + tracer_record.operation_id, &function_name)); + } + size_t kernel_name_size = 0; + CHECK_ROCPROFILER(rocprofiler_query_hip_tracer_api_data_info_size( + session_id, ROCPROFILER_HIP_KERNEL_NAME, tracer_record.api_data_handle, + tracer_record.operation_id, &kernel_name_size)); + char* kernel_name_str; + if (kernel_name_size > 1) { + kernel_name_str = static_cast(malloc(kernel_name_size * sizeof(char))); + CHECK_ROCPROFILER(rocprofiler_query_hip_tracer_api_data_info( + session_id, ROCPROFILER_HIP_KERNEL_NAME, tracer_record.api_data_handle, + tracer_record.operation_id, &kernel_name_str)); + if (kernel_name_str) { + kernel_name = rocmtools::cxx_demangle(std::string(kernel_name_str)); + free(kernel_name_str); + } + } + if (kernel_name.size() > 0) { + TRACE_EVENT_BEGIN("HIP_API", perfetto::StaticString(function_name), hip_track, + tracer_record.timestamps.begin.value, "Kernel Name", kernel_name, + perfetto::Flow::ProcessScoped(tracer_record.correlation_id.value)); + } else { + TRACE_EVENT_BEGIN("HIP_API", perfetto::StaticString(function_name), hip_track, + tracer_record.timestamps.begin.value, + perfetto::Flow::ProcessScoped(tracer_record.correlation_id.value)); + } + TRACE_EVENT_END("HIP_API", hip_track, tracer_record.timestamps.end.value); + break; + } + case ACTIVITY_DOMAIN_EXT_API: { + printf("Warning: External API is not supported!\n"); + break; + } + case ACTIVITY_DOMAIN_HIP_OPS: { + uint64_t stream_id = 0; + size_t stream_id_str_size = 0; + char* stream_id_str; + CHECK_ROCPROFILER(rocprofiler_query_hip_tracer_api_data_info_size( + session_id, ROCPROFILER_HIP_STREAM_ID, rocprofiler_tracer_api_data_handle_t{nullptr, 0}, + rocprofiler_tracer_operation_id_t{(uint32_t)tracer_record.correlation_id.value}, + &stream_id_str_size)); + if (stream_id_str_size > 1) { + stream_id_str = static_cast(malloc(stream_id_str_size * sizeof(char))); + CHECK_ROCPROFILER(rocprofiler_query_hip_tracer_api_data_info( + session_id, ROCPROFILER_HIP_STREAM_ID, rocprofiler_tracer_api_data_handle_t{nullptr, 0}, + rocprofiler_tracer_operation_id_t{(uint32_t)tracer_record.correlation_id.value}, + &stream_id_str)); + if (stream_id_str != nullptr) stream_id = std::stoll(stream_id_str); + } + std::unordered_map::iterator stream_track_it; + { + std::lock_guard lock(stream_tracks_lock_); + stream_track_it = stream_tracks_.find(stream_id); + if (stream_track_it == stream_tracks_.end()) { + /* Create a new perfetto::Track */ + uint64_t track_id = ((1 + stream_id + tracer_record.agent_id.handle) * machine_id_ * + STREAM_CONSTANT * GetPid()); + stream_track_it = + stream_tracks_.emplace(stream_id, perfetto::Track(track_id, gpu_track)).first; + + auto stream_desc = stream_track_it->second.Serialize(); + std::string stream_str = + rocmtools::string_printf("Process ID: %lu Stream %d", GetPid(), stream_id); + stream_desc.set_name(stream_str); + perfetto::TrackEvent::SetTrackDescriptor(stream_track_it->second, stream_desc); + track_ids_used_.emplace_back(1 + machine_id_ + tracer_record.agent_id.handle); + } + } + auto& stream_track = stream_track_it->second; + if (tracer_record.api_data_handle.handle && tracer_record.api_data_handle.size > 1) { + kernel_name = rocmtools::cxx_demangle( + strdup(reinterpret_cast(tracer_record.api_data_handle.handle))); + TRACE_EVENT_BEGIN( + "HIP_OPS", + perfetto::StaticString(strdup(rocmtools::truncate_name(kernel_name).c_str())), + stream_track, tracer_record.timestamps.begin.value, "Agent ID", + tracer_record.agent_id.handle, "Process ID", GetPid(), "Kernel Name", kernel_name, + perfetto::Flow::ProcessScoped(tracer_record.correlation_id.value)); + } else { + size_t activity_name_size = 0; + CHECK_ROCPROFILER(rocprofiler_query_hip_tracer_api_data_info_size( + session_id, ROCPROFILER_HIP_ACTIVITY_NAME, tracer_record.api_data_handle, + tracer_record.operation_id, &activity_name_size)); + if (activity_name_size > 1) { + activity_name = static_cast(malloc(activity_name_size * sizeof(char))); + CHECK_ROCPROFILER(rocprofiler_query_hip_tracer_api_data_info( + session_id, ROCPROFILER_HIP_ACTIVITY_NAME, tracer_record.api_data_handle, + tracer_record.operation_id, &activity_name)); + } else { + activity_name = const_cast(std::string("N/A").c_str()); + } + TRACE_EVENT_BEGIN("HIP_OPS", perfetto::StaticString(activity_name), stream_track, + tracer_record.timestamps.begin.value, "Agent ID", + tracer_record.agent_id.handle, "Process ID", GetPid(), + perfetto::Flow::ProcessScoped(tracer_record.correlation_id.value)); + } + TRACE_EVENT_END("HIP_OPS", stream_track, tracer_record.timestamps.end.value); + break; + } + case ACTIVITY_DOMAIN_HSA_OPS: { + std::pair gpu_queue_id = + std::make_pair(tracer_record.agent_id.handle, tracer_record.queue_id.handle); + std::unordered_map::iterator queue_track_it; + { + std::lock_guard lock(queue_tracks_lock_); + queue_track_it = queue_tracks_.find(gpu_queue_id.first); + if (queue_track_it == queue_tracks_.end()) { + /* Create a new perfetto::Track */ + uint64_t track_id = + ((1 + tracer_record.queue_id.handle + tracer_record.agent_id.handle) * machine_id_ * + QUEUE_CONSTANT * GetPid()); + queue_track_it = + queue_tracks_.emplace(gpu_queue_id.first, perfetto::Track(track_id, gpu_track)) + .first; + + auto queue_desc = queue_track_it->second.Serialize(); + std::string queue_str = rocmtools::string_printf("Process ID: %lu Queue %ld", GetPid(), + gpu_queue_id.second); + queue_desc.set_name(queue_str); + perfetto::TrackEvent::SetTrackDescriptor(queue_track_it->second, queue_desc); + } + track_ids_used_.emplace_back(tracer_record.queue_id.handle + machine_id_ + 1 + + tracer_record.agent_id.handle); + } + auto& queue_track = queue_track_it->second; + size_t activity_name_size = 0; + CHECK_ROCPROFILER(rocprofiler_query_hsa_tracer_api_data_info_size( + session_id, ROCPROFILER_HSA_ACTIVITY_NAME, tracer_record.api_data_handle, + tracer_record.operation_id, &activity_name_size)); + if (activity_name_size > 1) { + activity_name = static_cast(malloc(activity_name_size * sizeof(char))); + CHECK_ROCPROFILER(rocprofiler_query_hsa_tracer_api_data_info( + session_id, ROCPROFILER_HSA_ACTIVITY_NAME, tracer_record.api_data_handle, + tracer_record.operation_id, &activity_name)); + } + TRACE_EVENT_BEGIN("HSA_OPS", perfetto::StaticString(activity_name), queue_track, + tracer_record.timestamps.begin.value, "Agent ID", + tracer_record.agent_id.handle, "Queue ID", tracer_record.queue_id.handle, + "Process ID", GetPid(), + perfetto::Flow::ProcessScoped(tracer_record.correlation_id.value)); + TRACE_EVENT_END("HSA_OPS", queue_track, tracer_record.timestamps.end.value); + break; + } + default: { + rocmtools::warning("ignored record for domain %d", tracer_record.domain); + break; + } + } + return 0; + } + + int WriteBufferRecords(const rocprofiler_record_header_t* begin, + const rocprofiler_record_header_t* end, rocprofiler_session_id_t session_id, + rocprofiler_buffer_id_t buffer_id) { + if (!tracing_session_) rocmtools::warning("Tracing session is deleted!\n"); + while (begin < end) { + if (!begin) return 0; + switch (begin->kind) { + case ROCPROFILER_PROFILER_RECORD: { + rocprofiler_record_profiler_t* profiler_record = const_cast( + reinterpret_cast(begin)); + FlushProfilerRecord(*profiler_record, session_id); + break; + } + case ROCPROFILER_TRACER_RECORD: { + rocprofiler_record_tracer_t* tracer_record = const_cast( + reinterpret_cast(begin)); + FlushTracerRecord(*tracer_record, session_id); + break; + } + default: + break; + } + rocprofiler_next_record(begin, &begin, session_id, buffer_id); + } + return 0; + } + + bool IsValid() const { return is_valid_; } + + private: + fs::path output_prefix_; + std::unique_ptr tracing_session_; + int file_descriptor_; + bool is_valid_{false}; + size_t roctx_track_entries_{0}; + + // Correlate stream id(s) with correlation id(s) to identify the stream id of every HIP activity + std::unordered_map stream_ids_; + + // Callback Tracks + std::unordered_map thread_tracks_; + std::unordered_map roctx_tracks_, hsa_tracks_, hip_tracks_, + hip_ext_tracks_; + + // Activity Tracks + std::unordered_map device_tracks_; + std::unordered_map queue_tracks_, stream_tracks_; + + std::unordered_map counter_tracks_; + + std::atomic track_counter_{GetPid()}; + std::vector track_ids_used_; + + std::mutex stream_ids_lock_, thread_tracks_lock_, roctx_tracks_lock_, hsa_tracks_lock_, + hip_tracks_lock_, hip_ext_tracks_lock_, device_tracks_lock_, queue_tracks_lock_, + stream_tracks_lock_, counter_tracks_lock_; + + char hostname_[1024]; + uint64_t machine_id_; + + std::ofstream stream_; +}; + +perfetto_plugin_t* perfetto_plugin = nullptr; + +} // namespace + +int rocprofiler_plugin_initialize(uint32_t rocprofiler_major_version, + uint32_t rocprofiler_minor_version) { + if (rocprofiler_major_version != ROCPROFILER_VERSION_MAJOR || + rocprofiler_minor_version > ROCPROFILER_VERSION_MINOR) + return -1; + + if (perfetto_plugin != nullptr) return -1; + + perfetto_plugin = new perfetto_plugin_t(); + if (perfetto_plugin->IsValid()) return 0; + + delete perfetto_plugin; + perfetto_plugin = nullptr; + return -1; +} + +void rocprofiler_plugin_finalize() { + if (!perfetto_plugin) return; + delete perfetto_plugin; + perfetto_plugin = nullptr; +} + +ROCPROFILER_EXPORT int rocprofiler_plugin_write_buffer_records(const rocprofiler_record_header_t* begin, + const rocprofiler_record_header_t* end, + rocprofiler_session_id_t session_id, + rocprofiler_buffer_id_t buffer_id) { + if (!perfetto_plugin || !perfetto_plugin->IsValid()) return -1; + return perfetto_plugin->WriteBufferRecords(begin, end, session_id, buffer_id); +} + +ROCPROFILER_EXPORT int rocprofiler_plugin_write_record(rocprofiler_record_tracer_t record, + rocprofiler_session_id_t session_id) { + if (!perfetto_plugin || !perfetto_plugin->IsValid()) return -1; + if (record.header.id.handle == 0) return 0; + perfetto_plugin->FlushTracerRecord(record, session_id); + return 0; +} diff --git a/plugin/perfetto/perfetto_sdk/LICENSE b/plugin/perfetto/perfetto_sdk/LICENSE new file mode 100644 index 00000000..bb056df2 --- /dev/null +++ b/plugin/perfetto/perfetto_sdk/LICENSE @@ -0,0 +1,189 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright (c) 2017, The Android Open Source Project + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + diff --git a/plugin/perfetto/perfetto_sdk/OWNERS b/plugin/perfetto/perfetto_sdk/OWNERS new file mode 100644 index 00000000..0da623cf --- /dev/null +++ b/plugin/perfetto/perfetto_sdk/OWNERS @@ -0,0 +1,35 @@ +# Global OWNERS that can approve Perfetto changes. +# Please look first at OWNERS in the various subdirectories before falling back +# on this, as the former tend to be more brain-cache-hot. + +# Perfetto tracing internals and API/ABI boundaries. +primiano@google.com +skyostil@google.com + +# UI, Ftrace interop, traced_probes, protozero, Android internals. +hjd@google.com + +# Trace Processor, metrics, infra. +lalitm@google.com + +# Callstack / memory profilers, traced_probes & Linux internals. +ddiproietto@google.com +rsavitski@google.com + +# Chromium-related things and tracing SDK. +eseckler@google.com +nuskos@google.com +oysteine@google.com + +# Most Android-related metrics. +ilkos@google.com + +# fmayer@ left the team. Please try first rsavitski@, ddiproietto@ or primiano@ +# and leave fmayer@ as an emergency-only escalation on profilers. +fmayer@google.com + +# chromium.org aliases for adding DEPS entries from chromium subprojects to +# third_party/perfetto. +eseckler@chromium.org +nuskos@chromium.org +skyostil@chromium.org diff --git a/plugin/perfetto/perfetto_sdk/docs/tracing-sdk.md b/plugin/perfetto/perfetto_sdk/docs/tracing-sdk.md new file mode 100644 index 00000000..856dc378 --- /dev/null +++ b/plugin/perfetto/perfetto_sdk/docs/tracing-sdk.md @@ -0,0 +1,394 @@ +# Tracing SDK + +The Perfetto Tracing SDK is a C++11 library that allows userspace applications +to emit trace events and add more app-specific context to a Perfetto trace. + +When using the Tracing SDK there are two main aspects to consider: + +1. Whether you are interested only in tracing events coming from your own app + or want to collect full-stack traces that overlay app trace events with + system trace events like scheduler traces, syscalls or any other Perfetto + data source. + +2. For app-specific tracing, whether you need to trace simple types of timeline + events (e.g., slices, counters) or need to define complex data sources with a + custom strongly-typed schema (e.g., for dumping the state of a subsystem of + your app into the trace). + +For Android-only instrumentation, the advice is to keep using the existing +[android.os.Trace (SDK)][atrace-sdk] / [ATrace_* (NDK)][atrace-ndk] if they +are sufficient for your use cases. Atrace-based instrumentation is fully +supported in Perfetto. +See the [Data Sources -> Android System -> Atrace Instrumentation][atrace-ds] +for details. + +## Getting started + +TIP: The code from these examples is also available [in the +repository](/examples/sdk/README.md). + +To start using the Client API, first check out the latest SDK release: + +```bash +git clone https://android.googlesource.com/platform/external/perfetto -b v23.0 +``` + +The SDK consists of two files, `sdk/perfetto.h` and `sdk/perfetto.cc`. These are +an amalgamation of the Client API designed to easy to integrate to existing +build systems. The sources are self-contained and require only a C++11 compliant +standard library. + +For example, to add the SDK to a CMake project, edit your CMakeLists.txt: + +```cmake +cmake_minimum_required(VERSION 3.13) +project(PerfettoExample) +find_package(Threads) + +# Define a static library for Perfetto. +include_directories(perfetto/sdk) +add_library(perfetto STATIC perfetto/sdk/perfetto.cc) + +# Link the library to your main executable. +add_executable(example example.cc) +target_link_libraries(example perfetto ${CMAKE_THREAD_LIBS_INIT}) +``` + +Next, initialize Perfetto in your program: + +```C++ +#include + +int main(int argc, char** argv) { + perfetto::TracingInitArgs args; + + // The backends determine where trace events are recorded. You may select one + // or more of: + + // 1) The in-process backend only records within the app itself. + args.backends |= perfetto::kInProcessBackend; + + // 2) The system backend writes events into a system Perfetto daemon, + // allowing merging app and system events (e.g., ftrace) on the same + // timeline. Requires the Perfetto `traced` daemon to be running (e.g., + // on Android Pie and newer). + args.backends |= perfetto::kSystemBackend; + + perfetto::Tracing::Initialize(args); +} +``` + +You are now ready to instrument your app with trace events. + +## Custom data sources vs Track events + +The SDK offers two abstraction layers to inject tracing data, built on top of +each other, which trade off code complexity vs expressive power: +[track events](#track-events) and [custom data sources](#custom-data-sources). + +### Track events + +Track events are the suggested option when dealing with app-specific tracing as +they take care of a number of subtleties (e.g., thread safety, flushing, string +interning). +Track events are time bounded events (e.g., slices, counter) based on simple +`TRACE_EVENT` annotation tags in the codebase, like this: + +```c++ +#include + +PERFETTO_DEFINE_CATEGORIES( + perfetto::Category("rendering") + .SetDescription("Events from the graphics subsystem"), + perfetto::Category("network") + .SetDescription("Network upload and download statistics")); + +... + +int main(int argc, char** argv) { + ... + perfetto::Tracing::Initialize(args); + perfetto::TrackEvent::Register(); +} + +... + +void LayerTreeHost::DoUpdateLayers() { + TRACE_EVENT("rendering", "LayerTreeHost::DoUpdateLayers"); + ... + for (PictureLayer& pl : layers) { + TRACE_EVENT("rendering", "PictureLayer::Update"); + pl.Update(); + } +} +``` + +Which are rendered in the UI as follows: + +![Track event example](/docs/images/track-events.png) + +Track events are the best default option and serve most tracing use cases with +very little complexity. + +To include your new track events in the trace, ensure that the `track_event` +data source is included in the trace config. If you do not specify any +categories then all non-debug categories will be included by default. However, +you can also add just the categories you are interested in like so: + +```protobuf +data_sources { + config { + name: "track_event" + track_event_config { + enabled_categories: "rendering" + } + } +} +``` + +See the [Track events page](track-events.md) for full instructions. + +### Custom data sources + +For most uses, track events are the most straightforward way of instrumenting +apps for tracing. However, in some rare circumstances they are not +flexible enough, e.g., when the data doesn't fit the notion of a track or is +high volume enough that it needs a strongly typed schema to minimize the size of +each event. In this case, you can implement a *custom data source* for +Perfetto. + +Unlike track events, when working with custom data sources, you will also need +corresponding changes in [trace processor](/docs/analysis/trace-processor.md) +to enable importing your data format. + +A custom data source is a subclass of `perfetto::DataSource`. Perfetto will +automatically create one instance of the class for each tracing session it is +active in (usually just one). + +```C++ +class CustomDataSource : public perfetto::DataSource { + public: + void OnSetup(const SetupArgs&) override { + // Use this callback to apply any custom configuration to your data source + // based on the TraceConfig in SetupArgs. + } + + void OnStart(const StartArgs&) override { + // This notification can be used to initialize the GPU driver, enable + // counters, etc. StartArgs will contains the DataSourceDescriptor, + // which can be extended. + } + + void OnStop(const StopArgs&) override { + // Undo any initialization done in OnStart. + } + + // Data sources can also have per-instance state. + int my_custom_state = 0; +}; + +PERFETTO_DECLARE_DATA_SOURCE_STATIC_MEMBERS(CustomDataSource); +``` + +The data source's static data should be defined in one source file like this: + +```C++ +PERFETTO_DEFINE_DATA_SOURCE_STATIC_MEMBERS(CustomDataSource); +``` + +Custom data sources need to be registered with Perfetto: + +```C++ +int main(int argc, char** argv) { + ... + perfetto::Tracing::Initialize(args); + // Add the following: + perfetto::DataSourceDescriptor dsd; + dsd.set_name("com.example.custom_data_source"); + CustomDataSource::Register(dsd); +} +``` + +As with all data sources, the custom data source needs to be specified in the +trace config to enable tracing: + +```C++ +perfetto::TraceConfig cfg; +auto* ds_cfg = cfg.add_data_sources()->mutable_config(); +ds_cfg->set_name("com.example.custom_data_source"); +``` + +Finally, call the `Trace()` method to record an event with your custom data +source. The lambda function passed to that method will only be called if tracing +is enabled. It is always called synchronously and possibly multiple times if +multiple concurrent tracing sessions are active. + +```C++ +CustomDataSource::Trace([](CustomDataSource::TraceContext ctx) { + auto packet = ctx.NewTracePacket(); + packet->set_timestamp(perfetto::TrackEvent::GetTraceTimeNs()); + packet->set_for_testing()->set_str("Hello world!"); +}); +``` + +If necessary the `Trace()` method can access the custom data source state +(`my_custom_state` in the example above). Doing so, will take a mutex to +ensure data source isn't destroyed (e.g., because of stopping tracing) while +the `Trace()` method is called on another thread. For example: + +```C++ +CustomDataSource::Trace([](CustomDataSource::TraceContext ctx) { + auto safe_handle = trace_args.GetDataSourceLocked(); // Holds a RAII lock. + DoSomethingWith(safe_handle->my_custom_state); +}); +``` + +## In-process vs System mode + +The two modes are not mutually exclusive. An app can be configured to work +in both modes and respond both to in-process tracing requests and system +tracing requests. Both modes generate the same trace file format. + +### In-process mode + +In this mode both the perfetto service and the app-defined data sources are +hosted fully in-process, in the same process of the profiled app. No connection +to the system `traced` daemon will be attempted. + +In-process mode can be enabled by setting +`TracingInitArgs.backends = perfetto::kInProcessBackend` when initializing the +SDK, see examples below. + +This mode is used to generate traces that contain only events emitted by +the app, but not other types of events (e.g. scheduler traces). + +The main advantage is that by running fully in-process, it doesn't require any +special OS privileges and the profiled process can control the lifecycle of +tracing sessions. + +This mode is supported on Android, Linux, MacOS and Windows. + +### System mode + +In this mode the app-defined data sources will connect to the external `traced` +service using the [IPC over UNIX socket][ipc]. + +System mode can be enabled by setting +`TracingInitArgs.backends = perfetto::kSystemBackend` when initializing the SDK, +see examples below. + +The main advantage of this mode is that it is possible to create fused traces where +app events are overlaid on the same timeline of OS events. This enables +full-stack performance investigations, looking all the way through syscalls and +kernel scheduling events. + +The main limitation of this mode is that it requires the external `traced` daemon +to be up and running and reachable through the UNIX socket connection. + +This is suggested for local debugging or lab testing scenarios where the user +(or the test harness) can control the OS deployment (e.g., sideload binaries on +Android). + +When using system mode, the tracing session must be controlled from the outside, +using the `perfetto` command-line client +(See [reference](/docs/reference/perfetto-cli)). This is because when collecting +system traces, tracing data producers are not allowed to read back the trace +data as it might disclose information about other processes and allow +side-channel attacks. + +* On Android 9 (Pie) and beyond, traced is shipped as part of the platform. +* On older versions of Android, traced can be built from sources using the + the [standalone NDK-based workflow](/docs/contributing/build-instructions.md) + and sideloaded via adb shell. +* On Linux and MacOS `traced` must be built and run separately. See the + [Linux quickstart](/docs/quickstart/linux-tracing.md) for instructions. + +_System mode is not yet supported on Windows, due to the lack of an IPC +implementation_. + +## {#recording} Recording traces through the API + +_Tracing through the API is currently only supported with the in-process mode. +When using system mode, use the `perfetto` cmdline client (see quickstart +guides)._ + +First initialize a [TraceConfig](/docs/reference/trace-config-proto.autogen) +message which specifies what type of data to record. + +If your app includes [track events](track-events.md) (i.e, `TRACE_EVENT`), you +typically want to choose the categories which are enabled for tracing. + +By default, all non-debug categories are enabled, but you can enable a specific +one like this: + +```C++ +perfetto::protos::gen::TrackEventConfig track_event_cfg; +track_event_cfg.add_disabled_categories("*"); +track_event_cfg.add_enabled_categories("rendering"); +``` + +Next, build the main trace config together with the track event part: + +```C++ +perfetto::TraceConfig cfg; +cfg.add_buffers()->set_size_kb(1024); // Record up to 1 MiB. +auto* ds_cfg = cfg.add_data_sources()->mutable_config(); +ds_cfg->set_name("track_event"); +ds_cfg->set_track_event_config_raw(track_event_cfg.SerializeAsString()); +``` + +If your app includes a custom data source, you can also enable it here: + +```C++ +ds_cfg = cfg.add_data_sources()->mutable_config(); +ds_cfg->set_name("my_data_source"); +``` + +After building the trace config, you can begin tracing: + +```C++ +std::unique_ptr tracing_session( + perfetto::Tracing::NewTrace()); +tracing_session->Setup(cfg); +tracing_session->StartBlocking(); +``` + +TIP: API methods with `Blocking` in their name will suspend the calling thread + until the respective operation is complete. There are also asynchronous + variants that don't have this limitation. + +Now that tracing is active, instruct your app to perform the operation you +want to record. After that, stop tracing and collect the +protobuf-formatted trace data: + +```C++ +tracing_session->StopBlocking(); +std::vector trace_data(tracing_session->ReadTraceBlocking()); + +// Write the trace into a file. +std::ofstream output; +output.open("example.perfetto-trace", std::ios::out | std::ios::binary); +output.write(&trace_data[0], trace_data.size()); +output.close(); +``` + +To save memory with longer traces, you can also tell Perfetto to write +directly into a file by passing a file descriptor into Setup(), remembering +to close the file after tracing is done: + +```C++ +int fd = open("example.perfetto-trace", O_RDWR | O_CREAT | O_TRUNC, 0600); +tracing_session->Setup(cfg, fd); +tracing_session->StartBlocking(); +// ... +tracing_session->StopBlocking(); +close(fd); +``` + +The resulting trace file can be directly opened in the [Perfetto +UI](https://ui.perfetto.dev) or the [Trace Processor](/docs/analysis/trace-processor.md). + +[ipc]: /docs/design-docs/api-and-abi.md#socket-protocol +[atrace-ds]: /docs/data-sources/atrace.md +[atrace-ndk]: https://developer.android.com/ndk/reference/group/tracing +[atrace-sdk]: https://developer.android.com/reference/android/os/Trace diff --git a/plugin/perfetto/perfetto_sdk/sdk/perfetto.cc b/plugin/perfetto/perfetto_sdk/sdk/perfetto.cc new file mode 100644 index 00000000..35ecdd6c --- /dev/null +++ b/plugin/perfetto/perfetto_sdk/sdk/perfetto.cc @@ -0,0 +1,74282 @@ +// Copyright (C) 2019 The Android Open Source Project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// This file is automatically generated by gen_amalgamated. Do not edit. + +// gen_amalgamated: predefined macros +#if !defined(PERFETTO_IMPLEMENTATION) +#define PERFETTO_IMPLEMENTATION +#endif +#include "perfetto.h" +// gen_amalgamated begin source: src/base/android_utils.cc +// gen_amalgamated begin header: include/perfetto/ext/base/android_utils.h +/* + * Copyright (C) 2021 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_PERFETTO_EXT_BASE_ANDROID_UTILS_H_ +#define INCLUDE_PERFETTO_EXT_BASE_ANDROID_UTILS_H_ + +#include + +// gen_amalgamated expanded: #include "perfetto/base/build_config.h" + +namespace perfetto { +namespace base { + +#if PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID) + +// Returns the value of the Android system property named `name`. If the +// property does not exist, returns an empty string (a non-existing property is +// the same as a property with an empty value for this API). +std::string GetAndroidProp(const char* name); + +#endif // PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID) + +} // namespace base +} // namespace perfetto + +#endif // INCLUDE_PERFETTO_EXT_BASE_ANDROID_UTILS_H_ +/* + * Copyright (C) 2021 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// gen_amalgamated expanded: #include "perfetto/ext/base/android_utils.h" + +// gen_amalgamated expanded: #include "perfetto/base/build_config.h" + +#include + +#if PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID) +#include +#endif + +// gen_amalgamated expanded: #include "perfetto/base/compiler.h" +// gen_amalgamated expanded: #include "perfetto/base/logging.h" + +namespace perfetto { +namespace base { + +#if PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID) + +std::string GetAndroidProp(const char* name) { + std::string ret; +#if __ANDROID_API__ >= 26 + const prop_info* pi = __system_property_find(name); + if (!pi) { + return ret; + } + __system_property_read_callback( + pi, + [](void* dst_void, const char*, const char* value, uint32_t) { + std::string& dst = *static_cast(dst_void); + dst = value; + }, + &ret); +#else // __ANDROID_API__ < 26 + char value_buf[PROP_VALUE_MAX]; + int len = __system_property_get(name, value_buf); + if (len > 0 && static_cast(len) < sizeof(value_buf)) { + ret = std::string(value_buf, static_cast(len)); + } +#endif + return ret; +} + +#endif // PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID) + +} // namespace base +} // namespace perfetto +// gen_amalgamated begin source: src/base/base64.cc +// gen_amalgamated begin header: include/perfetto/ext/base/base64.h +// gen_amalgamated begin header: include/perfetto/ext/base/optional.h +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_PERFETTO_EXT_BASE_OPTIONAL_H_ +#define INCLUDE_PERFETTO_EXT_BASE_OPTIONAL_H_ + +#include +#include +#include + +// gen_amalgamated expanded: #include "perfetto/base/logging.h" + +namespace perfetto { +namespace base { + +// Specification: +// http://en.cppreference.com/w/cpp/utility/optional/in_place_t +struct in_place_t {}; + +// Specification: +// http://en.cppreference.com/w/cpp/utility/optional/nullopt_t +struct nullopt_t { + constexpr explicit nullopt_t(int) {} +}; + +// Specification: +// http://en.cppreference.com/w/cpp/utility/optional/in_place +constexpr in_place_t in_place = {}; + +// Specification: +// http://en.cppreference.com/w/cpp/utility/optional/nullopt +constexpr nullopt_t nullopt(0); + +// Forward declaration, which is referred by following helpers. +template +class Optional; + +namespace internal { + +template ::value> +struct OptionalStorageBase { + // Initializing |empty_| here instead of using default member initializing + // to avoid errors in g++ 4.8. + constexpr OptionalStorageBase() : empty_('\0') {} + + template + constexpr explicit OptionalStorageBase(in_place_t, Args&&... args) + : is_populated_(true), value_(std::forward(args)...) {} + + // When T is not trivially destructible we must call its + // destructor before deallocating its memory. + // Note that this hides the (implicitly declared) move constructor, which + // would be used for constexpr move constructor in OptionalStorage. + // It is needed iff T is trivially move constructible. However, the current + // is_trivially_{copy,move}_constructible implementation requires + // is_trivially_destructible (which looks a bug, cf: + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51452 and + // http://cplusplus.github.io/LWG/lwg-active.html#2116), so it is not + // necessary for this case at the moment. Please see also the destructor + // comment in "is_trivially_destructible = true" specialization below. + ~OptionalStorageBase() { + if (is_populated_) + value_.~T(); + } + + template + void Init(Args&&... args) { + PERFETTO_DCHECK(!is_populated_); + ::new (&value_) T(std::forward(args)...); + is_populated_ = true; + } + + bool is_populated_ = false; + union { + // |empty_| exists so that the union will always be initialized, even when + // it doesn't contain a value. Union members must be initialized for the + // constructor to be 'constexpr'. + char empty_; + T value_; + }; +}; + +template +struct OptionalStorageBase { + // Initializing |empty_| here instead of using default member initializing + // to avoid errors in g++ 4.8. + constexpr OptionalStorageBase() : empty_('\0') {} + + template + constexpr explicit OptionalStorageBase(in_place_t, Args&&... args) + : is_populated_(true), value_(std::forward(args)...) {} + + // When T is trivially destructible (i.e. its destructor does nothing) there + // is no need to call it. Implicitly defined destructor is trivial, because + // both members (bool and union containing only variants which are trivially + // destructible) are trivially destructible. + // Explicitly-defaulted destructor is also trivial, but do not use it here, + // because it hides the implicit move constructor. It is needed to implement + // constexpr move constructor in OptionalStorage iff T is trivially move + // constructible. Note that, if T is trivially move constructible, the move + // constructor of OptionalStorageBase is also implicitly defined and it is + // trivially move constructor. If T is not trivially move constructible, + // "not declaring move constructor without destructor declaration" here means + // "delete move constructor", which works because any move constructor of + // OptionalStorage will not refer to it in that case. + + template + void Init(Args&&... args) { + PERFETTO_DCHECK(!is_populated_); + ::new (&value_) T(std::forward(args)...); + is_populated_ = true; + } + + bool is_populated_ = false; + union { + // |empty_| exists so that the union will always be initialized, even when + // it doesn't contain a value. Union members must be initialized for the + // constructor to be 'constexpr'. + char empty_; + T value_; + }; +}; + +// Implement conditional constexpr copy and move constructors. These are +// constexpr if is_trivially_{copy,move}_constructible::value is true +// respectively. If each is true, the corresponding constructor is defined as +// "= default;", which generates a constexpr constructor (In this case, +// the condition of constexpr-ness is satisfied because the base class also has +// compiler generated constexpr {copy,move} constructors). Note that +// placement-new is prohibited in constexpr. +template ::value> +struct OptionalStorage : OptionalStorageBase { + // This is no trivially {copy,move} constructible case. Other cases are + // defined below as specializations. + + // Accessing the members of template base class requires explicit + // declaration. + using OptionalStorageBase::is_populated_; + using OptionalStorageBase::value_; + using OptionalStorageBase::Init; + + // Inherit constructors (specifically, the in_place constructor). + using OptionalStorageBase::OptionalStorageBase; + + // User defined constructor deletes the default constructor. + // Define it explicitly. + OptionalStorage() = default; + + OptionalStorage(const OptionalStorage& other) : OptionalStorageBase() { + if (other.is_populated_) + Init(other.value_); + } + + OptionalStorage(OptionalStorage&& other) noexcept( + std::is_nothrow_move_constructible::value) { + if (other.is_populated_) + Init(std::move(other.value_)); + } +}; + +template +struct OptionalStorage + : OptionalStorageBase { + using OptionalStorageBase::is_populated_; + using OptionalStorageBase::value_; + using OptionalStorageBase::Init; + using OptionalStorageBase::OptionalStorageBase; + + OptionalStorage() = default; + OptionalStorage(const OptionalStorage& other) = default; + + OptionalStorage(OptionalStorage&& other) noexcept( + std::is_nothrow_move_constructible::value) { + if (other.is_populated_) + Init(std::move(other.value_)); + } +}; + +// Base class to support conditionally usable copy-/move- constructors +// and assign operators. +template +class OptionalBase { + // This class provides implementation rather than public API, so everything + // should be hidden. Often we use composition, but we cannot in this case + // because of C++ language restriction. + protected: + constexpr OptionalBase() = default; + constexpr OptionalBase(const OptionalBase& other) = default; + constexpr OptionalBase(OptionalBase&& other) = default; + + template + constexpr explicit OptionalBase(in_place_t, Args&&... args) + : storage_(in_place, std::forward(args)...) {} + + // Implementation of converting constructors. + template + explicit OptionalBase(const OptionalBase& other) { + if (other.storage_.is_populated_) + storage_.Init(other.storage_.value_); + } + + template + explicit OptionalBase(OptionalBase&& other) { + if (other.storage_.is_populated_) + storage_.Init(std::move(other.storage_.value_)); + } + + ~OptionalBase() = default; + + OptionalBase& operator=(const OptionalBase& other) { + CopyAssign(other); + return *this; + } + + OptionalBase& operator=(OptionalBase&& other) noexcept( + std::is_nothrow_move_assignable::value&& + std::is_nothrow_move_constructible::value) { + MoveAssign(std::move(other)); + return *this; + } + + template + void CopyAssign(const OptionalBase& other) { + if (other.storage_.is_populated_) + InitOrAssign(other.storage_.value_); + else + FreeIfNeeded(); + } + + template + void MoveAssign(OptionalBase&& other) { + if (other.storage_.is_populated_) + InitOrAssign(std::move(other.storage_.value_)); + else + FreeIfNeeded(); + } + + template + void InitOrAssign(U&& value) { + if (storage_.is_populated_) + storage_.value_ = std::forward(value); + else + storage_.Init(std::forward(value)); + } + + void FreeIfNeeded() { + if (!storage_.is_populated_) + return; + storage_.value_.~T(); + storage_.is_populated_ = false; + } + + // For implementing conversion, allow access to other typed OptionalBase + // class. + template + friend class OptionalBase; + + OptionalStorage storage_; +}; + +// The following {Copy,Move}{Constructible,Assignable} structs are helpers to +// implement constructor/assign-operator overloading. Specifically, if T is +// is not movable but copyable, Optional's move constructor should not +// participate in overload resolution. This inheritance trick implements that. +template +struct CopyConstructible {}; + +template <> +struct CopyConstructible { + constexpr CopyConstructible() = default; + constexpr CopyConstructible(const CopyConstructible&) = delete; + constexpr CopyConstructible(CopyConstructible&&) = default; + CopyConstructible& operator=(const CopyConstructible&) = default; + CopyConstructible& operator=(CopyConstructible&&) = default; +}; + +template +struct MoveConstructible {}; + +template <> +struct MoveConstructible { + constexpr MoveConstructible() = default; + constexpr MoveConstructible(const MoveConstructible&) = default; + constexpr MoveConstructible(MoveConstructible&&) = delete; + MoveConstructible& operator=(const MoveConstructible&) = default; + MoveConstructible& operator=(MoveConstructible&&) = default; +}; + +template +struct CopyAssignable {}; + +template <> +struct CopyAssignable { + constexpr CopyAssignable() = default; + constexpr CopyAssignable(const CopyAssignable&) = default; + constexpr CopyAssignable(CopyAssignable&&) = default; + CopyAssignable& operator=(const CopyAssignable&) = delete; + CopyAssignable& operator=(CopyAssignable&&) = default; +}; + +template +struct MoveAssignable {}; + +template <> +struct MoveAssignable { + constexpr MoveAssignable() = default; + constexpr MoveAssignable(const MoveAssignable&) = default; + constexpr MoveAssignable(MoveAssignable&&) = default; + MoveAssignable& operator=(const MoveAssignable&) = default; + MoveAssignable& operator=(MoveAssignable&&) = delete; +}; + +// Helper to conditionally enable converting constructors and assign operators. +template +struct IsConvertibleFromOptional + : std::integral_constant< + bool, + std::is_constructible&>::value || + std::is_constructible&>::value || + std::is_constructible&&>::value || + std::is_constructible&&>::value || + std::is_convertible&, T>::value || + std::is_convertible&, T>::value || + std::is_convertible&&, T>::value || + std::is_convertible&&, T>::value> {}; + +template +struct IsAssignableFromOptional + : std::integral_constant< + bool, + IsConvertibleFromOptional::value || + std::is_assignable&>::value || + std::is_assignable&>::value || + std::is_assignable&&>::value || + std::is_assignable&&>::value> {}; + +// Forward compatibility for C++17. +// Introduce one more deeper nested namespace to avoid leaking using std::swap. +namespace swappable_impl { +using std::swap; + +struct IsSwappableImpl { + // Tests if swap can be called. Check(0) returns true_type iff swap is + // available for T. Otherwise, Check's overload resolution falls back to + // Check(...) declared below thanks to SFINAE, so returns false_type. + template + static auto Check(int) + -> decltype(swap(std::declval(), std::declval()), std::true_type()); + + template + static std::false_type Check(...); +}; +} // namespace swappable_impl + +template +struct IsSwappable : decltype(swappable_impl::IsSwappableImpl::Check(0)) {}; + +// Forward compatibility for C++20. +template +using RemoveCvRefT = + typename std::remove_cv::type>::type; + +} // namespace internal + +// On Windows, by default, empty-base class optimization does not work, +// which means even if the base class is empty struct, it still consumes one +// byte for its body. __declspec(empty_bases) enables the optimization. +// cf) +// https://blogs.msdn.microsoft.com/vcblog/2016/03/30/optimizing-the-layout-of-empty-base-classes-in-vs2015-update-2-3/ +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) && \ + !PERFETTO_BUILDFLAG(PERFETTO_COMPILER_GCC) +#define OPTIONAL_DECLSPEC_EMPTY_BASES __declspec(empty_bases) +#else +#define OPTIONAL_DECLSPEC_EMPTY_BASES +#endif + +// base::Optional is a Chromium version of the C++17 optional class: +// std::optional documentation: +// http://en.cppreference.com/w/cpp/utility/optional +// Chromium documentation: +// https://chromium.googlesource.com/chromium/src/+/master/docs/optional.md +// +// These are the differences between the specification and the implementation: +// - Constructors do not use 'constexpr' as it is a C++14 extension. +// - 'constexpr' might be missing in some places for reasons specified locally. +// - No exceptions are thrown, because they are banned from Chromium. +// Marked noexcept for only move constructor and move assign operators. +// - All the non-members are in the 'base' namespace instead of 'std'. +// +// Note that T cannot have a constructor T(Optional) etc. Optional +// PERFETTO_CHECKs T's constructor (specifically via IsConvertibleFromOptional), +// and in the PERFETTO_CHECK whether T can be constructible from Optional, +// which is recursive so it does not work. As of Feb 2018, std::optional C++17 +// implementation in both clang and gcc has same limitation. MSVC SFINAE looks +// to have different behavior, but anyway it reports an error, too. +// +// This file is a modified version of optional.h from Chromium at revision +// 5e71bd454e60511c1293c0c686544aaa76094424. The changes remove C++14/C++17 +// specific code and replace with C++11 counterparts. +template +class OPTIONAL_DECLSPEC_EMPTY_BASES Optional + : public internal::OptionalBase, + public internal::CopyConstructible::value>, + public internal::MoveConstructible::value>, + public internal::CopyAssignable::value && + std::is_copy_assignable::value>, + public internal::MoveAssignable::value && + std::is_move_assignable::value> { + public: +#undef OPTIONAL_DECLSPEC_EMPTY_BASES + using value_type = T; + + // Defer default/copy/move constructor implementation to OptionalBase. + constexpr Optional() = default; + constexpr Optional(const Optional& other) = default; + constexpr Optional(Optional&& other) noexcept( + std::is_nothrow_move_constructible::value) = default; + + constexpr Optional(nullopt_t) {} // NOLINT(runtime/explicit) + + // Converting copy constructor. "explicit" only if + // std::is_convertible::value is false. It is implemented by + // declaring two almost same constructors, but that condition in enable_if_t + // is different, so that either one is chosen, thanks to SFINAE. + template ::value && + !internal::IsConvertibleFromOptional::value && + std::is_convertible::value, + bool>::type = false> + Optional(const Optional& other) : internal::OptionalBase(other) {} + + template ::value && + !internal::IsConvertibleFromOptional::value && + !std::is_convertible::value, + bool>::type = false> + explicit Optional(const Optional& other) + : internal::OptionalBase(other) {} + + // Converting move constructor. Similar to converting copy constructor, + // declaring two (explicit and non-explicit) constructors. + template ::value && + !internal::IsConvertibleFromOptional::value && + std::is_convertible::value, + bool>::type = false> + Optional(Optional&& other) : internal::OptionalBase(std::move(other)) {} + + template ::value && + !internal::IsConvertibleFromOptional::value && + !std::is_convertible::value, + bool>::type = false> + explicit Optional(Optional&& other) + : internal::OptionalBase(std::move(other)) {} + + template + constexpr explicit Optional(in_place_t, Args&&... args) + : internal::OptionalBase(in_place, std::forward(args)...) {} + + template &, + Args...>::value>::type> + constexpr explicit Optional(in_place_t, + std::initializer_list il, + Args&&... args) + : internal::OptionalBase(in_place, il, std::forward(args)...) {} + + // Forward value constructor. Similar to converting constructors, + // conditionally explicit. + template < + typename U = value_type, + typename std::enable_if< + std::is_constructible::value && + !std::is_same, in_place_t>::value && + !std::is_same, Optional>::value && + std::is_convertible::value, + bool>::type = false> + constexpr Optional(U&& value) + : internal::OptionalBase(in_place, std::forward(value)) {} + + template < + typename U = value_type, + typename std::enable_if< + std::is_constructible::value && + !std::is_same, in_place_t>::value && + !std::is_same, Optional>::value && + !std::is_convertible::value, + bool>::type = false> + constexpr explicit Optional(U&& value) + : internal::OptionalBase(in_place, std::forward(value)) {} + + ~Optional() = default; + + // Defer copy-/move- assign operator implementation to OptionalBase. + Optional& operator=(const Optional& other) = default; + Optional& operator=(Optional&& other) noexcept( + std::is_nothrow_move_assignable::value&& + std::is_nothrow_move_constructible::value) = default; + + Optional& operator=(nullopt_t) { + FreeIfNeeded(); + return *this; + } + + // Perfect-forwarded assignment. + template + typename std::enable_if< + !std::is_same, Optional>::value && + std::is_constructible::value && + std::is_assignable::value && + (!std::is_scalar::value || + !std::is_same::type, T>::value), + Optional&>::type + operator=(U&& value) { + InitOrAssign(std::forward(value)); + return *this; + } + + // Copy assign the state of other. + template + typename std::enable_if::value && + std::is_constructible::value && + std::is_assignable::value, + Optional&>::type + operator=(const Optional& other) { + CopyAssign(other); + return *this; + } + + // Move assign the state of other. + template + typename std::enable_if::value && + std::is_constructible::value && + std::is_assignable::value, + Optional&>::type + operator=(Optional&& other) { + MoveAssign(std::move(other)); + return *this; + } + + const T* operator->() const { + PERFETTO_DCHECK(storage_.is_populated_); + return &storage_.value_; + } + + T* operator->() { + PERFETTO_DCHECK(storage_.is_populated_); + return &storage_.value_; + } + + const T& operator*() const& { + PERFETTO_DCHECK(storage_.is_populated_); + return storage_.value_; + } + + T& operator*() & { + PERFETTO_DCHECK(storage_.is_populated_); + return storage_.value_; + } + + const T&& operator*() const&& { + PERFETTO_DCHECK(storage_.is_populated_); + return std::move(storage_.value_); + } + + T&& operator*() && { + PERFETTO_DCHECK(storage_.is_populated_); + return std::move(storage_.value_); + } + + constexpr explicit operator bool() const { return storage_.is_populated_; } + + constexpr bool has_value() const { return storage_.is_populated_; } + + T& value() & { + PERFETTO_CHECK(storage_.is_populated_); + return storage_.value_; + } + + const T& value() const& { + PERFETTO_CHECK(storage_.is_populated_); + return storage_.value_; + } + + T&& value() && { + PERFETTO_CHECK(storage_.is_populated_); + return std::move(storage_.value_); + } + + const T&& value() const&& { + PERFETTO_CHECK(storage_.is_populated_); + return std::move(storage_.value_); + } + + template + constexpr T value_or(U&& default_value) const& { + static_assert(std::is_convertible::value, + "U must be convertible to T"); + return storage_.is_populated_ + ? storage_.value_ + : static_cast(std::forward(default_value)); + } + + template + T value_or(U&& default_value) && { + static_assert(std::is_convertible::value, + "U must be convertible to T"); + return storage_.is_populated_ + ? std::move(storage_.value_) + : static_cast(std::forward(default_value)); + } + + void swap(Optional& other) { + if (!storage_.is_populated_ && !other.storage_.is_populated_) + return; + + if (storage_.is_populated_ != other.storage_.is_populated_) { + if (storage_.is_populated_) { + other.storage_.Init(std::move(storage_.value_)); + FreeIfNeeded(); + } else { + storage_.Init(std::move(other.storage_.value_)); + other.FreeIfNeeded(); + } + return; + } + + PERFETTO_DCHECK(storage_.is_populated_ && other.storage_.is_populated_); + using std::swap; + swap(**this, *other); + } + + void reset() { FreeIfNeeded(); } + + template + T& emplace(Args&&... args) { + FreeIfNeeded(); + storage_.Init(std::forward(args)...); + return storage_.value_; + } + + template + typename std::enable_if< + std::is_constructible&, Args&&...>::value, + T&>::type + emplace(std::initializer_list il, Args&&... args) { + FreeIfNeeded(); + storage_.Init(il, std::forward(args)...); + return storage_.value_; + } + + private: + // Accessing template base class's protected member needs explicit + // declaration to do so. + using internal::OptionalBase::CopyAssign; + using internal::OptionalBase::FreeIfNeeded; + using internal::OptionalBase::InitOrAssign; + using internal::OptionalBase::MoveAssign; + using internal::OptionalBase::storage_; +}; + +// Here after defines comparation operators. The definition follows +// http://en.cppreference.com/w/cpp/utility/optional/operator_cmp +// while bool() casting is replaced by has_value() to meet the chromium +// style guide. +template +bool operator==(const Optional& lhs, const Optional& rhs) { + if (lhs.has_value() != rhs.has_value()) + return false; + if (!lhs.has_value()) + return true; + return *lhs == *rhs; +} + +template +bool operator!=(const Optional& lhs, const Optional& rhs) { + if (lhs.has_value() != rhs.has_value()) + return true; + if (!lhs.has_value()) + return false; + return *lhs != *rhs; +} + +template +bool operator<(const Optional& lhs, const Optional& rhs) { + if (!rhs.has_value()) + return false; + if (!lhs.has_value()) + return true; + return *lhs < *rhs; +} + +template +bool operator<=(const Optional& lhs, const Optional& rhs) { + if (!lhs.has_value()) + return true; + if (!rhs.has_value()) + return false; + return *lhs <= *rhs; +} + +template +bool operator>(const Optional& lhs, const Optional& rhs) { + if (!lhs.has_value()) + return false; + if (!rhs.has_value()) + return true; + return *lhs > *rhs; +} + +template +bool operator>=(const Optional& lhs, const Optional& rhs) { + if (!rhs.has_value()) + return true; + if (!lhs.has_value()) + return false; + return *lhs >= *rhs; +} + +template +constexpr bool operator==(const Optional& opt, nullopt_t) { + return !opt; +} + +template +constexpr bool operator==(nullopt_t, const Optional& opt) { + return !opt; +} + +template +constexpr bool operator!=(const Optional& opt, nullopt_t) { + return opt.has_value(); +} + +template +constexpr bool operator!=(nullopt_t, const Optional& opt) { + return opt.has_value(); +} + +template +constexpr bool operator<(const Optional&, nullopt_t) { + return false; +} + +template +constexpr bool operator<(nullopt_t, const Optional& opt) { + return opt.has_value(); +} + +template +constexpr bool operator<=(const Optional& opt, nullopt_t) { + return !opt; +} + +template +constexpr bool operator<=(nullopt_t, const Optional&) { + return true; +} + +template +constexpr bool operator>(const Optional& opt, nullopt_t) { + return opt.has_value(); +} + +template +constexpr bool operator>(nullopt_t, const Optional&) { + return false; +} + +template +constexpr bool operator>=(const Optional&, nullopt_t) { + return true; +} + +template +constexpr bool operator>=(nullopt_t, const Optional& opt) { + return !opt; +} + +template +constexpr bool operator==(const Optional& opt, const U& value) { + return opt.has_value() ? *opt == value : false; +} + +template +constexpr bool operator==(const U& value, const Optional& opt) { + return opt.has_value() ? value == *opt : false; +} + +template +constexpr bool operator!=(const Optional& opt, const U& value) { + return opt.has_value() ? *opt != value : true; +} + +template +constexpr bool operator!=(const U& value, const Optional& opt) { + return opt.has_value() ? value != *opt : true; +} + +template +constexpr bool operator<(const Optional& opt, const U& value) { + return opt.has_value() ? *opt < value : true; +} + +template +constexpr bool operator<(const U& value, const Optional& opt) { + return opt.has_value() ? value < *opt : false; +} + +template +constexpr bool operator<=(const Optional& opt, const U& value) { + return opt.has_value() ? *opt <= value : true; +} + +template +constexpr bool operator<=(const U& value, const Optional& opt) { + return opt.has_value() ? value <= *opt : false; +} + +template +constexpr bool operator>(const Optional& opt, const U& value) { + return opt.has_value() ? *opt > value : false; +} + +template +constexpr bool operator>(const U& value, const Optional& opt) { + return opt.has_value() ? value > *opt : true; +} + +template +constexpr bool operator>=(const Optional& opt, const U& value) { + return opt.has_value() ? *opt >= value : false; +} + +template +constexpr bool operator>=(const U& value, const Optional& opt) { + return opt.has_value() ? value >= *opt : true; +} + +template +constexpr Optional::type> make_optional(T&& value) { + return Optional::type>(std::forward(value)); +} + +template +constexpr Optional make_optional(Args&&... args) { + return Optional(in_place, std::forward(args)...); +} + +template +constexpr Optional make_optional(std::initializer_list il, + Args&&... args) { + return Optional(in_place, il, std::forward(args)...); +} + +// Partial specialization for a function template is not allowed. Also, it is +// not allowed to add overload function to std namespace, while it is allowed +// to specialize the template in std. Thus, swap() (kind of) overloading is +// defined in base namespace, instead. +template +typename std::enable_if::value && + internal::IsSwappable::value>::type +swap(Optional& lhs, Optional& rhs) { + lhs.swap(rhs); +} + +} // namespace base +} // namespace perfetto + +template +struct std::hash> { + size_t operator()(const perfetto::base::Optional& opt) const { + return opt == perfetto::base::nullopt ? 0 : std::hash()(*opt); + } +}; + +#endif // INCLUDE_PERFETTO_EXT_BASE_OPTIONAL_H_ +// gen_amalgamated begin header: include/perfetto/ext/base/string_view.h +// gen_amalgamated begin header: include/perfetto/ext/base/hash.h +/* + * Copyright (C) 2019 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_PERFETTO_EXT_BASE_HASH_H_ +#define INCLUDE_PERFETTO_EXT_BASE_HASH_H_ + +#include +#include +#include +#include + +namespace perfetto { +namespace base { + +// A helper class which computes a 64-bit hash of the input data. +// The algorithm used is FNV-1a as it is fast and easy to implement and has +// relatively few collisions. +// WARNING: This hash function should not be used for any cryptographic purpose. +class Hash { + public: + // Creates an empty hash object + Hash() {} + + // Hashes a numeric value. + template < + typename T, + typename std::enable_if::value, bool>::type = true> + void Update(T data) { + Update(reinterpret_cast(&data), sizeof(data)); + } + + // Using the loop instead of "Update(str, strlen(str))" to avoid looping twice + void Update(const char* str) { + for (const auto* p = str; *p; ++p) + Update(*p); + } + + // Hashes a byte array. + void Update(const char* data, size_t size) { + for (size_t i = 0; i < size; i++) { + result_ ^= static_cast(data[i]); + // Note: Arithmetic overflow of unsigned integers is well defined in C++ + // standard unlike signed integers. + // https://stackoverflow.com/a/41280273 + result_ *= kFnv1a64Prime; + } + } + + uint64_t digest() const { return result_; } + + // Usage: + // uint64_t hashed_value = Hash::Combine(33, false, "ABC", 458L, 3u, 'x'); + template + static uint64_t Combine(Ts&&... args) { + Hash hasher; + hasher.UpdateAll(std::forward(args)...); + return hasher.digest(); + } + + // `hasher.UpdateAll(33, false, "ABC")` is shorthand for: + // `hasher.Update(33); hasher.Update(false); hasher.Update("ABC");` + void UpdateAll() {} + + template + void UpdateAll(T&& arg, Ts&&... args) { + Update(arg); + UpdateAll(std::forward(args)...); + } + + private: + static constexpr uint64_t kFnv1a64OffsetBasis = 0xcbf29ce484222325; + static constexpr uint64_t kFnv1a64Prime = 0x100000001b3; + + uint64_t result_ = kFnv1a64OffsetBasis; +}; + +// This is for using already-hashed key into std::unordered_map and avoid the +// cost of re-hashing. Example: +// unordered_map my_map. +template +struct AlreadyHashed { + size_t operator()(const T& x) const { return static_cast(x); } +}; + +} // namespace base +} // namespace perfetto + +#endif // INCLUDE_PERFETTO_EXT_BASE_HASH_H_ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_PERFETTO_EXT_BASE_STRING_VIEW_H_ +#define INCLUDE_PERFETTO_EXT_BASE_STRING_VIEW_H_ + +#include + +#include +#include + +// gen_amalgamated expanded: #include "perfetto/base/build_config.h" +// gen_amalgamated expanded: #include "perfetto/base/logging.h" +// gen_amalgamated expanded: #include "perfetto/ext/base/hash.h" + +namespace perfetto { +namespace base { + +// A string-like object that refers to a non-owned piece of memory. +// Strings are internally NOT null terminated. +class StringView { + public: + static constexpr size_t npos = static_cast(-1); + + StringView() : data_(nullptr), size_(0) {} + StringView(const StringView&) = default; + StringView& operator=(const StringView&) = default; + StringView(const char* data, size_t size) : data_(data), size_(size) { + PERFETTO_DCHECK(size == 0 || data != nullptr); + } + + // Allow implicit conversion from any class that has a |data| and |size| field + // and has the kConvertibleToStringView trait (e.g., protozero::ConstChars). + template > + StringView(const T& x) : StringView(x.data, x.size) { + PERFETTO_DCHECK(x.size == 0 || x.data != nullptr); + } + + // Creates a StringView from a null-terminated C string. + // Deliberately not "explicit". + StringView(const char* cstr) : data_(cstr), size_(strlen(cstr)) { + PERFETTO_DCHECK(cstr != nullptr); + } + + // This instead has to be explicit, as creating a StringView out of a + // std::string can be subtle. + explicit StringView(const std::string& str) + : data_(str.data()), size_(str.size()) {} + + bool empty() const { return size_ == 0; } + size_t size() const { return size_; } + const char* data() const { return data_; } + const char* begin() const { return data_; } + const char* end() const { return data_ + size_; } + + char at(size_t pos) const { + PERFETTO_DCHECK(pos < size_); + return data_[pos]; + } + + size_t find(char c, size_t start_pos = 0) const { + for (size_t i = start_pos; i < size_; ++i) { + if (data_[i] == c) + return i; + } + return npos; + } + + size_t find(const StringView& str, size_t start_pos = 0) const { + if (start_pos > size()) + return npos; + auto it = std::search(begin() + start_pos, end(), str.begin(), str.end()); + size_t pos = static_cast(it - begin()); + return pos + str.size() <= size() ? pos : npos; + } + + size_t find(const char* str, size_t start_pos = 0) const { + return find(StringView(str), start_pos); + } + + size_t rfind(char c) const { + for (size_t i = size_; i > 0; --i) { + if (data_[i - 1] == c) + return i - 1; + } + return npos; + } + + StringView substr(size_t pos, size_t count = npos) const { + if (pos >= size_) + return StringView("", 0); + size_t rcount = std::min(count, size_ - pos); + return StringView(data_ + pos, rcount); + } + + bool CaseInsensitiveEq(const StringView& other) const { + if (size() != other.size()) + return false; + if (size() == 0) + return true; +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) + return _strnicmp(data(), other.data(), size()) == 0; +#else + return strncasecmp(data(), other.data(), size()) == 0; +#endif + } + + bool StartsWith(const StringView& other) { + if (other.size() == 0) + return true; + if (size() == 0) + return false; + if (other.size() > size()) + return false; + for (uint32_t i = 0; i < other.size(); ++i) { + if (at(i) != other.at(i)) + return false; + } + return true; + } + + std::string ToStdString() const { + return size_ == 0 ? "" : std::string(data_, size_); + } + + uint64_t Hash() const { + base::Hash hasher; + hasher.Update(data_, size_); + return hasher.digest(); + } + + private: + const char* data_ = nullptr; + size_t size_ = 0; +}; + +inline bool operator==(const StringView& x, const StringView& y) { + if (x.size() != y.size()) + return false; + if (x.size() == 0) + return true; + return memcmp(x.data(), y.data(), x.size()) == 0; +} + +inline bool operator!=(const StringView& x, const StringView& y) { + return !(x == y); +} + +inline bool operator<(const StringView& x, const StringView& y) { + auto size = std::min(x.size(), y.size()); + if (size == 0) + return x.size() < y.size(); + int result = memcmp(x.data(), y.data(), size); + return result < 0 || (result == 0 && x.size() < y.size()); +} + +inline bool operator>=(const StringView& x, const StringView& y) { + return !(x < y); +} + +inline bool operator>(const StringView& x, const StringView& y) { + return y < x; +} + +inline bool operator<=(const StringView& x, const StringView& y) { + return !(y < x); +} + +} // namespace base +} // namespace perfetto + +template <> +struct std::hash<::perfetto::base::StringView> { + size_t operator()(const ::perfetto::base::StringView& sv) const { + return static_cast(sv.Hash()); + } +}; + +#endif // INCLUDE_PERFETTO_EXT_BASE_STRING_VIEW_H_ +// gen_amalgamated begin header: include/perfetto/ext/base/utils.h +// gen_amalgamated begin header: include/perfetto/ext/base/sys_types.h +/* + * Copyright (C) 2022 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_PERFETTO_EXT_BASE_SYS_TYPES_H_ +#define INCLUDE_PERFETTO_EXT_BASE_SYS_TYPES_H_ + +// This headers deals with sys types commonly used in the codebase that are +// missing on Windows. + +#include + +// gen_amalgamated expanded: #include "perfetto/base/build_config.h" + +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) + +#if !PERFETTO_BUILDFLAG(PERFETTO_COMPILER_GCC) +// MinGW has these. clang-cl and MSVC, which use just the Windows SDK, don't. +using uid_t = unsigned int; +using pid_t = int; +#endif // !GCC + +#if defined(_WIN64) +using ssize_t = int64_t; +#else +using ssize_t = long; +#endif // _WIN64 + +#endif // OS_WIN + +namespace perfetto { +namespace base { + +constexpr uid_t kInvalidUid = static_cast(-1); +constexpr pid_t kInvalidPid = static_cast(-1); + +} // namespace base +} // namespace perfetto + +#endif // INCLUDE_PERFETTO_EXT_BASE_SYS_TYPES_H_ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_PERFETTO_EXT_BASE_UTILS_H_ +#define INCLUDE_PERFETTO_EXT_BASE_UTILS_H_ + +#include +#include +#include +#include + +#include +#include +#include +#include + +// gen_amalgamated expanded: #include "perfetto/base/build_config.h" +// gen_amalgamated expanded: #include "perfetto/base/compiler.h" +// gen_amalgamated expanded: #include "perfetto/ext/base/sys_types.h" + +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) +// Even if Windows has errno.h, the all syscall-restart behavior does not apply. +// Trying to handle EINTR can cause more harm than good if errno is left stale. +// Chromium does the same. +#define PERFETTO_EINTR(x) (x) +#else +#define PERFETTO_EINTR(x) \ + ([&] { \ + decltype(x) eintr_wrapper_result; \ + do { \ + eintr_wrapper_result = (x); \ + } while (eintr_wrapper_result == -1 && errno == EINTR); \ + return eintr_wrapper_result; \ + }()) +#endif + +namespace perfetto { +namespace base { + +// Do not add new usages of kPageSize, consider using GetSysPageSize() below. +// TODO(primiano): over time the semantic of kPageSize became too ambiguous. +// Strictly speaking, this constant is incorrect on some new devices where the +// page size can be 16K (e.g., crbug.com/1116576). Unfortunately too much code +// ended up depending on kPageSize for purposes that are not strictly related +// with the kernel's mm subsystem. +constexpr size_t kPageSize = 4096; + +// Returns the system's page size. Use this when dealing with mmap, madvise and +// similar mm-related syscalls. +uint32_t GetSysPageSize(); + +template +constexpr size_t ArraySize(const T& array) { + return sizeof(array) / sizeof(array[0]); +} + +// Function object which invokes 'free' on its parameter, which must be +// a pointer. Can be used to store malloc-allocated pointers in std::unique_ptr: +// +// std::unique_ptr foo_ptr( +// static_cast(malloc(sizeof(int)))); +struct FreeDeleter { + inline void operator()(void* ptr) const { free(ptr); } +}; + +template +constexpr T AssumeLittleEndian(T value) { +#if !PERFETTO_IS_LITTLE_ENDIAN() + static_assert(false, "Unimplemented on big-endian archs"); +#endif + return value; +} + +// Round up |size| to a multiple of |alignment| (must be a power of two). +template +constexpr size_t AlignUp(size_t size) { + static_assert((alignment & (alignment - 1)) == 0, "alignment must be a pow2"); + return (size + alignment - 1) & ~(alignment - 1); +} + +inline bool IsAgain(int err) { + return err == EAGAIN || err == EWOULDBLOCK; +} + +// setenv(2)-equivalent. Deals with Windows vs Posix discrepancies. +void SetEnv(const std::string& key, const std::string& value); + +// Calls mallopt(M_PURGE, 0) on Android. Does nothing on other platforms. +// This forces the allocator to release freed memory. This is used to work +// around various Scudo inefficiencies. See b/170217718. +void MaybeReleaseAllocatorMemToOS(); + +// geteuid() on POSIX OSes, returns 0 on Windows (See comment in utils.cc). +uid_t GetCurrentUserId(); + +// Forks the process. +// Parent: prints the PID of the child, calls |parent_cb| and exits from the +// process with its return value. +// Child: redirects stdio onto /dev/null, chdirs into / and returns. +void Daemonize(std::function parent_cb); + +// Returns the path of the current executable, e.g. /foo/bar/exe. +std::string GetCurExecutablePath(); + +// Returns the directory where the current executable lives in, e.g. /foo/bar. +// This is independent of cwd(). +std::string GetCurExecutableDir(); + +// Memory returned by AlignedAlloc() must be freed via AlignedFree() not just +// free. It makes a difference on Windows where _aligned_malloc() and +// _aligned_free() must be paired. +// Prefer using the AlignedAllocTyped() below which takes care of the pairing. +void* AlignedAlloc(size_t alignment, size_t size); +void AlignedFree(void*); + +// A RAII version of the above, which takes care of pairing Aligned{Alloc,Free}. +template +struct AlignedDeleter { + inline void operator()(T* ptr) const { AlignedFree(ptr); } +}; + +// The remove_extent here and below is to allow defining unique_ptr. +// As per https://en.cppreference.com/w/cpp/memory/unique_ptr the Deleter takes +// always a T*, not a T[]*. +template +using AlignedUniquePtr = + std::unique_ptr::type>>; + +template +AlignedUniquePtr AlignedAllocTyped(size_t n_membs) { + using TU = typename std::remove_extent::type; + return AlignedUniquePtr( + static_cast(AlignedAlloc(alignof(TU), sizeof(TU) * n_membs))); +} + +// A RAII wrapper to invoke a function when leaving a function/scope. +template +class OnScopeExitWrapper { + public: + explicit OnScopeExitWrapper(Func f) : f_(std::move(f)), active_(true) {} + OnScopeExitWrapper(OnScopeExitWrapper&& other) noexcept + : f_(std::move(other.f_)), active_(other.active_) { + other.active_ = false; + } + ~OnScopeExitWrapper() { + if (active_) + f_(); + } + + private: + Func f_; + bool active_; +}; + +template +PERFETTO_WARN_UNUSED_RESULT OnScopeExitWrapper OnScopeExit(Func f) { + return OnScopeExitWrapper(std::move(f)); +} + +// Returns a xxd-style hex dump (hex + ascii chars) of the input data. +std::string HexDump(const void* data, size_t len, size_t bytes_per_line = 16); +inline std::string HexDump(const std::string& data, + size_t bytes_per_line = 16) { + return HexDump(data.data(), data.size(), bytes_per_line); +} + +} // namespace base +} // namespace perfetto + +#endif // INCLUDE_PERFETTO_EXT_BASE_UTILS_H_ +/* + * Copyright (C) 2021 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_PERFETTO_EXT_BASE_BASE64_H_ +#define INCLUDE_PERFETTO_EXT_BASE_BASE64_H_ + +#include + +// gen_amalgamated expanded: #include "perfetto/ext/base/optional.h" +// gen_amalgamated expanded: #include "perfetto/ext/base/string_view.h" +// gen_amalgamated expanded: #include "perfetto/ext/base/utils.h" // For ssize_t. + +namespace perfetto { +namespace base { + +// Returns the length of the destination string (included '=' padding). +// Does NOT include the size of the string null terminator. +inline size_t Base64EncSize(size_t src_size) { + return (src_size + 2) / 3 * 4; +} + +// Returns the upper bound on the length of the destination buffer. +// The actual decoded length might be <= the number returned here. +inline size_t Base64DecSize(size_t src_size) { + return (src_size + 3) / 4 * 3; +} + +// Does NOT null-terminate |dst|. +ssize_t Base64Encode(const void* src, + size_t src_size, + char* dst, + size_t dst_size); + +std::string Base64Encode(const void* src, size_t src_size); + +inline std::string Base64Encode(StringView sv) { + return Base64Encode(sv.data(), sv.size()); +} + +// Returns -1 in case of failure. +ssize_t Base64Decode(const char* src, + size_t src_size, + uint8_t* dst, + size_t dst_size); + +Optional Base64Decode(const char* src, size_t src_size); + +inline Optional Base64Decode(StringView sv) { + return Base64Decode(sv.data(), sv.size()); +} + +} // namespace base +} // namespace perfetto + +#endif // INCLUDE_PERFETTO_EXT_BASE_BASE64_H_ +/* + * Copyright (C) 2021 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// gen_amalgamated expanded: #include "perfetto/ext/base/base64.h" + +namespace perfetto { +namespace base { + +namespace { + +constexpr char kPadding = '='; + +constexpr char kEncTable[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +static_assert(sizeof(kEncTable) == (1u << 6) + sizeof('\0'), "Bad table size"); + +// Maps an ASCII character to its 6-bit value. It only contains translations +// from '+' to 'z'. Supports the standard (+/) and URL-safe (-_) alphabets. +constexpr uint8_t kX = 0xff; // Value used for invalid characters +constexpr uint8_t kDecTable[] = { + 62, kX, 62, kX, 63, 52, 53, 54, 55, 56, // 00 - 09 + 57, 58, 59, 60, 61, kX, kX, kX, 0, kX, // 10 - 19 + kX, kX, 0, 1, 2, 3, 4, 5, 6, 7, // 20 - 29 + 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, // 30 - 39 + 18, 19, 20, 21, 22, 23, 24, 25, kX, kX, // 40 - 49 + kX, kX, 63, kX, 26, 27, 28, 29, 30, 31, // 50 - 59 + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, // 60 - 69 + 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, // 70 - 79 +}; +constexpr char kMinDecChar = '+'; +constexpr char kMaxDecChar = 'z'; +static_assert(kMaxDecChar - kMinDecChar <= sizeof(kDecTable), "Bad table size"); + +inline uint8_t DecodeChar(char c) { + if (c < kMinDecChar || c > kMaxDecChar) + return kX; + return kDecTable[c - kMinDecChar]; +} + +} // namespace + +ssize_t Base64Encode(const void* src, + size_t src_size, + char* dst, + size_t dst_size) { + const size_t padded_dst_size = Base64EncSize(src_size); + if (dst_size < padded_dst_size) + return -1; // Not enough space in output. + + const uint8_t* rd = static_cast(src); + const uint8_t* const end = rd + src_size; + size_t wr_size = 0; + while (rd < end) { + uint8_t s[3]{}; + s[0] = *(rd++); + dst[wr_size++] = kEncTable[s[0] >> 2]; + + uint8_t carry0 = static_cast((s[0] & 0x03) << 4); + if (PERFETTO_LIKELY(rd < end)) { + s[1] = *(rd++); + dst[wr_size++] = kEncTable[carry0 | (s[1] >> 4)]; + } else { + dst[wr_size++] = kEncTable[carry0]; + dst[wr_size++] = kPadding; + dst[wr_size++] = kPadding; + break; + } + + uint8_t carry1 = static_cast((s[1] & 0x0f) << 2); + if (PERFETTO_LIKELY(rd < end)) { + s[2] = *(rd++); + dst[wr_size++] = kEncTable[carry1 | (s[2] >> 6)]; + } else { + dst[wr_size++] = kEncTable[carry1]; + dst[wr_size++] = kPadding; + break; + } + + dst[wr_size++] = kEncTable[s[2] & 0x3f]; + } + PERFETTO_DCHECK(wr_size == padded_dst_size); + return static_cast(padded_dst_size); +} + +std::string Base64Encode(const void* src, size_t src_size) { + std::string dst; + dst.resize(Base64EncSize(src_size)); + auto res = Base64Encode(src, src_size, &dst[0], dst.size()); + PERFETTO_CHECK(res == static_cast(dst.size())); + return dst; +} + +ssize_t Base64Decode(const char* src, + size_t src_size, + uint8_t* dst, + size_t dst_size) { + const size_t min_dst_size = Base64DecSize(src_size); + if (dst_size < min_dst_size) + return -1; + + const char* rd = src; + const char* const end = src + src_size; + size_t wr_size = 0; + + char s[4]{}; + while (rd < end) { + uint8_t d[4]; + for (uint32_t j = 0; j < 4; j++) { + // Padding is only feasible for the last 2 chars of each group of 4. + s[j] = rd < end ? *(rd++) : (j < 2 ? '\0' : kPadding); + d[j] = DecodeChar(s[j]); + if (d[j] == kX) + return -1; // Invalid input char. + } + dst[wr_size] = static_cast((d[0] << 2) | (d[1] >> 4)); + dst[wr_size + 1] = static_cast((d[1] << 4) | (d[2] >> 2)); + dst[wr_size + 2] = static_cast((d[2] << 6) | (d[3])); + wr_size += 3; + } + + PERFETTO_CHECK(wr_size <= dst_size); + wr_size -= (s[3] == kPadding ? 1 : 0) + (s[2] == kPadding ? 1 : 0); + return static_cast(wr_size); +} + +Optional Base64Decode(const char* src, size_t src_size) { + std::string dst; + dst.resize(Base64DecSize(src_size)); + auto res = Base64Decode(src, src_size, reinterpret_cast(&dst[0]), + dst.size()); + if (res < 0) + return nullopt; // Decoding error. + + PERFETTO_CHECK(res <= static_cast(dst.size())); + dst.resize(static_cast(res)); + return base::make_optional(dst); +} + +} // namespace base +} // namespace perfetto +// gen_amalgamated begin source: src/base/crash_keys.cc +// gen_amalgamated begin header: include/perfetto/ext/base/crash_keys.h +/* + * Copyright (C) 2021 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_PERFETTO_EXT_BASE_CRASH_KEYS_H_ +#define INCLUDE_PERFETTO_EXT_BASE_CRASH_KEYS_H_ + +#include +#include + +#include +#include + +// gen_amalgamated expanded: #include "perfetto/base/compiler.h" +// gen_amalgamated expanded: #include "perfetto/ext/base/string_view.h" + +// Crash keys are very simple global variables with static-storage that +// are reported on crash time for managed crashes (CHECK/FATAL/Watchdog). +// - Translation units can define a CrashKey and register it at some point +// during initialization. +// - CrashKey instances must be long-lived. They should really be just global +// static variable in the anonymous namespace. +// Example: +// subsystem_1.cc +// CrashKey g_client_id("ipc_client_id"); +// ... +// OnIpcReceived(client_id) { +// g_client_id.Set(client_id); +// ... // Process the IPC +// g_client_id.Clear(); +// } +// Or equivalently: +// OnIpcReceived(client_id) { +// auto scoped_key = g_client_id.SetScoped(client_id); +// ... // Process the IPC +// } +// +// If a crash happens while processing the IPC, the crash report will +// have a line "ipc_client_id: 42". +// +// Thread safety considerations: +// CrashKeys can be registered and set/cleared from any thread. +// There is no compelling use-case to have full acquire/release consistency when +// setting a key. This means that if a thread crashes immediately after a +// crash key has been set on another thread, the value printed on the crash +// report could be incomplete. The code guarantees defined behavior and does +// not rely on null-terminated string (in the worst case 32 bytes of random +// garbage will be printed out). + +// The tests live in logging_unittest.cc. + +namespace perfetto { +namespace base { + +constexpr size_t kCrashKeyMaxStrSize = 32; + +// CrashKey instances must be long lived +class CrashKey { + public: + class ScopedClear { + public: + explicit ScopedClear(CrashKey* k) : key_(k) {} + ~ScopedClear() { + if (key_) + key_->Clear(); + } + ScopedClear(const ScopedClear&) = delete; + ScopedClear& operator=(const ScopedClear&) = delete; + ScopedClear& operator=(ScopedClear&&) = delete; + ScopedClear(ScopedClear&& other) noexcept : key_(other.key_) { + other.key_ = nullptr; + } + + private: + CrashKey* key_; + }; + + // constexpr so it can be used in the anon namespace without requiring a + // global constructor. + // |name| must be a long-lived string. + constexpr explicit CrashKey(const char* name) + : registered_{}, type_(Type::kUnset), name_(name), str_value_{} {} + CrashKey(const CrashKey&) = delete; + CrashKey& operator=(const CrashKey&) = delete; + CrashKey(CrashKey&&) = delete; + CrashKey& operator=(CrashKey&&) = delete; + + enum class Type : uint8_t { kUnset = 0, kInt, kStr }; + + void Clear() { + int_value_.store(0, std::memory_order_relaxed); + type_.store(Type::kUnset, std::memory_order_relaxed); + } + + void Set(int64_t value) { + int_value_.store(value, std::memory_order_relaxed); + type_.store(Type::kInt, std::memory_order_relaxed); + if (PERFETTO_UNLIKELY(!registered_.load(std::memory_order_relaxed))) + Register(); + } + + void Set(StringView sv) { + size_t len = std::min(sv.size(), sizeof(str_value_) - 1); + for (size_t i = 0; i < len; ++i) + str_value_[i].store(sv.data()[i], std::memory_order_relaxed); + str_value_[len].store('\0', std::memory_order_relaxed); + type_.store(Type::kStr, std::memory_order_relaxed); + if (PERFETTO_UNLIKELY(!registered_.load(std::memory_order_relaxed))) + Register(); + } + + ScopedClear SetScoped(int64_t value) PERFETTO_WARN_UNUSED_RESULT { + Set(value); + return ScopedClear(this); + } + + ScopedClear SetScoped(StringView sv) PERFETTO_WARN_UNUSED_RESULT { + Set(sv); + return ScopedClear(this); + } + + void Register(); + + int64_t int_value() const { + return int_value_.load(std::memory_order_relaxed); + } + size_t ToString(char* dst, size_t len); + + private: + std::atomic registered_; + std::atomic type_; + const char* const name_; + union { + std::atomic str_value_[kCrashKeyMaxStrSize]; + std::atomic int_value_; + }; +}; + +// Fills |dst| with a string containing one line for each crash key +// (excluding the unset ones). +// Returns number of chars written, without counting the NUL terminator. +// This is used in logging.cc when emitting the crash report abort message. +size_t SerializeCrashKeys(char* dst, size_t len); + +void UnregisterAllCrashKeysForTesting(); + +} // namespace base +} // namespace perfetto + +#endif // INCLUDE_PERFETTO_EXT_BASE_CRASH_KEYS_H_ +// gen_amalgamated begin header: include/perfetto/ext/base/string_utils.h +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_PERFETTO_EXT_BASE_STRING_UTILS_H_ +#define INCLUDE_PERFETTO_EXT_BASE_STRING_UTILS_H_ + +#include +#include +#include + +#include +#include +#include + +// gen_amalgamated expanded: #include "perfetto/ext/base/optional.h" +// gen_amalgamated expanded: #include "perfetto/ext/base/string_view.h" + +namespace perfetto { +namespace base { + +inline char Lowercase(char c) { + return ('A' <= c && c <= 'Z') ? static_cast(c - ('A' - 'a')) : c; +} + +inline char Uppercase(char c) { + return ('a' <= c && c <= 'z') ? static_cast(c + ('A' - 'a')) : c; +} + +inline Optional CStringToUInt32(const char* s, int base = 10) { + char* endptr = nullptr; + auto value = static_cast(strtoul(s, &endptr, base)); + return (*s && !*endptr) ? base::make_optional(value) : base::nullopt; +} + +inline Optional CStringToInt32(const char* s, int base = 10) { + char* endptr = nullptr; + auto value = static_cast(strtol(s, &endptr, base)); + return (*s && !*endptr) ? base::make_optional(value) : base::nullopt; +} + +// Note: it saturates to 7fffffffffffffff if parsing a hex number >= 0x8000... +inline Optional CStringToInt64(const char* s, int base = 10) { + char* endptr = nullptr; + auto value = static_cast(strtoll(s, &endptr, base)); + return (*s && !*endptr) ? base::make_optional(value) : base::nullopt; +} + +inline Optional CStringToUInt64(const char* s, int base = 10) { + char* endptr = nullptr; + auto value = static_cast(strtoull(s, &endptr, base)); + return (*s && !*endptr) ? base::make_optional(value) : base::nullopt; +} + +double StrToD(const char* nptr, char** endptr); + +inline Optional CStringToDouble(const char* s) { + char* endptr = nullptr; + double value = StrToD(s, &endptr); + Optional result(base::nullopt); + if (*s != '\0' && *endptr == '\0') + result = value; + return result; +} + +inline Optional StringToUInt32(const std::string& s, int base = 10) { + return CStringToUInt32(s.c_str(), base); +} + +inline Optional StringToInt32(const std::string& s, int base = 10) { + return CStringToInt32(s.c_str(), base); +} + +inline Optional StringToUInt64(const std::string& s, int base = 10) { + return CStringToUInt64(s.c_str(), base); +} + +inline Optional StringToInt64(const std::string& s, int base = 10) { + return CStringToInt64(s.c_str(), base); +} + +inline Optional StringToDouble(const std::string& s) { + return CStringToDouble(s.c_str()); +} + +bool StartsWith(const std::string& str, const std::string& prefix); +bool EndsWith(const std::string& str, const std::string& suffix); +bool StartsWithAny(const std::string& str, + const std::vector& prefixes); +bool Contains(const std::string& haystack, const std::string& needle); +bool Contains(const std::string& haystack, char needle); +size_t Find(const StringView& needle, const StringView& haystack); +bool CaseInsensitiveEqual(const std::string& first, const std::string& second); +std::string Join(const std::vector& parts, + const std::string& delim); +std::vector SplitString(const std::string& text, + const std::string& delimiter); +std::string StripPrefix(const std::string& str, const std::string& prefix); +std::string StripSuffix(const std::string& str, const std::string& suffix); +std::string ToLower(const std::string& str); +std::string ToUpper(const std::string& str); +std::string StripChars(const std::string& str, + const std::string& chars, + char replacement); +std::string ToHex(const char* data, size_t size); +inline std::string ToHex(const std::string& s) { + return ToHex(s.c_str(), s.size()); +} +std::string IntToHexString(uint32_t number); +std::string Uint64ToHexString(uint64_t number); +std::string Uint64ToHexStringNoPrefix(uint64_t number); +std::string ReplaceAll(std::string str, + const std::string& to_replace, + const std::string& replacement); + +// A BSD-style strlcpy without the return value. +// Copies at most |dst_size|-1 characters. Unlike strncpy, it always \0 +// terminates |dst|, as long as |dst_size| is not 0. +// Unlike strncpy and like strlcpy it does not zero-pad the rest of |dst|. +// Returns nothing. The BSD strlcpy returns the size of |src|, which might +// be > |dst_size|. Anecdotal experience suggests people assume the return value +// is the number of bytes written in |dst|. That assumption can lead to +// dangerous bugs. +// In order to avoid being subtly uncompliant with strlcpy AND avoid misuse, +// the choice here is to return nothing. +inline void StringCopy(char* dst, const char* src, size_t dst_size) { + for (size_t i = 0; i < dst_size; ++i) { + if ((dst[i] = src[i]) == '\0') { + return; // We hit and copied the null terminator. + } + } + + // We were left off at dst_size. We over copied 1 byte. Null terminate. + if (PERFETTO_LIKELY(dst_size > 0)) + dst[dst_size - 1] = 0; +} + +// Like snprintf() but returns the number of chars *actually* written (without +// counting the null terminator) NOT "the number of chars which would have been +// written to the final string if enough space had been available". +// This should be used in almost all cases when the caller uses the return value +// of snprintf(). If the return value is not used, there is no benefit in using +// this wrapper, as this just calls snprintf() and mangles the return value. +// It always null-terminates |dst| (even in case of errors), unless +// |dst_size| == 0. +// Examples: +// SprintfTrunc(x, 4, "123whatever"): returns 3 and writes "123\0". +// SprintfTrunc(x, 4, "123"): returns 3 and writes "123\0". +// SprintfTrunc(x, 3, "123"): returns 2 and writes "12\0". +// SprintfTrunc(x, 2, "123"): returns 1 and writes "1\0". +// SprintfTrunc(x, 1, "123"): returns 0 and writes "\0". +// SprintfTrunc(x, 0, "123"): returns 0 and writes nothing. +// NOTE: This means that the caller has no way to tell when truncation happens +// vs the edge case of *just* fitting in the buffer. +size_t SprintfTrunc(char* dst, size_t dst_size, const char* fmt, ...) + PERFETTO_PRINTF_FORMAT(3, 4); + +// A helper class to facilitate construction and usage of write-once stack +// strings. +// Example usage: +// StackString<32> x("format %d %s", 42, string_arg); +// TakeString(x.c_str() | x.string_view() | x.ToStdString()); +// Rather than char x[32] + sprintf. +// Advantages: +// - Avoids useless zero-fills caused by people doing `char buf[32] {}` (mainly +// by fearing unknown snprintf failure modes). +// - Makes the code more robust in case of snprintf truncations (len() and +// string_view() will return the truncated length, unlike snprintf). +template +class StackString { + public: + explicit PERFETTO_PRINTF_FORMAT(/* 1=this */ 2, 3) + StackString(const char* fmt, ...) { + buf_[0] = '\0'; + va_list args; + va_start(args, fmt); + int res = vsnprintf(buf_, sizeof(buf_), fmt, args); + va_end(args); + buf_[sizeof(buf_) - 1] = '\0'; + len_ = res < 0 ? 0 : std::min(static_cast(res), sizeof(buf_) - 1); + } + + StringView string_view() const { return StringView(buf_, len_); } + std::string ToStdString() const { return std::string(buf_, len_); } + const char* c_str() const { return buf_; } + size_t len() const { return len_; } + + private: + char buf_[N]; + size_t len_ = 0; // Does not include the \0. +}; + +} // namespace base +} // namespace perfetto + +#endif // INCLUDE_PERFETTO_EXT_BASE_STRING_UTILS_H_ +/* + * Copyright (C) 2021 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// gen_amalgamated expanded: #include "perfetto/ext/base/crash_keys.h" + +#include + +#include +#include + +// gen_amalgamated expanded: #include "perfetto/ext/base/string_utils.h" + +namespace perfetto { +namespace base { + +namespace { + +constexpr size_t kMaxKeys = 32; + +std::atomic g_keys[kMaxKeys]{}; +std::atomic g_num_keys{}; +} // namespace + +void CrashKey::Register() { + // If doesn't matter if we fail below. If there are no slots left, don't + // keep trying re-registering on every Set(), the outcome won't change. + + // If two threads raced on the Register(), avoid registering the key twice. + if (registered_.exchange(true)) + return; + + uint32_t slot = g_num_keys.fetch_add(1); + if (slot >= kMaxKeys) { + PERFETTO_LOG("Too many crash keys registered"); + return; + } + g_keys[slot].store(this); +} + +// Returns the number of chars written, without counting the \0. +size_t CrashKey::ToString(char* dst, size_t len) { + if (len > 0) + *dst = '\0'; + switch (type_.load(std::memory_order_relaxed)) { + case Type::kUnset: + break; + case Type::kInt: + return SprintfTrunc(dst, len, "%s: %" PRId64 "\n", name_, + int_value_.load(std::memory_order_relaxed)); + case Type::kStr: + char buf[sizeof(str_value_)]; + for (size_t i = 0; i < sizeof(str_value_); i++) + buf[i] = str_value_[i].load(std::memory_order_relaxed); + + // Don't assume |str_value_| is properly null-terminated. + return SprintfTrunc(dst, len, "%s: %.*s\n", name_, int(sizeof(buf)), buf); + } + return 0; +} + +void UnregisterAllCrashKeysForTesting() { + g_num_keys.store(0); + for (auto& key : g_keys) + key.store(nullptr); +} + +size_t SerializeCrashKeys(char* dst, size_t len) { + size_t written = 0; + uint32_t num_keys = g_num_keys.load(); + if (len > 0) + *dst = '\0'; + for (uint32_t i = 0; i < num_keys && written < len; i++) { + CrashKey* key = g_keys[i].load(); + if (!key) + continue; // Can happen if we hit this between the add and the store. + written += key->ToString(dst + written, len - written); + } + PERFETTO_DCHECK(written <= len); + PERFETTO_DCHECK(len == 0 || dst[written] == '\0'); + return written; +} + +} // namespace base +} // namespace perfetto +// gen_amalgamated begin source: src/base/ctrl_c_handler.cc +// gen_amalgamated begin header: include/perfetto/ext/base/ctrl_c_handler.h +/* + * Copyright (C) 2021 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_PERFETTO_EXT_BASE_CTRL_C_HANDLER_H_ +#define INCLUDE_PERFETTO_EXT_BASE_CTRL_C_HANDLER_H_ + +namespace perfetto { +namespace base { + +// On Linux/Android/Mac: installs SIGINT + SIGTERM signal handlers. +// On Windows: installs a SetConsoleCtrlHandler() handler. +// The passed handler must be async safe. +using CtrlCHandlerFunction = void (*)(); +void InstallCtrCHandler(CtrlCHandlerFunction); + +} // namespace base +} // namespace perfetto + +#endif // INCLUDE_PERFETTO_EXT_BASE_CTRL_C_HANDLER_H_ +/* + * Copyright (C) 2021 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// gen_amalgamated expanded: #include "perfetto/ext/base/ctrl_c_handler.h" + +// gen_amalgamated expanded: #include "perfetto/base/build_config.h" +// gen_amalgamated expanded: #include "perfetto/base/compiler.h" +// gen_amalgamated expanded: #include "perfetto/base/logging.h" + +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) +#include +#include +#else +#include +#include +#endif + +namespace perfetto { +namespace base { + +namespace { +CtrlCHandlerFunction g_handler = nullptr; +} + +void InstallCtrCHandler(CtrlCHandlerFunction handler) { + PERFETTO_CHECK(g_handler == nullptr); + g_handler = handler; + +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) + auto trampoline = [](DWORD type) -> int { + if (type == CTRL_C_EVENT) { + g_handler(); + return true; + } + return false; + }; + ::SetConsoleCtrlHandler(trampoline, true); +#elif PERFETTO_BUILDFLAG(PERFETTO_OS_LINUX) || \ + PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID) || \ + PERFETTO_BUILDFLAG(PERFETTO_OS_APPLE) + // Setup signal handler. + struct sigaction sa {}; + +// Glibc headers for sa_sigaction trigger this. +#pragma GCC diagnostic push +#if defined(__clang__) +#pragma GCC diagnostic ignored "-Wdisabled-macro-expansion" +#endif + sa.sa_handler = [](int) { g_handler(); }; + sa.sa_flags = static_cast(SA_RESETHAND | SA_RESTART); +#pragma GCC diagnostic pop + sigaction(SIGINT, &sa, nullptr); + sigaction(SIGTERM, &sa, nullptr); +#else + // Do nothing on NaCL and Fuchsia. + ignore_result(handler); +#endif +} + +} // namespace base +} // namespace perfetto +// gen_amalgamated begin source: src/base/event_fd.cc +// gen_amalgamated begin header: include/perfetto/ext/base/event_fd.h +// gen_amalgamated begin header: include/perfetto/base/platform_handle.h +/* + * Copyright (C) 2020 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_PERFETTO_BASE_PLATFORM_HANDLE_H_ +#define INCLUDE_PERFETTO_BASE_PLATFORM_HANDLE_H_ + +// gen_amalgamated expanded: #include "perfetto/base/build_config.h" + +namespace perfetto { +namespace base { + +// PlatformHandle should be used only for types that are HANDLE(s) in Windows. +// It should NOT be used to blanket-replace "int fd" in the codebase. +// Windows has two types of "handles", which, in UNIX-land, both map to int: +// 1. File handles returned by the posix-compatibility API like _open(). +// These are just int(s) and should stay such, because all the posix-like API +// in Windows.h take an int, not a HANDLE. +// 2. Handles returned by old-school WINAPI like CreateFile, CreateEvent etc. +// These are proper HANDLE(s). PlatformHandle should be used here. +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) +// Windows.h typedefs HANDLE to void*. We use void* here to avoid leaking +// Windows.h through our headers. +using PlatformHandle = void*; + +// On Windows both nullptr and 0xffff... (INVALID_HANDLE_VALUE) are invalid. +struct PlatformHandleChecker { + static inline bool IsValid(PlatformHandle h) { + return h && h != reinterpret_cast(-1); + } +}; +#else +using PlatformHandle = int; +struct PlatformHandleChecker { + static inline bool IsValid(PlatformHandle h) { return h >= 0; } +}; +#endif + +// The definition of this lives in base/file_utils.cc (to avoid creating an +// extra build edge for a one liner). This is really an alias for close() (UNIX) +// CloseHandle() (Windows). THe indirection layer is just to avoid leaking +// system headers like Windows.h through perfetto headers. +// Thre return value is always UNIX-style: 0 on success, -1 on failure. +int ClosePlatformHandle(PlatformHandle); + +} // namespace base +} // namespace perfetto + +#endif // INCLUDE_PERFETTO_BASE_PLATFORM_HANDLE_H_ +// gen_amalgamated begin header: include/perfetto/ext/base/scoped_file.h +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_PERFETTO_EXT_BASE_SCOPED_FILE_H_ +#define INCLUDE_PERFETTO_EXT_BASE_SCOPED_FILE_H_ + +// gen_amalgamated expanded: #include "perfetto/base/build_config.h" + +#include + +#if !PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) +#include // For DIR* / opendir(). +#endif + +#include + +// gen_amalgamated expanded: #include "perfetto/base/export.h" +// gen_amalgamated expanded: #include "perfetto/base/logging.h" +// gen_amalgamated expanded: #include "perfetto/base/platform_handle.h" + +namespace perfetto { +namespace base { + +namespace internal { +// Used for the most common cases of ScopedResource where there is only one +// invalid value. +template +struct DefaultValidityChecker { + static bool IsValid(T t) { return t != InvalidValue; } +}; +} // namespace internal + +// RAII classes for auto-releasing fds and dirs. +// if T is a pointer type, InvalidValue must be nullptr. Doing otherwise +// causes weird unexpected behaviors (See https://godbolt.org/z/5nGMW4). +template > +class ScopedResource { + public: + using ValidityChecker = Checker; + static constexpr T kInvalid = InvalidValue; + + explicit ScopedResource(T t = InvalidValue) : t_(t) {} + ScopedResource(ScopedResource&& other) noexcept { + t_ = other.t_; + other.t_ = InvalidValue; + } + ScopedResource& operator=(ScopedResource&& other) { + reset(other.t_); + other.t_ = InvalidValue; + return *this; + } + T get() const { return t_; } + T operator*() const { return t_; } + explicit operator bool() const { return Checker::IsValid(t_); } + void reset(T r = InvalidValue) { + if (Checker::IsValid(t_)) { + int res = CloseFunction(t_); + if (CheckClose) + PERFETTO_CHECK(res == 0); + } + t_ = r; + } + T release() { + T t = t_; + t_ = InvalidValue; + return t; + } + ~ScopedResource() { reset(InvalidValue); } + + private: + ScopedResource(const ScopedResource&) = delete; + ScopedResource& operator=(const ScopedResource&) = delete; + T t_; +}; + +// Declared in file_utils.h. Forward declared to avoid #include cycles. +int PERFETTO_EXPORT_COMPONENT CloseFile(int fd); + +// Use this for file resources obtained via open() and similar APIs. +using ScopedFile = ScopedResource; +using ScopedFstream = ScopedResource; + +// Use this for resources that are HANDLE on Windows. See comments in +// platform_handle.h +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) +using ScopedPlatformHandle = ScopedResource; +#else +// On non-windows systems we alias ScopedPlatformHandle to ScopedFile because +// they are really the same. This is to allow assignments between the two in +// Linux-specific code paths that predate ScopedPlatformHandle. +static_assert(std::is_same::value, ""); +using ScopedPlatformHandle = ScopedFile; + +// DIR* does not exist on Windows. +using ScopedDir = ScopedResource; +#endif + +} // namespace base +} // namespace perfetto + +#endif // INCLUDE_PERFETTO_EXT_BASE_SCOPED_FILE_H_ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_PERFETTO_EXT_BASE_EVENT_FD_H_ +#define INCLUDE_PERFETTO_EXT_BASE_EVENT_FD_H_ + +// gen_amalgamated expanded: #include "perfetto/base/build_config.h" +// gen_amalgamated expanded: #include "perfetto/base/platform_handle.h" +// gen_amalgamated expanded: #include "perfetto/ext/base/scoped_file.h" + +namespace perfetto { +namespace base { + +// A waitable event that can be used with poll/select. +// This is really a wrapper around eventfd_create with a pipe-based fallback +// for other platforms where eventfd is not supported. +class EventFd { + public: + EventFd(); + ~EventFd(); + EventFd(EventFd&&) noexcept = default; + EventFd& operator=(EventFd&&) = default; + + // The non-blocking file descriptor that can be polled to wait for the event. + PlatformHandle fd() const { return event_handle_.get(); } + + // Can be called from any thread. + void Notify(); + + // Can be called from any thread. If more Notify() are queued a Clear() call + // can clear all of them (up to 16 per call). + void Clear(); + + private: + // The eventfd, when eventfd is supported, otherwise this is the read end of + // the pipe for fallback mode. + ScopedPlatformHandle event_handle_; + +#if !PERFETTO_BUILDFLAG(PERFETTO_OS_LINUX) && \ + !PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID) && \ + !PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) + // On Mac and other non-Linux UNIX platforms a pipe-based fallback is used. + // The write end of the wakeup pipe. + ScopedFile write_fd_; +#endif +}; + +} // namespace base +} // namespace perfetto + +#endif // INCLUDE_PERFETTO_EXT_BASE_EVENT_FD_H_ +// gen_amalgamated begin header: include/perfetto/ext/base/pipe.h +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_PERFETTO_EXT_BASE_PIPE_H_ +#define INCLUDE_PERFETTO_EXT_BASE_PIPE_H_ + +// gen_amalgamated expanded: #include "perfetto/base/platform_handle.h" +// gen_amalgamated expanded: #include "perfetto/ext/base/scoped_file.h" + +namespace perfetto { +namespace base { + +class Pipe { + public: + enum Flags { + kBothBlock = 0, +#if !PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) + kBothNonBlock, + kRdNonBlock, + kWrNonBlock, +#endif + }; + + static Pipe Create(Flags = kBothBlock); + + Pipe(); + Pipe(Pipe&&) noexcept; + Pipe& operator=(Pipe&&); + + ScopedPlatformHandle rd; + ScopedPlatformHandle wr; +}; + +} // namespace base +} // namespace perfetto + +#endif // INCLUDE_PERFETTO_EXT_BASE_PIPE_H_ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// gen_amalgamated expanded: #include "perfetto/base/build_config.h" + +#include +#include + +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) +#include +#include +#elif PERFETTO_BUILDFLAG(PERFETTO_OS_LINUX) || \ + PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID) +#include +#include +#else // Mac, Fuchsia and other non-Linux UNIXes +#include +#endif + +// gen_amalgamated expanded: #include "perfetto/base/logging.h" +// gen_amalgamated expanded: #include "perfetto/ext/base/event_fd.h" +// gen_amalgamated expanded: #include "perfetto/ext/base/pipe.h" +// gen_amalgamated expanded: #include "perfetto/ext/base/utils.h" + +namespace perfetto { +namespace base { + +EventFd::~EventFd() = default; + +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) +EventFd::EventFd() { + event_handle_.reset( + CreateEventA(/*lpEventAttributes=*/nullptr, /*bManualReset=*/true, + /*bInitialState=*/false, /*bInitialState=*/nullptr)); +} + +void EventFd::Notify() { + if (!SetEvent(event_handle_.get())) // 0: fail, !0: success, unlike UNIX. + PERFETTO_DFATAL("EventFd::Notify()"); +} + +void EventFd::Clear() { + if (!ResetEvent(event_handle_.get())) // 0: fail, !0: success, unlike UNIX. + PERFETTO_DFATAL("EventFd::Clear()"); +} + +#elif PERFETTO_BUILDFLAG(PERFETTO_OS_LINUX) || \ + PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID) + +EventFd::EventFd() { + event_handle_.reset(eventfd(/*initval=*/0, EFD_CLOEXEC | EFD_NONBLOCK)); + PERFETTO_CHECK(event_handle_); +} + +void EventFd::Notify() { + const uint64_t value = 1; + ssize_t ret = write(event_handle_.get(), &value, sizeof(value)); + if (ret <= 0 && errno != EAGAIN) + PERFETTO_DFATAL("EventFd::Notify()"); +} + +void EventFd::Clear() { + uint64_t value; + ssize_t ret = + PERFETTO_EINTR(read(event_handle_.get(), &value, sizeof(value))); + if (ret <= 0 && errno != EAGAIN) + PERFETTO_DFATAL("EventFd::Clear()"); +} + +#else + +EventFd::EventFd() { + // Make the pipe non-blocking so that we never block the waking thread (either + // the main thread or another one) when scheduling a wake-up. + Pipe pipe = Pipe::Create(Pipe::kBothNonBlock); + event_handle_ = ScopedPlatformHandle(std::move(pipe.rd).release()); + write_fd_ = std::move(pipe.wr); +} + +void EventFd::Notify() { + const uint64_t value = 1; + ssize_t ret = write(write_fd_.get(), &value, sizeof(uint8_t)); + if (ret <= 0 && errno != EAGAIN) + PERFETTO_DFATAL("EventFd::Notify()"); +} + +void EventFd::Clear() { + // Drain the byte(s) written to the wake-up pipe. We can potentially read + // more than one byte if several wake-ups have been scheduled. + char buffer[16]; + ssize_t ret = + PERFETTO_EINTR(read(event_handle_.get(), &buffer[0], sizeof(buffer))); + if (ret <= 0 && errno != EAGAIN) + PERFETTO_DFATAL("EventFd::Clear()"); +} +#endif + +} // namespace base +} // namespace perfetto +// gen_amalgamated begin source: src/base/file_utils.cc +// gen_amalgamated begin header: include/perfetto/ext/base/file_utils.h +// gen_amalgamated begin header: include/perfetto/base/status.h +/* + * Copyright (C) 2019 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_PERFETTO_BASE_STATUS_H_ +#define INCLUDE_PERFETTO_BASE_STATUS_H_ + +#include + +// gen_amalgamated expanded: #include "perfetto/base/compiler.h" +// gen_amalgamated expanded: #include "perfetto/base/export.h" +// gen_amalgamated expanded: #include "perfetto/base/logging.h" + +namespace perfetto { +namespace base { + +// Represents either the success or the failure message of a function. +// This can used as the return type of functions which would usually return an +// bool for success or int for errno but also wants to add some string context +// (ususally for logging). +class PERFETTO_EXPORT_COMPONENT Status { + public: + Status() : ok_(true) {} + explicit Status(std::string msg) : ok_(false), message_(std::move(msg)) { + PERFETTO_CHECK(!message_.empty()); + } + + // Copy operations. + Status(const Status&) = default; + Status& operator=(const Status&) = default; + + // Move operations. The moved-from state is valid but unspecified. + Status(Status&&) noexcept = default; + Status& operator=(Status&&) = default; + + bool ok() const { return ok_; } + + // When ok() is false this returns the error message. Returns the empty string + // otherwise. + const std::string& message() const { return message_; } + const char* c_message() const { return message_.c_str(); } + + private: + bool ok_ = false; + std::string message_; +}; + +// Returns a status object which represents the Ok status. +inline Status OkStatus() { + return Status(); +} + +PERFETTO_PRINTF_FORMAT(1, 2) Status ErrStatus(const char* format, ...); + +} // namespace base +} // namespace perfetto + +#endif // INCLUDE_PERFETTO_BASE_STATUS_H_ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_PERFETTO_EXT_BASE_FILE_UTILS_H_ +#define INCLUDE_PERFETTO_EXT_BASE_FILE_UTILS_H_ + +#include // For mode_t & O_RDONLY/RDWR. Exists also on Windows. +#include + +#include +#include + +// gen_amalgamated expanded: #include "perfetto/base/build_config.h" +// gen_amalgamated expanded: #include "perfetto/base/export.h" +// gen_amalgamated expanded: #include "perfetto/base/status.h" +// gen_amalgamated expanded: #include "perfetto/ext/base/scoped_file.h" +// gen_amalgamated expanded: #include "perfetto/ext/base/optional.h" +// gen_amalgamated expanded: #include "perfetto/ext/base/utils.h" + +namespace perfetto { +namespace base { + +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) +using FileOpenMode = int; +#else +using FileOpenMode = mode_t; +#endif + +constexpr FileOpenMode kFileModeInvalid = static_cast(-1); + +bool ReadPlatformHandle(PlatformHandle, std::string* out); +bool ReadFileDescriptor(int fd, std::string* out); +bool ReadFileStream(FILE* f, std::string* out); +bool ReadFile(const std::string& path, std::string* out); + +// A wrapper around read(2). It deals with Linux vs Windows includes. It also +// deals with handling EINTR. Has the same semantics of UNIX's read(2). +ssize_t Read(int fd, void* dst, size_t dst_size); + +// Call write until all data is written or an error is detected. +// +// man 2 write: +// If a write() is interrupted by a signal handler before any bytes are +// written, then the call fails with the error EINTR; if it is +// interrupted after at least one byte has been written, the call +// succeeds, and returns the number of bytes written. +ssize_t WriteAll(int fd, const void* buf, size_t count); + +ssize_t WriteAllHandle(PlatformHandle, const void* buf, size_t count); + +ScopedFile OpenFile(const std::string& path, + int flags, + FileOpenMode = kFileModeInvalid); + +// This is an alias for close(). It's to avoid leaking Windows.h in headers. +// Exported because ScopedFile is used in the /include/ext API by Chromium +// component builds. +int PERFETTO_EXPORT_COMPONENT CloseFile(int fd); + +bool FlushFile(int fd); + +// Returns true if mkdir succeeds, false if it fails (see errno in that case). +bool Mkdir(const std::string& path); + +// Calls rmdir() on UNIX, _rmdir() on Windows. +bool Rmdir(const std::string& path); + +// Wrapper around access(path, F_OK). +bool FileExists(const std::string& path); + +// Gets the extension for a filename. If the file has two extensions, returns +// only the last one (foo.pb.gz => .gz). Returns empty string if there is no +// extension. +std::string GetFileExtension(const std::string& filename); + +// Puts the path to all files under |dir_path| in |output|, recursively walking +// subdirectories. File paths are relative to |dir_path|. Only files are +// included, not directories. Path separator is always '/', even on windows (not +// '\'). +base::Status ListFilesRecursive(const std::string& dir_path, + std::vector& output); + +// Returns the size of the file at `path` or nullopt in case of error. +Optional GetFileSize(const std::string& path); + +} // namespace base +} // namespace perfetto + +#endif // INCLUDE_PERFETTO_EXT_BASE_FILE_UTILS_H_ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// gen_amalgamated expanded: #include "perfetto/ext/base/file_utils.h" + +#include +#include + +#include +#include +#include +#include + +// gen_amalgamated expanded: #include "perfetto/base/build_config.h" +// gen_amalgamated expanded: #include "perfetto/base/logging.h" +// gen_amalgamated expanded: #include "perfetto/base/platform_handle.h" +// gen_amalgamated expanded: #include "perfetto/base/status.h" +// gen_amalgamated expanded: #include "perfetto/ext/base/scoped_file.h" +// gen_amalgamated expanded: #include "perfetto/ext/base/utils.h" + +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) +#include +#include +#include +#else +#include +#include +#endif + +namespace perfetto { +namespace base { +namespace { +constexpr size_t kBufSize = 2048; + +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) +// Wrap FindClose to: (1) make the return unix-style; (2) deal with stdcall. +int CloseFindHandle(HANDLE h) { + return FindClose(h) ? 0 : -1; +} +#endif + +} // namespace + +ssize_t Read(int fd, void* dst, size_t dst_size) { +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) + return _read(fd, dst, static_cast(dst_size)); +#else + return PERFETTO_EINTR(read(fd, dst, dst_size)); +#endif +} + +bool ReadFileDescriptor(int fd, std::string* out) { + // Do not override existing data in string. + size_t i = out->size(); + + struct stat buf {}; + if (fstat(fd, &buf) != -1) { + if (buf.st_size > 0) + out->resize(i + static_cast(buf.st_size)); + } + + ssize_t bytes_read; + for (;;) { + if (out->size() < i + kBufSize) + out->resize(out->size() + kBufSize); + + bytes_read = Read(fd, &((*out)[i]), kBufSize); + if (bytes_read > 0) { + i += static_cast(bytes_read); + } else { + out->resize(i); + return bytes_read == 0; + } + } +} + +bool ReadPlatformHandle(PlatformHandle h, std::string* out) { +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) + // Do not override existing data in string. + size_t i = out->size(); + + for (;;) { + if (out->size() < i + kBufSize) + out->resize(out->size() + kBufSize); + DWORD bytes_read = 0; + auto res = ::ReadFile(h, &((*out)[i]), kBufSize, &bytes_read, nullptr); + if (res && bytes_read > 0) { + i += static_cast(bytes_read); + } else { + out->resize(i); + const bool is_eof = res && bytes_read == 0; + auto err = res ? 0 : GetLastError(); + // The "Broken pipe" error on Windows is slighly different than Unix: + // On Unix: a "broken pipe" error can happen only on the writer side. On + // the reader there is no broken pipe, just a EOF. + // On windows: the reader also sees a broken pipe error. + // Here we normalize on the Unix behavior, treating broken pipe as EOF. + return is_eof || err == ERROR_BROKEN_PIPE; + } + } +#else + return ReadFileDescriptor(h, out); +#endif +} + +bool ReadFileStream(FILE* f, std::string* out) { + return ReadFileDescriptor(fileno(f), out); +} + +bool ReadFile(const std::string& path, std::string* out) { + base::ScopedFile fd = base::OpenFile(path, O_RDONLY); + if (!fd) + return false; + + return ReadFileDescriptor(*fd, out); +} + +ssize_t WriteAll(int fd, const void* buf, size_t count) { + size_t written = 0; + while (written < count) { + // write() on windows takes an unsigned int size. + uint32_t bytes_left = static_cast( + std::min(count - written, static_cast(UINT32_MAX))); + ssize_t wr = PERFETTO_EINTR( + write(fd, static_cast(buf) + written, bytes_left)); + if (wr == 0) + break; + if (wr < 0) + return wr; + written += static_cast(wr); + } + return static_cast(written); +} + +ssize_t WriteAllHandle(PlatformHandle h, const void* buf, size_t count) { +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) + DWORD wsize = 0; + if (::WriteFile(h, buf, static_cast(count), &wsize, nullptr)) { + return wsize; + } else { + return -1; + } +#else + return WriteAll(h, buf, count); +#endif +} + +bool FlushFile(int fd) { + PERFETTO_DCHECK(fd != 0); +#if PERFETTO_BUILDFLAG(PERFETTO_OS_LINUX) || \ + PERFETTO_BUILDFLAG(PERFETTO_OS_ANDROID) + return !PERFETTO_EINTR(fdatasync(fd)); +#elif PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) + return !PERFETTO_EINTR(_commit(fd)); +#else + return !PERFETTO_EINTR(fsync(fd)); +#endif +} + +bool Mkdir(const std::string& path) { +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) + return _mkdir(path.c_str()) == 0; +#else + return mkdir(path.c_str(), 0755) == 0; +#endif +} + +bool Rmdir(const std::string& path) { +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) + return _rmdir(path.c_str()) == 0; +#else + return rmdir(path.c_str()) == 0; +#endif +} + +int CloseFile(int fd) { + return close(fd); +} + +ScopedFile OpenFile(const std::string& path, int flags, FileOpenMode mode) { + PERFETTO_DCHECK((flags & O_CREAT) == 0 || mode != kFileModeInvalid); +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) + // Always use O_BINARY on Windows, to avoid silly EOL translations. + ScopedFile fd(_open(path.c_str(), flags | O_BINARY, mode)); +#else + // Always open a ScopedFile with O_CLOEXEC so we can safely fork and exec. + ScopedFile fd(open(path.c_str(), flags | O_CLOEXEC, mode)); +#endif + return fd; +} + +bool FileExists(const std::string& path) { +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) + return _access(path.c_str(), 0) == 0; +#else + return access(path.c_str(), F_OK) == 0; +#endif +} + +// Declared in base/platform_handle.h. +int ClosePlatformHandle(PlatformHandle handle) { +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) + // Make the return value UNIX-style. + return CloseHandle(handle) ? 0 : -1; +#else + return close(handle); +#endif +} + +base::Status ListFilesRecursive(const std::string& dir_path, + std::vector& output) { + std::string root_dir_path = dir_path; + if (root_dir_path.back() == '\\') { + root_dir_path.back() = '/'; + } else if (root_dir_path.back() != '/') { + root_dir_path.push_back('/'); + } + + // dir_queue contains full paths to the directories. The paths include the + // root_dir_path at the beginning and the trailing slash at the end. + std::deque dir_queue; + dir_queue.push_back(root_dir_path); + + while (!dir_queue.empty()) { + const std::string cur_dir = std::move(dir_queue.front()); + dir_queue.pop_front(); +#if PERFETTO_BUILDFLAG(PERFETTO_OS_NACL) + return base::ErrStatus("ListFilesRecursive not supported yet"); +#elif PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) + std::string glob_path = cur_dir + "*"; + // + 1 because we also have to count the NULL terminator. + if (glob_path.length() + 1 > MAX_PATH) + return base::ErrStatus("Directory path %s is too long", dir_path.c_str()); + WIN32_FIND_DATAA ffd; + + base::ScopedResource + hFind(FindFirstFileA(glob_path.c_str(), &ffd)); + if (!hFind) { + // For empty directories, there should be at least one entry '.'. + // If FindFirstFileA returns INVALID_HANDLE_VALUE, this means directory + // couldn't be accessed. + return base::ErrStatus("Failed to open directory %s", cur_dir.c_str()); + } + do { + if (strcmp(ffd.cFileName, ".") == 0 || strcmp(ffd.cFileName, "..") == 0) + continue; + if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) { + std::string subdir_path = cur_dir + ffd.cFileName + '/'; + dir_queue.push_back(subdir_path); + } else { + const std::string full_path = cur_dir + ffd.cFileName; + PERFETTO_CHECK(full_path.length() > root_dir_path.length()); + output.push_back(full_path.substr(root_dir_path.length())); + } + } while (FindNextFileA(*hFind, &ffd)); +#else + ScopedDir dir = ScopedDir(opendir(cur_dir.c_str())); + if (!dir) { + return base::ErrStatus("Failed to open directory %s", cur_dir.c_str()); + } + for (auto* dirent = readdir(dir.get()); dirent != nullptr; + dirent = readdir(dir.get())) { + if (strcmp(dirent->d_name, ".") == 0 || + strcmp(dirent->d_name, "..") == 0) { + continue; + } + if (dirent->d_type == DT_DIR) { + dir_queue.push_back(cur_dir + dirent->d_name + '/'); + } else if (dirent->d_type == DT_REG) { + const std::string full_path = cur_dir + dirent->d_name; + PERFETTO_CHECK(full_path.length() > root_dir_path.length()); + output.push_back(full_path.substr(root_dir_path.length())); + } + } +#endif + } + return base::OkStatus(); +} + +std::string GetFileExtension(const std::string& filename) { + auto ext_idx = filename.rfind('.'); + if (ext_idx == std::string::npos) + return std::string(); + return filename.substr(ext_idx); +} + +base::Optional GetFileSize(const std::string& file_path) { +#if PERFETTO_BUILDFLAG(PERFETTO_OS_WIN) + HANDLE file = + CreateFileA(file_path.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr, + OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr); + if (file == INVALID_HANDLE_VALUE) { + return nullopt; + } + LARGE_INTEGER file_size; + file_size.QuadPart = 0; + BOOL ok = GetFileSizeEx(file, &file_size); + CloseHandle(file); + if (!ok) { + return nullopt; + } + return static_cast(file_size.QuadPart); +#else + base::ScopedFile fd(base::OpenFile(file_path, O_RDONLY | O_CLOEXEC)); + if (!fd) { + return nullopt; + } + struct stat buf {}; + if (fstat(*fd, &buf) == -1) { + return nullopt; + } + return static_cast(buf.st_size); +#endif +} + +} // namespace base +} // namespace perfetto +// gen_amalgamated begin source: src/base/getopt_compat.cc +// gen_amalgamated begin header: include/perfetto/ext/base/getopt_compat.h +/* + * Copyright (C) 2021 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef INCLUDE_PERFETTO_EXT_BASE_GETOPT_COMPAT_H_ +#define INCLUDE_PERFETTO_EXT_BASE_GETOPT_COMPAT_H_ + +#include // For std::nullptr_t + +// No translation units other than base/getopt.h and getopt_compat_unittest.cc +// should directly include this file. Use base/getopt.h instead. + +namespace perfetto { +namespace base { +namespace getopt_compat { + +// A tiny getopt() replacement for Windows, which doesn't have . +// This implementation is based on the subset of features that we use in the +// Perfetto codebase. It doesn't even try to deal with the full surface of GNU's +// getopt(). +// Limitations: +// - getopt_long_only() is not supported. +// - optional_argument is not supported. That is extremely subtle and caused us +// problems in the past with GNU's getopt. +// - It does not reorder non-option arguments. It behaves like MacOS getopt, or +// GNU's when POSIXLY_CORRECT=1. +// - Doesn't expose optopt or opterr. +// - option.flag and longindex are not supported and must be nullptr. + +enum { + no_argument = 0, + required_argument = 1, +}; + +struct option { + const char* name; + int has_arg; + std::nullptr_t flag; // Only nullptr is supported. + int val; +}; + +extern char* optarg; +extern int optind; +extern int optopt; +extern int opterr; + +int getopt_long(int argc, + char** argv, + const char* shortopts, + const option* longopts, + std::nullptr_t /*longindex is not supported*/); + +int getopt(int argc, char** argv, const char* shortopts); + +} // namespace getopt_compat +} // namespace base +} // namespace perfetto + +#endif // INCLUDE_PERFETTO_EXT_BASE_GETOPT_COMPAT_H_ +/* + * Copyright (C) 2021 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// gen_amalgamated expanded: #include "perfetto/ext/base/getopt_compat.h" + +#include +#include +#include + +#include + +// gen_amalgamated expanded: #include "perfetto/base/logging.h" + +namespace perfetto { +namespace base { +namespace getopt_compat { + +char* optarg = nullptr; +int optind = 0; +int optopt = 0; +int opterr = 1; + +namespace { + +char* nextchar = nullptr; + +const option* LookupLongOpt(const std::vector