From 111c162e31c056e0aca3cdd7c6f4f88a5350e24c Mon Sep 17 00:00:00 2001 From: David Yat Sin Date: Fri, 17 Nov 2023 04:43:51 +0000 Subject: [PATCH 1/5] libhsakmt: Handle HW_EXCEPTION events Add new structures for HW Exception events and copy data from KFD to expose to upper layers. Change-Id: Icd5eb98997c47620e3b86277ab6d3abb7ed7d56f --- include/hsakmttypes.h | 17 +++++++++++++++++ src/events.c | 10 ++++++++++ 2 files changed, 27 insertions(+) diff --git a/include/hsakmttypes.h b/include/hsakmttypes.h index 2937c35..9f1b2cc 100644 --- a/include/hsakmttypes.h +++ b/include/hsakmttypes.h @@ -1044,6 +1044,21 @@ typedef struct _HsaMemoryAccessFault HSA_EVENTID_MEMORYFLAGS Flags; // event flags } HsaMemoryAccessFault; +typedef enum _HSA_EVENTID_HW_EXCEPTION_CAUSE +{ + HSA_EVENTID_HW_EXCEPTION_GPU_HANG = 0, // GPU Hang + HSA_EVENTID_HW_EXCEPTION_ECC = 1, // SRAM ECC error +} HSA_EVENTID_HW_EXCEPTION_CAUSE; + +// data associated with HSA_EVENTID_HW_EXCEPTION +typedef struct _HsaHwException +{ + HSAuint32 NodeId; // Node Id where the memory exception occured + HSAuint32 ResetType; + HSAuint32 MemoryLost; + HSA_EVENTID_HW_EXCEPTION_CAUSE ResetCause; +} HsaHwException; + typedef struct _HsaEventData { HSA_EVENTTYPE EventType; //event type @@ -1062,6 +1077,8 @@ typedef struct _HsaEventData // data associated with HSA_EVENTTYPE_MEMORY HsaMemoryAccessFault MemoryAccessFault; + // data associated with HSA_EVENTTYPE_HW_EXCEPTION + HsaHwException HwException; } EventData; // the following data entries are internal to the KFD & thunk itself. diff --git a/src/events.c b/src/events.c index 9ec199a..60e9f6d 100644 --- a/src/events.c +++ b/src/events.c @@ -428,6 +428,16 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[], ((event_data[i].memory_exception_data.ErrorType == 1) || (event_data[i].memory_exception_data.ErrorType == 2)) ? 1 : 0; Events[i]->EventData.EventData.MemoryAccessFault.Flags = HSA_EVENTID_MEMORY_FATAL_PROCESS; analysis_memory_exception(&event_data[i].memory_exception_data); + } else if (Events[i]->EventData.EventType == HSA_EVENTTYPE_HW_EXCEPTION && + event_data[i].hw_exception_data.gpu_id) { + + result = gpuid_to_nodeid(event_data[i].hw_exception_data.gpu_id, &Events[i]->EventData.EventData.HwException.NodeId); + if (result != HSAKMT_STATUS_SUCCESS) + goto out; + + Events[i]->EventData.EventData.HwException.ResetType = event_data[i].hw_exception_data.reset_type; + Events[i]->EventData.EventData.HwException.ResetCause = event_data[i].hw_exception_data.reset_cause; + Events[i]->EventData.EventData.HwException.MemoryLost = event_data[i].hw_exception_data.memory_lost; } } } From c245f8016e05436c857720159a510b7e7ee80e6b Mon Sep 17 00:00:00 2001 From: Yifan Zhang Date: Mon, 23 Oct 2023 21:58:22 +0800 Subject: [PATCH 2/5] kfdtest: Change SetGetAttributesTest range granularity granularity check is added in kfd w/ below patch: commit 270c7a8375a91fec2fb4e2c253e3955d9b7540b4 Author: Jesse Zhang Date: Fri Oct 20 09:43:51 2023 +0800 drm/amdkfd: Fix shift out-of-bounds issue diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index a690dced6860..f2b33fb2afcf 100644 Change-Id: I8cb037e3bf5db0a85661494b77e59984eca4d98d --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -781,7 +781,7 @@ svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange, prange->flags &= ~attrs[i].value; break; case KFD_IOCTL_SVM_ATTR_GRANULARITY: - prange->granularity = attrs[i].value; + prange->granularity = min_t(uint32_t, attrs[i].value, 0x3F); break; default: WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); Test cases have to been modified accordingly otherwise KFDSVMRangeTest.SetGetAttributesTest fails. Signed-off-by: Yifan Zhang Change-Id: Ifff47556bc398da6b18ad26ac545d139b63b0c92 (cherry picked from commit d36c28cd8f0ac8f024dee662f2c79e9237f61689) --- tests/kfdtest/src/KFDSVMRangeTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/kfdtest/src/KFDSVMRangeTest.cpp b/tests/kfdtest/src/KFDSVMRangeTest.cpp index a84ab50..b38eb70 100644 --- a/tests/kfdtest/src/KFDSVMRangeTest.cpp +++ b/tests/kfdtest/src/KFDSVMRangeTest.cpp @@ -118,7 +118,7 @@ TEST_P(KFDSVMRangeTest, SetGetAttributesTest) { {HSA_SVM_ATTR_PREFERRED_LOC, (HSAuint32)defaultGPUNode}, {HSA_SVM_ATTR_SET_FLAGS, HSA_SVM_FLAG_HOST_ACCESS | HSA_SVM_FLAG_GPU_EXEC | HSA_SVM_FLAG_COHERENT}, - {HSA_SVM_ATTR_GRANULARITY, 0xFF}, + {HSA_SVM_ATTR_GRANULARITY, 0x3F}, {HSA_SVM_ATTR_ACCESS, (HSAuint32)defaultGPUNode}, }; From 640d5e2602103cd59fbcfc4cd3545d936b940bb5 Mon Sep 17 00:00:00 2001 From: Jonathan Kim Date: Mon, 24 Jul 2023 15:16:28 -0400 Subject: [PATCH 3/5] libhsakmt: workaround fragment size mismatch on migration for gfx11 Due to a kernel bug where large unified memory ends up with an incorrect fragment size during migration, the debugger ends up triggering save area corruption. Since the debugger does not work in virtualization and the performance requirement to allocate the save area as unified memory is only for GFX1101 virtualized devices, allocate the save area in system memory for all other GFX11 devices. Note that devices prior to GFX11 have not exhibited issues so keep the save area in unified memory for these devices. Signed-off-by: Jonathan Kim Change-Id: I9c09a9af232a26aaece1c1663fdfad756fbbcdf8 --- src/queues.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/queues.c b/src/queues.c index f62e862..f72f6d6 100644 --- a/src/queues.c +++ b/src/queues.c @@ -509,7 +509,8 @@ static int handle_concrete_asic(struct queue *q, q->total_mem_alloc_size = (q->ctx_save_restore_size + q->debug_memory_size) * node.NumXcc; - svm_api = node.Capability.ui32.SVMAPISupported; + svm_api = node.Capability.ui32.SVMAPISupported && + (node.EngineId.ui32.Major < 11 || node.DeviceId == 0x7461); /* Allocate unified memory for context save restore * area on dGPU. From c346db7a060ceabe71ca9bc37d6f797091211fd0 Mon Sep 17 00:00:00 2001 From: David Belanger Date: Tue, 8 Aug 2023 14:54:37 -0400 Subject: [PATCH 4/5] kfdtest: Re-blacklist KFDQMTest.MultipleCpQueuesStressDispatch on GFX11. This test is still causing issue on GFX11. Change-Id: I4c12e2a410598a7f820bee96eccac6fae9642208 Signed-off-by: David Belanger --- tests/kfdtest/scripts/kfdtest.exclude | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/kfdtest/scripts/kfdtest.exclude b/tests/kfdtest/scripts/kfdtest.exclude index 2e3249a..22c6193 100644 --- a/tests/kfdtest/scripts/kfdtest.exclude +++ b/tests/kfdtest/scripts/kfdtest.exclude @@ -124,10 +124,12 @@ BLACKLIST_GFX10_NV2X=\ "$BLACKLIST_GFX10:"\ "KFDPerfCountersTest.*" +# KFDQMTest.MultipleCpQueuesStressDispatch - SWDEV-409990 # KFDMemoryTest.FlatScratchAccess - SWDEV-329877 # KFDGWSTest.*: GFX11 will no longer use global wave sync BLACKLIST_GFX11=\ "KFDQMTest.CreateAqlCpQueue:"\ +"KFDQMTest.MultipleCpQueuesStressDispatch:"\ "KFDCWSRTest.InterruptRestore:"\ "KFDPerfCountersTest.*:"\ "KFDMemoryTest.FlatScratchAccess:"\ From b6e680ae00bff5f4f989c14bbbcd6a9d4a5be583 Mon Sep 17 00:00:00 2001 From: Stella Laurenzo Date: Tue, 12 Mar 2024 13:26:41 -0700 Subject: [PATCH 5/5] Properly nest build time headers to match arrangement at install time. The build tree was missing a level of nesting, causing diversions based on in-tree/out-of-tree use. --- CMakeLists.txt | 2 +- include/{ => hsakmt}/hsakmt.h | 0 include/{ => hsakmt}/hsakmttypes.h | 0 include/{ => hsakmt}/linux/kfd_ioctl.h | 0 src/debug.c | 2 +- src/events.c | 2 +- src/fmm.c | 2 +- src/fmm.h | 2 +- src/libhsakmt.h | 4 ++-- src/memory.c | 2 +- src/perfctr.c | 2 +- src/queues.c | 2 +- src/spm.c | 2 +- src/time.c | 2 +- src/version.c | 2 +- 15 files changed, 13 insertions(+), 13 deletions(-) rename include/{ => hsakmt}/hsakmt.h (100%) rename include/{ => hsakmt}/hsakmttypes.h (100%) rename include/{ => hsakmt}/linux/kfd_ioctl.h (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1263efb..4341ac9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -208,7 +208,7 @@ install ( TARGETS ${HSAKMT_TARGET} #install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT devel ) # Install public headers -install ( DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${HSAKMT_TARGET} +install ( DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/${HSAKMT_TARGET} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${HSAKMT_TARGET} COMPONENT devel PATTERN "linux" EXCLUDE ) # Option to build header path migration helpers. diff --git a/include/hsakmt.h b/include/hsakmt/hsakmt.h similarity index 100% rename from include/hsakmt.h rename to include/hsakmt/hsakmt.h diff --git a/include/hsakmttypes.h b/include/hsakmt/hsakmttypes.h similarity index 100% rename from include/hsakmttypes.h rename to include/hsakmt/hsakmttypes.h diff --git a/include/linux/kfd_ioctl.h b/include/hsakmt/linux/kfd_ioctl.h similarity index 100% rename from include/linux/kfd_ioctl.h rename to include/hsakmt/linux/kfd_ioctl.h diff --git a/src/debug.c b/src/debug.c index d438a28..932e829 100644 --- a/src/debug.c +++ b/src/debug.c @@ -24,7 +24,7 @@ */ #include "libhsakmt.h" -#include "linux/kfd_ioctl.h" +#include "hsakmt/linux/kfd_ioctl.h" #include #include #include diff --git a/src/events.c b/src/events.c index 60e9f6d..a9e8ccc 100644 --- a/src/events.c +++ b/src/events.c @@ -31,7 +31,7 @@ #include #include #include -#include "linux/kfd_ioctl.h" +#include "hsakmt/linux/kfd_ioctl.h" #include "fmm.h" static HSAuint64 *events_page = NULL; diff --git a/src/fmm.c b/src/fmm.c index b87ee12..1db6984 100644 --- a/src/fmm.c +++ b/src/fmm.c @@ -25,7 +25,7 @@ #include "libhsakmt.h" #include "fmm.h" -#include "linux/kfd_ioctl.h" +#include "hsakmt/linux/kfd_ioctl.h" #include #include #include diff --git a/src/fmm.h b/src/fmm.h index b8c9b84..d414b91 100644 --- a/src/fmm.h +++ b/src/fmm.h @@ -26,7 +26,7 @@ #ifndef FMM_H_ #define FMM_H_ -#include "hsakmttypes.h" +#include "hsakmt/hsakmttypes.h" #include typedef enum { diff --git a/src/libhsakmt.h b/src/libhsakmt.h index 2d8da2c..348cc46 100644 --- a/src/libhsakmt.h +++ b/src/libhsakmt.h @@ -26,8 +26,8 @@ #ifndef LIBHSAKMT_H_INCLUDED #define LIBHSAKMT_H_INCLUDED -#include "linux/kfd_ioctl.h" -#include "hsakmt.h" +#include "hsakmt/linux/kfd_ioctl.h" +#include "hsakmt/hsakmt.h" #include #include #include diff --git a/src/memory.c b/src/memory.c index ba33abd..5825db9 100644 --- a/src/memory.c +++ b/src/memory.c @@ -24,7 +24,7 @@ */ #include "libhsakmt.h" -#include "linux/kfd_ioctl.h" +#include "hsakmt/linux/kfd_ioctl.h" #include #include #include diff --git a/src/perfctr.c b/src/perfctr.c index ed051be..32df25a 100644 --- a/src/perfctr.c +++ b/src/perfctr.c @@ -30,7 +30,7 @@ #include #include "libhsakmt.h" #include "pmc_table.h" -#include "linux/kfd_ioctl.h" +#include "hsakmt/linux/kfd_ioctl.h" #include #include #include diff --git a/src/queues.c b/src/queues.c index f72f6d6..2cee2db 100644 --- a/src/queues.c +++ b/src/queues.c @@ -25,7 +25,7 @@ #include "libhsakmt.h" #include "fmm.h" -#include "linux/kfd_ioctl.h" +#include "hsakmt/linux/kfd_ioctl.h" #include #include #include diff --git a/src/spm.c b/src/spm.c index c4eb33e..9571197 100644 --- a/src/spm.c +++ b/src/spm.c @@ -24,7 +24,7 @@ */ #include "libhsakmt.h" -#include "linux/kfd_ioctl.h" +#include "hsakmt/linux/kfd_ioctl.h" #include #include diff --git a/src/time.c b/src/time.c index 96729a1..286350b 100644 --- a/src/time.c +++ b/src/time.c @@ -24,7 +24,7 @@ */ #include "libhsakmt.h" -#include "linux/kfd_ioctl.h" +#include "hsakmt/linux/kfd_ioctl.h" HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId, HsaClockCounters *Counters) diff --git a/src/version.c b/src/version.c index 34e51e0..c6e2e8d 100644 --- a/src/version.c +++ b/src/version.c @@ -26,7 +26,7 @@ #include "libhsakmt.h" #include #include -#include "linux/kfd_ioctl.h" +#include "hsakmt/linux/kfd_ioctl.h" HsaVersionInfo kfd_version_info;