From 111c162e31c056e0aca3cdd7c6f4f88a5350e24c Mon Sep 17 00:00:00 2001 From: David Yat Sin Date: Fri, 17 Nov 2023 04:43:51 +0000 Subject: [PATCH 1/5] libhsakmt: Handle HW_EXCEPTION events Add new structures for HW Exception events and copy data from KFD to expose to upper layers. Change-Id: Icd5eb98997c47620e3b86277ab6d3abb7ed7d56f --- include/hsakmttypes.h | 17 +++++++++++++++++ src/events.c | 10 ++++++++++ 2 files changed, 27 insertions(+) diff --git a/include/hsakmttypes.h b/include/hsakmttypes.h index 2937c35..9f1b2cc 100644 --- a/include/hsakmttypes.h +++ b/include/hsakmttypes.h @@ -1044,6 +1044,21 @@ typedef struct _HsaMemoryAccessFault HSA_EVENTID_MEMORYFLAGS Flags; // event flags } HsaMemoryAccessFault; +typedef enum _HSA_EVENTID_HW_EXCEPTION_CAUSE +{ + HSA_EVENTID_HW_EXCEPTION_GPU_HANG = 0, // GPU Hang + HSA_EVENTID_HW_EXCEPTION_ECC = 1, // SRAM ECC error +} HSA_EVENTID_HW_EXCEPTION_CAUSE; + +// data associated with HSA_EVENTID_HW_EXCEPTION +typedef struct _HsaHwException +{ + HSAuint32 NodeId; // Node Id where the memory exception occured + HSAuint32 ResetType; + HSAuint32 MemoryLost; + HSA_EVENTID_HW_EXCEPTION_CAUSE ResetCause; +} HsaHwException; + typedef struct _HsaEventData { HSA_EVENTTYPE EventType; //event type @@ -1062,6 +1077,8 @@ typedef struct _HsaEventData // data associated with HSA_EVENTTYPE_MEMORY HsaMemoryAccessFault MemoryAccessFault; + // data associated with HSA_EVENTTYPE_HW_EXCEPTION + HsaHwException HwException; } EventData; // the following data entries are internal to the KFD & thunk itself. diff --git a/src/events.c b/src/events.c index 9ec199a..60e9f6d 100644 --- a/src/events.c +++ b/src/events.c @@ -428,6 +428,16 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[], ((event_data[i].memory_exception_data.ErrorType == 1) || (event_data[i].memory_exception_data.ErrorType == 2)) ? 1 : 0; Events[i]->EventData.EventData.MemoryAccessFault.Flags = HSA_EVENTID_MEMORY_FATAL_PROCESS; analysis_memory_exception(&event_data[i].memory_exception_data); + } else if (Events[i]->EventData.EventType == HSA_EVENTTYPE_HW_EXCEPTION && + event_data[i].hw_exception_data.gpu_id) { + + result = gpuid_to_nodeid(event_data[i].hw_exception_data.gpu_id, &Events[i]->EventData.EventData.HwException.NodeId); + if (result != HSAKMT_STATUS_SUCCESS) + goto out; + + Events[i]->EventData.EventData.HwException.ResetType = event_data[i].hw_exception_data.reset_type; + Events[i]->EventData.EventData.HwException.ResetCause = event_data[i].hw_exception_data.reset_cause; + Events[i]->EventData.EventData.HwException.MemoryLost = event_data[i].hw_exception_data.memory_lost; } } } From c245f8016e05436c857720159a510b7e7ee80e6b Mon Sep 17 00:00:00 2001 From: Yifan Zhang Date: Mon, 23 Oct 2023 21:58:22 +0800 Subject: [PATCH 2/5] kfdtest: Change SetGetAttributesTest range granularity granularity check is added in kfd w/ below patch: commit 270c7a8375a91fec2fb4e2c253e3955d9b7540b4 Author: Jesse Zhang Date: Fri Oct 20 09:43:51 2023 +0800 drm/amdkfd: Fix shift out-of-bounds issue diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index a690dced6860..f2b33fb2afcf 100644 Change-Id: I8cb037e3bf5db0a85661494b77e59984eca4d98d --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -781,7 +781,7 @@ svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange, prange->flags &= ~attrs[i].value; break; case KFD_IOCTL_SVM_ATTR_GRANULARITY: - prange->granularity = attrs[i].value; + prange->granularity = min_t(uint32_t, attrs[i].value, 0x3F); break; default: WARN_ONCE(1, "svm_range_check_attrs wasn't called?"); Test cases have to been modified accordingly otherwise KFDSVMRangeTest.SetGetAttributesTest fails. Signed-off-by: Yifan Zhang Change-Id: Ifff47556bc398da6b18ad26ac545d139b63b0c92 (cherry picked from commit d36c28cd8f0ac8f024dee662f2c79e9237f61689) --- tests/kfdtest/src/KFDSVMRangeTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/kfdtest/src/KFDSVMRangeTest.cpp b/tests/kfdtest/src/KFDSVMRangeTest.cpp index a84ab50..b38eb70 100644 --- a/tests/kfdtest/src/KFDSVMRangeTest.cpp +++ b/tests/kfdtest/src/KFDSVMRangeTest.cpp @@ -118,7 +118,7 @@ TEST_P(KFDSVMRangeTest, SetGetAttributesTest) { {HSA_SVM_ATTR_PREFERRED_LOC, (HSAuint32)defaultGPUNode}, {HSA_SVM_ATTR_SET_FLAGS, HSA_SVM_FLAG_HOST_ACCESS | HSA_SVM_FLAG_GPU_EXEC | HSA_SVM_FLAG_COHERENT}, - {HSA_SVM_ATTR_GRANULARITY, 0xFF}, + {HSA_SVM_ATTR_GRANULARITY, 0x3F}, {HSA_SVM_ATTR_ACCESS, (HSAuint32)defaultGPUNode}, }; From 640d5e2602103cd59fbcfc4cd3545d936b940bb5 Mon Sep 17 00:00:00 2001 From: Jonathan Kim Date: Mon, 24 Jul 2023 15:16:28 -0400 Subject: [PATCH 3/5] libhsakmt: workaround fragment size mismatch on migration for gfx11 Due to a kernel bug where large unified memory ends up with an incorrect fragment size during migration, the debugger ends up triggering save area corruption. Since the debugger does not work in virtualization and the performance requirement to allocate the save area as unified memory is only for GFX1101 virtualized devices, allocate the save area in system memory for all other GFX11 devices. Note that devices prior to GFX11 have not exhibited issues so keep the save area in unified memory for these devices. Signed-off-by: Jonathan Kim Change-Id: I9c09a9af232a26aaece1c1663fdfad756fbbcdf8 --- src/queues.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/queues.c b/src/queues.c index f62e862..f72f6d6 100644 --- a/src/queues.c +++ b/src/queues.c @@ -509,7 +509,8 @@ static int handle_concrete_asic(struct queue *q, q->total_mem_alloc_size = (q->ctx_save_restore_size + q->debug_memory_size) * node.NumXcc; - svm_api = node.Capability.ui32.SVMAPISupported; + svm_api = node.Capability.ui32.SVMAPISupported && + (node.EngineId.ui32.Major < 11 || node.DeviceId == 0x7461); /* Allocate unified memory for context save restore * area on dGPU. From c346db7a060ceabe71ca9bc37d6f797091211fd0 Mon Sep 17 00:00:00 2001 From: David Belanger Date: Tue, 8 Aug 2023 14:54:37 -0400 Subject: [PATCH 4/5] kfdtest: Re-blacklist KFDQMTest.MultipleCpQueuesStressDispatch on GFX11. This test is still causing issue on GFX11. Change-Id: I4c12e2a410598a7f820bee96eccac6fae9642208 Signed-off-by: David Belanger --- tests/kfdtest/scripts/kfdtest.exclude | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/kfdtest/scripts/kfdtest.exclude b/tests/kfdtest/scripts/kfdtest.exclude index 2e3249a..22c6193 100644 --- a/tests/kfdtest/scripts/kfdtest.exclude +++ b/tests/kfdtest/scripts/kfdtest.exclude @@ -124,10 +124,12 @@ BLACKLIST_GFX10_NV2X=\ "$BLACKLIST_GFX10:"\ "KFDPerfCountersTest.*" +# KFDQMTest.MultipleCpQueuesStressDispatch - SWDEV-409990 # KFDMemoryTest.FlatScratchAccess - SWDEV-329877 # KFDGWSTest.*: GFX11 will no longer use global wave sync BLACKLIST_GFX11=\ "KFDQMTest.CreateAqlCpQueue:"\ +"KFDQMTest.MultipleCpQueuesStressDispatch:"\ "KFDCWSRTest.InterruptRestore:"\ "KFDPerfCountersTest.*:"\ "KFDMemoryTest.FlatScratchAccess:"\ From 9455708297d5eade99731d36df18685fcd140cb6 Mon Sep 17 00:00:00 2001 From: Stella Laurenzo Date: Tue, 12 Mar 2024 13:29:02 -0700 Subject: [PATCH 5/5] Link CMAKE_DL_LIBS. Was failing to link on AlmaLinux8. --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1263efb..1db2b27 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -176,7 +176,7 @@ include_directories(${DRM_AMDGPU_INCLUDE_DIRS}) include_directories(${DRM_INCLUDE_DIRS}) target_link_libraries ( ${HSAKMT_TARGET} - PRIVATE ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} pthread rt c numa + PRIVATE ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} pthread rt c numa ${CMAKE_DL_LIBS} ) target_compile_options(${HSAKMT_TARGET} PRIVATE ${DRM_CFLAGS} ${HSAKMT_C_FLAGS})