diff --git a/CMakeLists.txt b/CMakeLists.txt index 1263efb..1db2b27 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -176,7 +176,7 @@ include_directories(${DRM_AMDGPU_INCLUDE_DIRS}) include_directories(${DRM_INCLUDE_DIRS}) target_link_libraries ( ${HSAKMT_TARGET} - PRIVATE ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} pthread rt c numa + PRIVATE ${DRM_LDFLAGS} ${DRM_AMDGPU_LDFLAGS} pthread rt c numa ${CMAKE_DL_LIBS} ) target_compile_options(${HSAKMT_TARGET} PRIVATE ${DRM_CFLAGS} ${HSAKMT_C_FLAGS}) diff --git a/include/hsakmttypes.h b/include/hsakmttypes.h index 2937c35..9f1b2cc 100644 --- a/include/hsakmttypes.h +++ b/include/hsakmttypes.h @@ -1044,6 +1044,21 @@ typedef struct _HsaMemoryAccessFault HSA_EVENTID_MEMORYFLAGS Flags; // event flags } HsaMemoryAccessFault; +typedef enum _HSA_EVENTID_HW_EXCEPTION_CAUSE +{ + HSA_EVENTID_HW_EXCEPTION_GPU_HANG = 0, // GPU Hang + HSA_EVENTID_HW_EXCEPTION_ECC = 1, // SRAM ECC error +} HSA_EVENTID_HW_EXCEPTION_CAUSE; + +// data associated with HSA_EVENTID_HW_EXCEPTION +typedef struct _HsaHwException +{ + HSAuint32 NodeId; // Node Id where the memory exception occured + HSAuint32 ResetType; + HSAuint32 MemoryLost; + HSA_EVENTID_HW_EXCEPTION_CAUSE ResetCause; +} HsaHwException; + typedef struct _HsaEventData { HSA_EVENTTYPE EventType; //event type @@ -1062,6 +1077,8 @@ typedef struct _HsaEventData // data associated with HSA_EVENTTYPE_MEMORY HsaMemoryAccessFault MemoryAccessFault; + // data associated with HSA_EVENTTYPE_HW_EXCEPTION + HsaHwException HwException; } EventData; // the following data entries are internal to the KFD & thunk itself. diff --git a/src/events.c b/src/events.c index 9ec199a..60e9f6d 100644 --- a/src/events.c +++ b/src/events.c @@ -428,6 +428,16 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[], ((event_data[i].memory_exception_data.ErrorType == 1) || (event_data[i].memory_exception_data.ErrorType == 2)) ? 1 : 0; Events[i]->EventData.EventData.MemoryAccessFault.Flags = HSA_EVENTID_MEMORY_FATAL_PROCESS; analysis_memory_exception(&event_data[i].memory_exception_data); + } else if (Events[i]->EventData.EventType == HSA_EVENTTYPE_HW_EXCEPTION && + event_data[i].hw_exception_data.gpu_id) { + + result = gpuid_to_nodeid(event_data[i].hw_exception_data.gpu_id, &Events[i]->EventData.EventData.HwException.NodeId); + if (result != HSAKMT_STATUS_SUCCESS) + goto out; + + Events[i]->EventData.EventData.HwException.ResetType = event_data[i].hw_exception_data.reset_type; + Events[i]->EventData.EventData.HwException.ResetCause = event_data[i].hw_exception_data.reset_cause; + Events[i]->EventData.EventData.HwException.MemoryLost = event_data[i].hw_exception_data.memory_lost; } } } diff --git a/src/queues.c b/src/queues.c index f62e862..f72f6d6 100644 --- a/src/queues.c +++ b/src/queues.c @@ -509,7 +509,8 @@ static int handle_concrete_asic(struct queue *q, q->total_mem_alloc_size = (q->ctx_save_restore_size + q->debug_memory_size) * node.NumXcc; - svm_api = node.Capability.ui32.SVMAPISupported; + svm_api = node.Capability.ui32.SVMAPISupported && + (node.EngineId.ui32.Major < 11 || node.DeviceId == 0x7461); /* Allocate unified memory for context save restore * area on dGPU. diff --git a/tests/kfdtest/scripts/kfdtest.exclude b/tests/kfdtest/scripts/kfdtest.exclude index 2e3249a..22c6193 100644 --- a/tests/kfdtest/scripts/kfdtest.exclude +++ b/tests/kfdtest/scripts/kfdtest.exclude @@ -124,10 +124,12 @@ BLACKLIST_GFX10_NV2X=\ "$BLACKLIST_GFX10:"\ "KFDPerfCountersTest.*" +# KFDQMTest.MultipleCpQueuesStressDispatch - SWDEV-409990 # KFDMemoryTest.FlatScratchAccess - SWDEV-329877 # KFDGWSTest.*: GFX11 will no longer use global wave sync BLACKLIST_GFX11=\ "KFDQMTest.CreateAqlCpQueue:"\ +"KFDQMTest.MultipleCpQueuesStressDispatch:"\ "KFDCWSRTest.InterruptRestore:"\ "KFDPerfCountersTest.*:"\ "KFDMemoryTest.FlatScratchAccess:"\ diff --git a/tests/kfdtest/src/KFDSVMRangeTest.cpp b/tests/kfdtest/src/KFDSVMRangeTest.cpp index a84ab50..b38eb70 100644 --- a/tests/kfdtest/src/KFDSVMRangeTest.cpp +++ b/tests/kfdtest/src/KFDSVMRangeTest.cpp @@ -118,7 +118,7 @@ TEST_P(KFDSVMRangeTest, SetGetAttributesTest) { {HSA_SVM_ATTR_PREFERRED_LOC, (HSAuint32)defaultGPUNode}, {HSA_SVM_ATTR_SET_FLAGS, HSA_SVM_FLAG_HOST_ACCESS | HSA_SVM_FLAG_GPU_EXEC | HSA_SVM_FLAG_COHERENT}, - {HSA_SVM_ATTR_GRANULARITY, 0xFF}, + {HSA_SVM_ATTR_GRANULARITY, 0x3F}, {HSA_SVM_ATTR_ACCESS, (HSAuint32)defaultGPUNode}, };