diff --git a/CMakeLists.txt b/CMakeLists.txt index 1263efb..4341ac9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -208,7 +208,7 @@ install ( TARGETS ${HSAKMT_TARGET} #install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT devel ) # Install public headers -install ( DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${HSAKMT_TARGET} +install ( DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/${HSAKMT_TARGET} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${HSAKMT_TARGET} COMPONENT devel PATTERN "linux" EXCLUDE ) # Option to build header path migration helpers. diff --git a/include/hsakmt.h b/include/hsakmt/hsakmt.h similarity index 100% rename from include/hsakmt.h rename to include/hsakmt/hsakmt.h diff --git a/include/hsakmttypes.h b/include/hsakmt/hsakmttypes.h similarity index 99% rename from include/hsakmttypes.h rename to include/hsakmt/hsakmttypes.h index 2937c35..9f1b2cc 100644 --- a/include/hsakmttypes.h +++ b/include/hsakmt/hsakmttypes.h @@ -1044,6 +1044,21 @@ typedef struct _HsaMemoryAccessFault HSA_EVENTID_MEMORYFLAGS Flags; // event flags } HsaMemoryAccessFault; +typedef enum _HSA_EVENTID_HW_EXCEPTION_CAUSE +{ + HSA_EVENTID_HW_EXCEPTION_GPU_HANG = 0, // GPU Hang + HSA_EVENTID_HW_EXCEPTION_ECC = 1, // SRAM ECC error +} HSA_EVENTID_HW_EXCEPTION_CAUSE; + +// data associated with HSA_EVENTID_HW_EXCEPTION +typedef struct _HsaHwException +{ + HSAuint32 NodeId; // Node Id where the memory exception occured + HSAuint32 ResetType; + HSAuint32 MemoryLost; + HSA_EVENTID_HW_EXCEPTION_CAUSE ResetCause; +} HsaHwException; + typedef struct _HsaEventData { HSA_EVENTTYPE EventType; //event type @@ -1062,6 +1077,8 @@ typedef struct _HsaEventData // data associated with HSA_EVENTTYPE_MEMORY HsaMemoryAccessFault MemoryAccessFault; + // data associated with HSA_EVENTTYPE_HW_EXCEPTION + HsaHwException HwException; } EventData; // the following data entries are internal to the KFD & thunk itself. diff --git a/include/linux/kfd_ioctl.h b/include/hsakmt/linux/kfd_ioctl.h similarity index 100% rename from include/linux/kfd_ioctl.h rename to include/hsakmt/linux/kfd_ioctl.h diff --git a/src/debug.c b/src/debug.c index d438a28..932e829 100644 --- a/src/debug.c +++ b/src/debug.c @@ -24,7 +24,7 @@ */ #include "libhsakmt.h" -#include "linux/kfd_ioctl.h" +#include "hsakmt/linux/kfd_ioctl.h" #include #include #include diff --git a/src/events.c b/src/events.c index 9ec199a..a9e8ccc 100644 --- a/src/events.c +++ b/src/events.c @@ -31,7 +31,7 @@ #include #include #include -#include "linux/kfd_ioctl.h" +#include "hsakmt/linux/kfd_ioctl.h" #include "fmm.h" static HSAuint64 *events_page = NULL; @@ -428,6 +428,16 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtWaitOnMultipleEvents_Ext(HsaEvent *Events[], ((event_data[i].memory_exception_data.ErrorType == 1) || (event_data[i].memory_exception_data.ErrorType == 2)) ? 1 : 0; Events[i]->EventData.EventData.MemoryAccessFault.Flags = HSA_EVENTID_MEMORY_FATAL_PROCESS; analysis_memory_exception(&event_data[i].memory_exception_data); + } else if (Events[i]->EventData.EventType == HSA_EVENTTYPE_HW_EXCEPTION && + event_data[i].hw_exception_data.gpu_id) { + + result = gpuid_to_nodeid(event_data[i].hw_exception_data.gpu_id, &Events[i]->EventData.EventData.HwException.NodeId); + if (result != HSAKMT_STATUS_SUCCESS) + goto out; + + Events[i]->EventData.EventData.HwException.ResetType = event_data[i].hw_exception_data.reset_type; + Events[i]->EventData.EventData.HwException.ResetCause = event_data[i].hw_exception_data.reset_cause; + Events[i]->EventData.EventData.HwException.MemoryLost = event_data[i].hw_exception_data.memory_lost; } } } diff --git a/src/fmm.c b/src/fmm.c index b87ee12..1db6984 100644 --- a/src/fmm.c +++ b/src/fmm.c @@ -25,7 +25,7 @@ #include "libhsakmt.h" #include "fmm.h" -#include "linux/kfd_ioctl.h" +#include "hsakmt/linux/kfd_ioctl.h" #include #include #include diff --git a/src/fmm.h b/src/fmm.h index b8c9b84..d414b91 100644 --- a/src/fmm.h +++ b/src/fmm.h @@ -26,7 +26,7 @@ #ifndef FMM_H_ #define FMM_H_ -#include "hsakmttypes.h" +#include "hsakmt/hsakmttypes.h" #include typedef enum { diff --git a/src/libhsakmt.h b/src/libhsakmt.h index 2d8da2c..348cc46 100644 --- a/src/libhsakmt.h +++ b/src/libhsakmt.h @@ -26,8 +26,8 @@ #ifndef LIBHSAKMT_H_INCLUDED #define LIBHSAKMT_H_INCLUDED -#include "linux/kfd_ioctl.h" -#include "hsakmt.h" +#include "hsakmt/linux/kfd_ioctl.h" +#include "hsakmt/hsakmt.h" #include #include #include diff --git a/src/memory.c b/src/memory.c index ba33abd..5825db9 100644 --- a/src/memory.c +++ b/src/memory.c @@ -24,7 +24,7 @@ */ #include "libhsakmt.h" -#include "linux/kfd_ioctl.h" +#include "hsakmt/linux/kfd_ioctl.h" #include #include #include diff --git a/src/perfctr.c b/src/perfctr.c index ed051be..32df25a 100644 --- a/src/perfctr.c +++ b/src/perfctr.c @@ -30,7 +30,7 @@ #include #include "libhsakmt.h" #include "pmc_table.h" -#include "linux/kfd_ioctl.h" +#include "hsakmt/linux/kfd_ioctl.h" #include #include #include diff --git a/src/queues.c b/src/queues.c index f62e862..2cee2db 100644 --- a/src/queues.c +++ b/src/queues.c @@ -25,7 +25,7 @@ #include "libhsakmt.h" #include "fmm.h" -#include "linux/kfd_ioctl.h" +#include "hsakmt/linux/kfd_ioctl.h" #include #include #include @@ -509,7 +509,8 @@ static int handle_concrete_asic(struct queue *q, q->total_mem_alloc_size = (q->ctx_save_restore_size + q->debug_memory_size) * node.NumXcc; - svm_api = node.Capability.ui32.SVMAPISupported; + svm_api = node.Capability.ui32.SVMAPISupported && + (node.EngineId.ui32.Major < 11 || node.DeviceId == 0x7461); /* Allocate unified memory for context save restore * area on dGPU. diff --git a/src/spm.c b/src/spm.c index c4eb33e..9571197 100644 --- a/src/spm.c +++ b/src/spm.c @@ -24,7 +24,7 @@ */ #include "libhsakmt.h" -#include "linux/kfd_ioctl.h" +#include "hsakmt/linux/kfd_ioctl.h" #include #include diff --git a/src/time.c b/src/time.c index 96729a1..286350b 100644 --- a/src/time.c +++ b/src/time.c @@ -24,7 +24,7 @@ */ #include "libhsakmt.h" -#include "linux/kfd_ioctl.h" +#include "hsakmt/linux/kfd_ioctl.h" HSAKMT_STATUS HSAKMTAPI hsaKmtGetClockCounters(HSAuint32 NodeId, HsaClockCounters *Counters) diff --git a/src/version.c b/src/version.c index 34e51e0..c6e2e8d 100644 --- a/src/version.c +++ b/src/version.c @@ -26,7 +26,7 @@ #include "libhsakmt.h" #include #include -#include "linux/kfd_ioctl.h" +#include "hsakmt/linux/kfd_ioctl.h" HsaVersionInfo kfd_version_info; diff --git a/tests/kfdtest/scripts/kfdtest.exclude b/tests/kfdtest/scripts/kfdtest.exclude index 2e3249a..22c6193 100644 --- a/tests/kfdtest/scripts/kfdtest.exclude +++ b/tests/kfdtest/scripts/kfdtest.exclude @@ -124,10 +124,12 @@ BLACKLIST_GFX10_NV2X=\ "$BLACKLIST_GFX10:"\ "KFDPerfCountersTest.*" +# KFDQMTest.MultipleCpQueuesStressDispatch - SWDEV-409990 # KFDMemoryTest.FlatScratchAccess - SWDEV-329877 # KFDGWSTest.*: GFX11 will no longer use global wave sync BLACKLIST_GFX11=\ "KFDQMTest.CreateAqlCpQueue:"\ +"KFDQMTest.MultipleCpQueuesStressDispatch:"\ "KFDCWSRTest.InterruptRestore:"\ "KFDPerfCountersTest.*:"\ "KFDMemoryTest.FlatScratchAccess:"\ diff --git a/tests/kfdtest/src/KFDSVMRangeTest.cpp b/tests/kfdtest/src/KFDSVMRangeTest.cpp index a84ab50..b38eb70 100644 --- a/tests/kfdtest/src/KFDSVMRangeTest.cpp +++ b/tests/kfdtest/src/KFDSVMRangeTest.cpp @@ -118,7 +118,7 @@ TEST_P(KFDSVMRangeTest, SetGetAttributesTest) { {HSA_SVM_ATTR_PREFERRED_LOC, (HSAuint32)defaultGPUNode}, {HSA_SVM_ATTR_SET_FLAGS, HSA_SVM_FLAG_HOST_ACCESS | HSA_SVM_FLAG_GPU_EXEC | HSA_SVM_FLAG_COHERENT}, - {HSA_SVM_ATTR_GRANULARITY, 0xFF}, + {HSA_SVM_ATTR_GRANULARITY, 0x3F}, {HSA_SVM_ATTR_ACCESS, (HSAuint32)defaultGPUNode}, };