diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index a10812606e..67a815d1de 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -28,7 +28,7 @@ finally: del bindings, importlib, subdir, cuda_major, cuda_minor -from cuda.core import utils # noqa: E402 +from cuda.core import system, utils # noqa: E402 from cuda.core._device import Device # noqa: E402 from cuda.core._event import Event, EventOptions # noqa: E402 from cuda.core._graph import ( # noqa: E402 @@ -62,8 +62,3 @@ from cuda.core._module import Kernel, ObjectCode # noqa: E402 from cuda.core._program import Program, ProgramOptions # noqa: E402 from cuda.core._stream import Stream, StreamOptions # noqa: E402 -from cuda.core._system import System # noqa: E402 - -system = System() -__import__("sys").modules[__spec__.name + ".system"] = system -del System diff --git a/cuda_core/cuda/core/_system.py b/cuda_core/cuda/core/_system.py deleted file mode 100644 index 6f06587b46..0000000000 --- a/cuda_core/cuda/core/_system.py +++ /dev/null @@ -1,114 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 - -import warnings - -from cuda.core._device import Device -from cuda.core._utils.cuda_utils import driver, handle_return, runtime - - -class System: - """Provide information about the cuda system. - This class is a singleton and should not be instantiated directly. - """ - - _instance = None - - def __new__(cls): - if cls._instance is None: - cls._instance = super().__new__(cls) - return cls._instance - - def __init__(self): - if hasattr(self, "_initialized") and self._initialized: - return - self._initialized = True - - def get_driver_version(self) -> tuple[int, int]: - """ - Query the CUDA driver version. - - Returns - ------- - tuple of int - A 2-tuple of (major, minor) version numbers. - """ - version = handle_return(driver.cuDriverGetVersion()) - major = version // 1000 - minor = (version % 1000) // 10 - return (major, minor) - - @property - def driver_version(self) -> tuple[int, int]: - """ - Query the CUDA driver version. - - Returns - ------- - tuple of int - A 2-tuple of (major, minor) version numbers. - - .. deprecated:: 0.5.0 - `cuda.core.system.driver_version` will be removed in 0.6.0. - Use `cuda.core.system.get_driver_version()` instead. - """ - warnings.warn( - "cuda.core.system.driver_version is deprecated. Use cuda.core.system.get_driver_version() instead.", - DeprecationWarning, - stacklevel=1, - ) - return self.get_driver_version() - - def get_num_devices(self) -> int: - """ - Query the number of available GPUs. - - Returns - ------- - int - The number of available GPU devices. - """ - return handle_return(runtime.cudaGetDeviceCount()) - - @property - def num_devices(self) -> int: - """ - Query the number of available GPUs. - - Returns - ------- - int - The number of available GPU devices. - - .. deprecated:: 0.5.0 - `cuda.core.system.num_devices` will be removed in 0.6.0. - Use `cuda.core.system.get_num_devices()` instead. - """ - warnings.warn( - "cuda.core.system.num_devices is deprecated. Use cuda.core.system.get_num_devices() instead.", - DeprecationWarning, - stacklevel=1, - ) - return self.get_num_devices() - - @property - def devices(self) -> tuple: - """ - Query the available device instances. - - Returns - ------- - tuple of Device - A tuple containing instances of available devices. - - .. deprecated:: 0.5.0 - `cuda.core.system.devices` will be removed in 0.6.0. - Use `cuda.core.Device.get_all_devices()` instead. - """ - warnings.warn( - "cuda.core.system.devices is deprecated. Use cuda.core.Device.get_all_devices() instead.", - DeprecationWarning, - stacklevel=1, - ) - return Device.get_all_devices() diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 3dbf3b7440..7f5c5caf21 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -38,7 +38,7 @@ def _warn_deprecated(): _warn_deprecated() -from cuda.core import utils # noqa: E402 +from cuda.core import system, utils # noqa: E402 # Make utils accessible as a submodule for backward compatibility __import__("sys").modules[__spec__.name + ".utils"] = utils @@ -73,8 +73,3 @@ def _warn_deprecated(): from cuda.core._module import Kernel, ObjectCode # noqa: E402 from cuda.core._program import Program, ProgramOptions # noqa: E402 from cuda.core._stream import Stream, StreamOptions # noqa: E402 -from cuda.core._system import System # noqa: E402 - -system = System() -__import__("sys").modules[__spec__.name + ".system"] = system -del System diff --git a/cuda_core/cuda/core/system/__init__.py b/cuda_core/cuda/core/system/__init__.py new file mode 100644 index 0000000000..9eaa79a6f7 --- /dev/null +++ b/cuda_core/cuda/core/system/__init__.py @@ -0,0 +1,38 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ruff: noqa: F403, F405 + + +# NOTE: We must maintain that it is always possible to import this module +# without CUDA being installed, and without CUDA being initialized or any +# contexts created, so that a user can use NVML to explore things about their +# system without loading CUDA. + + +__all__ = [ + "get_driver_version", + "get_driver_version_full", + "get_num_devices", + "get_process_name", + "CUDA_BINDINGS_NVML_IS_COMPATIBLE", +] + + +from ._system import * + +if CUDA_BINDINGS_NVML_IS_COMPATIBLE: + from ._device import Device, DeviceArchitecture + from .exceptions import * + from .exceptions import __all__ as _exceptions_all + + __all__.extend( + [ + "get_nvml_version", + "Device", + "DeviceArchitecture", + ] + ) + + __all__.extend(_exceptions_all) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx new file mode 100644 index 0000000000..faea1296b7 --- /dev/null +++ b/cuda_core/cuda/core/system/_device.pyx @@ -0,0 +1,314 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libc.stdint cimport intptr_t +from libc.math cimport ceil + +from multiprocessing import cpu_count +from typing import Iterable + +from cuda.bindings import _nvml as nvml + +from ._nvml_context cimport initialize + +include "_device_utils.pxi" + + +class DeviceArchitecture: + """ + Device architecture enumeration. + """ + + def __init__(self, architecture: int): + try: + self._architecture = nvml.DeviceArch(architecture) + except ValueError: + self._architecture = None + + @property + def id(self) -> int: + """ + The numeric id of the device architecture. + + Returns -1 if the device is unknown. + """ + if self._architecture is None: + return -1 + return int(self._architecture) + + @property + def name(self) -> str: + """ + The name of the device architecture. + + Returns "Unlisted" if the device is unknown. + """ + if self._architecture is None: + return "Unlisted" + name = self._architecture.name + return name[name.rfind("_") + 1 :].title() + + +cdef class MemoryInfo: + """ + Memory allocation information for a device. + """ + cdef object _memory_info + + def __init__(self, memory_info: nvml.Memory_v2): + self._memory_info = memory_info + + @property + def free(self) -> int: + """ + Unallocated device memory (in bytes) + """ + return self._memory_info.free + + @property + def total(self) -> int: + """ + Total physical device memory (in bytes) + """ + return self._memory_info.total + + @property + def used(self) -> int: + """ + Allocated device memory (in bytes) + """ + return self._memory_info.used + + @property + def reserved(self) -> int: + """ + Device memory (in bytes) reserved for system use (driver or firmware) + """ + return self._memory_info.reserved + + +cdef class BAR1MemoryInfo(MemoryInfo): + """ + BAR1 Memory allocation information for a device. + """ + cdef object _memory_info + + def __init__(self, memory_info: nvml.BAR1Memory): + self._memory_info = memory_info + + @property + def free(self) -> int: + """ + Unallocated BAR1 memory (in bytes) + """ + return self._memory_info.bar1_free + + @property + def total(self) -> int: + """ + Total BAR1 memory (in bytes) + """ + return self._memory_info.bar1_total + + @property + def used(self) -> int: + """ + Allocated used memory (in bytes) + """ + return self._memory_info.bar1_used + + +cdef class PciInfo: + """ + PCI information about a GPU device. + """ + cdef object _pci_info + + def __init__(self, pci_info: nvml.PciInfo): + self._pci_info = pci_info + + @property + def bus(self) -> int: + """ + The bus on which the device resides, 0 to 255 + """ + return self._pci_info.bus + + @property + def bus_id(self) -> str: + """ + The tuple domain:bus:device.function PCI identifier string + """ + return self._pci_info.bus_id + + @property + def device(self) -> int: + """ + The device's id on the bus, 0 to 31 + """ + return self._pci_info.device_ + + @property + def domain(self) -> int: + """ + The PCI domain on which the device's bus resides, 0 to 0xffffffff + """ + return self._pci_info.domain + + @property + def vendor_id(self) -> int: + """ + The PCI vendor id of the device + """ + return self._pci_info.pci_device_id & 0xFFFF + + @property + def device_id(self) -> int: + """ + The PCI device id of the device + """ + return self._pci_info.pci_device_id >> 16 + + +cdef class Device: + """ + Representation of a device. + + ``cuda.core.system.Device`` provides access to various pieces of metadata + about devices and their topology, as provided by the NVIDIA Management + Library (NVML). To use CUDA with a device, use :class:`cuda.core.Device`. + + Parameters + ---------- + index: int, optional + Integer representing the CUDA device index to get a handle to. + uuid: bytes or str, optional + UUID of a CUDA device to get a handle to. + + Raises + ------ + ValueError + If neither `index` nor `uuid` are specified or if both are specified. + """ + + cdef intptr_t _handle + + def __init__(self, index: int | None = None, uuid: bytes | str | None = None): + initialize() + + if index is not None and uuid is not None: + raise ValueError("Handle requires only one of either device `index` or `uuid`.") + if index is None and uuid is None: + raise ValueError("Handle requires either a device `index` or `uuid`.") + + if index is not None: + self._handle = nvml.device_get_handle_by_index_v2(index) + else: + if isinstance(uuid, bytes): + uuid = uuid.decode("ascii") + self._handle = nvml.device_get_handle_by_uuid(uuid) + + @property + def handle(self) -> int: + return self._handle + + @classmethod + def get_all_devices(cls) -> Iterable[Device]: + """ + Query the available device instances. + + Returns + ------- + Iterator of Device + An iterator over available devices. + """ + total = nvml.device_get_count_v2() + for device_id in range(total): + yield cls(device_id) + + @property + def architecture(self) -> DeviceArchitecture: + """ + Device architecture. For example, a Tesla V100 will report + ``DeviceArchitecture.name == "Volta"``, and RTX A6000 will report + ``DeviceArchitecture.name == "Ampere"``. If the device returns an + architecture that is unknown to NVML then ``DeviceArchitecture.name == + "Unknown"`` is reported, whereas an architecture that is unknown to + cuda.core.system is reported as ``DeviceArchitecture.name == "Unlisted"``. + """ + return DeviceArchitecture(nvml.device_get_architecture(self._handle)) + + @property + def bar1_memory_info(self) -> BAR1MemoryInfo: + """ + Get information about BAR1 memory. + + BAR1 is used to map the FB (device memory) so that it can be directly + accessed by the CPU or by 3rd party devices (peer-to-peer on the PCIE + bus). + """ + return BAR1MemoryInfo(nvml.device_get_bar1_memory_info(self._handle)) + + @property + def cpu_affinity(self) -> list[int]: + """ + Get a list containing the CPU indices to which the GPU is directly connected. + + Examples + -------- + >>> Device(index=0).cpu_affinity + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59] + """ + return _unpack_bitmask(nvml.device_get_cpu_affinity( + self._handle, + ceil(cpu_count() / 64), + )) + + @property + def cuda_compute_capability(self) -> tuple[int, int]: + """ + CUDA compute capability of the device, e.g.: `(7, 0)` for a Tesla V100. + + Returns a tuple `(major, minor)`. + """ + return nvml.device_get_cuda_compute_capability(self._handle) + + @property + def memory_info(self) -> MemoryInfo: + """ + Object with memory information. + """ + return MemoryInfo(nvml.device_get_memory_info_v2(self._handle)) + + @property + def name(self) -> str: + """ + Name of the device, e.g.: `"Tesla V100-SXM2-32GB"` + """ + return nvml.device_get_name(self._handle) + + @property + def pci_info(self) -> PciInfo: + """ + The PCI attributes of this device. + """ + return PciInfo(nvml.device_get_pci_info_v3(self._handle)) + + @property + def serial(self) -> str: + """ + Retrieves the globally unique board serial number associated with this + device's board. + """ + return nvml.device_get_serial(self._handle) + + @property + def uuid(self) -> str: + """ + Retrieves the globally unique immutable UUID associated with this + device, as a 5 part hexadecimal string, that augments the immutable, + board serial identifier. + """ + return nvml.device_get_uuid(self._handle) diff --git a/cuda_core/cuda/core/system/_device_utils.pxi b/cuda_core/cuda/core/system/_device_utils.pxi new file mode 100644 index 0000000000..6d7a150e8f --- /dev/null +++ b/cuda_core/cuda/core/system/_device_utils.pxi @@ -0,0 +1,40 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cpython cimport array +from libc.stdint cimport uint64_t + + +cpdef inline list[int] _unpack_bitmask(uint64_t[:] arr): + """ + Unpack a list of integers containing bitmasks. + + Parameters + ---------- + x: list of int + A list of integers + + Examples + -------- + >>> from cuda.core.system.utils import unpack_bitmask + >>> unpack_bitmask([1 + 2 + 8]) + [0, 1, 3] + >>> unpack_bitmask([1 + 2 + 16]) + [0, 1, 4] + >>> unpack_bitmask([1 + 2 + 16, 2 + 4]) + [0, 1, 4, 65, 66] + """ + cdef uint64_t i, j, idx + cdef int mask_bits = 64 + + res = [] + + for i in range(len(arr)): + cpu_offset = i * mask_bits + idx = 1 + for j in range(mask_bits): + if arr[i] & idx: + res.append(cpu_offset + j) + idx <<= 1 + return res diff --git a/cuda_core/cuda/core/system/_nvml_context.pxd b/cuda_core/cuda/core/system/_nvml_context.pxd new file mode 100644 index 0000000000..64dbe705f5 --- /dev/null +++ b/cuda_core/cuda/core/system/_nvml_context.pxd @@ -0,0 +1,58 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +cdef extern from *: + """ + #if defined(_WIN32) || defined(_WIN64) + #include + #else + #include + #endif + """ + int getpid() nogil + + +ctypedef enum _NVMLState: + UNINITIALIZED = 0 + INITIALIZED = 1 + DISABLED_LIBRARY_NOT_FOUND = 2 + + +# Initialisation must occur per-process, so an initialised state is a +# (state, pid) pair +cdef _NVMLState _NVML_STATE + + +cdef int _NVML_OWNER_PID + + +cpdef _initialize() + + +cpdef inline initialize(): + """ + Initializes Nvidia Management Library (NVML), ensuring it only happens once per process. + """ + if _NVML_STATE == _NVMLState.DISABLED_LIBRARY_NOT_FOUND or ( + _NVML_STATE == _NVMLState.INITIALIZED and getpid() == _NVML_OWNER_PID + ): + return + + _initialize() + + +cpdef inline bint is_initialized(): + """ + Check whether the NVML context is initialized on this process. + + Returns + ------- + result: bool + Whether the NVML context is initialized on this process. + """ + return _NVML_STATE == _NVMLState.INITIALIZED and getpid() == _NVML_OWNER_PID + + +cpdef validate() diff --git a/cuda_core/cuda/core/system/_nvml_context.pyx b/cuda_core/cuda/core/system/_nvml_context.pyx new file mode 100644 index 0000000000..d6d9c46060 --- /dev/null +++ b/cuda_core/cuda/core/system/_nvml_context.pyx @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +import threading + +from cuda.bindings import _nvml as nvml + +from . import exceptions + + +_NVML_STATE = _NVMLState.UNINITIALIZED + + +_NVML_OWNER_PID = 0 + + +_lock = threading.Lock() + + +# For testing +def _get_nvml_state(): + return _NVML_STATE + + +cpdef _initialize(): + """ + Initializes Nvidia Management Library (NVML), ensuring it only happens once per process. + """ + global _NVML_STATE, _NVML_OWNER_PID + + with _lock: + # Double-check to make sure nothing has changed since acquiring the lock + if _NVML_STATE == _NVMLState.DISABLED_LIBRARY_NOT_FOUND or ( + _NVML_STATE == _NVMLState.INITIALIZED and getpid() == _NVML_OWNER_PID + ): + return + elif ( + _NVML_STATE == _NVMLState.INITIALIZED and getpid() != _NVML_OWNER_PID + ) or _NVML_STATE == _NVMLState.UNINITIALIZED: + try: + nvml.init_v2() + except ( + exceptions.LibraryNotFoundError, + exceptions.DriverNotLoadedError, + exceptions.UnknownError, + ): + _NVML_STATE = _NVMLState.DISABLED_LIBRARY_NOT_FOUND + return + + # initialization was successful + _NVML_STATE = _NVMLState.INITIALIZED + _NVML_OWNER_PID = getpid() + else: + raise RuntimeError(f"Unhandled initialisation state ({_NVML_STATE=}, {_NVML_OWNER_PID=})") + + +cpdef validate(): + """ + Validate NVML state. + + Validate that NVML is initialized, functional and that the system has at + least one GPU available. + + Raises + ------ + nvml.UninitializedError + If NVML hasn't been initialized. + nvml.LibraryNotFoundError + If the NVML library could not be found. + nvml.GpuNotFoundError + If no GPUs are available. + """ + if _NVML_STATE == _NVMLState.DISABLED_LIBRARY_NOT_FOUND: + raise exceptions.LibraryNotFoundError() + elif not is_initialized(): + raise exceptions.UninitializedError() + elif nvml.device_get_count_v2() == 0: + raise exceptions.GpuNotFoundError() diff --git a/cuda_core/cuda/core/system/_system.pyx b/cuda_core/cuda/core/system/_system.pyx new file mode 100644 index 0000000000..e6163b94fd --- /dev/null +++ b/cuda_core/cuda/core/system/_system.pyx @@ -0,0 +1,121 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +# This file needs to either use NVML exclusively, or when `cuda.bindings._nvml` +# isn't available, fall back to non-NVML-based methods for backward +# compatibility. + + +CUDA_BINDINGS_NVML_IS_COMPATIBLE: bool + +try: + from cuda.bindings._version import __version_tuple__ as _BINDINGS_VERSION +except ImportError: + CUDA_BINDINGS_NVML_IS_COMPATIBLE = False +else: + CUDA_BINDINGS_NVML_IS_COMPATIBLE = _BINDINGS_VERSION >= (13, 1, 2) or (_BINDINGS_VERSION[0] == 12 and _BINDINGS_VERSION[1:3] >= (9, 6)) + + +if CUDA_BINDINGS_NVML_IS_COMPATIBLE: + from cuda.bindings import _nvml as nvml + from ._nvml_context import initialize +else: + from cuda.core._utils.cuda_utils import driver, handle_return, runtime + + +def get_driver_version(kernel_mode: bool = False) -> tuple[int, int]: + """ + Get the driver version. + + Parameters + ---------- + kernel_mode: bool + When `True`, return the kernel-mode driver version, e.g. 580.65.06. + Otherwise, return the user-mode driver version, e.g. 13.0.1. + + Returns + ------- + version: tuple[int, int] + Tuple in the format `(MAJOR, MINOR)`. + """ + return get_driver_version_full(kernel_mode)[:2] + + +def get_driver_version_full(kernel_mode: bool = False) -> tuple[int, int, int]: + """ + Get the full driver version. + + Parameters + ---------- + kernel_mode: bool + When `True`, return the kernel-mode driver version, e.g. 580.65.06. + Otherwise, return the user-mode driver version, e.g. 13.0.1. + + Returns + ------- + version: tuple[int, int, int] + Tuple in the format `(MAJOR, MINOR, PATCH)`. + """ + cdef int v + if kernel_mode: + if not CUDA_BINDINGS_NVML_IS_COMPATIBLE: + raise ValueError("Kernel-mode driver version requires NVML support") + initialize() + return tuple(int(v) for v in nvml.system_get_driver_version().split(".")) + else: + if CUDA_BINDINGS_NVML_IS_COMPATIBLE: + initialize() + v = nvml.system_get_cuda_driver_version() + else: + v = handle_return(driver.cuDriverGetVersion()) + return (v // 1000, (v // 10) % 100, v % 10) + + +def get_nvml_version() -> tuple[int, ...]: + """ + The version of the NVML library. + """ + if not CUDA_BINDINGS_NVML_IS_COMPATIBLE: + raise RuntimeError("NVML library is not available") + return tuple(int(v) for v in nvml.system_get_nvml_version().split(".")) + + +def get_num_devices() -> int: + """ + Return the number of devices in the system. + """ + if CUDA_BINDINGS_NVML_IS_COMPATIBLE: + initialize() + return nvml.device_get_count_v2() + else: + return handle_return(runtime.cudaGetDeviceCount()) + + +def get_process_name(pid: int) -> str: + """ + The name of process with given PID. + + Parameters + ---------- + pid: int + The PID of the process for which to get the name. + + Returns + ------- + name: str + The process name. + """ + initialize() + return nvml.system_get_process_name(pid) + + +__all__ = [ + "get_driver_version", + "get_driver_version_full", + "get_nvml_version", + "get_num_devices", + "get_process_name", + "CUDA_BINDINGS_NVML_IS_COMPATIBLE", +] diff --git a/cuda_core/cuda/core/system/exceptions.py b/cuda_core/cuda/core/system/exceptions.py new file mode 100644 index 0000000000..65bcdd27b5 --- /dev/null +++ b/cuda_core/cuda/core/system/exceptions.py @@ -0,0 +1,75 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +from cuda.bindings import _nvml as nvml + +NvmlError = nvml.NvmlError +UninitializedError = nvml.UninitializedError +InvalidArgumentError = nvml.InvalidArgumentError +NotSupportedError = nvml.NotSupportedError +NoPermissionError = nvml.NoPermissionError +AlreadyInitializedError = nvml.AlreadyInitializedError +NotFoundError = nvml.NotFoundError +InsufficientSizeError = nvml.InsufficientSizeError +InsufficientPowerError = nvml.InsufficientPowerError +DriverNotLoadedError = nvml.DriverNotLoadedError +TimeoutError = nvml.TimeoutError +IrqIssueError = nvml.IrqIssueError +LibraryNotFoundError = nvml.LibraryNotFoundError +FunctionNotFoundError = nvml.FunctionNotFoundError +CorruptedInforomError = nvml.CorruptedInforomError +GpuIsLostError = nvml.GpuIsLostError +ResetRequiredError = nvml.ResetRequiredError +OperatingSystemError = nvml.OperatingSystemError +LibRmVersionMismatchError = nvml.LibRmVersionMismatchError +InUseError = nvml.InUseError +MemoryError = nvml.MemoryError +NoDataError = nvml.NoDataError +VgpuEccNotSupportedError = nvml.VgpuEccNotSupportedError +InsufficientResourcesError = nvml.InsufficientResourcesError +FreqNotSupportedError = nvml.FreqNotSupportedError +ArgumentVersionMismatchError = nvml.ArgumentVersionMismatchError +DeprecatedError = nvml.DeprecatedError +NotReadyError = nvml.NotReadyError +GpuNotFoundError = nvml.GpuNotFoundError +InvalidStateError = nvml.InvalidStateError +ResetTypeNotSupportedError = nvml.ResetTypeNotSupportedError +UnknownError = nvml.UnknownError + + +__all__ = [ + "NvmlError", + "UninitializedError", + "InvalidArgumentError", + "NotSupportedError", + "NoPermissionError", + "AlreadyInitializedError", + "NotFoundError", + "InsufficientSizeError", + "InsufficientPowerError", + "DriverNotLoadedError", + "TimeoutError", + "IrqIssueError", + "LibraryNotFoundError", + "FunctionNotFoundError", + "CorruptedInforomError", + "GpuIsLostError", + "ResetRequiredError", + "OperatingSystemError", + "LibRmVersionMismatchError", + "InUseError", + "MemoryError", + "NoDataError", + "VgpuEccNotSupportedError", + "InsufficientResourcesError", + "FreqNotSupportedError", + "ArgumentVersionMismatchError", + "DeprecatedError", + "NotReadyError", + "GpuNotFoundError", + "InvalidStateError", + "ResetTypeNotSupportedError", + "UnknownError", +] diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 5bd47a4ed2..ac46fa2fa5 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -64,11 +64,21 @@ CUDA compilation toolchain LinkerOptions -CUDA system information ------------------------ +CUDA system information and NVIDIA Management Library (NVML) +------------------------------------------------------------ -.. automethod:: cuda.core._system.System.get_driver_version -.. automethod:: cuda.core._system.System.get_num_devices +.. autosummary:: + :toctree: generated/ + + system.get_driver_version + system.get_driver_version_full + system.get_num_devices + system.get_nvml_version + system.get_process_name + + :template: autosummary/cyclass.rst + + system.Device .. module:: cuda.core.utils diff --git a/cuda_core/tests/system/__init__.py b/cuda_core/tests/system/__init__.py new file mode 100644 index 0000000000..79599c77db --- /dev/null +++ b/cuda_core/tests/system/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/cuda_core/tests/system/conftest.py b/cuda_core/tests/system/conftest.py new file mode 100644 index 0000000000..ad2f06bfdb --- /dev/null +++ b/cuda_core/tests/system/conftest.py @@ -0,0 +1,11 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +import pytest +from cuda.core import system + +skip_if_nvml_unsupported = pytest.mark.skipif( + not system.CUDA_BINDINGS_NVML_IS_COMPATIBLE, reason="NVML support requires cuda.bindings version 12.9.6+ or 13.1.2+" +) diff --git a/cuda_core/tests/system/test_nvml_context.py b/cuda_core/tests/system/test_nvml_context.py new file mode 100644 index 0000000000..199b4a67ad --- /dev/null +++ b/cuda_core/tests/system/test_nvml_context.py @@ -0,0 +1,69 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ruff: noqa: E402 + +from .conftest import skip_if_nvml_unsupported + +pytestmark = skip_if_nvml_unsupported + +import multiprocessing as mp +from platform import uname + +import pytest + +UNINITIALIZED = 0 +INITIALIZED = 1 +DISABLED_LIBRARY_NOT_FOUND = 2 + + +def _run_process(target): + p = mp.get_context("spawn").Process(target=target) + p.start() + p.join() + assert not p.exitcode + + +def _test_uninitialized(): + from cuda.core.system import _nvml_context + + assert _nvml_context._get_nvml_state() == UNINITIALIZED + + +def test_uninitialized(): + _run_process(_test_uninitialized) + + +def _test_is_initialized(): + from cuda.core.system import _nvml_context + + _nvml_context.initialize() + assert _nvml_context._get_nvml_state() == INITIALIZED + assert _nvml_context.is_initialized() is True + + +def test_is_initialized(): + _run_process(_test_is_initialized) + + +@pytest.mark.skipif("microsoft-standard" in uname().release, reason="Probably a WSL system") +def test_no_wsl(): + assert "microsoft-standard" not in uname().release + + +@pytest.mark.skipif("microsoft-standard" not in uname().release, reason="Probably a non-WSL system") +def test_wsl(): + assert "microsoft-standard" in uname().release + + +def _test_validate(): + from cuda.core.system import _nvml_context + + _nvml_context.initialize() + + assert _nvml_context.validate() is None + + +def test_validate(): + _run_process(_test_validate) diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py new file mode 100644 index 0000000000..134ea7cbbe --- /dev/null +++ b/cuda_core/tests/system/test_system_device.py @@ -0,0 +1,191 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ruff: noqa: E402 + +from .conftest import skip_if_nvml_unsupported + +pytestmark = skip_if_nvml_unsupported + +import array +import os +import re +import sys + +import pytest +from cuda.core import system +from cuda.core.system import _device as system_device + +if system.CUDA_BINDINGS_NVML_IS_COMPATIBLE: + from cuda.bindings import _nvml as nvml + + +@pytest.fixture(autouse=True, scope="module") +def check_gpu_available(): + if not system.CUDA_BINDINGS_NVML_IS_COMPATIBLE or system.get_num_devices() == 0: + pytest.skip("No GPUs available to run device tests", allow_module_level=True) + + +def test_device_index_handle(): + for device in system.Device.get_all_devices(): + assert isinstance(device.handle, int) + + +def test_device_architecture(): + for device in system.Device.get_all_devices(): + device_arch = device.architecture + + assert isinstance(device_arch, system_device.DeviceArchitecture) + if sys.version_info < (3, 12): + assert device_arch.id in nvml.DeviceArch.__members__.values() + else: + assert device_arch.id in nvml.DeviceArch + + +def test_device_bar1_memory(): + for device in system.Device.get_all_devices(): + bar1_memory_info = device.bar1_memory_info + free, total, used = ( + bar1_memory_info.free, + bar1_memory_info.total, + bar1_memory_info.used, + ) + + assert isinstance(bar1_memory_info, system_device.BAR1MemoryInfo) + assert isinstance(free, int) + assert isinstance(total, int) + assert isinstance(used, int) + + assert free >= 0 + assert total >= 0 + assert used >= 0 + assert free + used == total + + +def test_device_cpu_affinity(): + skip_reasons = set() + for device in system.Device.get_all_devices(): + try: + affinity = device.cpu_affinity + except system.NotSupportedError: + skip_reasons.add(f"CPU affinity not supported on '{device.name}'") + else: + assert isinstance(affinity, list) + os.sched_setaffinity(0, affinity) + assert os.sched_getaffinity(0) == set(affinity) + if skip_reasons: + pytest.skip(" ; ".join(skip_reasons)) + + +def test_device_cuda_compute_capability(): + for device in system.Device.get_all_devices(): + cuda_compute_capability = device.cuda_compute_capability + assert isinstance(cuda_compute_capability, tuple) + assert len(cuda_compute_capability) == 2 + assert all([isinstance(i, int) for i in cuda_compute_capability]) + assert 3 <= cuda_compute_capability[0] <= 99 + assert 0 <= cuda_compute_capability[1] <= 9 + + +def test_device_memory(): + for device in system.Device.get_all_devices(): + memory_info = device.memory_info + free, total, used, reserved = memory_info.free, memory_info.total, memory_info.used, memory_info.reserved + + assert isinstance(memory_info, system_device.MemoryInfo) + assert isinstance(free, int) + assert isinstance(total, int) + assert isinstance(used, int) + assert isinstance(reserved, int) + + assert free >= 0 + assert total >= 0 + assert used >= 0 + assert reserved >= 0 + assert free + used + reserved == total + + +def test_device_name(): + for device in system.Device.get_all_devices(): + name = device.name + assert isinstance(name, str) + assert len(name) > 0 + + +def test_device_pci_info(): + for device in system.Device.get_all_devices(): + pci_info = device.pci_info + assert isinstance(pci_info, system_device.PciInfo) + + assert isinstance(pci_info.bus_id, str) + assert re.match("[a-f0-9]{8}:[a-f0-9]{2}:[a-f0-9]{2}.[a-f0-9]", pci_info.bus_id.lower()) + bus_id_domain = int(pci_info.bus_id.split(":")[0], 16) + bus_id_bus = int(pci_info.bus_id.split(":")[1], 16) + bus_id_device = int(pci_info.bus_id.split(":")[2][:2], 16) + + assert isinstance(pci_info.domain, int) + assert 0x00 <= pci_info.domain <= 0xFFFFFFFF + assert pci_info.domain == bus_id_domain + + assert isinstance(pci_info.bus, int) + assert 0x00 <= pci_info.bus <= 0xFF + assert pci_info.bus == bus_id_bus + + assert isinstance(pci_info.device, int) + assert 0x00 <= pci_info.device <= 0xFF + assert pci_info.device == bus_id_device + + assert isinstance(pci_info.vendor_id, int) + assert 0x0000 <= pci_info.vendor_id <= 0xFFFF + + assert isinstance(pci_info.device_id, int) + assert 0x0000 <= pci_info.device_id <= 0xFFFF + + +def test_device_serial(): + skip_reasons = set() + for device in system.Device.get_all_devices(): + try: + serial = device.serial + except system.NotSupportedError: + skip_reasons.add(f"Device serial not supported by device '{device.name}'") + else: + assert isinstance(serial, str) + assert len(serial) > 0 + + if skip_reasons: + pytest.skip(" ; ".join(skip_reasons)) + + +def test_device_uuid(): + for device in system.Device.get_all_devices(): + uuid = device.uuid + assert isinstance(uuid, str) + + # Expands to GPU-8hex-4hex-4hex-4hex-12hex, where 8hex means 8 consecutive + # hex characters, e.g.: "GPU-abcdef12-abcd-0123-4567-1234567890ab" + + +@pytest.mark.parametrize( + "params", + [ + { + "input": [1152920405096267775, 0], + "output": [i for i in range(20)] + [i + 40 for i in range(20)], + }, + { + "input": [17293823668613283840, 65535], + "output": [i + 20 for i in range(20)] + [i + 60 for i in range(20)], + }, + {"input": [18446744073709551615, 0], "output": [i for i in range(64)]}, + {"input": [0, 18446744073709551615], "output": [i + 64 for i in range(64)]}, + ], +) +def test_unpack_bitmask(params): + assert system_device._unpack_bitmask(array.array("Q", params["input"])) == params["output"] + + +def test_unpack_bitmask_single_value(): + with pytest.raises(TypeError): + system_device._unpack_bitmask(1) diff --git a/cuda_core/tests/system/test_system_system.py b/cuda_core/tests/system/test_system_system.py new file mode 100644 index 0000000000..582c471b8c --- /dev/null +++ b/cuda_core/tests/system/test_system_system.py @@ -0,0 +1,97 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ruff: noqa: E402 + +import os + +import pytest + +try: + from cuda.bindings import driver, runtime +except ImportError: + from cuda import cuda as driver + from cuda import cudart as runtime + +from cuda.core import Device, system +from cuda.core._utils.cuda_utils import handle_return + +from .conftest import skip_if_nvml_unsupported + + +def test_driver_version(): + driver_version = system.get_driver_version() + version = handle_return(driver.cuDriverGetVersion()) + expected_driver_version = (version // 1000, (version % 1000) // 10) + assert driver_version == expected_driver_version, "Driver version does not match expected value" + + +def test_num_devices(): + num_devices = system.get_num_devices() + expected_num_devices = handle_return(runtime.cudaGetDeviceCount()) + assert num_devices == expected_num_devices, "Number of devices does not match expected value" + + +def test_devices(): + devices = Device.get_all_devices() + expected_num_devices = handle_return(runtime.cudaGetDeviceCount()) + expected_devices = tuple(Device(device_id) for device_id in range(expected_num_devices)) + assert len(devices) == len(expected_devices), "Number of devices does not match expected value" + for device, expected_device in zip(devices, expected_devices): + assert device.device_id == expected_device.device_id, "Device ID does not match expected value" + + +def test_cuda_driver_version(): + cuda_driver_version = system.get_driver_version_full() + assert isinstance(cuda_driver_version, tuple) + assert len(cuda_driver_version) == 3 + + ver_maj, ver_min, ver_patch = cuda_driver_version + assert ver_maj >= 10 + assert 0 <= ver_min <= 99 + assert 0 <= ver_patch <= 9 + + +@skip_if_nvml_unsupported +def test_gpu_driver_version(): + driver_version = system.get_driver_version(kernel_mode=True) + assert isinstance(driver_version, tuple) + assert len(driver_version) in (2, 3) + + (ver_maj, ver_min, *ver_patch) = driver_version + assert 400 <= ver_maj < 1000 + assert ver_min >= 0 + if ver_patch: + assert 0 <= ver_patch[0] <= 99 + + +@skip_if_nvml_unsupported +def test_nvml_version(): + nvml_version = system.get_nvml_version() + assert isinstance(nvml_version, tuple) + assert len(nvml_version) in (3, 4) + + (cuda_ver_maj, ver_maj, ver_min, *ver_patch) = nvml_version + assert cuda_ver_maj >= 10 + assert 400 <= ver_maj < 1000 + assert ver_min >= 0 + if ver_patch: + assert 0 <= ver_patch[0] <= 99 + + +@skip_if_nvml_unsupported +def test_get_process_name(): + try: + process_name = system.get_process_name(os.getpid()) + except system.NotFoundError: + pytest.skip("Process not found") + + assert isinstance(process_name, str) + assert "python" in process_name + + +def test_device_count(): + device_count = system.get_num_devices() + assert isinstance(device_count, int) + assert device_count >= 0 diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py deleted file mode 100644 index 60b7ef7ec7..0000000000 --- a/cuda_core/tests/test_system.py +++ /dev/null @@ -1,39 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -try: - from cuda.bindings import driver, runtime -except ImportError: - from cuda import cuda as driver - from cuda import cudart as runtime - -from cuda.core import Device, system -from cuda.core._utils.cuda_utils import handle_return - - -def test_system_singleton(): - system1 = system - system2 = system - assert id(system1) == id(system2), "system is not a singleton" - - -def test_driver_version(): - driver_version = system.get_driver_version() - version = handle_return(driver.cuDriverGetVersion()) - expected_driver_version = (version // 1000, (version % 1000) // 10) - assert driver_version == expected_driver_version, "Driver version does not match expected value" - - -def test_num_devices(): - num_devices = system.get_num_devices() - expected_num_devices = handle_return(runtime.cudaGetDeviceCount()) - assert num_devices == expected_num_devices, "Number of devices does not match expected value" - - -def test_devices(): - devices = Device.get_all_devices() - expected_num_devices = handle_return(runtime.cudaGetDeviceCount()) - expected_devices = tuple(Device(device_id) for device_id in range(expected_num_devices)) - assert len(devices) == len(expected_devices), "Number of devices does not match expected value" - for device, expected_device in zip(devices, expected_devices): - assert device.device_id == expected_device.device_id, "Device ID does not match expected value"