Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions cuda_core/cuda/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
finally:
del bindings, importlib, subdir, cuda_major, cuda_minor

from cuda.core import utils # noqa: E402
from cuda.core import system, utils # noqa: E402
from cuda.core._device import Device # noqa: E402
from cuda.core._event import Event, EventOptions # noqa: E402
from cuda.core._graph import ( # noqa: E402
Expand Down Expand Up @@ -62,8 +62,3 @@
from cuda.core._module import Kernel, ObjectCode # noqa: E402
from cuda.core._program import Program, ProgramOptions # noqa: E402
from cuda.core._stream import Stream, StreamOptions # noqa: E402
from cuda.core._system import System # noqa: E402

system = System()
__import__("sys").modules[__spec__.name + ".system"] = system
del System
114 changes: 0 additions & 114 deletions cuda_core/cuda/core/_system.py

This file was deleted.

7 changes: 1 addition & 6 deletions cuda_core/cuda/core/experimental/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def _warn_deprecated():
_warn_deprecated()


from cuda.core import utils # noqa: E402
from cuda.core import system, utils # noqa: E402

# Make utils accessible as a submodule for backward compatibility
__import__("sys").modules[__spec__.name + ".utils"] = utils
Expand Down Expand Up @@ -73,8 +73,3 @@ def _warn_deprecated():
from cuda.core._module import Kernel, ObjectCode # noqa: E402
from cuda.core._program import Program, ProgramOptions # noqa: E402
from cuda.core._stream import Stream, StreamOptions # noqa: E402
from cuda.core._system import System # noqa: E402

system = System()
__import__("sys").modules[__spec__.name + ".system"] = system
del System
63 changes: 63 additions & 0 deletions cuda_core/cuda/core/system/__init__.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI, cuda.core supports any cuda-bindings/cuda-python 12.x and 13.x, many of which do not have the NVML bindings available. So, we need a version guard here before importing anything that would expect the bindings to exist, and raise an exception in such cases.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, good reminder. I guess that precludes cimport'ing anything from cuda.bindings._nvml, since _nvml is a moving target. Will just take that out for now...

Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

# ruff: noqa: F403, F405


__all__ = [
"get_driver_version",
"get_driver_version_full",
"get_gpu_driver_version",
"get_num_devices",
"get_process_name",
"HAS_WORKING_NVML",
]


from .system import *

if HAS_WORKING_NVML:
from ._nvml_context import initialize
from .device import Device, DeviceArchitecture
from .exceptions import *

__all__.extend(
[
"initialize",
"get_nvml_version",
"Device",
"DeviceArchitecture",
"UninitializedError",
"InvalidArgumentError",
"NotSupportedError",
"NoPermissionError",
"AlreadyInitializedError",
"NotFoundError",
"InsufficientSizeError",
"InsufficientPowerError",
"DriverNotLoadedError",
"TimeoutError",
"IrqIssueError",
"LibraryNotFoundError",
"FunctionNotFoundError",
"CorruptedInforomError",
"GpuIsLostError",
"ResetRequiredError",
"OperatingSystemError",
"LibRmVersionMismatchError",
"InUseError",
"MemoryError",
"NoDataError",
"VgpuEccNotSupportedError",
"InsufficientResourcesError",
"FreqNotSupportedError",
"ArgumentVersionMismatchError",
"DeprecatedError",
"NotReadyError",
"GpuNotFoundError",
"InvalidStateError",
"ResetTypeNotSupportedError",
"UnknownError",
]
)
95 changes: 95 additions & 0 deletions cuda_core/cuda/core/system/_nvml_context.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

import os
import threading

from cuda.bindings import _nvml as nvml

from . import exceptions


ctypedef enum _NVMLState:
UNINITIALIZED = 0
INITIALIZED = 1
DISABLED_LIBRARY_NOT_FOUND = 2


# Initialisation must occur per-process, so an initialised state is a
# (state, pid) pair
_NVML_STATE = _NVMLState.UNINITIALIZED
# """Current initialization state"""

_NVML_OWNER_PID = 0
# """PID of process that successfully called pynvml.nvmlInit"""


_lock = threading.Lock()


cpdef initialize():
"""
Initializes Nvidia Management Library (NVML), ensuring it only happens once per process.
"""
global _NVML_STATE, _NVML_OWNER_PID

with _lock:
if _NVML_STATE == _NVMLState.DISABLED_LIBRARY_NOT_FOUND or (
_NVML_STATE == _NVMLState.INITIALIZED and os.getpid() == _NVML_OWNER_PID
):
return
elif (
_NVML_STATE == _NVMLState.INITIALIZED and os.getpid() != _NVML_OWNER_PID
) or _NVML_STATE == _NVMLState.UNINITIALIZED:
try:
nvml.init_v2()
except (
exceptions.LibraryNotFoundError,
exceptions.DriverNotLoadedError,
exceptions.UnknownError,
):
_NVML_STATE = _NVMLState.DISABLED_LIBRARY_NOT_FOUND
return

# initialization was successful
_NVML_STATE = _NVMLState.INITIALIZED
_NVML_OWNER_PID = os.getpid()
else:
raise RuntimeError(f"Unhandled initialisation state ({_NVML_STATE=}, {_NVML_OWNER_PID=})")


cpdef bint is_initialized():
"""
Check whether the NVML context is initialized on this process.

Returns
-------
result: bool
Whether the NVML context is initialized on this process.
"""
return _NVML_STATE == _NVMLState.INITIALIZED and os.getpid() == _NVML_OWNER_PID


cpdef validate():
"""
Validate NVML state.

Validate that NVML is initialized, functional and that the system has at
least one GPU available.

Raises
------
nvml.UninitializedError
If NVML hasn't been initialized.
nvml.LibraryNotFoundError
If the NVML library could not be found.
nvml.GpuNotFoundError
If no GPUs are available.
"""
if _NVML_STATE == _NVMLState.DISABLED_LIBRARY_NOT_FOUND:
raise exceptions.LibraryNotFoundError()
elif not is_initialized():
raise exceptions.UninitializedError()
elif nvml.device_get_count_v2() == 0:
raise exceptions.GpuNotFoundError()
Loading
Loading