diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index a10812606e..67a815d1de 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -28,7 +28,7 @@
 finally:
     del bindings, importlib, subdir, cuda_major, cuda_minor
 
-from cuda.core import utils  # noqa: E402
+from cuda.core import system, utils  # noqa: E402
 from cuda.core._device import Device  # noqa: E402
 from cuda.core._event import Event, EventOptions  # noqa: E402
 from cuda.core._graph import (  # noqa: E402
@@ -62,8 +62,3 @@
 from cuda.core._module import Kernel, ObjectCode  # noqa: E402
 from cuda.core._program import Program, ProgramOptions  # noqa: E402
 from cuda.core._stream import Stream, StreamOptions  # noqa: E402
-from cuda.core._system import System  # noqa: E402
-
-system = System()
-__import__("sys").modules[__spec__.name + ".system"] = system
-del System
diff --git a/cuda_core/cuda/core/_system.py b/cuda_core/cuda/core/_system.py
deleted file mode 100644
index 6f06587b46..0000000000
--- a/cuda_core/cuda/core/_system.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import warnings
-
-from cuda.core._device import Device
-from cuda.core._utils.cuda_utils import driver, handle_return, runtime
-
-
-class System:
-    """Provide information about the cuda system.
-    This class is a singleton and should not be instantiated directly.
-    """
-
-    _instance = None
-
-    def __new__(cls):
-        if cls._instance is None:
-            cls._instance = super().__new__(cls)
-        return cls._instance
-
-    def __init__(self):
-        if hasattr(self, "_initialized") and self._initialized:
-            return
-        self._initialized = True
-
-    def get_driver_version(self) -> tuple[int, int]:
-        """
-        Query the CUDA driver version.
-
-        Returns
-        -------
-        tuple of int
-            A 2-tuple of (major, minor) version numbers.
-        """
-        version = handle_return(driver.cuDriverGetVersion())
-        major = version // 1000
-        minor = (version % 1000) // 10
-        return (major, minor)
-
-    @property
-    def driver_version(self) -> tuple[int, int]:
-        """
-        Query the CUDA driver version.
-
-        Returns
-        -------
-        tuple of int
-            A 2-tuple of (major, minor) version numbers.
-
-        .. deprecated:: 0.5.0
-          `cuda.core.system.driver_version` will be removed in 0.6.0.
-          Use `cuda.core.system.get_driver_version()` instead.
-        """
-        warnings.warn(
-            "cuda.core.system.driver_version is deprecated. Use cuda.core.system.get_driver_version() instead.",
-            DeprecationWarning,
-            stacklevel=1,
-        )
-        return self.get_driver_version()
-
-    def get_num_devices(self) -> int:
-        """
-        Query the number of available GPUs.
-
-        Returns
-        -------
-        int
-            The number of available GPU devices.
-        """
-        return handle_return(runtime.cudaGetDeviceCount())
-
-    @property
-    def num_devices(self) -> int:
-        """
-        Query the number of available GPUs.
-
-        Returns
-        -------
-        int
-            The number of available GPU devices.
-
-        .. deprecated:: 0.5.0
-          `cuda.core.system.num_devices` will be removed in 0.6.0.
-          Use `cuda.core.system.get_num_devices()` instead.
-        """
-        warnings.warn(
-            "cuda.core.system.num_devices is deprecated. Use cuda.core.system.get_num_devices() instead.",
-            DeprecationWarning,
-            stacklevel=1,
-        )
-        return self.get_num_devices()
-
-    @property
-    def devices(self) -> tuple:
-        """
-        Query the available device instances.
-
-        Returns
-        -------
-        tuple of Device
-            A tuple containing instances of available devices.
-
-        .. deprecated:: 0.5.0
-          `cuda.core.system.devices` will be removed in 0.6.0.
-          Use `cuda.core.Device.get_all_devices()` instead.
-        """
-        warnings.warn(
-            "cuda.core.system.devices is deprecated. Use cuda.core.Device.get_all_devices() instead.",
-            DeprecationWarning,
-            stacklevel=1,
-        )
-        return Device.get_all_devices()
diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 3dbf3b7440..7f5c5caf21 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -38,7 +38,7 @@ def _warn_deprecated():
 _warn_deprecated()
 
 
-from cuda.core import utils  # noqa: E402
+from cuda.core import system, utils  # noqa: E402
 
 # Make utils accessible as a submodule for backward compatibility
 __import__("sys").modules[__spec__.name + ".utils"] = utils
@@ -73,8 +73,3 @@ def _warn_deprecated():
 from cuda.core._module import Kernel, ObjectCode  # noqa: E402
 from cuda.core._program import Program, ProgramOptions  # noqa: E402
 from cuda.core._stream import Stream, StreamOptions  # noqa: E402
-from cuda.core._system import System  # noqa: E402
-
-system = System()
-__import__("sys").modules[__spec__.name + ".system"] = system
-del System
diff --git a/cuda_core/cuda/core/system/__init__.py b/cuda_core/cuda/core/system/__init__.py
new file mode 100644
index 0000000000..9eaa79a6f7
--- /dev/null
+++ b/cuda_core/cuda/core/system/__init__.py
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ruff: noqa: F403, F405
+
+
+# NOTE: We must maintain that it is always possible to import this module
+# without CUDA being installed, and without CUDA being initialized or any
+# contexts created, so that a user can use NVML to explore things about their
+# system without loading CUDA.
+
+
+__all__ = [
+    "get_driver_version",
+    "get_driver_version_full",
+    "get_num_devices",
+    "get_process_name",
+    "CUDA_BINDINGS_NVML_IS_COMPATIBLE",
+]
+
+
+from ._system import *
+
+if CUDA_BINDINGS_NVML_IS_COMPATIBLE:
+    from ._device import Device, DeviceArchitecture
+    from .exceptions import *
+    from .exceptions import __all__ as _exceptions_all
+
+    __all__.extend(
+        [
+            "get_nvml_version",
+            "Device",
+            "DeviceArchitecture",
+        ]
+    )
+
+    __all__.extend(_exceptions_all)
diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx
new file mode 100644
index 0000000000..faea1296b7
--- /dev/null
+++ b/cuda_core/cuda/core/system/_device.pyx
@@ -0,0 +1,314 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport intptr_t
+from libc.math cimport ceil
+
+from multiprocessing import cpu_count
+from typing import Iterable
+
+from cuda.bindings import _nvml as nvml
+
+from ._nvml_context cimport initialize
+
+include "_device_utils.pxi"
+
+
+class DeviceArchitecture:
+    """
+    Device architecture enumeration.
+    """
+
+    def __init__(self, architecture: int):
+        try:
+            self._architecture = nvml.DeviceArch(architecture)
+        except ValueError:
+            self._architecture = None
+
+    @property
+    def id(self) -> int:
+        """
+        The numeric id of the device architecture.
+
+        Returns -1 if the device is unknown.
+        """
+        if self._architecture is None:
+            return -1
+        return int(self._architecture)
+
+    @property
+    def name(self) -> str:
+        """
+        The name of the device architecture.
+
+        Returns "Unlisted" if the device is unknown.
+        """
+        if self._architecture is None:
+            return "Unlisted"
+        name = self._architecture.name
+        return name[name.rfind("_") + 1 :].title()
+
+
+cdef class MemoryInfo:
+    """
+    Memory allocation information for a device.
+    """
+    cdef object _memory_info
+
+    def __init__(self, memory_info: nvml.Memory_v2):
+        self._memory_info = memory_info
+
+    @property
+    def free(self) -> int:
+        """
+        Unallocated device memory (in bytes)
+        """
+        return self._memory_info.free
+
+    @property
+    def total(self) -> int:
+        """
+        Total physical device memory (in bytes)
+        """
+        return self._memory_info.total
+
+    @property
+    def used(self) -> int:
+        """
+        Allocated device memory (in bytes)
+        """
+        return self._memory_info.used
+
+    @property
+    def reserved(self) -> int:
+        """
+        Device memory (in bytes) reserved for system use (driver or firmware)
+        """
+        return self._memory_info.reserved
+
+
+cdef class BAR1MemoryInfo(MemoryInfo):
+    """
+    BAR1 Memory allocation information for a device.
+    """
+    cdef object _memory_info
+
+    def __init__(self, memory_info: nvml.BAR1Memory):
+        self._memory_info = memory_info
+
+    @property
+    def free(self) -> int:
+        """
+        Unallocated BAR1 memory (in bytes)
+        """
+        return self._memory_info.bar1_free
+
+    @property
+    def total(self) -> int:
+        """
+        Total BAR1 memory (in bytes)
+        """
+        return self._memory_info.bar1_total
+
+    @property
+    def used(self) -> int:
+        """
+        Allocated used memory (in bytes)
+        """
+        return self._memory_info.bar1_used
+
+
+cdef class PciInfo:
+    """
+    PCI information about a GPU device.
+    """
+    cdef object _pci_info
+
+    def __init__(self, pci_info: nvml.PciInfo):
+        self._pci_info = pci_info
+
+    @property
+    def bus(self) -> int:
+        """
+        The bus on which the device resides, 0 to 255
+        """
+        return self._pci_info.bus
+
+    @property
+    def bus_id(self) -> str:
+        """
+        The tuple domain:bus:device.function PCI identifier string
+        """
+        return self._pci_info.bus_id
+
+    @property
+    def device(self) -> int:
+        """
+        The device's id on the bus, 0 to 31
+        """
+        return self._pci_info.device_
+
+    @property
+    def domain(self) -> int:
+        """
+        The PCI domain on which the device's bus resides, 0 to 0xffffffff
+        """
+        return self._pci_info.domain
+
+    @property
+    def vendor_id(self) -> int:
+        """
+        The PCI vendor id of the device
+        """
+        return self._pci_info.pci_device_id & 0xFFFF
+
+    @property
+    def device_id(self) -> int:
+        """
+        The PCI device id of the device
+        """
+        return self._pci_info.pci_device_id >> 16
+
+
+cdef class Device:
+    """
+    Representation of a device.
+
+    ``cuda.core.system.Device`` provides access to various pieces of metadata
+    about devices and their topology, as provided by the NVIDIA Management
+    Library (NVML).  To use CUDA with a device, use :class:`cuda.core.Device`.
+
+    Parameters
+    ----------
+    index: int, optional
+        Integer representing the CUDA device index to get a handle to.
+    uuid: bytes or str, optional
+        UUID of a CUDA device to get a handle to.
+
+    Raises
+    ------
+    ValueError
+        If neither `index` nor `uuid` are specified or if both are specified.
+    """
+
+    cdef intptr_t _handle
+
+    def __init__(self, index: int | None = None, uuid: bytes | str | None = None):
+        initialize()
+
+        if index is not None and uuid is not None:
+            raise ValueError("Handle requires only one of either device `index` or `uuid`.")
+        if index is None and uuid is None:
+            raise ValueError("Handle requires either a device `index` or `uuid`.")
+
+        if index is not None:
+            self._handle = nvml.device_get_handle_by_index_v2(index)
+        else:
+            if isinstance(uuid, bytes):
+                uuid = uuid.decode("ascii")
+            self._handle = nvml.device_get_handle_by_uuid(uuid)
+
+    @property
+    def handle(self) -> int:
+        return self._handle
+
+    @classmethod
+    def get_all_devices(cls) -> Iterable[Device]:
+        """
+        Query the available device instances.
+
+        Returns
+        -------
+        Iterator of Device
+            An iterator over available devices.
+        """
+        total = nvml.device_get_count_v2()
+        for device_id in range(total):
+            yield cls(device_id)
+
+    @property
+    def architecture(self) -> DeviceArchitecture:
+        """
+        Device architecture. For example, a Tesla V100 will report
+        ``DeviceArchitecture.name == "Volta"``, and RTX A6000 will report
+        ``DeviceArchitecture.name == "Ampere"``. If the device returns an
+        architecture that is unknown to NVML then ``DeviceArchitecture.name ==
+        "Unknown"`` is reported, whereas an architecture that is unknown to
+        cuda.core.system is reported as ``DeviceArchitecture.name == "Unlisted"``.
+        """
+        return DeviceArchitecture(nvml.device_get_architecture(self._handle))
+
+    @property
+    def bar1_memory_info(self) -> BAR1MemoryInfo:
+        """
+        Get information about BAR1 memory.
+
+        BAR1 is used to map the FB (device memory) so that it can be directly
+        accessed by the CPU or by 3rd party devices (peer-to-peer on the PCIE
+        bus).
+        """
+        return BAR1MemoryInfo(nvml.device_get_bar1_memory_info(self._handle))
+
+    @property
+    def cpu_affinity(self) -> list[int]:
+        """
+        Get a list containing the CPU indices to which the GPU is directly connected.
+
+        Examples
+        --------
+        >>> Device(index=0).cpu_affinity
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+         40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59]
+        """
+        return _unpack_bitmask(nvml.device_get_cpu_affinity(
+            self._handle,
+            <unsigned int>ceil(cpu_count() / 64),
+        ))
+
+    @property
+    def cuda_compute_capability(self) -> tuple[int, int]:
+        """
+        CUDA compute capability of the device, e.g.: `(7, 0)` for a Tesla V100.
+
+        Returns a tuple `(major, minor)`.
+        """
+        return nvml.device_get_cuda_compute_capability(self._handle)
+
+    @property
+    def memory_info(self) -> MemoryInfo:
+        """
+        Object with memory information.
+        """
+        return MemoryInfo(nvml.device_get_memory_info_v2(self._handle))
+
+    @property
+    def name(self) -> str:
+        """
+        Name of the device, e.g.: `"Tesla V100-SXM2-32GB"`
+        """
+        return nvml.device_get_name(self._handle)
+
+    @property
+    def pci_info(self) -> PciInfo:
+        """
+        The PCI attributes of this device.
+        """
+        return PciInfo(nvml.device_get_pci_info_v3(self._handle))
+
+    @property
+    def serial(self) -> str:
+        """
+        Retrieves the globally unique board serial number associated with this
+        device's board.
+        """
+        return nvml.device_get_serial(self._handle)
+
+    @property
+    def uuid(self) -> str:
+        """
+        Retrieves the globally unique immutable UUID associated with this
+        device, as a 5 part hexadecimal string, that augments the immutable,
+        board serial identifier.
+        """
+        return nvml.device_get_uuid(self._handle)
diff --git a/cuda_core/cuda/core/system/_device_utils.pxi b/cuda_core/cuda/core/system/_device_utils.pxi
new file mode 100644
index 0000000000..6d7a150e8f
--- /dev/null
+++ b/cuda_core/cuda/core/system/_device_utils.pxi
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cpython cimport array
+from libc.stdint cimport uint64_t
+
+
+cpdef inline list[int] _unpack_bitmask(uint64_t[:] arr):
+    """
+    Unpack a list of integers containing bitmasks.
+
+    Parameters
+    ----------
+    x: list of int
+        A list of integers
+
+    Examples
+    --------
+    >>> from cuda.core.system.utils import unpack_bitmask
+    >>> unpack_bitmask([1 + 2 + 8])
+    [0, 1, 3]
+    >>> unpack_bitmask([1 + 2 + 16])
+    [0, 1, 4]
+    >>> unpack_bitmask([1 + 2 + 16, 2 + 4])
+    [0, 1, 4, 65, 66]
+    """
+    cdef uint64_t i, j, idx
+    cdef int mask_bits = 64
+
+    res = []
+
+    for i in range(len(arr)):
+        cpu_offset = i * mask_bits
+        idx = 1
+        for j in range(mask_bits):
+            if arr[i] & idx:
+                res.append(cpu_offset + j)
+            idx <<= 1
+    return res
diff --git a/cuda_core/cuda/core/system/_nvml_context.pxd b/cuda_core/cuda/core/system/_nvml_context.pxd
new file mode 100644
index 0000000000..64dbe705f5
--- /dev/null
+++ b/cuda_core/cuda/core/system/_nvml_context.pxd
@@ -0,0 +1,58 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+cdef extern from *:
+    """
+    #if defined(_WIN32) || defined(_WIN64)
+        #include <process.h>
+    #else
+        #include <unistd.h>
+    #endif
+    """
+    int getpid() nogil
+
+
+ctypedef enum _NVMLState:
+    UNINITIALIZED = 0
+    INITIALIZED = 1
+    DISABLED_LIBRARY_NOT_FOUND = 2
+
+
+# Initialisation must occur per-process, so an initialised state is a
+# (state, pid) pair
+cdef _NVMLState _NVML_STATE
+
+
+cdef int _NVML_OWNER_PID
+
+
+cpdef _initialize()
+
+
+cpdef inline initialize():
+    """
+    Initializes Nvidia Management Library (NVML), ensuring it only happens once per process.
+    """
+    if _NVML_STATE == _NVMLState.DISABLED_LIBRARY_NOT_FOUND or (
+        _NVML_STATE == _NVMLState.INITIALIZED and getpid() == _NVML_OWNER_PID
+    ):
+        return
+
+    _initialize()
+
+
+cpdef inline bint is_initialized():
+    """
+    Check whether the NVML context is initialized on this process.
+
+    Returns
+    -------
+    result: bool
+        Whether the NVML context is initialized on this process.
+    """
+    return _NVML_STATE == _NVMLState.INITIALIZED and getpid() == _NVML_OWNER_PID
+
+
+cpdef validate()
diff --git a/cuda_core/cuda/core/system/_nvml_context.pyx b/cuda_core/cuda/core/system/_nvml_context.pyx
new file mode 100644
index 0000000000..d6d9c46060
--- /dev/null
+++ b/cuda_core/cuda/core/system/_nvml_context.pyx
@@ -0,0 +1,79 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import threading
+
+from cuda.bindings import _nvml as nvml
+
+from . import exceptions
+
+
+_NVML_STATE = _NVMLState.UNINITIALIZED
+
+
+_NVML_OWNER_PID = 0
+
+
+_lock = threading.Lock()
+
+
+# For testing
+def _get_nvml_state():
+    return _NVML_STATE
+
+
+cpdef _initialize():
+    """
+    Initializes Nvidia Management Library (NVML), ensuring it only happens once per process.
+    """
+    global _NVML_STATE, _NVML_OWNER_PID
+
+    with _lock:
+        # Double-check to make sure nothing has changed since acquiring the lock
+        if _NVML_STATE == _NVMLState.DISABLED_LIBRARY_NOT_FOUND or (
+            _NVML_STATE == _NVMLState.INITIALIZED and getpid() == _NVML_OWNER_PID
+        ):
+            return
+        elif (
+            _NVML_STATE == _NVMLState.INITIALIZED and getpid() != _NVML_OWNER_PID
+        ) or _NVML_STATE == _NVMLState.UNINITIALIZED:
+            try:
+                nvml.init_v2()
+            except (
+                exceptions.LibraryNotFoundError,
+                exceptions.DriverNotLoadedError,
+                exceptions.UnknownError,
+            ):
+                _NVML_STATE = _NVMLState.DISABLED_LIBRARY_NOT_FOUND
+                return
+
+            # initialization was successful
+            _NVML_STATE = _NVMLState.INITIALIZED
+            _NVML_OWNER_PID = getpid()
+        else:
+            raise RuntimeError(f"Unhandled initialisation state ({_NVML_STATE=}, {_NVML_OWNER_PID=})")
+
+
+cpdef validate():
+    """
+    Validate NVML state.
+
+    Validate that NVML is initialized, functional and that the system has at
+    least one GPU available.
+
+    Raises
+    ------
+    nvml.UninitializedError
+        If NVML hasn't been initialized.
+    nvml.LibraryNotFoundError
+        If the NVML library could not be found.
+    nvml.GpuNotFoundError
+        If no GPUs are available.
+    """
+    if _NVML_STATE == _NVMLState.DISABLED_LIBRARY_NOT_FOUND:
+        raise exceptions.LibraryNotFoundError()
+    elif not is_initialized():
+        raise exceptions.UninitializedError()
+    elif nvml.device_get_count_v2() == 0:
+        raise exceptions.GpuNotFoundError()
diff --git a/cuda_core/cuda/core/system/_system.pyx b/cuda_core/cuda/core/system/_system.pyx
new file mode 100644
index 0000000000..e6163b94fd
--- /dev/null
+++ b/cuda_core/cuda/core/system/_system.pyx
@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+# This file needs to either use NVML exclusively, or when `cuda.bindings._nvml`
+# isn't available, fall back to non-NVML-based methods for backward
+# compatibility.
+
+
+CUDA_BINDINGS_NVML_IS_COMPATIBLE: bool
+
+try:
+    from cuda.bindings._version import __version_tuple__ as _BINDINGS_VERSION
+except ImportError:
+    CUDA_BINDINGS_NVML_IS_COMPATIBLE = False
+else:
+    CUDA_BINDINGS_NVML_IS_COMPATIBLE = _BINDINGS_VERSION >= (13, 1, 2) or (_BINDINGS_VERSION[0] == 12 and _BINDINGS_VERSION[1:3] >= (9, 6))
+
+
+if CUDA_BINDINGS_NVML_IS_COMPATIBLE:
+    from cuda.bindings import _nvml as nvml
+    from ._nvml_context import initialize
+else:
+    from cuda.core._utils.cuda_utils import driver, handle_return, runtime
+
+
+def get_driver_version(kernel_mode: bool = False) -> tuple[int, int]:
+    """
+    Get the driver version.
+
+    Parameters
+    ----------
+    kernel_mode: bool
+        When `True`, return the kernel-mode driver version, e.g. 580.65.06.
+        Otherwise, return the user-mode driver version, e.g. 13.0.1.
+
+    Returns
+    -------
+    version: tuple[int, int]
+        Tuple in the format `(MAJOR, MINOR)`.
+    """
+    return get_driver_version_full(kernel_mode)[:2]
+
+
+def get_driver_version_full(kernel_mode: bool = False) -> tuple[int, int, int]:
+    """
+    Get the full driver version.
+
+    Parameters
+    ----------
+    kernel_mode: bool
+        When `True`, return the kernel-mode driver version, e.g. 580.65.06.
+        Otherwise, return the user-mode driver version, e.g. 13.0.1.
+
+    Returns
+    -------
+    version: tuple[int, int, int]
+        Tuple in the format `(MAJOR, MINOR, PATCH)`.
+    """
+    cdef int v
+    if kernel_mode:
+        if not CUDA_BINDINGS_NVML_IS_COMPATIBLE:
+            raise ValueError("Kernel-mode driver version requires NVML support")
+        initialize()
+        return tuple(int(v) for v in nvml.system_get_driver_version().split("."))
+    else:
+        if CUDA_BINDINGS_NVML_IS_COMPATIBLE:
+            initialize()
+            v = nvml.system_get_cuda_driver_version()
+        else:
+            v = handle_return(driver.cuDriverGetVersion())
+        return (v // 1000, (v // 10) % 100, v % 10)
+
+
+def get_nvml_version() -> tuple[int, ...]:
+    """
+    The version of the NVML library.
+    """
+    if not CUDA_BINDINGS_NVML_IS_COMPATIBLE:
+        raise RuntimeError("NVML library is not available")
+    return tuple(int(v) for v in nvml.system_get_nvml_version().split("."))
+
+
+def get_num_devices() -> int:
+    """
+    Return the number of devices in the system.
+    """
+    if CUDA_BINDINGS_NVML_IS_COMPATIBLE:
+        initialize()
+        return nvml.device_get_count_v2()
+    else:
+        return handle_return(runtime.cudaGetDeviceCount())
+
+
+def get_process_name(pid: int) -> str:
+    """
+    The name of process with given PID.
+
+    Parameters
+    ----------
+    pid: int
+        The PID of the process for which to get the name.
+
+    Returns
+    -------
+    name: str
+        The process name.
+    """
+    initialize()
+    return nvml.system_get_process_name(pid)
+
+
+__all__ = [
+    "get_driver_version",
+    "get_driver_version_full",
+    "get_nvml_version",
+    "get_num_devices",
+    "get_process_name",
+    "CUDA_BINDINGS_NVML_IS_COMPATIBLE",
+]
diff --git a/cuda_core/cuda/core/system/exceptions.py b/cuda_core/cuda/core/system/exceptions.py
new file mode 100644
index 0000000000..65bcdd27b5
--- /dev/null
+++ b/cuda_core/cuda/core/system/exceptions.py
@@ -0,0 +1,75 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+from cuda.bindings import _nvml as nvml
+
+NvmlError = nvml.NvmlError
+UninitializedError = nvml.UninitializedError
+InvalidArgumentError = nvml.InvalidArgumentError
+NotSupportedError = nvml.NotSupportedError
+NoPermissionError = nvml.NoPermissionError
+AlreadyInitializedError = nvml.AlreadyInitializedError
+NotFoundError = nvml.NotFoundError
+InsufficientSizeError = nvml.InsufficientSizeError
+InsufficientPowerError = nvml.InsufficientPowerError
+DriverNotLoadedError = nvml.DriverNotLoadedError
+TimeoutError = nvml.TimeoutError
+IrqIssueError = nvml.IrqIssueError
+LibraryNotFoundError = nvml.LibraryNotFoundError
+FunctionNotFoundError = nvml.FunctionNotFoundError
+CorruptedInforomError = nvml.CorruptedInforomError
+GpuIsLostError = nvml.GpuIsLostError
+ResetRequiredError = nvml.ResetRequiredError
+OperatingSystemError = nvml.OperatingSystemError
+LibRmVersionMismatchError = nvml.LibRmVersionMismatchError
+InUseError = nvml.InUseError
+MemoryError = nvml.MemoryError
+NoDataError = nvml.NoDataError
+VgpuEccNotSupportedError = nvml.VgpuEccNotSupportedError
+InsufficientResourcesError = nvml.InsufficientResourcesError
+FreqNotSupportedError = nvml.FreqNotSupportedError
+ArgumentVersionMismatchError = nvml.ArgumentVersionMismatchError
+DeprecatedError = nvml.DeprecatedError
+NotReadyError = nvml.NotReadyError
+GpuNotFoundError = nvml.GpuNotFoundError
+InvalidStateError = nvml.InvalidStateError
+ResetTypeNotSupportedError = nvml.ResetTypeNotSupportedError
+UnknownError = nvml.UnknownError
+
+
+__all__ = [
+    "NvmlError",
+    "UninitializedError",
+    "InvalidArgumentError",
+    "NotSupportedError",
+    "NoPermissionError",
+    "AlreadyInitializedError",
+    "NotFoundError",
+    "InsufficientSizeError",
+    "InsufficientPowerError",
+    "DriverNotLoadedError",
+    "TimeoutError",
+    "IrqIssueError",
+    "LibraryNotFoundError",
+    "FunctionNotFoundError",
+    "CorruptedInforomError",
+    "GpuIsLostError",
+    "ResetRequiredError",
+    "OperatingSystemError",
+    "LibRmVersionMismatchError",
+    "InUseError",
+    "MemoryError",
+    "NoDataError",
+    "VgpuEccNotSupportedError",
+    "InsufficientResourcesError",
+    "FreqNotSupportedError",
+    "ArgumentVersionMismatchError",
+    "DeprecatedError",
+    "NotReadyError",
+    "GpuNotFoundError",
+    "InvalidStateError",
+    "ResetTypeNotSupportedError",
+    "UnknownError",
+]
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 5bd47a4ed2..ac46fa2fa5 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -64,11 +64,21 @@ CUDA compilation toolchain
    LinkerOptions
 
 
-CUDA system information
------------------------
+CUDA system information and NVIDIA Management Library (NVML)
+------------------------------------------------------------
 
-.. automethod:: cuda.core._system.System.get_driver_version
-.. automethod:: cuda.core._system.System.get_num_devices
+.. autosummary::
+   :toctree: generated/
+
+   system.get_driver_version
+   system.get_driver_version_full
+   system.get_num_devices
+   system.get_nvml_version
+   system.get_process_name
+
+   :template: autosummary/cyclass.rst
+
+   system.Device
 
 
 .. module:: cuda.core.utils
diff --git a/cuda_core/tests/system/__init__.py b/cuda_core/tests/system/__init__.py
new file mode 100644
index 0000000000..79599c77db
--- /dev/null
+++ b/cuda_core/tests/system/__init__.py
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/cuda_core/tests/system/conftest.py b/cuda_core/tests/system/conftest.py
new file mode 100644
index 0000000000..ad2f06bfdb
--- /dev/null
+++ b/cuda_core/tests/system/conftest.py
@@ -0,0 +1,11 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+import pytest
+from cuda.core import system
+
+skip_if_nvml_unsupported = pytest.mark.skipif(
+    not system.CUDA_BINDINGS_NVML_IS_COMPATIBLE, reason="NVML support requires cuda.bindings version 12.9.6+ or 13.1.2+"
+)
diff --git a/cuda_core/tests/system/test_nvml_context.py b/cuda_core/tests/system/test_nvml_context.py
new file mode 100644
index 0000000000..199b4a67ad
--- /dev/null
+++ b/cuda_core/tests/system/test_nvml_context.py
@@ -0,0 +1,69 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ruff: noqa: E402
+
+from .conftest import skip_if_nvml_unsupported
+
+pytestmark = skip_if_nvml_unsupported
+
+import multiprocessing as mp
+from platform import uname
+
+import pytest
+
+UNINITIALIZED = 0
+INITIALIZED = 1
+DISABLED_LIBRARY_NOT_FOUND = 2
+
+
+def _run_process(target):
+    p = mp.get_context("spawn").Process(target=target)
+    p.start()
+    p.join()
+    assert not p.exitcode
+
+
+def _test_uninitialized():
+    from cuda.core.system import _nvml_context
+
+    assert _nvml_context._get_nvml_state() == UNINITIALIZED
+
+
+def test_uninitialized():
+    _run_process(_test_uninitialized)
+
+
+def _test_is_initialized():
+    from cuda.core.system import _nvml_context
+
+    _nvml_context.initialize()
+    assert _nvml_context._get_nvml_state() == INITIALIZED
+    assert _nvml_context.is_initialized() is True
+
+
+def test_is_initialized():
+    _run_process(_test_is_initialized)
+
+
+@pytest.mark.skipif("microsoft-standard" in uname().release, reason="Probably a WSL system")
+def test_no_wsl():
+    assert "microsoft-standard" not in uname().release
+
+
+@pytest.mark.skipif("microsoft-standard" not in uname().release, reason="Probably a non-WSL system")
+def test_wsl():
+    assert "microsoft-standard" in uname().release
+
+
+def _test_validate():
+    from cuda.core.system import _nvml_context
+
+    _nvml_context.initialize()
+
+    assert _nvml_context.validate() is None
+
+
+def test_validate():
+    _run_process(_test_validate)
diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py
new file mode 100644
index 0000000000..134ea7cbbe
--- /dev/null
+++ b/cuda_core/tests/system/test_system_device.py
@@ -0,0 +1,191 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ruff: noqa: E402
+
+from .conftest import skip_if_nvml_unsupported
+
+pytestmark = skip_if_nvml_unsupported
+
+import array
+import os
+import re
+import sys
+
+import pytest
+from cuda.core import system
+from cuda.core.system import _device as system_device
+
+if system.CUDA_BINDINGS_NVML_IS_COMPATIBLE:
+    from cuda.bindings import _nvml as nvml
+
+
+@pytest.fixture(autouse=True, scope="module")
+def check_gpu_available():
+    if not system.CUDA_BINDINGS_NVML_IS_COMPATIBLE or system.get_num_devices() == 0:
+        pytest.skip("No GPUs available to run device tests", allow_module_level=True)
+
+
+def test_device_index_handle():
+    for device in system.Device.get_all_devices():
+        assert isinstance(device.handle, int)
+
+
+def test_device_architecture():
+    for device in system.Device.get_all_devices():
+        device_arch = device.architecture
+
+        assert isinstance(device_arch, system_device.DeviceArchitecture)
+        if sys.version_info < (3, 12):
+            assert device_arch.id in nvml.DeviceArch.__members__.values()
+        else:
+            assert device_arch.id in nvml.DeviceArch
+
+
+def test_device_bar1_memory():
+    for device in system.Device.get_all_devices():
+        bar1_memory_info = device.bar1_memory_info
+        free, total, used = (
+            bar1_memory_info.free,
+            bar1_memory_info.total,
+            bar1_memory_info.used,
+        )
+
+        assert isinstance(bar1_memory_info, system_device.BAR1MemoryInfo)
+        assert isinstance(free, int)
+        assert isinstance(total, int)
+        assert isinstance(used, int)
+
+        assert free >= 0
+        assert total >= 0
+        assert used >= 0
+        assert free + used == total
+
+
+def test_device_cpu_affinity():
+    skip_reasons = set()
+    for device in system.Device.get_all_devices():
+        try:
+            affinity = device.cpu_affinity
+        except system.NotSupportedError:
+            skip_reasons.add(f"CPU affinity not supported on '{device.name}'")
+        else:
+            assert isinstance(affinity, list)
+            os.sched_setaffinity(0, affinity)
+            assert os.sched_getaffinity(0) == set(affinity)
+    if skip_reasons:
+        pytest.skip(" ; ".join(skip_reasons))
+
+
+def test_device_cuda_compute_capability():
+    for device in system.Device.get_all_devices():
+        cuda_compute_capability = device.cuda_compute_capability
+        assert isinstance(cuda_compute_capability, tuple)
+        assert len(cuda_compute_capability) == 2
+        assert all([isinstance(i, int) for i in cuda_compute_capability])
+        assert 3 <= cuda_compute_capability[0] <= 99
+        assert 0 <= cuda_compute_capability[1] <= 9
+
+
+def test_device_memory():
+    for device in system.Device.get_all_devices():
+        memory_info = device.memory_info
+        free, total, used, reserved = memory_info.free, memory_info.total, memory_info.used, memory_info.reserved
+
+        assert isinstance(memory_info, system_device.MemoryInfo)
+        assert isinstance(free, int)
+        assert isinstance(total, int)
+        assert isinstance(used, int)
+        assert isinstance(reserved, int)
+
+        assert free >= 0
+        assert total >= 0
+        assert used >= 0
+        assert reserved >= 0
+        assert free + used + reserved == total
+
+
+def test_device_name():
+    for device in system.Device.get_all_devices():
+        name = device.name
+        assert isinstance(name, str)
+        assert len(name) > 0
+
+
+def test_device_pci_info():
+    for device in system.Device.get_all_devices():
+        pci_info = device.pci_info
+        assert isinstance(pci_info, system_device.PciInfo)
+
+        assert isinstance(pci_info.bus_id, str)
+        assert re.match("[a-f0-9]{8}:[a-f0-9]{2}:[a-f0-9]{2}.[a-f0-9]", pci_info.bus_id.lower())
+        bus_id_domain = int(pci_info.bus_id.split(":")[0], 16)
+        bus_id_bus = int(pci_info.bus_id.split(":")[1], 16)
+        bus_id_device = int(pci_info.bus_id.split(":")[2][:2], 16)
+
+        assert isinstance(pci_info.domain, int)
+        assert 0x00 <= pci_info.domain <= 0xFFFFFFFF
+        assert pci_info.domain == bus_id_domain
+
+        assert isinstance(pci_info.bus, int)
+        assert 0x00 <= pci_info.bus <= 0xFF
+        assert pci_info.bus == bus_id_bus
+
+        assert isinstance(pci_info.device, int)
+        assert 0x00 <= pci_info.device <= 0xFF
+        assert pci_info.device == bus_id_device
+
+        assert isinstance(pci_info.vendor_id, int)
+        assert 0x0000 <= pci_info.vendor_id <= 0xFFFF
+
+        assert isinstance(pci_info.device_id, int)
+        assert 0x0000 <= pci_info.device_id <= 0xFFFF
+
+
+def test_device_serial():
+    skip_reasons = set()
+    for device in system.Device.get_all_devices():
+        try:
+            serial = device.serial
+        except system.NotSupportedError:
+            skip_reasons.add(f"Device serial not supported by device '{device.name}'")
+        else:
+            assert isinstance(serial, str)
+            assert len(serial) > 0
+
+    if skip_reasons:
+        pytest.skip(" ; ".join(skip_reasons))
+
+
+def test_device_uuid():
+    for device in system.Device.get_all_devices():
+        uuid = device.uuid
+        assert isinstance(uuid, str)
+
+        # Expands to GPU-8hex-4hex-4hex-4hex-12hex, where 8hex means 8 consecutive
+        # hex characters, e.g.: "GPU-abcdef12-abcd-0123-4567-1234567890ab"
+
+
+@pytest.mark.parametrize(
+    "params",
+    [
+        {
+            "input": [1152920405096267775, 0],
+            "output": [i for i in range(20)] + [i + 40 for i in range(20)],
+        },
+        {
+            "input": [17293823668613283840, 65535],
+            "output": [i + 20 for i in range(20)] + [i + 60 for i in range(20)],
+        },
+        {"input": [18446744073709551615, 0], "output": [i for i in range(64)]},
+        {"input": [0, 18446744073709551615], "output": [i + 64 for i in range(64)]},
+    ],
+)
+def test_unpack_bitmask(params):
+    assert system_device._unpack_bitmask(array.array("Q", params["input"])) == params["output"]
+
+
+def test_unpack_bitmask_single_value():
+    with pytest.raises(TypeError):
+        system_device._unpack_bitmask(1)
diff --git a/cuda_core/tests/system/test_system_system.py b/cuda_core/tests/system/test_system_system.py
new file mode 100644
index 0000000000..582c471b8c
--- /dev/null
+++ b/cuda_core/tests/system/test_system_system.py
@@ -0,0 +1,97 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ruff: noqa: E402
+
+import os
+
+import pytest
+
+try:
+    from cuda.bindings import driver, runtime
+except ImportError:
+    from cuda import cuda as driver
+    from cuda import cudart as runtime
+
+from cuda.core import Device, system
+from cuda.core._utils.cuda_utils import handle_return
+
+from .conftest import skip_if_nvml_unsupported
+
+
+def test_driver_version():
+    driver_version = system.get_driver_version()
+    version = handle_return(driver.cuDriverGetVersion())
+    expected_driver_version = (version // 1000, (version % 1000) // 10)
+    assert driver_version == expected_driver_version, "Driver version does not match expected value"
+
+
+def test_num_devices():
+    num_devices = system.get_num_devices()
+    expected_num_devices = handle_return(runtime.cudaGetDeviceCount())
+    assert num_devices == expected_num_devices, "Number of devices does not match expected value"
+
+
+def test_devices():
+    devices = Device.get_all_devices()
+    expected_num_devices = handle_return(runtime.cudaGetDeviceCount())
+    expected_devices = tuple(Device(device_id) for device_id in range(expected_num_devices))
+    assert len(devices) == len(expected_devices), "Number of devices does not match expected value"
+    for device, expected_device in zip(devices, expected_devices):
+        assert device.device_id == expected_device.device_id, "Device ID does not match expected value"
+
+
+def test_cuda_driver_version():
+    cuda_driver_version = system.get_driver_version_full()
+    assert isinstance(cuda_driver_version, tuple)
+    assert len(cuda_driver_version) == 3
+
+    ver_maj, ver_min, ver_patch = cuda_driver_version
+    assert ver_maj >= 10
+    assert 0 <= ver_min <= 99
+    assert 0 <= ver_patch <= 9
+
+
+@skip_if_nvml_unsupported
+def test_gpu_driver_version():
+    driver_version = system.get_driver_version(kernel_mode=True)
+    assert isinstance(driver_version, tuple)
+    assert len(driver_version) in (2, 3)
+
+    (ver_maj, ver_min, *ver_patch) = driver_version
+    assert 400 <= ver_maj < 1000
+    assert ver_min >= 0
+    if ver_patch:
+        assert 0 <= ver_patch[0] <= 99
+
+
+@skip_if_nvml_unsupported
+def test_nvml_version():
+    nvml_version = system.get_nvml_version()
+    assert isinstance(nvml_version, tuple)
+    assert len(nvml_version) in (3, 4)
+
+    (cuda_ver_maj, ver_maj, ver_min, *ver_patch) = nvml_version
+    assert cuda_ver_maj >= 10
+    assert 400 <= ver_maj < 1000
+    assert ver_min >= 0
+    if ver_patch:
+        assert 0 <= ver_patch[0] <= 99
+
+
+@skip_if_nvml_unsupported
+def test_get_process_name():
+    try:
+        process_name = system.get_process_name(os.getpid())
+    except system.NotFoundError:
+        pytest.skip("Process not found")
+
+    assert isinstance(process_name, str)
+    assert "python" in process_name
+
+
+def test_device_count():
+    device_count = system.get_num_devices()
+    assert isinstance(device_count, int)
+    assert device_count >= 0
diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py
deleted file mode 100644
index 60b7ef7ec7..0000000000
--- a/cuda_core/tests/test_system.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-try:
-    from cuda.bindings import driver, runtime
-except ImportError:
-    from cuda import cuda as driver
-    from cuda import cudart as runtime
-
-from cuda.core import Device, system
-from cuda.core._utils.cuda_utils import handle_return
-
-
-def test_system_singleton():
-    system1 = system
-    system2 = system
-    assert id(system1) == id(system2), "system is not a singleton"
-
-
-def test_driver_version():
-    driver_version = system.get_driver_version()
-    version = handle_return(driver.cuDriverGetVersion())
-    expected_driver_version = (version // 1000, (version % 1000) // 10)
-    assert driver_version == expected_driver_version, "Driver version does not match expected value"
-
-
-def test_num_devices():
-    num_devices = system.get_num_devices()
-    expected_num_devices = handle_return(runtime.cudaGetDeviceCount())
-    assert num_devices == expected_num_devices, "Number of devices does not match expected value"
-
-
-def test_devices():
-    devices = Device.get_all_devices()
-    expected_num_devices = handle_return(runtime.cudaGetDeviceCount())
-    expected_devices = tuple(Device(device_id) for device_id in range(expected_num_devices))
-    assert len(devices) == len(expected_devices), "Number of devices does not match expected value"
-    for device, expected_device in zip(devices, expected_devices):
-        assert device.device_id == expected_device.device_id, "Device ID does not match expected value"