Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 48 additions & 9 deletions internal/pkg/nvmlprovider/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"log/slog"
"strconv"
"strings"
"sync"

"github.com/NVIDIA/go-nvml/pkg/nvml"
)
Expand Down Expand Up @@ -62,13 +63,18 @@ func SetClient(n NVML) {
// nvmlProvider implements NVML Interface
type nvmlProvider struct {
initialized bool
migCache map[string]*MIGDeviceInfo
// lock protects migCache
lock sync.RWMutex
}

func newNVMLProvider() (NVML, error) {
// Check if a NVML client already exists and return it if so.
if Client() != nil && Client().(nvmlProvider).initialized {
slog.Info("NVML already initialized.")
return Client(), nil
if Client() != nil {
if p, ok := Client().(*nvmlProvider); ok && p.initialized {
slog.Info("NVML already initialized.")
return Client(), nil
}
}

slog.Info("Attempting to initialize NVML library.")
Expand All @@ -79,10 +85,10 @@ func newNVMLProvider() (NVML, error) {
return nvmlProvider{initialized: false}, err
}

return nvmlProvider{initialized: true}, nil
return &nvmlProvider{initialized: true, migCache: make(map[string]*MIGDeviceInfo)}, nil
}

func (n nvmlProvider) preCheck() error {
func (n *nvmlProvider) preCheck() error {
if !n.initialized {
return fmt.Errorf("NVML library not initialized")
}
Expand All @@ -91,18 +97,51 @@ func (n nvmlProvider) preCheck() error {
}

// GetMIGDeviceInfoByID returns information about MIG DEVICE by ID
func (n nvmlProvider) GetMIGDeviceInfoByID(uuid string) (*MIGDeviceInfo, error) {
func (n *nvmlProvider) GetMIGDeviceInfoByID(uuid string) (*MIGDeviceInfo, error) {
if err := n.preCheck(); err != nil {
slog.Error(fmt.Sprintf("failed to get MIG Device Info; err: %v", err))
return nil, err
}

// Check cache first (including negative cache)
n.lock.RLock()
if info, ok := n.migCache[uuid]; ok {
n.lock.RUnlock()
if info == nil {
return nil, fmt.Errorf("previously failed to get MIG device info")
}
return info, nil
}
n.lock.RUnlock()

device, ret := nvml.DeviceGetHandleByUUID(uuid)
if ret == nvml.SUCCESS {
return getMIGDeviceInfoForNewDriver(device)
info, err := getMIGDeviceInfoForNewDriver(device)
if err == nil {
n.lock.Lock()
n.migCache[uuid] = info
n.lock.Unlock()
return info, nil
}
// Negative cache on failure
n.lock.Lock()
n.migCache[uuid] = nil
n.lock.Unlock()
return nil, err
}

return getMIGDeviceInfoForOldDriver(uuid)
info, err := getMIGDeviceInfoForOldDriver(uuid)
if err == nil {
n.lock.Lock()
n.migCache[uuid] = info
n.lock.Unlock()
return info, nil
}
// Negative cache on failure
n.lock.Lock()
n.migCache[uuid] = nil
n.lock.Unlock()
return nil, err
}

// getMIGDeviceInfoForNewDriver identifies MIG Device Information for drivers >= R470 (470.42.01+),
Expand Down Expand Up @@ -167,7 +206,7 @@ func getMIGDeviceInfoForOldDriver(uuid string) (*MIGDeviceInfo, error) {
}

// Cleanup performs cleanup operations for the NVML provider
func (n nvmlProvider) Cleanup() {
func (n *nvmlProvider) Cleanup() {
if err := n.preCheck(); err == nil {
reset()
}
Expand Down
15 changes: 15 additions & 0 deletions internal/pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ import (
"github.com/gorilla/mux"
"github.com/prometheus/exporter-toolkit/web"

"github.com/NVIDIA/go-dcgm/pkg/dcgm"

"github.com/NVIDIA/dcgm-exporter/internal/pkg/appconfig"
"github.com/NVIDIA/dcgm-exporter/internal/pkg/debug"
"github.com/NVIDIA/dcgm-exporter/internal/pkg/devicewatchlistmanager"
Expand Down Expand Up @@ -101,7 +103,20 @@ func NewMetricsServer(
}
}

if podMapper != nil {
if wl, exists := deviceWatchListManager.EntityWatchList(dcgm.FE_GPU); exists {
podMapper.DeviceInfo = wl.DeviceInfo()
} else {
slog.Warn("Could not find FE_GPU watchlist to configure PodMapper")
}
go podMapper.Run()
}

cleanup := func() {
if podMapper != nil {
slog.Info("Stopping PodMapper")
podMapper.Stop()
}
if podMapper != nil && c.KubernetesEnableDRA && podMapper.ResourceSliceManager != nil {
slog.Info("Stopping ResourceSliceManager")
podMapper.ResourceSliceManager.Stop()
Expand Down
Loading
Loading