diff --git a/internal/rm/health.go b/internal/rm/health.go
index 46f036bc6..afcec3053 100644
--- a/internal/rm/health.go
+++ b/internal/rm/health.go
@@ -21,6 +21,8 @@ import (
 	"os"
 	"strconv"
 	"strings"
+	"sync"
+	"time"
 
 	"github.com/NVIDIA/go-nvml/pkg/nvml"
 	"k8s.io/klog/v2"
@@ -38,9 +40,16 @@ const (
 	// Note that this also allows individual XIDs to be selected when ALL XIDs
 	// are disabled.
 	envEnableHealthChecks = "DP_ENABLE_HEALTHCHECKS"
+
+	// polledHealthCheckInterval defines how frequently the polled health checks
+	// run. These checks cover conditions not detectable via NVML events, such as
+	// remapped rows, retired pages pending status, and GPU temperature.
+	polledHealthCheckInterval = 30 * time.Second
 )
 
-// CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices
+// CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices.
+// It combines event-based monitoring (XID errors, ECC errors) with periodic polled checks
+// (remapped rows, retired pages, temperature).
 func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devices, unhealthy chan<- *Device) error {
 	xids := getDisabledHealthCheckXids()
 	if xids.IsAllDisabled() {
@@ -72,6 +81,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
 	}()
 
 	parentToDeviceMap := make(map[string]*Device)
+	parentToDevicesMap := make(map[string][]*Device)
 	deviceIDToGiMap := make(map[string]uint32)
 	deviceIDToCiMap := make(map[string]uint32)
 
@@ -86,6 +96,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
 		deviceIDToGiMap[d.ID] = gi
 		deviceIDToCiMap[d.ID] = ci
 		parentToDeviceMap[uuid] = d
+		parentToDevicesMap[uuid] = append(parentToDevicesMap[uuid], d)
 
 		gpu, ret := r.nvml.DeviceGetHandleByUUID(uuid)
 		if ret != nvml.SUCCESS {
@@ -111,9 +122,20 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
 		}
 	}
 
+	// Launch polled health checks (remapped rows, retired pages, temperature)
+	// in parallel with the event-based health check loop.
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		r.polledHealthChecks(stop, parentToDevicesMap, unhealthy)
+	}()
+
+	// Run event-based health check loop.
 	for {
 		select {
 		case <-stop:
+			wg.Wait()
 			return nil
 		default:
 		}
@@ -130,8 +152,36 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
 			continue
 		}
 
+		// Handle double-bit (uncorrectable) ECC errors.
+		if e.EventType == nvml.EventTypeDoubleBitEccError {
+			eventUUID, ret := e.Device.GetUUID()
+			if ret != nvml.SUCCESS {
+				klog.Infof("Failed to determine uuid for DoubleBitEccError event: %v; Marking all devices as unhealthy.", ret)
+				for _, d := range devices {
+					unhealthy <- d
+				}
+				continue
+			}
+			klog.Infof("DoubleBitEccError on Device=%s; marking device(s) as unhealthy.", eventUUID)
+			for _, d := range parentToDevicesMap[eventUUID] {
+				unhealthy <- d
+			}
+			continue
+		}
+
+		// Log single-bit (correctable) ECC errors but do not mark unhealthy.
+		if e.EventType == nvml.EventTypeSingleBitEccError {
+			eventUUID, ret := e.Device.GetUUID()
+			if ret != nvml.SUCCESS {
+				klog.Warningf("Failed to determine uuid for SingleBitEccError event: %v", ret)
+			} else {
+				klog.Warningf("SingleBitEccError on Device=%s (correctable; not marking unhealthy).", eventUUID)
+			}
+			continue
+		}
+
 		if e.EventType != nvml.EventTypeXidCriticalError {
-			klog.Infof("Skipping non-nvmlEventTypeXidCriticalError event: %+v", e)
+			klog.Infof("Skipping non-critical event: %+v", e)
 			continue
 		}
 
@@ -171,6 +221,146 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
 	}
 }
 
+// polledHealthChecks runs periodic health checks that cannot be detected via
+// NVML events. These cover hardware conditions such as remapped memory rows,
+// pending retired pages, and GPU temperature reaching the shutdown threshold.
+func (r *nvmlResourceManager) polledHealthChecks(stop <-chan interface{}, parentToDevicesMap map[string][]*Device, unhealthy chan<- *Device) {
+	ticker := time.NewTicker(polledHealthCheckInterval)
+	defer ticker.Stop()
+
+	// Track devices already reported unhealthy to avoid duplicate reports.
+	reported := make(map[string]bool)
+
+	for {
+		select {
+		case <-stop:
+			return
+		case <-ticker.C:
+			for uuid, devices := range parentToDevicesMap {
+				allReported := true
+				for _, d := range devices {
+					if !reported[d.ID] {
+						allReported = false
+						break
+					}
+				}
+				if allReported {
+					continue
+				}
+
+				gpu, ret := r.nvml.DeviceGetHandleByUUID(uuid)
+				if ret != nvml.SUCCESS {
+					klog.Warningf("Unable to get device handle for %v during polled health check: %v", uuid, ret)
+					continue
+				}
+
+				checks := []struct {
+					name  string
+					check func(nvml.Device) (string, bool)
+				}{
+					{"RemappedRows", r.checkRemappedRows},
+					{"RetiredPages", r.checkRetiredPages},
+					{"Temperature", r.checkTemperature},
+				}
+
+				for _, hc := range checks {
+					reason, failed := hc.check(gpu)
+					if !failed {
+						continue
+					}
+					klog.Infof("%s health check failed for %v: %s; marking device(s) as unhealthy.", hc.name, uuid, reason)
+					for _, d := range devices {
+						if !reported[d.ID] {
+							reported[d.ID] = true
+							unhealthy <- d
+						}
+					}
+					break
+				}
+			}
+		}
+	}
+}
+
+// checkRemappedRows checks whether the GPU has experienced a row remapping
+// failure or has a pending row remap that requires a GPU reset.
+// See: https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html
+func (r *nvmlResourceManager) checkRemappedRows(gpu nvml.Device) (string, bool) {
+	_, uncRows, isPending, failureOccurred, ret := gpu.GetRemappedRows()
+	if ret == nvml.ERROR_NOT_SUPPORTED {
+		return "", false
+	}
+	if ret != nvml.SUCCESS {
+		klog.Warningf("Failed to get remapped rows: %v", ret)
+		return "", false
+	}
+
+	if failureOccurred {
+		return "row remapping failure occurred (uncorrectable memory error)", true
+	}
+	if isPending {
+		return "row remapping is pending (GPU reset required)", true
+	}
+	if uncRows > 0 {
+		klog.Warningf("GPU has %d uncorrectable remapped row(s); rows were successfully remapped", uncRows)
+	}
+	return "", false
+}
+
+// checkRetiredPages checks whether the GPU has pages pending retirement.
+// Pending page retirements indicate that the GPU requires a reboot to complete
+// the retirement of faulty memory pages.
+func (r *nvmlResourceManager) checkRetiredPages(gpu nvml.Device) (string, bool) {
+	status, ret := gpu.GetRetiredPagesPendingStatus()
+	if ret == nvml.ERROR_NOT_SUPPORTED {
+		return "", false
+	}
+	if ret != nvml.SUCCESS {
+		klog.Warningf("Failed to get retired pages pending status: %v", ret)
+		return "", false
+	}
+
+	if status == nvml.FEATURE_ENABLED {
+		return "pages are pending retirement (reboot required)", true
+	}
+	return "", false
+}
+
+// checkTemperature checks whether the GPU temperature has reached or exceeded
+// the hardware shutdown threshold. A GPU at this temperature will be shut down
+// by the hardware to prevent damage. If the temperature has reached the
+// slowdown threshold, a warning is logged but the device is not marked unhealthy.
+func (r *nvmlResourceManager) checkTemperature(gpu nvml.Device) (string, bool) {
+	shutdownTemp, ret := gpu.GetTemperatureThreshold(nvml.TEMPERATURE_THRESHOLD_SHUTDOWN)
+	if ret == nvml.ERROR_NOT_SUPPORTED {
+		return "", false
+	}
+	if ret != nvml.SUCCESS {
+		klog.Warningf("Failed to get shutdown temperature threshold: %v", ret)
+		return "", false
+	}
+
+	currentTemp, ret := gpu.GetTemperature(nvml.TEMPERATURE_GPU)
+	if ret == nvml.ERROR_NOT_SUPPORTED {
+		return "", false
+	}
+	if ret != nvml.SUCCESS {
+		klog.Warningf("Failed to get current GPU temperature: %v", ret)
+		return "", false
+	}
+
+	if currentTemp >= shutdownTemp {
+		return fmt.Sprintf("GPU temperature (%d°C) has reached shutdown threshold (%d°C)", currentTemp, shutdownTemp), true
+	}
+
+	slowdownTemp, ret := gpu.GetTemperatureThreshold(nvml.TEMPERATURE_THRESHOLD_SLOWDOWN)
+	if ret == nvml.SUCCESS && currentTemp >= slowdownTemp {
+		klog.Warningf("GPU temperature (%d°C) has reached slowdown threshold (%d°C); GPU is thermally throttling", currentTemp, slowdownTemp)
+	}
+
+	return "", false
+}
+
 const allXIDs = 0
 
 // disabledXIDs stores a map of explicitly disabled XIDs.
diff --git a/internal/rm/health_test.go b/internal/rm/health_test.go
index 6f50dccb8..964012185 100644
--- a/internal/rm/health_test.go
+++ b/internal/rm/health_test.go
@@ -21,6 +21,7 @@ import (
 	"strings"
 	"testing"
 
+	"github.com/NVIDIA/go-nvml/pkg/nvml"
 	"github.com/stretchr/testify/require"
 )
 
@@ -221,3 +222,300 @@ func TestGetDisabledHealthCheckXids(t *testing.T) {
 		})
 	}
 }
+
+func TestCheckRemappedRows(t *testing.T) {
+	rm := &nvmlResourceManager{}
+
+	testCases := []struct {
+		description     string
+		device          *DeviceMock
+		expectedReason  string
+		expectedFailure bool
+	}{
+		{
+			description: "not supported returns healthy",
+			device: &DeviceMock{
+				GetRemappedRowsFunc: func() (int, int, bool, bool, nvml.Return) {
+					return 0, 0, false, false, nvml.ERROR_NOT_SUPPORTED
+				},
+			},
+			expectedFailure: false,
+		},
+		{
+			description: "nvml error returns healthy",
+			device: &DeviceMock{
+				GetRemappedRowsFunc: func() (int, int, bool, bool, nvml.Return) {
+					return 0, 0, false, false, nvml.ERROR_UNKNOWN
+				},
+			},
+			expectedFailure: false,
+		},
+		{
+			description: "no issues returns healthy",
+			device: &DeviceMock{
+				GetRemappedRowsFunc: func() (int, int, bool, bool, nvml.Return) {
+					return 0, 0, false, false, nvml.SUCCESS
+				},
+			},
+			expectedFailure: false,
+		},
+		{
+			description: "uncorrectable rows remapped successfully returns healthy",
+			device: &DeviceMock{
+				GetRemappedRowsFunc: func() (int, int, bool, bool, nvml.Return) {
+					return 2, 3, false, false, nvml.SUCCESS
+				},
+			},
+			expectedFailure: false,
+		},
+		{
+			description: "row remapping failure returns unhealthy",
+			device: &DeviceMock{
+				GetRemappedRowsFunc: func() (int, int, bool, bool, nvml.Return) {
+					return 0, 0, false, true, nvml.SUCCESS
+				},
+			},
+			expectedReason:  "row remapping failure occurred (uncorrectable memory error)",
+			expectedFailure: true,
+		},
+		{
+			description: "pending row remap returns unhealthy",
+			device: &DeviceMock{
+				GetRemappedRowsFunc: func() (int, int, bool, bool, nvml.Return) {
+					return 0, 0, true, false, nvml.SUCCESS
+				},
+			},
+			expectedReason:  "row remapping is pending (GPU reset required)",
+			expectedFailure: true,
+		},
+		{
+			description: "failure takes precedence over pending",
+			device: &DeviceMock{
+				GetRemappedRowsFunc: func() (int, int, bool, bool, nvml.Return) {
+					return 0, 1, true, true, nvml.SUCCESS
+				},
+			},
+			expectedReason:  "row remapping failure occurred (uncorrectable memory error)",
+			expectedFailure: true,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.description, func(t *testing.T) {
+			reason, failed := rm.checkRemappedRows(tc.device)
+			require.Equal(t, tc.expectedFailure, failed)
+			require.Equal(t, tc.expectedReason, reason)
+		})
+	}
+}
+
+func TestCheckRetiredPages(t *testing.T) {
+	rm := &nvmlResourceManager{}
+
+	testCases := []struct {
+		description     string
+		device          *DeviceMock
+		expectedReason  string
+		expectedFailure bool
+	}{
+		{
+			description: "not supported returns healthy",
+			device: &DeviceMock{
+				GetRetiredPagesPendingStatusFunc: func() (nvml.EnableState, nvml.Return) {
+					return nvml.FEATURE_DISABLED, nvml.ERROR_NOT_SUPPORTED
+				},
+			},
+			expectedFailure: false,
+		},
+		{
+			description: "nvml error returns healthy",
+			device: &DeviceMock{
+				GetRetiredPagesPendingStatusFunc: func() (nvml.EnableState, nvml.Return) {
+					return nvml.FEATURE_DISABLED, nvml.ERROR_UNKNOWN
+				},
+			},
+			expectedFailure: false,
+		},
+		{
+			description: "no pending retired pages returns healthy",
+			device: &DeviceMock{
+				GetRetiredPagesPendingStatusFunc: func() (nvml.EnableState, nvml.Return) {
+					return nvml.FEATURE_DISABLED, nvml.SUCCESS
+				},
+			},
+			expectedFailure: false,
+		},
+		{
+			description: "pending retired pages returns unhealthy",
+			device: &DeviceMock{
+				GetRetiredPagesPendingStatusFunc: func() (nvml.EnableState, nvml.Return) {
+					return nvml.FEATURE_ENABLED, nvml.SUCCESS
+				},
+			},
+			expectedReason:  "pages are pending retirement (reboot required)",
+			expectedFailure: true,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.description, func(t *testing.T) {
+			reason, failed := rm.checkRetiredPages(tc.device)
+			require.Equal(t, tc.expectedFailure, failed)
+			require.Equal(t, tc.expectedReason, reason)
+		})
+	}
+}
+
+func TestCheckTemperature(t *testing.T) {
+	rm := &nvmlResourceManager{}
+
+	testCases := []struct {
+		description     string
+		device          *DeviceMock
+		expectedReason  string
+		expectedFailure bool
+	}{
+		{
+			description: "shutdown threshold not supported returns healthy",
+			device: &DeviceMock{
+				GetTemperatureThresholdFunc: func(tt nvml.TemperatureThresholds) (uint32, nvml.Return) {
+					return 0, nvml.ERROR_NOT_SUPPORTED
+				},
+			},
+			expectedFailure: false,
+		},
+		{
+			description: "shutdown threshold error returns healthy",
+			device: &DeviceMock{
+				GetTemperatureThresholdFunc: func(tt nvml.TemperatureThresholds) (uint32, nvml.Return) {
+					return 0, nvml.ERROR_UNKNOWN
+				},
+			},
+			expectedFailure: false,
+		},
+		{
+			description: "temperature sensor not supported returns healthy",
+			device: &DeviceMock{
+				GetTemperatureThresholdFunc: func(tt nvml.TemperatureThresholds) (uint32, nvml.Return) {
+					return 100, nvml.SUCCESS
+				},
+				GetTemperatureFunc: func(ts nvml.TemperatureSensors) (uint32, nvml.Return) {
+					return 0, nvml.ERROR_NOT_SUPPORTED
+				},
+			},
+			expectedFailure: false,
+		},
+		{
+			description: "temperature sensor error returns healthy",
+			device: &DeviceMock{
+				GetTemperatureThresholdFunc: func(tt nvml.TemperatureThresholds) (uint32, nvml.Return) {
+					return 100, nvml.SUCCESS
+				},
+				GetTemperatureFunc: func(ts nvml.TemperatureSensors) (uint32, nvml.Return) {
+					return 0, nvml.ERROR_UNKNOWN
+				},
+			},
+			expectedFailure: false,
+		},
+		{
+			description: "temperature below shutdown returns healthy",
+			device: &DeviceMock{
+				GetTemperatureThresholdFunc: func(tt nvml.TemperatureThresholds) (uint32, nvml.Return) {
+					switch tt {
+					case nvml.TEMPERATURE_THRESHOLD_SHUTDOWN:
+						return 100, nvml.SUCCESS
+					case nvml.TEMPERATURE_THRESHOLD_SLOWDOWN:
+						return 90, nvml.SUCCESS
+					}
+					return 0, nvml.ERROR_NOT_SUPPORTED
+				},
+				GetTemperatureFunc: func(ts nvml.TemperatureSensors) (uint32, nvml.Return) {
+					return 50, nvml.SUCCESS
+				},
+			},
+			expectedFailure: false,
+		},
+		{
+			description: "temperature at slowdown but below shutdown returns healthy",
+			device: &DeviceMock{
+				GetTemperatureThresholdFunc: func(tt nvml.TemperatureThresholds) (uint32, nvml.Return) {
+					switch tt {
+					case nvml.TEMPERATURE_THRESHOLD_SHUTDOWN:
+						return 100, nvml.SUCCESS
+					case nvml.TEMPERATURE_THRESHOLD_SLOWDOWN:
+						return 90, nvml.SUCCESS
+					}
+					return 0, nvml.ERROR_NOT_SUPPORTED
+				},
+				GetTemperatureFunc: func(ts nvml.TemperatureSensors) (uint32, nvml.Return) {
+					return 92, nvml.SUCCESS
+				},
+			},
+			expectedFailure: false,
+		},
+		{
+			description: "temperature at shutdown threshold returns unhealthy",
+			device: &DeviceMock{
+				GetTemperatureThresholdFunc: func(tt nvml.TemperatureThresholds) (uint32, nvml.Return) {
+					switch tt {
+					case nvml.TEMPERATURE_THRESHOLD_SHUTDOWN:
+						return 100, nvml.SUCCESS
+					case nvml.TEMPERATURE_THRESHOLD_SLOWDOWN:
+						return 90, nvml.SUCCESS
+					}
+					return 0, nvml.ERROR_NOT_SUPPORTED
+				},
+				GetTemperatureFunc: func(ts nvml.TemperatureSensors) (uint32, nvml.Return) {
+					return 100, nvml.SUCCESS
+				},
+			},
+			expectedReason:  "GPU temperature (100°C) has reached shutdown threshold (100°C)",
+			expectedFailure: true,
+		},
+		{
+			description: "temperature above shutdown threshold returns unhealthy",
+			device: &DeviceMock{
+				GetTemperatureThresholdFunc: func(tt nvml.TemperatureThresholds) (uint32, nvml.Return) {
+					switch tt {
+					case nvml.TEMPERATURE_THRESHOLD_SHUTDOWN:
+						return 100, nvml.SUCCESS
+					case nvml.TEMPERATURE_THRESHOLD_SLOWDOWN:
+						return 90, nvml.SUCCESS
+					}
+					return 0, nvml.ERROR_NOT_SUPPORTED
+				},
+				GetTemperatureFunc: func(ts nvml.TemperatureSensors) (uint32, nvml.Return) {
+					return 105, nvml.SUCCESS
+				},
+			},
+			expectedReason:  "GPU temperature (105°C) has reached shutdown threshold (100°C)",
+			expectedFailure: true,
+		},
+		{
+			description: "slowdown threshold error does not affect result",
+			device: &DeviceMock{
+				GetTemperatureThresholdFunc: func(tt nvml.TemperatureThresholds) (uint32, nvml.Return) {
+					switch tt {
+					case nvml.TEMPERATURE_THRESHOLD_SHUTDOWN:
+						return 100, nvml.SUCCESS
+					case nvml.TEMPERATURE_THRESHOLD_SLOWDOWN:
+						return 0, nvml.ERROR_NOT_SUPPORTED
+					}
+					return 0, nvml.ERROR_NOT_SUPPORTED
+				},
+				GetTemperatureFunc: func(ts nvml.TemperatureSensors) (uint32, nvml.Return) {
+					return 85, nvml.SUCCESS
+				},
+			},
+			expectedFailure: false,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.description, func(t *testing.T) {
+			reason, failed := rm.checkTemperature(tc.device)
+			require.Equal(t, tc.expectedFailure, failed)
+			require.Equal(t, tc.expectedReason, reason)
+		})
+	}
+}