NVIDIA · wkd-woo · Feb 11, 2026 · Feb 17, 2026 · Feb 23, 2026 · Feb 25, 2026
diff --git a/api/config/v1/consts.go b/api/config/v1/consts.go
@@ -48,6 +48,12 @@ const (
 	DeviceIDStrategyIndex = "index"
 )
 
+// Constants to represent the various allocation policies
+const (
+	AllocationPolicyDistributed = "distributed"
+	AllocationPolicyPacked      = "packed"
+)
+
 // Constants related to generating CDI specifications
 const (
 	DefaultCDIAnnotationPrefix = cdiapi.AnnotationPrefix

diff --git a/api/config/v1/flags.go b/api/config/v1/flags.go
@@ -79,6 +79,7 @@ type PluginCommandLineFlags struct {
 	CDIAnnotationPrefix *string                 `json:"cdiAnnotationPrefix" yaml:"cdiAnnotationPrefix"`
 	NvidiaCTKPath       *string                 `json:"nvidiaCTKPath"       yaml:"nvidiaCTKPath"`
 	ContainerDriverRoot *string                 `json:"containerDriverRoot" yaml:"containerDriverRoot"`
+	AllocationPolicy    *string                 `json:"allocationPolicy"   yaml:"allocationPolicy"`
 }
 
 // deviceListStrategyFlag is a custom type for parsing the deviceListStrategy flag.
@@ -157,6 +158,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
 				updateFromCLIFlag(&f.Plugin.NvidiaCTKPath, c, n)
 			case "container-driver-root":
 				updateFromCLIFlag(&f.Plugin.ContainerDriverRoot, c, n)
+			case "allocation-policy":
+				updateFromCLIFlag(&f.Plugin.AllocationPolicy, c, n)
 			}
 			// GFD specific flags
 			if f.GFD == nil {

diff --git a/cmd/nvidia-device-plugin/main.go b/cmd/nvidia-device-plugin/main.go
@@ -144,6 +144,12 @@ func main() {
 			Usage:   "the path on the host where MPS-specific mounts and files are created by the MPS control daemon manager",
 			EnvVars: []string{"MPS_ROOT"},
 		},
+		&cli.StringFlag{
+			Name:    "allocation-policy",
+			Value:   spec.AllocationPolicyDistributed,
+			Usage:   "the allocation policy for replicated and MIG resources:\n\t\t[distributed | packed]",
+			EnvVars: []string{"ALLOCATION_POLICY"},
+		},
 		&cli.StringFlag{
 			Name:    "device-discovery-strategy",
 			Value:   "auto",
@@ -205,6 +211,15 @@ func validateFlags(infolib nvinfo.Interface, config *spec.Config) error {
 		return fmt.Errorf("invalid --device-id-strategy option: %v", *config.Flags.Plugin.DeviceIDStrategy)
 	}
 
+	if config.Flags.Plugin.AllocationPolicy != nil {
+		switch *config.Flags.Plugin.AllocationPolicy {
+		case spec.AllocationPolicyDistributed:
+		case spec.AllocationPolicyPacked:
+		default:
+			return fmt.Errorf("invalid --allocation-policy option: %v", *config.Flags.Plugin.AllocationPolicy)
+		}
+	}
+
 	if config.Sharing.SharingStrategy() == spec.SharingStrategyMPS {
 		if *config.Flags.MigStrategy == spec.MigStrategyMixed {
 			return fmt.Errorf("using --mig-strategy=mixed is not supported with MPS")

diff --git a/internal/rm/allocate.go b/internal/rm/allocate.go
@@ -21,25 +21,26 @@ import (
 	"sort"
 )
 
-// distributedAlloc returns a list of devices such that any replicated
-// devices are distributed across all replicated GPUs equally. It takes into
-// account already allocated replicas to ensure a proper balance across them.
-func (r *resourceManager) distributedAlloc(available, required []string, size int) ([]string, error) {
-	// Get the set of candidate devices as the difference between available and required.
+// replicaCount tracks the total and available replica counts for a physical GPU.
+type replicaCount struct {
+	total, available int
+}
+
+// prepareCandidates filters candidates from available devices (excluding required),
+// validates there are enough, and builds a per-GPU replica count map.
+func (r *resourceManager) prepareCandidates(available, required []string, size int) ([]string, map[string]*replicaCount, int, error) {
 	candidates := r.devices.Subset(available).Difference(r.devices.Subset(required)).GetIDs()
 	needed := size - len(required)
 
 	if len(candidates) < needed {
-		return nil, fmt.Errorf("not enough available devices to satisfy allocation")
+		return nil, nil, 0, fmt.Errorf("not enough available devices to satisfy allocation")
 	}
 
-	// For each candidate device, build a mapping of (stripped) device ID to
-	// total / available replicas for that device.
-	replicas := make(map[string]*struct{ total, available int })
+	replicas := make(map[string]*replicaCount)
 	for _, c := range candidates {
 		id := AnnotatedID(c).GetID()
 		if _, exists := replicas[id]; !exists {
-			replicas[id] = &struct{ total, available int }{}
+			replicas[id] = &replicaCount{}
 		}
 		replicas[id].available++
 	}
@@ -51,13 +52,20 @@ func (r *resourceManager) distributedAlloc(available, required []string, size in
 		replicas[id].total++
 	}
 
-	// Grab the set of 'needed' devices one-by-one from the candidates list.
-	// Before selecting each candidate, first sort the candidate list using the
-	// replicas map above. After sorting, the first element in the list will
-	// contain the device with the least difference between total and available
-	// replications (based on what's already been allocated). Add this device
-	// to the list of devices to allocate, remove it from the candidate list,
-	// down its available count in the replicas map, and repeat.
+	return candidates, replicas, needed, nil
+}
+
+// distributedAlloc returns a list of devices such that any replicated
+// devices are distributed across all replicated GPUs equally. It takes into
+// account already allocated replicas to ensure a proper balance across them.
+func (r *resourceManager) distributedAlloc(available, required []string, size int) ([]string, error) {
+	candidates, replicas, needed, err := r.prepareCandidates(available, required, size)
+	if err != nil {
+		return nil, err
+	}
+
+	// Select devices one-by-one, preferring GPUs with the fewest allocated
+	// replicas to spread workload evenly across physical GPUs.
 	var devices []string
 	for i := 0; i < needed; i++ {
 		sort.Slice(candidates, func(i, j int) bool {
@@ -73,8 +81,35 @@ func (r *resourceManager) distributedAlloc(available, required []string, size in
 		candidates = candidates[1:]
 	}
 
-	// Add the set of required devices to this list and return it.
-	devices = append(required, devices...)
+	return append(required, devices...), nil
+}
+
+// packedAlloc returns a list of devices such that any replicated devices are
+// packed onto as few physical GPUs as possible. It preferentially allocates
+// replicas from GPUs that already have the most allocated replicas, which
+// helps consolidate workloads and free up entire GPUs for other uses.
+func (r *resourceManager) packedAlloc(available, required []string, size int) ([]string, error) {
+	candidates, replicas, needed, err := r.prepareCandidates(available, required, size)
+	if err != nil {
+		return nil, err
+	}
+
+	// Select devices one-by-one, preferring GPUs with the most allocated
+	// replicas to consolidate onto fewer physical GPUs.
+	var devices []string
+	for i := 0; i < needed; i++ {
+		sort.Slice(candidates, func(i, j int) bool {
+			iid := AnnotatedID(candidates[i]).GetID()
+			jid := AnnotatedID(candidates[j]).GetID()
+			idiff := replicas[iid].total - replicas[iid].available
+			jdiff := replicas[jid].total - replicas[jid].available
+			return idiff > jdiff
+		})
+		id := AnnotatedID(candidates[0]).GetID()
+		replicas[id].available--
+		devices = append(devices, candidates[0])
+		candidates = candidates[1:]
+	}
 
-	return devices, nil
+	return append(required, devices...), nil
 }