Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions api/config/v1/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ const (
DeviceIDStrategyIndex = "index"
)

// Constants to represent the various allocation policies
const (
AllocationPolicyDistributed = "distributed"
AllocationPolicyPacked = "packed"
)

// Constants related to generating CDI specifications
const (
DefaultCDIAnnotationPrefix = cdiapi.AnnotationPrefix
Expand Down
3 changes: 3 additions & 0 deletions api/config/v1/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ type PluginCommandLineFlags struct {
CDIAnnotationPrefix *string `json:"cdiAnnotationPrefix" yaml:"cdiAnnotationPrefix"`
NvidiaCTKPath *string `json:"nvidiaCTKPath" yaml:"nvidiaCTKPath"`
ContainerDriverRoot *string `json:"containerDriverRoot" yaml:"containerDriverRoot"`
AllocationPolicy *string `json:"allocationPolicy" yaml:"allocationPolicy"`
}

// deviceListStrategyFlag is a custom type for parsing the deviceListStrategy flag.
Expand Down Expand Up @@ -157,6 +158,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
updateFromCLIFlag(&f.Plugin.NvidiaCTKPath, c, n)
case "container-driver-root":
updateFromCLIFlag(&f.Plugin.ContainerDriverRoot, c, n)
case "allocation-policy":
updateFromCLIFlag(&f.Plugin.AllocationPolicy, c, n)
}
// GFD specific flags
if f.GFD == nil {
Expand Down
15 changes: 15 additions & 0 deletions cmd/nvidia-device-plugin/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,12 @@ func main() {
Usage: "the path on the host where MPS-specific mounts and files are created by the MPS control daemon manager",
EnvVars: []string{"MPS_ROOT"},
},
&cli.StringFlag{
Name: "allocation-policy",
Value: spec.AllocationPolicyDistributed,
Usage: "the allocation policy for replicated and MIG resources:\n\t\t[distributed | packed]",
EnvVars: []string{"ALLOCATION_POLICY"},
},
&cli.StringFlag{
Name: "device-discovery-strategy",
Value: "auto",
Expand Down Expand Up @@ -205,6 +211,15 @@ func validateFlags(infolib nvinfo.Interface, config *spec.Config) error {
return fmt.Errorf("invalid --device-id-strategy option: %v", *config.Flags.Plugin.DeviceIDStrategy)
}

if config.Flags.Plugin.AllocationPolicy != nil {
switch *config.Flags.Plugin.AllocationPolicy {
case spec.AllocationPolicyDistributed:
case spec.AllocationPolicyPacked:
default:
return fmt.Errorf("invalid --allocation-policy option: %v", *config.Flags.Plugin.AllocationPolicy)
}
}

if config.Sharing.SharingStrategy() == spec.SharingStrategyMPS {
if *config.Flags.MigStrategy == spec.MigStrategyMixed {
return fmt.Errorf("using --mig-strategy=mixed is not supported with MPS")
Expand Down
75 changes: 55 additions & 20 deletions internal/rm/allocate.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,26 @@ import (
"sort"
)

// distributedAlloc returns a list of devices such that any replicated
// devices are distributed across all replicated GPUs equally. It takes into
// account already allocated replicas to ensure a proper balance across them.
func (r *resourceManager) distributedAlloc(available, required []string, size int) ([]string, error) {
// Get the set of candidate devices as the difference between available and required.
// replicaCount tracks the total and available replica counts for a physical GPU.
type replicaCount struct {
total, available int
}

// prepareCandidates filters candidates from available devices (excluding required),
// validates there are enough, and builds a per-GPU replica count map.
func (r *resourceManager) prepareCandidates(available, required []string, size int) ([]string, map[string]*replicaCount, int, error) {
candidates := r.devices.Subset(available).Difference(r.devices.Subset(required)).GetIDs()
needed := size - len(required)

if len(candidates) < needed {
return nil, fmt.Errorf("not enough available devices to satisfy allocation")
return nil, nil, 0, fmt.Errorf("not enough available devices to satisfy allocation")
}

// For each candidate device, build a mapping of (stripped) device ID to
// total / available replicas for that device.
replicas := make(map[string]*struct{ total, available int })
replicas := make(map[string]*replicaCount)
for _, c := range candidates {
id := AnnotatedID(c).GetID()
if _, exists := replicas[id]; !exists {
replicas[id] = &struct{ total, available int }{}
replicas[id] = &replicaCount{}
}
replicas[id].available++
}
Expand All @@ -51,13 +52,20 @@ func (r *resourceManager) distributedAlloc(available, required []string, size in
replicas[id].total++
}

// Grab the set of 'needed' devices one-by-one from the candidates list.
// Before selecting each candidate, first sort the candidate list using the
// replicas map above. After sorting, the first element in the list will
// contain the device with the least difference between total and available
// replications (based on what's already been allocated). Add this device
// to the list of devices to allocate, remove it from the candidate list,
// down its available count in the replicas map, and repeat.
return candidates, replicas, needed, nil
}

// distributedAlloc returns a list of devices such that any replicated
// devices are distributed across all replicated GPUs equally. It takes into
// account already allocated replicas to ensure a proper balance across them.
func (r *resourceManager) distributedAlloc(available, required []string, size int) ([]string, error) {
candidates, replicas, needed, err := r.prepareCandidates(available, required, size)
if err != nil {
return nil, err
}

// Select devices one-by-one, preferring GPUs with the fewest allocated
// replicas to spread workload evenly across physical GPUs.
var devices []string
for i := 0; i < needed; i++ {
sort.Slice(candidates, func(i, j int) bool {
Expand All @@ -73,8 +81,35 @@ func (r *resourceManager) distributedAlloc(available, required []string, size in
candidates = candidates[1:]
}

// Add the set of required devices to this list and return it.
devices = append(required, devices...)
return append(required, devices...), nil
}

// packedAlloc returns a list of devices such that any replicated devices are
// packed onto as few physical GPUs as possible. It preferentially allocates
// replicas from GPUs that already have the most allocated replicas, which
// helps consolidate workloads and free up entire GPUs for other uses.
func (r *resourceManager) packedAlloc(available, required []string, size int) ([]string, error) {
candidates, replicas, needed, err := r.prepareCandidates(available, required, size)
if err != nil {
return nil, err
}

// Select devices one-by-one, preferring GPUs with the most allocated
// replicas to consolidate onto fewer physical GPUs.
var devices []string
for i := 0; i < needed; i++ {
sort.Slice(candidates, func(i, j int) bool {
iid := AnnotatedID(candidates[i]).GetID()
jid := AnnotatedID(candidates[j]).GetID()
idiff := replicas[iid].total - replicas[iid].available
jdiff := replicas[jid].total - replicas[jid].available
return idiff > jdiff

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was the only change in this function and distributedAlloc the comparator between idiff and jdiff? Can we just pass in an enum and then have a condition on the enum to dictate this behavior? There is a lot of code duplication at the moment.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review!

The duplication is intentional — I wanted distributedAlloc to stay completely untouched so reviewers can verify there's zero behavioral change just by looking at the diff.

On merging them: the two functions share the same structure because packed is just the inverse of distributed (flipped comparator). But as you mentioned with topology-aware bin-packing, a future policy would need NVML topology data and different sorting logic entirely — so I'd rather keep them separate now than merge and split again later.

That said, happy to extract the common setup (candidate filtering + replica counting) into a shared helper if that feels cleaner.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No problem :) to be honest I think it would actually be easier to review/maintain if you generalised the allocation function slightly.

It could be generalised to fractionalAlloc, taking an enum that allows distributedFractionPlacement or packedFractionPlacement policies. Then internally you just pick the comparator based on the policy.

What you suggested with pulling out the common helpers such as candidate filtering and replica counting is probably even better than the generalisation I suggested, as it enables the policies to be kept completely separate, and is more extensible in that it enables easier implementation of new policies again such as the topology-aware one.

When I was reviewing the code, I pulled up both distributedAlloc and packedAlloc side by side and was cross-checking for a while before I saw the flipped comparator. I expect other reviewers and contributors to go through a similar process. This is just my two cents though.

You've also improved the state of testing so that's an additional signal that this is functionally equivalent to what there was before.

In terms of the different topology-aware placement policy, I think it makes sense to keep it separate for now. But cool that you're opening this up and making it more configurable! I may build off of this myself.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ottowhite I've extracted prepareCandidates in the second commit.
Each policy function now only has its sorting logic, so the difference should be immediately visible

Copy link

@ottowhite ottowhite Feb 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking better! There's still quite a bit of duplication across both of the Alloc functions. What is common is that they're both greedy allocation algorithms (repeatedly evaluating and selecting the next best option, without looking further ahead). Could you extract this into another helper greedyDeviceAlloc or something similar that makes the difference between these two functions even clearer? Could pass in a candidate comparator. It will also be much clearer to extend with other greedy allocation policies. For example the one that I spoke about is another greedy allocation algorithm that happens to use topology-awareness in it's comparator.

})
id := AnnotatedID(candidates[0]).GetID()
replicas[id].available--
devices = append(devices, candidates[0])
candidates = candidates[1:]
}

return devices, nil
return append(required, devices...), nil
}
Loading