From 8224fde3d6c2f69ca1d9d469cf84dabc01a65216 Mon Sep 17 00:00:00 2001 From: xdkaine <55013938+xdkaine@users.noreply.github.com> Date: Tue, 17 Feb 2026 16:02:15 -0800 Subject: [PATCH 1/2] trying to fix race condition in a more optimal way --- internal/proxmox/types.go | 1 + internal/proxmox/vms.go | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/proxmox/types.go b/internal/proxmox/types.go index 245e505..6a98323 100644 --- a/internal/proxmox/types.go +++ b/internal/proxmox/types.go @@ -108,6 +108,7 @@ type ProxmoxNodeStatus struct { } type VirtualResourceConfig struct { + Name string `json:"name"` HardDisk string `json:"scsi0"` Lock string `json:"lock"` Net0 string `json:"net0"` diff --git a/internal/proxmox/vms.go b/internal/proxmox/vms.go index 18fb991..53d95bc 100644 --- a/internal/proxmox/vms.go +++ b/internal/proxmox/vms.go @@ -181,7 +181,7 @@ func (s *ProxmoxService) WaitForDisk(node string, vmID int, maxWait time.Duratio log.Printf("%+v", configResp) - if configResp.HardDisk != "" { + if configResp.HardDisk != "" && configResp.Name != "" { log.Printf("/nodes/%s/storage/%s/content?vmid=%d", s.Config.Nodes[0], s.Config.StorageID, vmID) pendingReq := tools.ProxmoxAPIRequest{ From 5b4e2f10d669613620b120e405a8caaeee80c136 Mon Sep 17 00:00:00 2001 From: xdkaine <55013938+xdkaine@users.noreply.github.com> Date: Tue, 17 Feb 2026 16:20:38 -0800 Subject: [PATCH 2/2] attempting to fix race conditions part 2 --- internal/cloning/cloning_service.go | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/internal/cloning/cloning_service.go b/internal/cloning/cloning_service.go index 23b1342..c97a8b4 100644 --- a/internal/cloning/cloning_service.go +++ b/internal/cloning/cloning_service.go @@ -270,7 +270,23 @@ func (cs *CloningService) CloneTemplate(req CloneRequest) error { // Release the vmid allocation mutex now that all of the VMs are cloned on proxmox cs.vmidMutex.Unlock() - // 9. Configure VNet of all VMs + // 9. Wait for all router disks to be fully available before configuring VNets. + // Proxmox clone is two-phase: the clone lock (Phase 1) releases before the storage + // backend finishes writing the disk (Phase 2). If SetPodVnet runs before Phase 2 + // completes, Proxmox's disk finalization can overwrite the net1 config change, + // leaving the router connected to the wrong vnet. + log.Printf("Waiting for router disks to be available before configuring VNets") + routerDiskReady := make(map[int]bool) + for _, routerInfo := range clonedRouters { + log.Printf("Waiting for router disk to be available for %s (VMID: %d)", routerInfo.TargetName, routerInfo.VMID) + if err := cs.ProxmoxService.WaitForDisk(routerInfo.Node, routerInfo.VMID, cs.Config.RouterWaitTimeout); err != nil { + errors = append(errors, fmt.Sprintf("router disk unavailable for %s: %v", routerInfo.TargetName, err)) + } else { + routerDiskReady[routerInfo.VMID] = true + } + } + + // 10. Configure VNet of all VMs log.Printf("Configuring VNets for %d targets", len(req.Targets)) for _, target := range req.Targets { vnetName := fmt.Sprintf("kamino%d", target.PodNumber) @@ -281,7 +297,7 @@ func (cs *CloningService) CloneTemplate(req CloneRequest) error { } } - // 10. Start all routers and wait for them to be running + // 11. Start all routers and wait for them to be running req.SSE.Send( ProgressMessage{ Message: "Starting routers", @@ -290,11 +306,7 @@ func (cs *CloningService) CloneTemplate(req CloneRequest) error { ) log.Printf("Starting %d routers", len(clonedRouters)) for _, routerInfo := range clonedRouters { - // Wait for router disk to be available - log.Printf("Waiting for router disk to be available for %s (VMID: %d)", routerInfo.TargetName, routerInfo.VMID) - err = cs.ProxmoxService.WaitForDisk(routerInfo.Node, routerInfo.VMID, cs.Config.RouterWaitTimeout) - if err != nil { - errors = append(errors, fmt.Sprintf("router disk unavailable for %s: %v", routerInfo.TargetName, err)) + if !routerDiskReady[routerInfo.VMID] { continue } @@ -314,7 +326,7 @@ func (cs *CloningService) CloneTemplate(req CloneRequest) error { } } - // 11. Configure all pod routers (separate step after all routers are running) + // 12. Configure all pod routers (separate step after all routers are running) req.SSE.Send( ProgressMessage{ Message: "Configuring pod routers",