From 5f7c1e19ae841674f282061c735397e1efa20b2f Mon Sep 17 00:00:00 2001 From: Xinwei Liu Date: Mon, 28 Jul 2025 15:37:16 +1000 Subject: [PATCH 1/5] feat: enable batch deployment to improve performance Added support for batch deployment of multiple deployments. Previously, all deployments were loaded into a single Helm chart file, causing timeouts and performance degradation during large-scale installs. This update splits the process into manageable batches to avoid overload and improve reliability. --- .../runkperf/commands/bench/node100_pod10k.go | 14 +++++-- .../deployments/templates/deployments.tpl | 2 +- .../workload/deployments/values.yaml | 1 + contrib/utils/utils.go | 3 +- contrib/utils/utils_common.go | 41 +++++++++++++++++++ 5 files changed, 55 insertions(+), 6 deletions(-) diff --git a/contrib/cmd/runkperf/commands/bench/node100_pod10k.go b/contrib/cmd/runkperf/commands/bench/node100_pod10k.go index 46d98fa3..8871341a 100644 --- a/contrib/cmd/runkperf/commands/bench/node100_pod10k.go +++ b/contrib/cmd/runkperf/commands/bench/node100_pod10k.go @@ -102,13 +102,19 @@ func benchNode100DeploymentNPod10KRun(cliCtx *cli.Context) (*internaltypes.Bench // NOTE: The name pattern should be aligned with ../../../../internal/manifests/loadprofile/node100_pod10k.yaml. deploymentNamePattern := "benchmark" - // TODO(xinwei): Implement batching support for deploying deployments after decoupling it from rolling update logic. - ruCleanupFn, err := utils.DeployDeployments(dpCtx, - kubeCfgPath, deploymentNamePattern, total, replica, paddingBytes, 10*time.Minute) + bm := utils.DeploymentBatchManager{ + KubeCfgPath: kubeCfgPath, + DeploymentNamePattern: deploymentNamePattern, + DeploymentReplica: replica, + PaddingBytes: paddingBytes, + } + + err = bm.Add(dpCtx, total) + defer bm.CleanAll() + if err != nil { return nil, fmt.Errorf("failed to setup workload: %w", err) } - defer ruCleanupFn() err = dumpDeploymentReplicas(ctx, kubeCfgPath, deploymentNamePattern, total) if err != nil { diff --git a/contrib/internal/manifests/workload/deployments/templates/deployments.tpl b/contrib/internal/manifests/workload/deployments/templates/deployments.tpl index 38df56c9..a0b1e780 100644 --- a/contrib/internal/manifests/workload/deployments/templates/deployments.tpl +++ b/contrib/internal/manifests/workload/deployments/templates/deployments.tpl @@ -1,7 +1,7 @@ {{- $pattern := .Values.namePattern }} {{- $replica := int .Values.replica }} {{- $paddingBytes := int .Values.paddingBytes }} -{{- range $index := (untilStep 0 (int .Values.total) 1) }} +{{- range $index := (untilStep (int .Values.start) (int .Values.total) 1) }} apiVersion: v1 kind: Namespace metadata: diff --git a/contrib/internal/manifests/workload/deployments/values.yaml b/contrib/internal/manifests/workload/deployments/values.yaml index bab75055..75f7d4fe 100644 --- a/contrib/internal/manifests/workload/deployments/values.yaml +++ b/contrib/internal/manifests/workload/deployments/values.yaml @@ -2,3 +2,4 @@ namePattern: "benchmark" total: 5 replica: 2000 paddingBytes: 0 +start: 0 \ No newline at end of file diff --git a/contrib/utils/utils.go b/contrib/utils/utils.go index 9eb3fdbb..3d9403db 100644 --- a/contrib/utils/utils.go +++ b/contrib/utils/utils.go @@ -121,7 +121,7 @@ func DeployDeployments( ctx context.Context, kubeCfgPath string, releaseName string, - total, replica, paddingBytes int, + total, replica, paddingBytes int, start int, deployTimeout time.Duration, ) (cleanupFn func(), retErr error) { infoLogger := log.GetLogger(ctx).WithKeyValues("level", "info") @@ -148,6 +148,7 @@ func DeployDeployments( fmt.Sprintf("total=%d", total), fmt.Sprintf("replica=%d", replica), fmt.Sprintf("paddingBytes=%d", paddingBytes), + fmt.Sprintf("start=%d", start), ), ) if err != nil { diff --git a/contrib/utils/utils_common.go b/contrib/utils/utils_common.go index 845d8ea0..fda12836 100644 --- a/contrib/utils/utils_common.go +++ b/contrib/utils/utils_common.go @@ -4,6 +4,8 @@ package utils import ( + "context" + "fmt" "time" ) @@ -61,5 +63,44 @@ func WithJobWaitTimeoutOpt(to time.Duration) JobTimeoutOpt { func WithJobDeleteTimeoutOpt(to time.Duration) JobTimeoutOpt { return func(jto *jobsTimeoutOption) { jto.deleteTimeout = to + + } +} + +var deploymentBatchSize int = 20 + +type DeploymentBatchManager struct { + KubeCfgPath string + DeploymentNamePattern string + DeploymentReplica int + PaddingBytes int + cleanups []func() +} + +func (bm *DeploymentBatchManager) Add(ctx context.Context, total int) error { + for start := 0; start < total; start += deploymentBatchSize { + // Create a unique name for each deployment batch + namePattern := fmt.Sprintf("%s-%d", bm.DeploymentNamePattern, start/deploymentBatchSize) + + // Calculate the current batch size, ensuring it does not exceed the total + currentBatchSize := deploymentBatchSize + if start+currentBatchSize > total { + currentBatchSize = total - start + } + + cleanup, err := DeployDeployments(ctx, bm.KubeCfgPath, namePattern, start+currentBatchSize, bm.DeploymentReplica, + bm.PaddingBytes, start, 10*time.Minute) + if err != nil { + return err + } + // Store the cleanup function to be called later + bm.cleanups = append(bm.cleanups, cleanup) + } + return nil +} + +func (bm *DeploymentBatchManager) CleanAll() { + for i := len(bm.cleanups) - 1; i >= 0; i-- { + bm.cleanups[i]() } } From b8f418f725f90902fb78a91d1532288d166005da Mon Sep 17 00:00:00 2001 From: Xinwei Liu Date: Mon, 28 Jul 2025 15:48:15 +1000 Subject: [PATCH 2/5] fix the linter --- contrib/utils/utils_common.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/utils/utils_common.go b/contrib/utils/utils_common.go index fda12836..fcc79804 100644 --- a/contrib/utils/utils_common.go +++ b/contrib/utils/utils_common.go @@ -67,7 +67,7 @@ func WithJobDeleteTimeoutOpt(to time.Duration) JobTimeoutOpt { } } -var deploymentBatchSize int = 20 +var deploymentBatchSize = 20 type DeploymentBatchManager struct { KubeCfgPath string From 43f1cbf3c24849e56376fdb54ee08b648efa3baf Mon Sep 17 00:00:00 2001 From: Xinwei Liu Date: Tue, 29 Jul 2025 14:32:10 +1000 Subject: [PATCH 3/5] Fix: reorder struct fields to group related ints --- contrib/utils/utils.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/utils/utils.go b/contrib/utils/utils.go index 3d9403db..71985cb0 100644 --- a/contrib/utils/utils.go +++ b/contrib/utils/utils.go @@ -121,7 +121,7 @@ func DeployDeployments( ctx context.Context, kubeCfgPath string, releaseName string, - total, replica, paddingBytes int, start int, + total, replica, paddingBytes, start int, deployTimeout time.Duration, ) (cleanupFn func(), retErr error) { infoLogger := log.GetLogger(ctx).WithKeyValues("level", "info") From b38cbb1ad2ec432c8139333b10e700b9db07f129 Mon Sep 17 00:00:00 2001 From: Xinwei Liu Date: Tue, 29 Jul 2025 14:54:23 +1000 Subject: [PATCH 4/5] Fix: correct loop range and batch size logic for deployment --- .../manifests/workload/deployments/templates/deployments.tpl | 2 +- contrib/utils/utils_common.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/internal/manifests/workload/deployments/templates/deployments.tpl b/contrib/internal/manifests/workload/deployments/templates/deployments.tpl index a0b1e780..66a21507 100644 --- a/contrib/internal/manifests/workload/deployments/templates/deployments.tpl +++ b/contrib/internal/manifests/workload/deployments/templates/deployments.tpl @@ -1,7 +1,7 @@ {{- $pattern := .Values.namePattern }} {{- $replica := int .Values.replica }} {{- $paddingBytes := int .Values.paddingBytes }} -{{- range $index := (untilStep (int .Values.start) (int .Values.total) 1) }} +{{- range $index := (untilStep (int .Values.start) (int (add (int .Values.start) (int .Values.total))) 1) }} apiVersion: v1 kind: Namespace metadata: diff --git a/contrib/utils/utils_common.go b/contrib/utils/utils_common.go index fcc79804..197ff2c0 100644 --- a/contrib/utils/utils_common.go +++ b/contrib/utils/utils_common.go @@ -88,7 +88,7 @@ func (bm *DeploymentBatchManager) Add(ctx context.Context, total int) error { currentBatchSize = total - start } - cleanup, err := DeployDeployments(ctx, bm.KubeCfgPath, namePattern, start+currentBatchSize, bm.DeploymentReplica, + cleanup, err := DeployDeployments(ctx, bm.KubeCfgPath, namePattern, currentBatchSize, bm.DeploymentReplica, bm.PaddingBytes, start, 10*time.Minute) if err != nil { return err From 220957ac428c5f2380cda61995f0646f241c2278 Mon Sep 17 00:00:00 2001 From: Xinwei Liu Date: Tue, 29 Jul 2025 15:42:31 +1000 Subject: [PATCH 5/5] Add deploymentBatchSize to DeploymentBatchManager struct --- contrib/cmd/runkperf/commands/bench/node100_pod10k.go | 1 + contrib/utils/utils_common.go | 9 ++++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/contrib/cmd/runkperf/commands/bench/node100_pod10k.go b/contrib/cmd/runkperf/commands/bench/node100_pod10k.go index 8871341a..a0747885 100644 --- a/contrib/cmd/runkperf/commands/bench/node100_pod10k.go +++ b/contrib/cmd/runkperf/commands/bench/node100_pod10k.go @@ -107,6 +107,7 @@ func benchNode100DeploymentNPod10KRun(cliCtx *cli.Context) (*internaltypes.Bench DeploymentNamePattern: deploymentNamePattern, DeploymentReplica: replica, PaddingBytes: paddingBytes, + DeploymentBatchSize: 20, } err = bm.Add(dpCtx, total) diff --git a/contrib/utils/utils_common.go b/contrib/utils/utils_common.go index 197ff2c0..331292b6 100644 --- a/contrib/utils/utils_common.go +++ b/contrib/utils/utils_common.go @@ -67,23 +67,22 @@ func WithJobDeleteTimeoutOpt(to time.Duration) JobTimeoutOpt { } } -var deploymentBatchSize = 20 - type DeploymentBatchManager struct { KubeCfgPath string DeploymentNamePattern string DeploymentReplica int PaddingBytes int + DeploymentBatchSize int cleanups []func() } func (bm *DeploymentBatchManager) Add(ctx context.Context, total int) error { - for start := 0; start < total; start += deploymentBatchSize { + for start := 0; start < total; start += bm.DeploymentBatchSize { // Create a unique name for each deployment batch - namePattern := fmt.Sprintf("%s-%d", bm.DeploymentNamePattern, start/deploymentBatchSize) + namePattern := fmt.Sprintf("%s-%d", bm.DeploymentNamePattern, start/bm.DeploymentBatchSize) // Calculate the current batch size, ensuring it does not exceed the total - currentBatchSize := deploymentBatchSize + currentBatchSize := bm.DeploymentBatchSize if start+currentBatchSize > total { currentBatchSize = total - start }