From df8524ec3246dccaf7839ce06c6538c0b9d04c8c Mon Sep 17 00:00:00 2001 From: Syeda Anjum Date: Tue, 10 Feb 2026 14:57:30 -0600 Subject: [PATCH 01/12] update new structure --- .../{ => base}/configure_benchmark.sh | 0 .../inference-perf-bench/{ => base}/job.yaml | 0 .../{ => base}/kustomization.yaml | 0 .../{ => base}/pod-monitoring.yaml | 0 .../{ => base}/set-compute-class.yaml | 0 .../{ => base}/templates/benchmarking.tpl.env | 0 .../templates/configmap-benchmark.tpl.yaml | 0 ...ecretproviderclass-huggingface-tokens.tpl.yaml | 0 .../sd-eagle/kustomization.yaml | 0 .../vllm-spec-decoding/sd-eagle/patch-data.yaml | 0 .../vllm-spec-decoding/sd-eagle/patch-load.yaml | 0 .../sd-ngram/kustomization.yaml | 0 .../vllm-spec-decoding/sd-ngram/patch-data.yaml | 0 .../vllm-spec-decoding/sd-ngram/patch-load.yaml | 0 .../inference-perf-bench/vllm/kustomization.yaml | 15 +++++++++++++++ .../inference-perf-bench/vllm/patch-data.yaml | 0 .../inference-perf-bench/vllm/patch-load.yaml | 0 17 files changed, 15 insertions(+) rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{ => base}/configure_benchmark.sh (100%) rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{ => base}/job.yaml (100%) rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{ => base}/kustomization.yaml (100%) rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{ => base}/pod-monitoring.yaml (100%) rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{ => base}/set-compute-class.yaml (100%) rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{ => base}/templates/benchmarking.tpl.env (100%) rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{ => base}/templates/configmap-benchmark.tpl.yaml (100%) rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{ => base}/templates/secretproviderclass-huggingface-tokens.tpl.yaml (100%) create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-data.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-load.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-data.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-load.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-data.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-load.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/configure_benchmark.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/configure_benchmark.sh similarity index 100% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/configure_benchmark.sh rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/configure_benchmark.sh diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/job.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/job.yaml similarity index 100% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/job.yaml rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/job.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/kustomization.yaml similarity index 100% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/kustomization.yaml rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/kustomization.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/pod-monitoring.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/pod-monitoring.yaml similarity index 100% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/pod-monitoring.yaml rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/pod-monitoring.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/set-compute-class.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/set-compute-class.yaml similarity index 100% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/set-compute-class.yaml rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/set-compute-class.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/templates/benchmarking.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/templates/benchmarking.tpl.env similarity index 100% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/templates/benchmarking.tpl.env rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/templates/benchmarking.tpl.env diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/templates/configmap-benchmark.tpl.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/templates/configmap-benchmark.tpl.yaml similarity index 100% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/templates/configmap-benchmark.tpl.yaml rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/templates/configmap-benchmark.tpl.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/templates/secretproviderclass-huggingface-tokens.tpl.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/templates/secretproviderclass-huggingface-tokens.tpl.yaml similarity index 100% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/templates/secretproviderclass-huggingface-tokens.tpl.yaml rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/templates/secretproviderclass-huggingface-tokens.tpl.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-data.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-data.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-load.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-load.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-data.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-data.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-load.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-load.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml new file mode 100644 index 000000000..831f4e0ad --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml @@ -0,0 +1,15 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - runtime.env + name: runtime + namespace: replaced-by-kustomize + +nameSuffix: -vllm + +patches: + - path: patch-load.yaml + - path: patch-data.yaml \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-data.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-data.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-load.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-load.yaml new file mode 100644 index 000000000..e69de29bb From f183a41ad95ee8f6467e9a2523b0a494c14655f3 Mon Sep 17 00:00:00 2001 From: Syeda Anjum Date: Tue, 10 Feb 2026 15:50:01 -0600 Subject: [PATCH 02/12] update kustomize --- .../sd-eagle/kustomization.yaml | 30 +++++++++++++++++++ .../sd-eagle/patch-data.yaml | 2 ++ .../sd-eagle/patch-load.yaml | 11 +++++++ .../sd-ngram/kustomization.yaml | 30 +++++++++++++++++++ .../sd-ngram/patch-data.yaml | 2 ++ .../sd-ngram/patch-load.yaml | 11 +++++++ .../vllm/kustomization.yaml | 19 ++++++++++-- .../inference-perf-bench/vllm/patch-data.yaml | 14 +++++++++ .../inference-perf-bench/vllm/patch-load.yaml | 11 +++++++ 9 files changed, 128 insertions(+), 2 deletions(-) diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml index e69de29bb..ba8f5ddf5 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml @@ -0,0 +1,30 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - benchmarking-spec.env + name: benchmarking-spec + namespace: replaced-by-kustomize + +resources: + - ../base +nameSuffix: -vllm-sd-eagle + +patches: + - path: patch-load.yaml + - path: patch-data.yaml \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-data.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-data.yaml index e69de29bb..a05091c9a 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-data.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-data.yaml @@ -0,0 +1,2 @@ +data: + type: cnn_dailymail \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-load.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-load.yaml index e69de29bb..0ac67fea8 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-load.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-load.yaml @@ -0,0 +1,11 @@ +load: + type: constant + interval: 1.0 + sweep: + type: linear + timeout: 250 + num_stages: 7 + stage_duration: 30 + num_workers: 20 + worker_max_concurrency: 15 + worker_max_tcp_connections: 2500 \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml index e69de29bb..88d0d54e9 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml @@ -0,0 +1,30 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +configMapGenerator: + - envs: + - benchmarking-spec.env + name: benchmarking-spec + namespace: replaced-by-kustomize + +resources: + - ../base +nameSuffix: -vllm-sd-ngram + +patches: + - path: patch-load.yaml + - path: patch-data.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-data.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-data.yaml index e69de29bb..009564643 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-data.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-data.yaml @@ -0,0 +1,2 @@ +data: + type: shareGPT \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-load.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-load.yaml index e69de29bb..0ac67fea8 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-load.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-load.yaml @@ -0,0 +1,11 @@ +load: + type: constant + interval: 1.0 + sweep: + type: linear + timeout: 250 + num_stages: 7 + stage_duration: 30 + num_workers: 20 + worker_max_concurrency: 15 + worker_max_tcp_connections: 2500 \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml index 831f4e0ad..496f047d4 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml @@ -1,13 +1,28 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. --- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization configMapGenerator: - envs: - - runtime.env - name: runtime + - benchmarking-spec.env + name: benchmarking-spec namespace: replaced-by-kustomize +resources: + - ../base nameSuffix: -vllm patches: diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-data.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-data.yaml index e69de29bb..23e23d835 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-data.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-data.yaml @@ -0,0 +1,14 @@ +data: + type: synthetic + input_distribution: # For synthetic/random types + min: 10 # Minimum prompt length (tokens) + max: 100 # Maximum prompt length + mean: 50 # Average length + std: 10 # Standard deviation + total_count: 7000 # Total prompts to generate. Choose a large number for sweep so that you don't run out of prompts as the QPS increases to saturation + output_distribution: # Same structure as input_distribution + min: 40 + max: 400 + mean: 200 + std: 50 + total_count: 7000 \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-load.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-load.yaml index e69de29bb..0ac67fea8 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-load.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-load.yaml @@ -0,0 +1,11 @@ +load: + type: constant + interval: 1.0 + sweep: + type: linear + timeout: 250 + num_stages: 7 + stage_duration: 30 + num_workers: 20 + worker_max_concurrency: 15 + worker_max_tcp_connections: 2500 \ No newline at end of file From f09d760305ed0a20190be8b77007fd6433ed129a Mon Sep 17 00:00:00 2001 From: Syeda Anjum Date: Wed, 11 Feb 2026 12:37:56 -0600 Subject: [PATCH 03/12] update kustomize --- .../base/configmap-benchmark.yaml | 36 ++++++ .../base/configure_benchmark.sh | 2 +- .../base/kustomization.yaml | 114 +++++++++--------- .../sd-eagle/kustomization.yaml | 44 +++++-- .../sd-ngram/kustomization.yaml | 44 +++++-- .../vllm/kustomization.yaml | 43 +++++-- 6 files changed, 193 insertions(+), 90 deletions(-) create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/configmap-benchmark.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/configmap-benchmark.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/configmap-benchmark.yaml new file mode 100644 index 000000000..d677f07f4 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/configmap-benchmark.yaml @@ -0,0 +1,36 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +LOAD_BLOCK_HERE +api: + type: completion + streaming: true +server: + type: vllm + model_name: /gcs/${HF_MODEL_ID} + base_url: http://${APP_LABEL}.${BENCHMARKING_KUBERNETES_NAMESPACE}.svc.cluster.local:8000 + ignore_eos: true +tokenizer: + pretrained_model_name_or_path: ${HF_MODEL_ID} +DATA_BLOCK_HERE +metrics: + type: prometheus + prometheus: + scrape_interval: 15 + google_managed: true +report: + request_lifecycle: + summary: true +storage: + google_cloud_storage: + bucket_name: ${hub_models_bucket_bench_results_name} diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/configure_benchmark.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/configure_benchmark.sh index 3beba4ef6..a05fd0f3b 100755 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/configure_benchmark.sh +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/configure_benchmark.sh @@ -49,7 +49,7 @@ source "${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variab envsubst < "${MY_PATH}/templates/benchmarking.tpl.env" | sponge "${MY_PATH}/benchmarking.env" -envsubst < "${MY_PATH}/templates/configmap-benchmark.tpl.yaml" | sponge "${MY_PATH}/configmap-benchmark.yaml" +# envsubst < "${MY_PATH}/templates/configmap-benchmark.tpl.yaml" | sponge "${MY_PATH}/configmap-benchmark.yaml" envsubst < "${MY_PATH}/templates/secretproviderclass-huggingface-tokens.tpl.yaml" | sponge "${MY_PATH}/secretproviderclass-huggingface-tokens.yaml" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/kustomization.yaml index c4b0e1dc0..a09fd62f1 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/kustomization.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/kustomization.yaml @@ -16,70 +16,64 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization configMapGenerator: -- envs: - - benchmarking.env - name: benchmark - namespace: replaced-by-kustomize + - name: benchmark + envs: + - benchmarking.env + - name: inference-perf-config + namespace: replaced-by-kustomize + envs: + - benchmarking.env + files: + - config.yaml=configmap-benchmark.yaml patches: -- path: set-compute-class.yaml + - path: set-compute-class.yaml replacements: -- source: - fieldPath: data.BENCHMARKING_KUBERNETES_NAMESPACE - kind: ConfigMap - name: benchmark - targets: - - fieldPaths: - - metadata.namespace - select: + # Metadata & Namespace Sync + - source: kind: ConfigMap - - fieldPaths: - - metadata.namespace - select: - kind: Job - - fieldPaths: - - metadata.namespace - select: - kind: SecretProviderClass - - fieldPaths: - - metadata.namespace - select: - kind: PodMonitoring -- source: - fieldPath: data.BENCHMARKING_KUBERNETES_SERVICE_ACCOUNT - kind: ConfigMap - name: benchmark - targets: - - fieldPaths: - - spec.template.spec.serviceAccountName - select: - kind: Job - name: inference-perf -- source: - fieldPath: metadata.name - kind: SecretProviderClass - name: huggingface-token-read - targets: - - fieldPaths: - - spec.template.spec.volumes.[name=huggingface-token].csi.volumeAttributes.secretProviderClass - select: - kind: Job - name: inference-perf -- source: - kind: ConfigMap - name: benchmark - fieldPath: data.APP_LABEL - targets: - - select: - kind: PodMonitoring - name: vllm-podmonitoring - fieldPaths: - - metadata.labels.app - - spec.selector.matchLabels.app + name: benchmark + fieldPath: data.BENCHMARKING_KUBERNETES_NAMESPACE + targets: + - select: {kind: ConfigMap} + fieldPaths: [metadata.namespace] + - select: {kind: Job} + fieldPaths: [metadata.namespace] + - select: {kind: SecretProviderClass} + fieldPaths: [metadata.namespace] + - select: {kind: PodMonitoring} + fieldPaths: [metadata.namespace] + + # Config.yaml String Interpolation + - source: {kind: ConfigMap, name: inference-perf-config, fieldPath: data.HF_MODEL_ID} + targets: + - select: {kind: ConfigMap, name: inference-perf-config} + fieldPaths: ["data.config\\.yaml"] + options: {delimiter: "${HF_MODEL_ID}"} + + - source: {kind: ConfigMap, name: inference-perf-config, fieldPath: data.APP_LABEL} + targets: + - select: {kind: ConfigMap, name: inference-perf-config} + fieldPaths: ["data.config\\.yaml"] + options: {delimiter: "${APP_LABEL}"} + + - source: {kind: ConfigMap, name: inference-perf-config, fieldPath: data.BENCHMARKING_KUBERNETES_NAMESPACE} + targets: + - select: {kind: ConfigMap, name: inference-perf-config} + fieldPaths: ["data.config\\.yaml"] + options: {delimiter: "${BENCHMARKING_KUBERNETES_NAMESPACE}"} + + - source: {kind: ConfigMap, name: inference-perf-config, fieldPath: data.hub_models_bucket_bench_results_name} + targets: + - select: {kind: ConfigMap, name: inference-perf-config} + fieldPaths: ["data.config\\.yaml"] + options: {delimiter: "${hub_models_bucket_bench_results_name}"} resources: -- configmap-benchmark.yaml -- job.yaml -- secretproviderclass-huggingface-tokens.yaml -- pod-monitoring.yaml + - job.yaml + - secretproviderclass-huggingface-tokens.yaml + - pod-monitoring.yaml + +generatorOptions: + disableNameSuffixHash: true \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml index ba8f5ddf5..d53cfeaae 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml @@ -16,15 +16,39 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization configMapGenerator: - - envs: - - benchmarking-spec.env - name: benchmarking-spec - namespace: replaced-by-kustomize + - name: load-patch-source + files: + - content=patch-load.yaml + - name: data-patch-source + files: + - content=patch-data.yaml -resources: - - ../base -nameSuffix: -vllm-sd-eagle +replacements: + - source: + kind: ConfigMap + name: load-patch-source + fieldPath: data.content + targets: + - select: + kind: ConfigMap + name: inference-perf-config + fieldPaths: + - data.config\\.yaml + options: + delimiter: "LOAD_BLOCK_HERE" + + - source: + kind: ConfigMap + name: data-patch-source + fieldPath: data.content + targets: + - select: + kind: ConfigMap + name: inference-perf-config + fieldPaths: + - data.config\\.yaml + options: + delimiter: "DATA_BLOCK_HERE" -patches: - - path: patch-load.yaml - - path: patch-data.yaml \ No newline at end of file +resources: + - ../base \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml index 88d0d54e9..d53cfeaae 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml @@ -16,15 +16,39 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization configMapGenerator: - - envs: - - benchmarking-spec.env - name: benchmarking-spec - namespace: replaced-by-kustomize + - name: load-patch-source + files: + - content=patch-load.yaml + - name: data-patch-source + files: + - content=patch-data.yaml -resources: - - ../base -nameSuffix: -vllm-sd-ngram +replacements: + - source: + kind: ConfigMap + name: load-patch-source + fieldPath: data.content + targets: + - select: + kind: ConfigMap + name: inference-perf-config + fieldPaths: + - data.config\\.yaml + options: + delimiter: "LOAD_BLOCK_HERE" + + - source: + kind: ConfigMap + name: data-patch-source + fieldPath: data.content + targets: + - select: + kind: ConfigMap + name: inference-perf-config + fieldPaths: + - data.config\\.yaml + options: + delimiter: "DATA_BLOCK_HERE" -patches: - - path: patch-load.yaml - - path: patch-data.yaml +resources: + - ../base \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml index 496f047d4..ee6ce1dd2 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml @@ -16,15 +16,40 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization configMapGenerator: - - envs: - - benchmarking-spec.env - name: benchmarking-spec - namespace: replaced-by-kustomize + - name: load-patch-source + files: + - content=patch-load.yaml + - name: data-patch-source + files: + - content=patch-data.yaml + +replacements: + - source: + kind: ConfigMap + name: load-patch-source + fieldPath: data.content + targets: + - select: + kind: ConfigMap + name: inference-perf-config + fieldPaths: + - data.config\\.yaml + options: + delimiter: "LOAD_BLOCK_HERE" + + - source: + kind: ConfigMap + name: data-patch-source + fieldPath: data.content + targets: + - select: + kind: ConfigMap + name: inference-perf-config + fieldPaths: + - data.config\\.yaml + options: + delimiter: "DATA_BLOCK_HERE" resources: - ../base -nameSuffix: -vllm - -patches: - - path: patch-load.yaml - - path: patch-data.yaml \ No newline at end of file +nameSuffix: -vllm \ No newline at end of file From 5c5bc50a7870654f63d5f69f9f609503f6f4d9c3 Mon Sep 17 00:00:00 2001 From: Syeda Anjum Date: Wed, 11 Feb 2026 12:45:01 -0600 Subject: [PATCH 04/12] spell check --- .github/workflows/dictionary/vllm.txt | 1 + .../base/configmap-benchmark.yaml | 1 + .../inference-perf-bench/base/job.yaml | 1 + .../inference-perf-bench/base/kustomization.yaml | 3 ++- .../base/set-compute-class.yaml | 1 + .../sd-eagle/kustomization.yaml | 3 ++- .../vllm-spec-decoding/sd-eagle/patch-data.yaml | 15 ++++++++++++++- .../vllm-spec-decoding/sd-eagle/patch-load.yaml | 15 ++++++++++++++- .../sd-ngram/kustomization.yaml | 3 ++- .../vllm-spec-decoding/sd-ngram/patch-data.yaml | 15 ++++++++++++++- .../vllm-spec-decoding/sd-ngram/patch-load.yaml | 15 ++++++++++++++- .../inference-perf-bench/vllm/kustomization.yaml | 2 +- .../inference-perf-bench/vllm/patch-data.yaml | 15 ++++++++++++++- .../inference-perf-bench/vllm/patch-load.yaml | 15 ++++++++++++++- 14 files changed, 95 insertions(+), 10 deletions(-) diff --git a/.github/workflows/dictionary/vllm.txt b/.github/workflows/dictionary/vllm.txt index d88569bf3..1cb5e4d73 100644 --- a/.github/workflows/dictionary/vllm.txt +++ b/.github/workflows/dictionary/vllm.txt @@ -1,3 +1,4 @@ +dailymail dtype flashinfer matplot diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/configmap-benchmark.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/configmap-benchmark.yaml index d677f07f4..2351ecd84 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/configmap-benchmark.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/configmap-benchmark.yaml @@ -34,3 +34,4 @@ report: storage: google_cloud_storage: bucket_name: ${hub_models_bucket_bench_results_name} + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/job.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/job.yaml index 3d5176a6f..f90eb0759 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/job.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/job.yaml @@ -57,3 +57,4 @@ spec: volumeAttributes: secretProviderClass: huggingface-token-read name: huggingface-token + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/kustomization.yaml index a09fd62f1..18c65f2a1 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/kustomization.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/kustomization.yaml @@ -76,4 +76,5 @@ resources: - pod-monitoring.yaml generatorOptions: - disableNameSuffixHash: true \ No newline at end of file + disableNameSuffixHash: true + \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/set-compute-class.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/set-compute-class.yaml index da44e23ba..fc36cc817 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/set-compute-class.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/set-compute-class.yaml @@ -22,3 +22,4 @@ spec: spec: nodeSelector: cloud.google.com/compute-class: model-download + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml index d53cfeaae..cd1593eb2 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml @@ -51,4 +51,5 @@ replacements: delimiter: "DATA_BLOCK_HERE" resources: - - ../base \ No newline at end of file + - ../base + \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-data.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-data.yaml index a05091c9a..eda5d0c11 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-data.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-data.yaml @@ -1,2 +1,15 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. data: - type: cnn_dailymail \ No newline at end of file + type: cnn_dailymail diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-load.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-load.yaml index 0ac67fea8..1c1ca33f3 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-load.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-load.yaml @@ -1,3 +1,16 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. load: type: constant interval: 1.0 @@ -8,4 +21,4 @@ load: stage_duration: 30 num_workers: 20 worker_max_concurrency: 15 - worker_max_tcp_connections: 2500 \ No newline at end of file + worker_max_tcp_connections: 2500 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml index d53cfeaae..cd1593eb2 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml @@ -51,4 +51,5 @@ replacements: delimiter: "DATA_BLOCK_HERE" resources: - - ../base \ No newline at end of file + - ../base + \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-data.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-data.yaml index 009564643..e4c3ce55f 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-data.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-data.yaml @@ -1,2 +1,15 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. data: - type: shareGPT \ No newline at end of file + type: shareGPT diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-load.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-load.yaml index 0ac67fea8..1c1ca33f3 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-load.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-load.yaml @@ -1,3 +1,16 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. load: type: constant interval: 1.0 @@ -8,4 +21,4 @@ load: stage_duration: 30 num_workers: 20 worker_max_concurrency: 15 - worker_max_tcp_connections: 2500 \ No newline at end of file + worker_max_tcp_connections: 2500 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml index ee6ce1dd2..c6059d674 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml @@ -52,4 +52,4 @@ replacements: resources: - ../base -nameSuffix: -vllm \ No newline at end of file +nameSuffix: -vllm diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-data.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-data.yaml index 23e23d835..84ecedcb4 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-data.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-data.yaml @@ -1,3 +1,16 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. data: type: synthetic input_distribution: # For synthetic/random types @@ -11,4 +24,4 @@ data: max: 400 mean: 200 std: 50 - total_count: 7000 \ No newline at end of file + total_count: 7000 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-load.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-load.yaml index 0ac67fea8..1c1ca33f3 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-load.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-load.yaml @@ -1,3 +1,16 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. load: type: constant interval: 1.0 @@ -8,4 +21,4 @@ load: stage_duration: 30 num_workers: 20 worker_max_concurrency: 15 - worker_max_tcp_connections: 2500 \ No newline at end of file + worker_max_tcp_connections: 2500 From 837c2e78aa30bff34785054f394919929b9b4838 Mon Sep 17 00:00:00 2001 From: Syeda Anjum Date: Wed, 11 Feb 2026 16:56:51 -0600 Subject: [PATCH 05/12] simplify --- .../base/kustomization.yaml | 80 ------------------ .../sd-eagle}/configure_benchmark.sh | 0 .../sd-eagle}/job.yaml | 0 .../sd-eagle}/pod-monitoring.yaml | 0 .../sd-eagle}/set-compute-class.yaml | 0 .../sd-eagle}/templates/benchmarking.tpl.env | 0 .../templates/configmap-benchmark.tpl.yaml | 0 ...tproviderclass-huggingface-tokens.tpl.yaml | 0 .../sd-ngram/configure_benchmark.sh | 59 ++++++++++++++ .../vllm-spec-decoding/sd-ngram/job.yaml | 60 ++++++++++++++ .../sd-ngram/pod-monitoring.yaml | 34 ++++++++ .../set-compute-class.yaml} | 16 +++- .../sd-ngram/templates/benchmarking.tpl.env | 8 ++ .../templates/configmap-benchmark.tpl.yaml} | 47 +++++++++++ ...providerclass-huggingface-tokens.tpl.yaml} | 23 +++--- .../{base => vllm}/configmap-benchmark.yaml | 0 .../vllm/configure_benchmark.sh | 59 ++++++++++++++ .../inference-perf-bench/vllm/job.yaml | 60 ++++++++++++++ .../vllm/kustomization.yaml | 81 ++++++++++++------- .../inference-perf-bench/vllm/patch-load.yaml | 24 ------ .../vllm/pod-monitoring.yaml | 34 ++++++++ .../set-compute-class.yaml} | 16 +++- .../vllm/templates/benchmarking.tpl.env | 8 ++ .../templates/configmap-benchmark.tpl.yaml | 74 +++++++++++++++++ ...providerclass-huggingface-tokens.tpl.yaml} | 23 +++--- 25 files changed, 546 insertions(+), 160 deletions(-) delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/kustomization.yaml rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{base => vllm-spec-decoding/sd-eagle}/configure_benchmark.sh (100%) rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{base => vllm-spec-decoding/sd-eagle}/job.yaml (100%) rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{base => vllm-spec-decoding/sd-eagle}/pod-monitoring.yaml (100%) rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{base => vllm-spec-decoding/sd-eagle}/set-compute-class.yaml (100%) rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{base => vllm-spec-decoding/sd-eagle}/templates/benchmarking.tpl.env (100%) rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{base => vllm-spec-decoding/sd-eagle}/templates/configmap-benchmark.tpl.yaml (100%) rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{base => vllm-spec-decoding/sd-eagle}/templates/secretproviderclass-huggingface-tokens.tpl.yaml (100%) create mode 100755 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/configure_benchmark.sh create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/job.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/pod-monitoring.yaml rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/{sd-eagle/patch-data.yaml => sd-ngram/set-compute-class.yaml} (66%) create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/benchmarking.tpl.env rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{vllm/patch-data.yaml => vllm-spec-decoding/sd-ngram/templates/configmap-benchmark.tpl.yaml} (52%) rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/{patch-load.yaml => templates/secretproviderclass-huggingface-tokens.tpl.yaml} (59%) rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{base => vllm}/configmap-benchmark.yaml (100%) create mode 100755 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configure_benchmark.sh create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/job.yaml delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-load.yaml create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/pod-monitoring.yaml rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{vllm-spec-decoding/sd-ngram/patch-data.yaml => vllm/set-compute-class.yaml} (66%) create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/benchmarking.tpl.env create mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/configmap-benchmark.tpl.yaml rename platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/{vllm-spec-decoding/sd-eagle/patch-load.yaml => vllm/templates/secretproviderclass-huggingface-tokens.tpl.yaml} (59%) diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/kustomization.yaml deleted file mode 100644 index 18c65f2a1..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/kustomization.yaml +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -configMapGenerator: - - name: benchmark - envs: - - benchmarking.env - - name: inference-perf-config - namespace: replaced-by-kustomize - envs: - - benchmarking.env - files: - - config.yaml=configmap-benchmark.yaml - -patches: - - path: set-compute-class.yaml - -replacements: - # Metadata & Namespace Sync - - source: - kind: ConfigMap - name: benchmark - fieldPath: data.BENCHMARKING_KUBERNETES_NAMESPACE - targets: - - select: {kind: ConfigMap} - fieldPaths: [metadata.namespace] - - select: {kind: Job} - fieldPaths: [metadata.namespace] - - select: {kind: SecretProviderClass} - fieldPaths: [metadata.namespace] - - select: {kind: PodMonitoring} - fieldPaths: [metadata.namespace] - - # Config.yaml String Interpolation - - source: {kind: ConfigMap, name: inference-perf-config, fieldPath: data.HF_MODEL_ID} - targets: - - select: {kind: ConfigMap, name: inference-perf-config} - fieldPaths: ["data.config\\.yaml"] - options: {delimiter: "${HF_MODEL_ID}"} - - - source: {kind: ConfigMap, name: inference-perf-config, fieldPath: data.APP_LABEL} - targets: - - select: {kind: ConfigMap, name: inference-perf-config} - fieldPaths: ["data.config\\.yaml"] - options: {delimiter: "${APP_LABEL}"} - - - source: {kind: ConfigMap, name: inference-perf-config, fieldPath: data.BENCHMARKING_KUBERNETES_NAMESPACE} - targets: - - select: {kind: ConfigMap, name: inference-perf-config} - fieldPaths: ["data.config\\.yaml"] - options: {delimiter: "${BENCHMARKING_KUBERNETES_NAMESPACE}"} - - - source: {kind: ConfigMap, name: inference-perf-config, fieldPath: data.hub_models_bucket_bench_results_name} - targets: - - select: {kind: ConfigMap, name: inference-perf-config} - fieldPaths: ["data.config\\.yaml"] - options: {delimiter: "${hub_models_bucket_bench_results_name}"} - -resources: - - job.yaml - - secretproviderclass-huggingface-tokens.yaml - - pod-monitoring.yaml - -generatorOptions: - disableNameSuffixHash: true - \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/configure_benchmark.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/configure_benchmark.sh similarity index 100% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/configure_benchmark.sh rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/configure_benchmark.sh diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/job.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/job.yaml similarity index 100% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/job.yaml rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/job.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/pod-monitoring.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/pod-monitoring.yaml similarity index 100% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/pod-monitoring.yaml rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/pod-monitoring.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/set-compute-class.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/set-compute-class.yaml similarity index 100% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/set-compute-class.yaml rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/set-compute-class.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/templates/benchmarking.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/templates/benchmarking.tpl.env similarity index 100% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/templates/benchmarking.tpl.env rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/templates/benchmarking.tpl.env diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/templates/configmap-benchmark.tpl.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/templates/configmap-benchmark.tpl.yaml similarity index 100% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/templates/configmap-benchmark.tpl.yaml rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/templates/configmap-benchmark.tpl.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/templates/secretproviderclass-huggingface-tokens.tpl.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/templates/secretproviderclass-huggingface-tokens.tpl.yaml similarity index 100% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/templates/secretproviderclass-huggingface-tokens.tpl.yaml rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/templates/secretproviderclass-huggingface-tokens.tpl.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/configure_benchmark.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/configure_benchmark.sh new file mode 100755 index 000000000..a05fd0f3b --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/configure_benchmark.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o errexit +set -o nounset +set -o pipefail + +MY_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +# Update benchmarking namespace depending on TPU or GPU selection +TARGET_FILE="${MY_PATH}/templates/benchmarking.tpl.env" +GPU_NS="${ira_online_gpu_kubernetes_namespace_name}" +TPU_NS="${ira_online_tpu_kubernetes_namespace_name}" + +# Determine the correct namespace +if [[ "$ACCELERATOR" == "GPU" ]]; then + export BENCHMARKING_KUBERNETES_NAMESPACE=$GPU_NS +elif [[ "$ACCELERATOR" == "TPU" ]]; then + export BENCHMARKING_KUBERNETES_NAMESPACE=$TPU_NS +else + echo "Error: Please specify 'GPU' or 'TPU'" + exit 1 +fi + +# Use sed to update the value +if grep -q "BENCHMARKING_KUBERNETES_NAMESPACE=" "$TARGET_FILE"; then + sed -i "s/^BENCHMARKING_KUBERNETES_NAMESPACE=.*/BENCHMARKING_KUBERNETES_NAMESPACE=$BENCHMARKING_KUBERNETES_NAMESPACE/" "$TARGET_FILE" + echo "Successfully updated $TARGET_FILE: BENCHMARKING_KUBERNETES_NAMESPACE=$BENCHMARKING_KUBERNETES_NAMESPACE" +else + echo "Variable not found" +fi + +source "${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variables.sh" + +envsubst < "${MY_PATH}/templates/benchmarking.tpl.env" | sponge "${MY_PATH}/benchmarking.env" + +# envsubst < "${MY_PATH}/templates/configmap-benchmark.tpl.yaml" | sponge "${MY_PATH}/configmap-benchmark.yaml" + +envsubst < "${MY_PATH}/templates/secretproviderclass-huggingface-tokens.tpl.yaml" | sponge "${MY_PATH}/secretproviderclass-huggingface-tokens.yaml" + + +cd "${MY_PATH}" +kustomize edit set nameprefix "${HF_MODEL_ID_HASH}-" + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/job.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/job.yaml new file mode 100644 index 000000000..f90eb0759 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/job.yaml @@ -0,0 +1,60 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: inference-perf + namespace: replaced-by-kustomize + labels: + app: inference-perf +spec: + template: + metadata: + labels: + app: inference-perf + spec: + serviceAccountName: replaced-by-kustomize + containers: + - name: inference-perf + env: + - name: HF_TOKEN_PATH + value: /var/run/secrets/huggingface.co/token + image: quay.io/inference-perf/inference-perf:latest + imagePullPolicy: Always + command: ["inference-perf"] + args: ["--config_file", "/etc/config/config.yaml"] + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + - name: huggingface-token + mountPath: /var/run/secrets/huggingface.co + resources: + requests: + cpu: 200m + ephemeral-storage: 10Gi + memory: 10Gi + restartPolicy: Never + volumes: + - name: config-volume + configMap: + name: inference-perf-config + - csi: + driver: secrets-store-gke.csi.k8s.io + readOnly: true + volumeAttributes: + secretProviderClass: huggingface-token-read + name: huggingface-token + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/pod-monitoring.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/pod-monitoring.yaml new file mode 100644 index 000000000..c21d7e4f6 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/pod-monitoring.yaml @@ -0,0 +1,34 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: monitoring.googleapis.com/v1 +kind: PodMonitoring +metadata: + labels: + app: APP_LABEL + name: vllm-podmonitoring + namespace: replaced-by-kustomize +spec: + endpoints: + - interval: 15s + path: "/metrics" + port: metrics + selector: + matchLabels: + app: APP_LABEL + targetLabels: + metadata: + - pod + - container + - node diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-data.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/set-compute-class.yaml similarity index 66% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-data.yaml rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/set-compute-class.yaml index eda5d0c11..fc36cc817 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-data.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/set-compute-class.yaml @@ -4,12 +4,22 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -data: - type: cnn_dailymail +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: inference-perf + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: model-download + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/benchmarking.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/benchmarking.tpl.env new file mode 100644 index 000000000..42363b9a8 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/benchmarking.tpl.env @@ -0,0 +1,8 @@ +BENCHMARKING_KUBERNETES_SERVICE_ACCOUNT=${ira_inference_perf_bench_kubernetes_service_account_name} +BENCHMARKING_KUBERNETES_NAMESPACE=benchmarking_ns +HUGGINGFACE_TOKEN_READ_SECRET_PROVIDER_CLASS_NAME=huggingface-token-read +RESULTS_BUCKET_NAME=${hub_models_bucket_bench_results_name} +DATASET_BUCKET_NAME=${hub_models_bucket_bench_dataset_name} +MODEL_ID=${HF_MODEL_ID} +APP_LABEL=${APP_LABEL} + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-data.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/configmap-benchmark.tpl.yaml similarity index 52% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-data.yaml rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/configmap-benchmark.tpl.yaml index 84ecedcb4..2d80035db 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-data.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/configmap-benchmark.tpl.yaml @@ -11,7 +11,36 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: inference-perf-config + namespace: ${BENCHMARKING_KUBERNETES_NAMESPACE} data: + config.yaml: | + load: + type: constant + interval: 1.0 + sweep: + type: linear + timeout: 250 + num_stages: 7 + stage_duration: 30 + num_workers: 20 + worker_max_concurrency: 15 + worker_max_tcp_connections: 2500 + api: + type: completion + streaming: true + server: + type: vllm + model_name: /gcs/${HF_MODEL_ID} + base_url: http://${APP_LABEL}.${BENCHMARKING_KUBERNETES_NAMESPACE}.svc.cluster.local:8000 + ignore_eos: true + tokenizer: + pretrained_model_name_or_path: ${HF_MODEL_ID} + data: type: synthetic input_distribution: # For synthetic/random types min: 10 # Minimum prompt length (tokens) @@ -25,3 +54,21 @@ data: mean: 200 std: 50 total_count: 7000 + metrics: + type: prometheus + prometheus: + scrape_interval: 15 + google_managed: true # Whether using Google Managed Prometheus + filters: [] + report: + request_lifecycle: + summary: true + per_stage: true + per_request: true + prometheus: + summary: true + per_stage: true + storage: + google_cloud_storage: + bucket_name: ${hub_models_bucket_bench_results_name} # Required GCS bucket + report_file_prefix: null # Optional filename prefix diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-load.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/secretproviderclass-huggingface-tokens.tpl.yaml similarity index 59% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-load.yaml rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/secretproviderclass-huggingface-tokens.tpl.yaml index 1c1ca33f3..00392a5a3 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-load.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/secretproviderclass-huggingface-tokens.tpl.yaml @@ -11,14 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -load: - type: constant - interval: 1.0 - sweep: - type: linear - timeout: 250 - num_stages: 7 - stage_duration: 30 - num_workers: 20 - worker_max_concurrency: 15 - worker_max_tcp_connections: 2500 +--- +apiVersion: secrets-store.csi.x-k8s.io/v1 +kind: SecretProviderClass +metadata: + name: huggingface-token-read + namespace: replaced-by-kustomize +spec: + parameters: + secrets: | + - resourceName: "projects/${huggingface_secret_manager_project_id}/secrets/${huggingface_hub_access_token_read_secret_manager_secret_name}/versions/latest" + path: "token" + provider: gke diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/configmap-benchmark.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configmap-benchmark.yaml similarity index 100% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/base/configmap-benchmark.yaml rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configmap-benchmark.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configure_benchmark.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configure_benchmark.sh new file mode 100755 index 000000000..a05fd0f3b --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configure_benchmark.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash + +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -o errexit +set -o nounset +set -o pipefail + +MY_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +# Update benchmarking namespace depending on TPU or GPU selection +TARGET_FILE="${MY_PATH}/templates/benchmarking.tpl.env" +GPU_NS="${ira_online_gpu_kubernetes_namespace_name}" +TPU_NS="${ira_online_tpu_kubernetes_namespace_name}" + +# Determine the correct namespace +if [[ "$ACCELERATOR" == "GPU" ]]; then + export BENCHMARKING_KUBERNETES_NAMESPACE=$GPU_NS +elif [[ "$ACCELERATOR" == "TPU" ]]; then + export BENCHMARKING_KUBERNETES_NAMESPACE=$TPU_NS +else + echo "Error: Please specify 'GPU' or 'TPU'" + exit 1 +fi + +# Use sed to update the value +if grep -q "BENCHMARKING_KUBERNETES_NAMESPACE=" "$TARGET_FILE"; then + sed -i "s/^BENCHMARKING_KUBERNETES_NAMESPACE=.*/BENCHMARKING_KUBERNETES_NAMESPACE=$BENCHMARKING_KUBERNETES_NAMESPACE/" "$TARGET_FILE" + echo "Successfully updated $TARGET_FILE: BENCHMARKING_KUBERNETES_NAMESPACE=$BENCHMARKING_KUBERNETES_NAMESPACE" +else + echo "Variable not found" +fi + +source "${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variables.sh" + +envsubst < "${MY_PATH}/templates/benchmarking.tpl.env" | sponge "${MY_PATH}/benchmarking.env" + +# envsubst < "${MY_PATH}/templates/configmap-benchmark.tpl.yaml" | sponge "${MY_PATH}/configmap-benchmark.yaml" + +envsubst < "${MY_PATH}/templates/secretproviderclass-huggingface-tokens.tpl.yaml" | sponge "${MY_PATH}/secretproviderclass-huggingface-tokens.yaml" + + +cd "${MY_PATH}" +kustomize edit set nameprefix "${HF_MODEL_ID_HASH}-" + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/job.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/job.yaml new file mode 100644 index 000000000..f90eb0759 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/job.yaml @@ -0,0 +1,60 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: inference-perf + namespace: replaced-by-kustomize + labels: + app: inference-perf +spec: + template: + metadata: + labels: + app: inference-perf + spec: + serviceAccountName: replaced-by-kustomize + containers: + - name: inference-perf + env: + - name: HF_TOKEN_PATH + value: /var/run/secrets/huggingface.co/token + image: quay.io/inference-perf/inference-perf:latest + imagePullPolicy: Always + command: ["inference-perf"] + args: ["--config_file", "/etc/config/config.yaml"] + volumeMounts: + - name: config-volume + mountPath: /etc/config + readOnly: true + - name: huggingface-token + mountPath: /var/run/secrets/huggingface.co + resources: + requests: + cpu: 200m + ephemeral-storage: 10Gi + memory: 10Gi + restartPolicy: Never + volumes: + - name: config-volume + configMap: + name: inference-perf-config + - csi: + driver: secrets-store-gke.csi.k8s.io + readOnly: true + volumeAttributes: + secretProviderClass: huggingface-token-read + name: huggingface-token + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml index c6059d674..18c65f2a1 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml @@ -4,7 +4,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -16,40 +16,65 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization configMapGenerator: - - name: load-patch-source + - name: benchmark + envs: + - benchmarking.env + - name: inference-perf-config + namespace: replaced-by-kustomize + envs: + - benchmarking.env files: - - content=patch-load.yaml - - name: data-patch-source - files: - - content=patch-data.yaml + - config.yaml=configmap-benchmark.yaml + +patches: + - path: set-compute-class.yaml replacements: + # Metadata & Namespace Sync - source: kind: ConfigMap - name: load-patch-source - fieldPath: data.content + name: benchmark + fieldPath: data.BENCHMARKING_KUBERNETES_NAMESPACE targets: - - select: - kind: ConfigMap - name: inference-perf-config - fieldPaths: - - data.config\\.yaml - options: - delimiter: "LOAD_BLOCK_HERE" + - select: {kind: ConfigMap} + fieldPaths: [metadata.namespace] + - select: {kind: Job} + fieldPaths: [metadata.namespace] + - select: {kind: SecretProviderClass} + fieldPaths: [metadata.namespace] + - select: {kind: PodMonitoring} + fieldPaths: [metadata.namespace] - - source: - kind: ConfigMap - name: data-patch-source - fieldPath: data.content + # Config.yaml String Interpolation + - source: {kind: ConfigMap, name: inference-perf-config, fieldPath: data.HF_MODEL_ID} + targets: + - select: {kind: ConfigMap, name: inference-perf-config} + fieldPaths: ["data.config\\.yaml"] + options: {delimiter: "${HF_MODEL_ID}"} + + - source: {kind: ConfigMap, name: inference-perf-config, fieldPath: data.APP_LABEL} targets: - - select: - kind: ConfigMap - name: inference-perf-config - fieldPaths: - - data.config\\.yaml - options: - delimiter: "DATA_BLOCK_HERE" + - select: {kind: ConfigMap, name: inference-perf-config} + fieldPaths: ["data.config\\.yaml"] + options: {delimiter: "${APP_LABEL}"} + + - source: {kind: ConfigMap, name: inference-perf-config, fieldPath: data.BENCHMARKING_KUBERNETES_NAMESPACE} + targets: + - select: {kind: ConfigMap, name: inference-perf-config} + fieldPaths: ["data.config\\.yaml"] + options: {delimiter: "${BENCHMARKING_KUBERNETES_NAMESPACE}"} + + - source: {kind: ConfigMap, name: inference-perf-config, fieldPath: data.hub_models_bucket_bench_results_name} + targets: + - select: {kind: ConfigMap, name: inference-perf-config} + fieldPaths: ["data.config\\.yaml"] + options: {delimiter: "${hub_models_bucket_bench_results_name}"} resources: - - ../base -nameSuffix: -vllm + - job.yaml + - secretproviderclass-huggingface-tokens.yaml + - pod-monitoring.yaml + +generatorOptions: + disableNameSuffixHash: true + \ No newline at end of file diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-load.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-load.yaml deleted file mode 100644 index 1c1ca33f3..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/patch-load.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -load: - type: constant - interval: 1.0 - sweep: - type: linear - timeout: 250 - num_stages: 7 - stage_duration: 30 - num_workers: 20 - worker_max_concurrency: 15 - worker_max_tcp_connections: 2500 diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/pod-monitoring.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/pod-monitoring.yaml new file mode 100644 index 000000000..c21d7e4f6 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/pod-monitoring.yaml @@ -0,0 +1,34 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: monitoring.googleapis.com/v1 +kind: PodMonitoring +metadata: + labels: + app: APP_LABEL + name: vllm-podmonitoring + namespace: replaced-by-kustomize +spec: + endpoints: + - interval: 15s + path: "/metrics" + port: metrics + selector: + matchLabels: + app: APP_LABEL + targetLabels: + metadata: + - pod + - container + - node diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-data.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/set-compute-class.yaml similarity index 66% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-data.yaml rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/set-compute-class.yaml index e4c3ce55f..fc36cc817 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/patch-data.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/set-compute-class.yaml @@ -4,12 +4,22 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -data: - type: shareGPT +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: inference-perf + namespace: replaced-by-kustomize +spec: + template: + spec: + nodeSelector: + cloud.google.com/compute-class: model-download + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/benchmarking.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/benchmarking.tpl.env new file mode 100644 index 000000000..42363b9a8 --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/benchmarking.tpl.env @@ -0,0 +1,8 @@ +BENCHMARKING_KUBERNETES_SERVICE_ACCOUNT=${ira_inference_perf_bench_kubernetes_service_account_name} +BENCHMARKING_KUBERNETES_NAMESPACE=benchmarking_ns +HUGGINGFACE_TOKEN_READ_SECRET_PROVIDER_CLASS_NAME=huggingface-token-read +RESULTS_BUCKET_NAME=${hub_models_bucket_bench_results_name} +DATASET_BUCKET_NAME=${hub_models_bucket_bench_dataset_name} +MODEL_ID=${HF_MODEL_ID} +APP_LABEL=${APP_LABEL} + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/configmap-benchmark.tpl.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/configmap-benchmark.tpl.yaml new file mode 100644 index 000000000..2d80035db --- /dev/null +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/configmap-benchmark.tpl.yaml @@ -0,0 +1,74 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: inference-perf-config + namespace: ${BENCHMARKING_KUBERNETES_NAMESPACE} +data: + config.yaml: | + load: + type: constant + interval: 1.0 + sweep: + type: linear + timeout: 250 + num_stages: 7 + stage_duration: 30 + num_workers: 20 + worker_max_concurrency: 15 + worker_max_tcp_connections: 2500 + api: + type: completion + streaming: true + server: + type: vllm + model_name: /gcs/${HF_MODEL_ID} + base_url: http://${APP_LABEL}.${BENCHMARKING_KUBERNETES_NAMESPACE}.svc.cluster.local:8000 + ignore_eos: true + tokenizer: + pretrained_model_name_or_path: ${HF_MODEL_ID} + data: + type: synthetic + input_distribution: # For synthetic/random types + min: 10 # Minimum prompt length (tokens) + max: 100 # Maximum prompt length + mean: 50 # Average length + std: 10 # Standard deviation + total_count: 7000 # Total prompts to generate. Choose a large number for sweep so that you don't run out of prompts as the QPS increases to saturation + output_distribution: # Same structure as input_distribution + min: 40 + max: 400 + mean: 200 + std: 50 + total_count: 7000 + metrics: + type: prometheus + prometheus: + scrape_interval: 15 + google_managed: true # Whether using Google Managed Prometheus + filters: [] + report: + request_lifecycle: + summary: true + per_stage: true + per_request: true + prometheus: + summary: true + per_stage: true + storage: + google_cloud_storage: + bucket_name: ${hub_models_bucket_bench_results_name} # Required GCS bucket + report_file_prefix: null # Optional filename prefix diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-load.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/secretproviderclass-huggingface-tokens.tpl.yaml similarity index 59% rename from platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-load.yaml rename to platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/secretproviderclass-huggingface-tokens.tpl.yaml index 1c1ca33f3..00392a5a3 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/patch-load.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/secretproviderclass-huggingface-tokens.tpl.yaml @@ -11,14 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -load: - type: constant - interval: 1.0 - sweep: - type: linear - timeout: 250 - num_stages: 7 - stage_duration: 30 - num_workers: 20 - worker_max_concurrency: 15 - worker_max_tcp_connections: 2500 +--- +apiVersion: secrets-store.csi.x-k8s.io/v1 +kind: SecretProviderClass +metadata: + name: huggingface-token-read + namespace: replaced-by-kustomize +spec: + parameters: + secrets: | + - resourceName: "projects/${huggingface_secret_manager_project_id}/secrets/${huggingface_hub_access_token_read_secret_manager_secret_name}/versions/latest" + path: "token" + provider: gke From ee9d4d1e15cabeba0309f1f2ec8e847301709534 Mon Sep 17 00:00:00 2001 From: Syeda Anjum Date: Wed, 11 Feb 2026 17:14:26 -0600 Subject: [PATCH 06/12] Update to speculative decoding readme --- .../vllm-spec-decoding-with-hf-model.md | 116 +++++++++++++++++- .../templates/configmap-benchmark.tpl.yaml | 14 +-- .../vllm/configmap-benchmark.yaml | 37 ------ 3 files changed, 116 insertions(+), 51 deletions(-) delete mode 100644 platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configmap-benchmark.yaml diff --git a/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md b/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md index e1d5dcf66..c8a78f12d 100644 --- a/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md +++ b/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md @@ -240,7 +240,121 @@ This example is built on top of the kill -9 ${PF_PID} ``` -- Measuring performance with inference-perf +## Measuring performance with inference-perf + +Inference-perf allows you to run your own benchmarks and simulate production +traffic and ensure the load generation is external to the model server pods. + +This implementation deploys the inference-perf tool as a Kubernetes Job and can +be customized with different load scenarios and datasets. + +Stay-up to date with the official +[inference-perf tool](https://github.com/kubernetes-sigs/inference-perf) to +learn more about all the supported features for metrics,load scenarios, and +datasets. + +Optional - Install the inference-perf and matplot libraries to be able to create +throughput vs latency curves + +```shell +pip install inference-perf +pip install matplotlib +``` + +### Workflow + +This example will run through the following steps: + +1. Apply the inference_perf_bench terraform, which will: + + - Create the GCS bucket for storing inference-perf results + - Create the GCS bucket for storing a custom benchmarking dataset + - Create the Kubernetes service account for the inference-perf workload + - Grant the required IAM permissions for workload identity KSA + +2. Create the custom kubernetes manifest for the benchmarking job +3. Run the benchmarking job for a load test on the vLLM service +4. Collect the google managed prometheus metrics to generate reports +5. Push the results from the benchmark run to the results GCS bucket + +#### Run the Inference-perf terraform + +```shell +export TF_VAR_enable_gpu=true +export ACCELERATOR="GPU" +export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache" +cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/inference_perf_bench && \ +rm -rf .terraform/ terraform.tfstate* && \ +terraform init && \ +terraform plan -input=false -out=tfplan && \ +terraform apply -input=false tfplan && \ +rm tfplan +``` +- Source the environment configuration. + + ```shell + source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh" + ``` +- Export the vLLM service endpoint + + ```shell + export APP_LABEL="vllm-${ACCELERATOR_TYPE}-${HF_MODEL_NAME}-sd-${METHOD}" + ``` + > > Verify the APP_LABEL + > > + > > ```shell + > > echo $APP_LABEL + > > ``` + +#### Run the benchmarking job. + + ```shell + "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-${METHOD}/configure_benchmark.sh" + ``` +- Deploy the benchmarking job. + +```shell +kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench" +``` + +- Check the status of the job + +The job can take up an estimated 15 mins to run through all the stages + + +```shell + watch --color --interval 5 --no-title + "kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} get job/${HF_MODEL_ID_HASH}-inference-perf | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1 1 1' + echo '\nLogs(last 10 lines):' + kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} logs job/${HF_MODEL_ID_HASH}-inference-perf --all-containers --tail 10" +``` +When the job is complete, you will see the following: + +```text +NAME STATUS COMPLETIONS DURATION AGE +XXXXXX-inference-perf Complete 1/1 15m 25m +``` + +#### Analyze and Interpret Results + +The output reports (JSON files) can be viewed in benchmarking results bucket +with metrics for each load stage + +Download the report and run inference-perf to create the throughput and latency +curves + +```shell + gsutil -m cp -r gs://${hub_models_bucket_bench_results_name}/ . + inference-perf --analyze ${hub_models_bucket_bench_results_name}/* + +``` +Clean up + +- Delete the benchmarking job. + + ```shell + kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench" + ``` - Delete the workload. diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/configmap-benchmark.tpl.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/configmap-benchmark.tpl.yaml index 2d80035db..3a06090a9 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/configmap-benchmark.tpl.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/configmap-benchmark.tpl.yaml @@ -41,19 +41,7 @@ data: tokenizer: pretrained_model_name_or_path: ${HF_MODEL_ID} data: - type: synthetic - input_distribution: # For synthetic/random types - min: 10 # Minimum prompt length (tokens) - max: 100 # Maximum prompt length - mean: 50 # Average length - std: 10 # Standard deviation - total_count: 7000 # Total prompts to generate. Choose a large number for sweep so that you don't run out of prompts as the QPS increases to saturation - output_distribution: # Same structure as input_distribution - min: 40 - max: 400 - mean: 200 - std: 50 - total_count: 7000 + type: shareGPT metrics: type: prometheus prometheus: diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configmap-benchmark.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configmap-benchmark.yaml deleted file mode 100644 index 2351ecd84..000000000 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configmap-benchmark.yaml +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -LOAD_BLOCK_HERE -api: - type: completion - streaming: true -server: - type: vllm - model_name: /gcs/${HF_MODEL_ID} - base_url: http://${APP_LABEL}.${BENCHMARKING_KUBERNETES_NAMESPACE}.svc.cluster.local:8000 - ignore_eos: true -tokenizer: - pretrained_model_name_or_path: ${HF_MODEL_ID} -DATA_BLOCK_HERE -metrics: - type: prometheus - prometheus: - scrape_interval: 15 - google_managed: true -report: - request_lifecycle: - summary: true -storage: - google_cloud_storage: - bucket_name: ${hub_models_bucket_bench_results_name} - From 961041dd533e7ee157cbf02bddc7407b3cf01666 Mon Sep 17 00:00:00 2001 From: Syeda Anjum Date: Wed, 11 Feb 2026 17:26:38 -0600 Subject: [PATCH 07/12] new line --- .../sd-eagle/configure_benchmark.sh | 2 +- .../vllm-spec-decoding/sd-eagle/job.yaml | 1 + .../sd-eagle/kustomization.yaml | 96 ++++++++++----- .../sd-eagle/pod-monitoring.yaml | 1 + .../sd-eagle/set-compute-class.yaml | 1 + .../templates/configmap-benchmark.tpl.yaml | 15 +-- ...tproviderclass-huggingface-tokens.tpl.yaml | 1 + .../sd-ngram/configure_benchmark.sh | 3 +- .../vllm-spec-decoding/sd-ngram/job.yaml | 1 + .../sd-ngram/kustomization.yaml | 96 ++++++++++----- .../sd-ngram/pod-monitoring.yaml | 1 + .../sd-ngram/set-compute-class.yaml | 1 + .../sd-ngram/templates/benchmarking.tpl.env | 1 + .../templates/configmap-benchmark.tpl.yaml | 1 + ...tproviderclass-huggingface-tokens.tpl.yaml | 1 + .../vllm/configure_benchmark.sh | 3 +- .../inference-perf-bench/vllm/job.yaml | 1 + .../vllm/kustomization.yaml | 115 +++++++++--------- .../vllm/pod-monitoring.yaml | 1 + .../vllm/set-compute-class.yaml | 1 + .../vllm/templates/benchmarking.tpl.env | 1 + .../templates/configmap-benchmark.tpl.yaml | 1 + ...tproviderclass-huggingface-tokens.tpl.yaml | 1 + 23 files changed, 209 insertions(+), 137 deletions(-) diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/configure_benchmark.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/configure_benchmark.sh index a05fd0f3b..3beba4ef6 100755 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/configure_benchmark.sh +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/configure_benchmark.sh @@ -49,7 +49,7 @@ source "${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variab envsubst < "${MY_PATH}/templates/benchmarking.tpl.env" | sponge "${MY_PATH}/benchmarking.env" -# envsubst < "${MY_PATH}/templates/configmap-benchmark.tpl.yaml" | sponge "${MY_PATH}/configmap-benchmark.yaml" +envsubst < "${MY_PATH}/templates/configmap-benchmark.tpl.yaml" | sponge "${MY_PATH}/configmap-benchmark.yaml" envsubst < "${MY_PATH}/templates/secretproviderclass-huggingface-tokens.tpl.yaml" | sponge "${MY_PATH}/secretproviderclass-huggingface-tokens.yaml" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/job.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/job.yaml index f90eb0759..35ea1119f 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/job.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/job.yaml @@ -58,3 +58,4 @@ spec: secretProviderClass: huggingface-token-read name: huggingface-token + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml index cd1593eb2..c4b0e1dc0 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/kustomization.yaml @@ -4,7 +4,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -16,40 +16,70 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization configMapGenerator: - - name: load-patch-source - files: - - content=patch-load.yaml - - name: data-patch-source - files: - - content=patch-data.yaml +- envs: + - benchmarking.env + name: benchmark + namespace: replaced-by-kustomize -replacements: - - source: - kind: ConfigMap - name: load-patch-source - fieldPath: data.content - targets: - - select: - kind: ConfigMap - name: inference-perf-config - fieldPaths: - - data.config\\.yaml - options: - delimiter: "LOAD_BLOCK_HERE" +patches: +- path: set-compute-class.yaml - - source: +replacements: +- source: + fieldPath: data.BENCHMARKING_KUBERNETES_NAMESPACE + kind: ConfigMap + name: benchmark + targets: + - fieldPaths: + - metadata.namespace + select: kind: ConfigMap - name: data-patch-source - fieldPath: data.content - targets: - - select: - kind: ConfigMap - name: inference-perf-config - fieldPaths: - - data.config\\.yaml - options: - delimiter: "DATA_BLOCK_HERE" + - fieldPaths: + - metadata.namespace + select: + kind: Job + - fieldPaths: + - metadata.namespace + select: + kind: SecretProviderClass + - fieldPaths: + - metadata.namespace + select: + kind: PodMonitoring +- source: + fieldPath: data.BENCHMARKING_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: benchmark + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Job + name: inference-perf +- source: + fieldPath: metadata.name + kind: SecretProviderClass + name: huggingface-token-read + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-token].csi.volumeAttributes.secretProviderClass + select: + kind: Job + name: inference-perf +- source: + kind: ConfigMap + name: benchmark + fieldPath: data.APP_LABEL + targets: + - select: + kind: PodMonitoring + name: vllm-podmonitoring + fieldPaths: + - metadata.labels.app + - spec.selector.matchLabels.app resources: - - ../base - \ No newline at end of file +- configmap-benchmark.yaml +- job.yaml +- secretproviderclass-huggingface-tokens.yaml +- pod-monitoring.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/pod-monitoring.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/pod-monitoring.yaml index c21d7e4f6..116753958 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/pod-monitoring.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/pod-monitoring.yaml @@ -32,3 +32,4 @@ spec: - pod - container - node + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/set-compute-class.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/set-compute-class.yaml index fc36cc817..9ff69e8b5 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/set-compute-class.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/set-compute-class.yaml @@ -23,3 +23,4 @@ spec: nodeSelector: cloud.google.com/compute-class: model-download + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/templates/configmap-benchmark.tpl.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/templates/configmap-benchmark.tpl.yaml index 2d80035db..67ed00430 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/templates/configmap-benchmark.tpl.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/templates/configmap-benchmark.tpl.yaml @@ -41,19 +41,7 @@ data: tokenizer: pretrained_model_name_or_path: ${HF_MODEL_ID} data: - type: synthetic - input_distribution: # For synthetic/random types - min: 10 # Minimum prompt length (tokens) - max: 100 # Maximum prompt length - mean: 50 # Average length - std: 10 # Standard deviation - total_count: 7000 # Total prompts to generate. Choose a large number for sweep so that you don't run out of prompts as the QPS increases to saturation - output_distribution: # Same structure as input_distribution - min: 40 - max: 400 - mean: 200 - std: 50 - total_count: 7000 + type: cnn_dailymail metrics: type: prometheus prometheus: @@ -72,3 +60,4 @@ data: google_cloud_storage: bucket_name: ${hub_models_bucket_bench_results_name} # Required GCS bucket report_file_prefix: null # Optional filename prefix + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/templates/secretproviderclass-huggingface-tokens.tpl.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/templates/secretproviderclass-huggingface-tokens.tpl.yaml index 00392a5a3..c6d280263 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/templates/secretproviderclass-huggingface-tokens.tpl.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/templates/secretproviderclass-huggingface-tokens.tpl.yaml @@ -23,3 +23,4 @@ spec: - resourceName: "projects/${huggingface_secret_manager_project_id}/secrets/${huggingface_hub_access_token_read_secret_manager_secret_name}/versions/latest" path: "token" provider: gke + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/configure_benchmark.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/configure_benchmark.sh index a05fd0f3b..6846a0736 100755 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/configure_benchmark.sh +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/configure_benchmark.sh @@ -49,7 +49,7 @@ source "${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variab envsubst < "${MY_PATH}/templates/benchmarking.tpl.env" | sponge "${MY_PATH}/benchmarking.env" -# envsubst < "${MY_PATH}/templates/configmap-benchmark.tpl.yaml" | sponge "${MY_PATH}/configmap-benchmark.yaml" +envsubst < "${MY_PATH}/templates/configmap-benchmark.tpl.yaml" | sponge "${MY_PATH}/configmap-benchmark.yaml" envsubst < "${MY_PATH}/templates/secretproviderclass-huggingface-tokens.tpl.yaml" | sponge "${MY_PATH}/secretproviderclass-huggingface-tokens.yaml" @@ -57,3 +57,4 @@ envsubst < "${MY_PATH}/templates/secretproviderclass-huggingface-tokens.tpl.yaml cd "${MY_PATH}" kustomize edit set nameprefix "${HF_MODEL_ID_HASH}-" + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/job.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/job.yaml index f90eb0759..35ea1119f 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/job.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/job.yaml @@ -58,3 +58,4 @@ spec: secretProviderClass: huggingface-token-read name: huggingface-token + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml index cd1593eb2..c4b0e1dc0 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/kustomization.yaml @@ -4,7 +4,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -16,40 +16,70 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization configMapGenerator: - - name: load-patch-source - files: - - content=patch-load.yaml - - name: data-patch-source - files: - - content=patch-data.yaml +- envs: + - benchmarking.env + name: benchmark + namespace: replaced-by-kustomize -replacements: - - source: - kind: ConfigMap - name: load-patch-source - fieldPath: data.content - targets: - - select: - kind: ConfigMap - name: inference-perf-config - fieldPaths: - - data.config\\.yaml - options: - delimiter: "LOAD_BLOCK_HERE" +patches: +- path: set-compute-class.yaml - - source: +replacements: +- source: + fieldPath: data.BENCHMARKING_KUBERNETES_NAMESPACE + kind: ConfigMap + name: benchmark + targets: + - fieldPaths: + - metadata.namespace + select: kind: ConfigMap - name: data-patch-source - fieldPath: data.content - targets: - - select: - kind: ConfigMap - name: inference-perf-config - fieldPaths: - - data.config\\.yaml - options: - delimiter: "DATA_BLOCK_HERE" + - fieldPaths: + - metadata.namespace + select: + kind: Job + - fieldPaths: + - metadata.namespace + select: + kind: SecretProviderClass + - fieldPaths: + - metadata.namespace + select: + kind: PodMonitoring +- source: + fieldPath: data.BENCHMARKING_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: benchmark + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Job + name: inference-perf +- source: + fieldPath: metadata.name + kind: SecretProviderClass + name: huggingface-token-read + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-token].csi.volumeAttributes.secretProviderClass + select: + kind: Job + name: inference-perf +- source: + kind: ConfigMap + name: benchmark + fieldPath: data.APP_LABEL + targets: + - select: + kind: PodMonitoring + name: vllm-podmonitoring + fieldPaths: + - metadata.labels.app + - spec.selector.matchLabels.app resources: - - ../base - \ No newline at end of file +- configmap-benchmark.yaml +- job.yaml +- secretproviderclass-huggingface-tokens.yaml +- pod-monitoring.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/pod-monitoring.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/pod-monitoring.yaml index c21d7e4f6..116753958 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/pod-monitoring.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/pod-monitoring.yaml @@ -32,3 +32,4 @@ spec: - pod - container - node + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/set-compute-class.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/set-compute-class.yaml index fc36cc817..9ff69e8b5 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/set-compute-class.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/set-compute-class.yaml @@ -23,3 +23,4 @@ spec: nodeSelector: cloud.google.com/compute-class: model-download + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/benchmarking.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/benchmarking.tpl.env index 42363b9a8..49fc2dfed 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/benchmarking.tpl.env +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/benchmarking.tpl.env @@ -6,3 +6,4 @@ DATASET_BUCKET_NAME=${hub_models_bucket_bench_dataset_name} MODEL_ID=${HF_MODEL_ID} APP_LABEL=${APP_LABEL} + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/configmap-benchmark.tpl.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/configmap-benchmark.tpl.yaml index 3a06090a9..583a9a89c 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/configmap-benchmark.tpl.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/configmap-benchmark.tpl.yaml @@ -60,3 +60,4 @@ data: google_cloud_storage: bucket_name: ${hub_models_bucket_bench_results_name} # Required GCS bucket report_file_prefix: null # Optional filename prefix + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/secretproviderclass-huggingface-tokens.tpl.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/secretproviderclass-huggingface-tokens.tpl.yaml index 00392a5a3..c6d280263 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/secretproviderclass-huggingface-tokens.tpl.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/templates/secretproviderclass-huggingface-tokens.tpl.yaml @@ -23,3 +23,4 @@ spec: - resourceName: "projects/${huggingface_secret_manager_project_id}/secrets/${huggingface_hub_access_token_read_secret_manager_secret_name}/versions/latest" path: "token" provider: gke + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configure_benchmark.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configure_benchmark.sh index a05fd0f3b..6846a0736 100755 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configure_benchmark.sh +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configure_benchmark.sh @@ -49,7 +49,7 @@ source "${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variab envsubst < "${MY_PATH}/templates/benchmarking.tpl.env" | sponge "${MY_PATH}/benchmarking.env" -# envsubst < "${MY_PATH}/templates/configmap-benchmark.tpl.yaml" | sponge "${MY_PATH}/configmap-benchmark.yaml" +envsubst < "${MY_PATH}/templates/configmap-benchmark.tpl.yaml" | sponge "${MY_PATH}/configmap-benchmark.yaml" envsubst < "${MY_PATH}/templates/secretproviderclass-huggingface-tokens.tpl.yaml" | sponge "${MY_PATH}/secretproviderclass-huggingface-tokens.yaml" @@ -57,3 +57,4 @@ envsubst < "${MY_PATH}/templates/secretproviderclass-huggingface-tokens.tpl.yaml cd "${MY_PATH}" kustomize edit set nameprefix "${HF_MODEL_ID_HASH}-" + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/job.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/job.yaml index f90eb0759..35ea1119f 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/job.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/job.yaml @@ -58,3 +58,4 @@ spec: secretProviderClass: huggingface-token-read name: huggingface-token + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml index 18c65f2a1..c4b0e1dc0 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/kustomization.yaml @@ -16,65 +16,70 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization configMapGenerator: - - name: benchmark - envs: - - benchmarking.env - - name: inference-perf-config - namespace: replaced-by-kustomize - envs: - - benchmarking.env - files: - - config.yaml=configmap-benchmark.yaml +- envs: + - benchmarking.env + name: benchmark + namespace: replaced-by-kustomize patches: - - path: set-compute-class.yaml +- path: set-compute-class.yaml replacements: - # Metadata & Namespace Sync - - source: +- source: + fieldPath: data.BENCHMARKING_KUBERNETES_NAMESPACE + kind: ConfigMap + name: benchmark + targets: + - fieldPaths: + - metadata.namespace + select: kind: ConfigMap - name: benchmark - fieldPath: data.BENCHMARKING_KUBERNETES_NAMESPACE - targets: - - select: {kind: ConfigMap} - fieldPaths: [metadata.namespace] - - select: {kind: Job} - fieldPaths: [metadata.namespace] - - select: {kind: SecretProviderClass} - fieldPaths: [metadata.namespace] - - select: {kind: PodMonitoring} - fieldPaths: [metadata.namespace] - - # Config.yaml String Interpolation - - source: {kind: ConfigMap, name: inference-perf-config, fieldPath: data.HF_MODEL_ID} - targets: - - select: {kind: ConfigMap, name: inference-perf-config} - fieldPaths: ["data.config\\.yaml"] - options: {delimiter: "${HF_MODEL_ID}"} - - - source: {kind: ConfigMap, name: inference-perf-config, fieldPath: data.APP_LABEL} - targets: - - select: {kind: ConfigMap, name: inference-perf-config} - fieldPaths: ["data.config\\.yaml"] - options: {delimiter: "${APP_LABEL}"} - - - source: {kind: ConfigMap, name: inference-perf-config, fieldPath: data.BENCHMARKING_KUBERNETES_NAMESPACE} - targets: - - select: {kind: ConfigMap, name: inference-perf-config} - fieldPaths: ["data.config\\.yaml"] - options: {delimiter: "${BENCHMARKING_KUBERNETES_NAMESPACE}"} - - - source: {kind: ConfigMap, name: inference-perf-config, fieldPath: data.hub_models_bucket_bench_results_name} - targets: - - select: {kind: ConfigMap, name: inference-perf-config} - fieldPaths: ["data.config\\.yaml"] - options: {delimiter: "${hub_models_bucket_bench_results_name}"} + - fieldPaths: + - metadata.namespace + select: + kind: Job + - fieldPaths: + - metadata.namespace + select: + kind: SecretProviderClass + - fieldPaths: + - metadata.namespace + select: + kind: PodMonitoring +- source: + fieldPath: data.BENCHMARKING_KUBERNETES_SERVICE_ACCOUNT + kind: ConfigMap + name: benchmark + targets: + - fieldPaths: + - spec.template.spec.serviceAccountName + select: + kind: Job + name: inference-perf +- source: + fieldPath: metadata.name + kind: SecretProviderClass + name: huggingface-token-read + targets: + - fieldPaths: + - spec.template.spec.volumes.[name=huggingface-token].csi.volumeAttributes.secretProviderClass + select: + kind: Job + name: inference-perf +- source: + kind: ConfigMap + name: benchmark + fieldPath: data.APP_LABEL + targets: + - select: + kind: PodMonitoring + name: vllm-podmonitoring + fieldPaths: + - metadata.labels.app + - spec.selector.matchLabels.app resources: - - job.yaml - - secretproviderclass-huggingface-tokens.yaml - - pod-monitoring.yaml - -generatorOptions: - disableNameSuffixHash: true - \ No newline at end of file +- configmap-benchmark.yaml +- job.yaml +- secretproviderclass-huggingface-tokens.yaml +- pod-monitoring.yaml diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/pod-monitoring.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/pod-monitoring.yaml index c21d7e4f6..116753958 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/pod-monitoring.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/pod-monitoring.yaml @@ -32,3 +32,4 @@ spec: - pod - container - node + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/set-compute-class.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/set-compute-class.yaml index fc36cc817..9ff69e8b5 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/set-compute-class.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/set-compute-class.yaml @@ -23,3 +23,4 @@ spec: nodeSelector: cloud.google.com/compute-class: model-download + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/benchmarking.tpl.env b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/benchmarking.tpl.env index 42363b9a8..49fc2dfed 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/benchmarking.tpl.env +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/benchmarking.tpl.env @@ -6,3 +6,4 @@ DATASET_BUCKET_NAME=${hub_models_bucket_bench_dataset_name} MODEL_ID=${HF_MODEL_ID} APP_LABEL=${APP_LABEL} + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/configmap-benchmark.tpl.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/configmap-benchmark.tpl.yaml index 2d80035db..a73ad0506 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/configmap-benchmark.tpl.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/configmap-benchmark.tpl.yaml @@ -72,3 +72,4 @@ data: google_cloud_storage: bucket_name: ${hub_models_bucket_bench_results_name} # Required GCS bucket report_file_prefix: null # Optional filename prefix + diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/secretproviderclass-huggingface-tokens.tpl.yaml b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/secretproviderclass-huggingface-tokens.tpl.yaml index 00392a5a3..c6d280263 100644 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/secretproviderclass-huggingface-tokens.tpl.yaml +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/templates/secretproviderclass-huggingface-tokens.tpl.yaml @@ -23,3 +23,4 @@ spec: - resourceName: "projects/${huggingface_secret_manager_project_id}/secrets/${huggingface_hub_access_token_read_secret_manager_secret_name}/versions/latest" path: "token" provider: gke + From 545ad9bf8bf7d1ccc1d72f9e393f04e8b64a151d Mon Sep 17 00:00:00 2001 From: syeda-anjum Date: Wed, 11 Feb 2026 23:35:04 +0000 Subject: [PATCH 08/12] update readme with prettier fix --- .../inf-perf-benchmarking-with-hf-model.md | 2 +- .../vllm-spec-decoding-with-hf-model.md | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-hf-model.md b/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-hf-model.md index d546abc7c..b6164ff33 100644 --- a/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-hf-model.md +++ b/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-hf-model.md @@ -367,7 +367,7 @@ export ACCELERATOR="TPU" - Configure the benchmarking job. ```shell - "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/configure_benchmark.sh" + "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configure_benchmark.sh" ``` - OPTIONAL: Customize the load scenario: diff --git a/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md b/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md index c8a78f12d..43ff4ac97 100644 --- a/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md +++ b/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md @@ -290,16 +290,19 @@ terraform plan -input=false -out=tfplan && \ terraform apply -input=false tfplan && \ rm tfplan ``` + - Source the environment configuration. ```shell source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh" ``` + - Export the vLLM service endpoint ```shell export APP_LABEL="vllm-${ACCELERATOR_TYPE}-${HF_MODEL_NAME}-sd-${METHOD}" ``` + > > Verify the APP_LABEL > > > > ```shell @@ -308,9 +311,10 @@ rm tfplan #### Run the benchmarking job. - ```shell - "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-${METHOD}/configure_benchmark.sh" - ``` +```shell +"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-${METHOD}/configure_benchmark.sh" +``` + - Deploy the benchmarking job. ```shell @@ -321,13 +325,13 @@ kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inferenc The job can take up an estimated 15 mins to run through all the stages - ```shell watch --color --interval 5 --no-title "kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} get job/${HF_MODEL_ID_HASH}-inference-perf | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1 1 1' echo '\nLogs(last 10 lines):' kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} logs job/${HF_MODEL_ID_HASH}-inference-perf --all-containers --tail 10" ``` + When the job is complete, you will see the following: ```text @@ -348,6 +352,7 @@ curves inference-perf --analyze ${hub_models_bucket_bench_results_name}/* ``` + Clean up - Delete the benchmarking job. From 85fbaf147f1718629756c736d0ab4055de437e89 Mon Sep 17 00:00:00 2001 From: Syeda Anjum Date: Wed, 18 Feb 2026 15:06:25 -0600 Subject: [PATCH 09/12] fixes from PR review --- .../inf-perf-benchmarking-with-hf-model.md | 20 ++++++++--------- .../vllm-spec-decoding-with-hf-model.md | 22 +++++++++++-------- .../sd-eagle/configure_benchmark.sh | 5 +++-- .../sd-ngram/configure_benchmark.sh | 5 +++-- .../vllm/configure_benchmark.sh | 5 +++-- 5 files changed, 32 insertions(+), 25 deletions(-) diff --git a/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-hf-model.md b/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-hf-model.md index b6164ff33..a4f808052 100644 --- a/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-hf-model.md +++ b/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-hf-model.md @@ -381,14 +381,14 @@ export ACCELERATOR="TPU" > > ```shell - cd "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/ + cd "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/ ``` ## Deploy the benchmarking job. ```shell -kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench" +kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm" ``` ## Check the status of the job @@ -398,19 +398,19 @@ The job can take up an estimated 15 mins to run through all the stages #### For GPUs: ```shell - watch --color --interval 5 --no-title - "kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} get job/${HF_MODEL_ID_HASH}-inference-perf | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1 1 1' - echo '\nLogs(last 10 lines):' - kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} logs job/${HF_MODEL_ID_HASH}-inference-perf --all-containers --tail 10" +watch --color --interval 5 --no-title " + kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} get job/${SHORT_HASH}-inference-perf | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1 1 1'; + echo '\nLogs(last 10 lines):'; + kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} logs job/${SHORT_HASH}-inference-perf --all-containers --tail 10" ``` #### For TPUs: ```shell - watch --color --interval 5 --no-title - "kubectl --namespace=${ira_online_tpu_kubernetes_namespace_name} get job/${HF_MODEL_ID_HASH}-inference-perf | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1 1 1' - echo '\nLogs(last 10 lines):' - kubectl --namespace=${ira_online_tpu_kubernetes_namespace_name} logs job/${HF_MODEL_ID_HASH}-inference-perf --all-containers --tail 10" +watch --color --interval 5 --no-title " + kubectl --namespace=${ira_online_tpu_kubernetes_namespace_name} get job/${SHORT_HASH}-inference-perf | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1 1 1'; + echo '\nLogs(last 10 lines):'; + kubectl --namespace=${ira_online_tpu_kubernetes_namespace_name} logs job/${SHORT_HASH}-inference-perf --all-containers --tail 10" ``` When the job is complete, you will see the following: diff --git a/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md b/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md index 43ff4ac97..b8889512b 100644 --- a/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md +++ b/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md @@ -240,7 +240,7 @@ This example is built on top of the kill -9 ${PF_PID} ``` -## Measuring performance with inference-perf +## Measuring speculative decoding (ngram/eagle) performance with inference-perf Inference-perf allows you to run your own benchmarks and simulate production traffic and ensure the load generation is external to the model server pods. @@ -300,17 +300,19 @@ rm tfplan - Export the vLLM service endpoint ```shell - export APP_LABEL="vllm-${ACCELERATOR_TYPE}-${HF_MODEL_NAME}-sd-${METHOD}" + export APP_LABEL="vllm-${ACCELERATOR_TYPE}-${HF_MODEL_NAME}-sd-${METHOD}" ``` > > Verify the APP_LABEL > > > > ```shell - > > echo $APP_LABEL + > > echo $APP_LABEL > > ``` #### Run the benchmarking job. +- Configure the benchmarking job. + ```shell "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-${METHOD}/configure_benchmark.sh" ``` @@ -318,18 +320,20 @@ rm tfplan - Deploy the benchmarking job. ```shell -kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench" +kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-${METHOD}" ``` - Check the status of the job The job can take up an estimated 15 mins to run through all the stages + ```shell - watch --color --interval 5 --no-title - "kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} get job/${HF_MODEL_ID_HASH}-inference-perf | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1 1 1' - echo '\nLogs(last 10 lines):' - kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} logs job/${HF_MODEL_ID_HASH}-inference-perf --all-containers --tail 10" +watch --color --interval 5 --no-title " + kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} get job/${SHORT_HASH}-inference-perf | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1 1 1'; + echo '\nLogs(last 10 lines):'; + kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} logs job/${SHORT_HASH}-inference-perf --all-containers --tail 10 + " ``` When the job is complete, you will see the following: @@ -358,7 +362,7 @@ Clean up - Delete the benchmarking job. ```shell - kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench" + kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-${METHOD}" ``` - Delete the workload. diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/configure_benchmark.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/configure_benchmark.sh index 3beba4ef6..6a73d52b4 100755 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/configure_benchmark.sh +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/configure_benchmark.sh @@ -45,7 +45,7 @@ else echo "Variable not found" fi -source "${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variables.sh" +source "${MY_PATH}/../../../../terraform/_shared_config/scripts/set_environment_variables.sh" envsubst < "${MY_PATH}/templates/benchmarking.tpl.env" | sponge "${MY_PATH}/benchmarking.env" @@ -55,5 +55,6 @@ envsubst < "${MY_PATH}/templates/secretproviderclass-huggingface-tokens.tpl.yaml cd "${MY_PATH}" -kustomize edit set nameprefix "${HF_MODEL_ID_HASH}-" +SHORT_HASH=$(echo -n "${APP_LABEL}" | sha256sum | cut -c1-10) +kustomize edit set nameprefix "${SHORT_HASH}-" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/configure_benchmark.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/configure_benchmark.sh index 6846a0736..312beeca7 100755 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/configure_benchmark.sh +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-ngram/configure_benchmark.sh @@ -45,7 +45,7 @@ else echo "Variable not found" fi -source "${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variables.sh" +source "${MY_PATH}/../../../../terraform/_shared_config/scripts/set_environment_variables.sh" envsubst < "${MY_PATH}/templates/benchmarking.tpl.env" | sponge "${MY_PATH}/benchmarking.env" @@ -55,6 +55,7 @@ envsubst < "${MY_PATH}/templates/secretproviderclass-huggingface-tokens.tpl.yaml cd "${MY_PATH}" -kustomize edit set nameprefix "${HF_MODEL_ID_HASH}-" +SHORT_HASH=$(echo -n "${APP_LABEL}" | sha256sum | cut -c1-10) +kustomize edit set nameprefix "${SHORT_HASH}-" diff --git a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configure_benchmark.sh b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configure_benchmark.sh index 6846a0736..eb6c53e60 100755 --- a/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configure_benchmark.sh +++ b/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configure_benchmark.sh @@ -45,7 +45,7 @@ else echo "Variable not found" fi -source "${MY_PATH}/../../terraform/_shared_config/scripts/set_environment_variables.sh" +source "${MY_PATH}/../../../terraform/_shared_config/scripts/set_environment_variables.sh" envsubst < "${MY_PATH}/templates/benchmarking.tpl.env" | sponge "${MY_PATH}/benchmarking.env" @@ -55,6 +55,7 @@ envsubst < "${MY_PATH}/templates/secretproviderclass-huggingface-tokens.tpl.yaml cd "${MY_PATH}" -kustomize edit set nameprefix "${HF_MODEL_ID_HASH}-" +SHORT_HASH=$(echo -n "${APP_LABEL}" | sha256sum | cut -c1-10) +kustomize edit set nameprefix "${SHORT_HASH}-" From 34c8bbb759f9c3aa5c6bf0e6e7968e6922523e7f Mon Sep 17 00:00:00 2001 From: syeda-anjum Date: Wed, 18 Feb 2026 21:15:04 +0000 Subject: [PATCH 10/12] prettier fix --- .../inf-perf-benchmarking-with-hf-model.md | 6 ------ .../vllm-spec-decoding-with-hf-model.md | 1 - 2 files changed, 7 deletions(-) diff --git a/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-hf-model.md b/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-hf-model.md index a4f808052..627d3a6a7 100644 --- a/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-hf-model.md +++ b/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-hf-model.md @@ -26,9 +26,7 @@ datasets. is deployed and configured. - Get access to the models. - - For Gemma: - - Consented to the license on [Kaggle](https://www.kaggle.com/) using a Hugging Face account. - [**google/gemma**](https://www.kaggle.com/models/google/gemma). @@ -74,7 +72,6 @@ pip install matplotlib This example will run through the following steps: 1. Apply the inference_perf_bench terraform, which will: - - Create the GCS bucket for storing inference-perf results - Create the GCS bucket for storing a custom benchmarking dataset - Create the Kubernetes service account for the inference-perf workload @@ -231,7 +228,6 @@ export ACCELERATOR="TPU" | llama-3.3-70b-instruct | ❌ | ✅ | ✅ | ✅ | | llama-4-scout-17b-16e-instruct | ❌ | ✅ | ✅ | ✅ | | qwen3-32b | ✅ | ✅ | ✅ | ✅ | - - **NVIDIA Tesla L4 24GB**: ```shell @@ -270,7 +266,6 @@ export ACCELERATOR="TPU" | gemma-3-4b-it | ✅ | ❌ | | gemma-3-27b-it | ✅ | ✅ | | qwen3-32b | ✅ | ✅ | - - **v5e**: ```shell @@ -288,7 +283,6 @@ export ACCELERATOR="TPU" [Allocation quotas: TPU quota](https://cloud.google.com/compute/resource-usage#tpu_quota). - Choose the model. - - **Gemma 3 1B Instruction-Tuned**: ```shell diff --git a/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md b/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md index b8889512b..8d4b762bb 100644 --- a/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md +++ b/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md @@ -327,7 +327,6 @@ kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inferenc The job can take up an estimated 15 mins to run through all the stages - ```shell watch --color --interval 5 --no-title " kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} get job/${SHORT_HASH}-inference-perf | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1 1 1'; From 1892f8b9dc39ea701b38454fa87c0fb9ebc69a09 Mon Sep 17 00:00:00 2001 From: syeda-anjum Date: Wed, 18 Feb 2026 21:18:47 +0000 Subject: [PATCH 11/12] prettier fix --- .../inf-perf-benchmarking-with-hf-model.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-hf-model.md b/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-hf-model.md index 627d3a6a7..a4f808052 100644 --- a/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-hf-model.md +++ b/docs/platforms/gke/base/use-cases/inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-hf-model.md @@ -26,7 +26,9 @@ datasets. is deployed and configured. - Get access to the models. + - For Gemma: + - Consented to the license on [Kaggle](https://www.kaggle.com/) using a Hugging Face account. - [**google/gemma**](https://www.kaggle.com/models/google/gemma). @@ -72,6 +74,7 @@ pip install matplotlib This example will run through the following steps: 1. Apply the inference_perf_bench terraform, which will: + - Create the GCS bucket for storing inference-perf results - Create the GCS bucket for storing a custom benchmarking dataset - Create the Kubernetes service account for the inference-perf workload @@ -228,6 +231,7 @@ export ACCELERATOR="TPU" | llama-3.3-70b-instruct | ❌ | ✅ | ✅ | ✅ | | llama-4-scout-17b-16e-instruct | ❌ | ✅ | ✅ | ✅ | | qwen3-32b | ✅ | ✅ | ✅ | ✅ | + - **NVIDIA Tesla L4 24GB**: ```shell @@ -266,6 +270,7 @@ export ACCELERATOR="TPU" | gemma-3-4b-it | ✅ | ❌ | | gemma-3-27b-it | ✅ | ✅ | | qwen3-32b | ✅ | ✅ | + - **v5e**: ```shell @@ -283,6 +288,7 @@ export ACCELERATOR="TPU" [Allocation quotas: TPU quota](https://cloud.google.com/compute/resource-usage#tpu_quota). - Choose the model. + - **Gemma 3 1B Instruction-Tuned**: ```shell From 3240bd6ff98a527fa8bf9a3359ac07782acd6ea1 Mon Sep 17 00:00:00 2001 From: syeda-anjum Date: Wed, 18 Feb 2026 21:59:00 +0000 Subject: [PATCH 12/12] readme reorder --- .../vllm-spec-decoding-with-hf-model.md | 144 +++++++++--------- 1 file changed, 71 insertions(+), 73 deletions(-) diff --git a/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md b/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md index 8d4b762bb..b7e4a9e5f 100644 --- a/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md +++ b/docs/platforms/gke/base/use-cases/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md @@ -240,6 +240,77 @@ This example is built on top of the kill -9 ${PF_PID} ``` +- Delete the workload. + + ```shell + export METHOD=ngram && \ + kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-spec-decoding/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}-sd-${METHOD}" + ``` + +### Speculative Decoding with Eagle + +- Deploy the inference workload. + + ```shell + export METHOD=eagle && \ + kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-spec-decoding/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}-sd-${METHOD}" + ``` + +- Watch the deployment until it is ready. + + ```shell + export METHOD=eagle && \ + watch --color --interval 5 --no-title "kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} get deployment/vllm-${ACCELERATOR_TYPE}-${HF_MODEL_NAME}-sd-${METHOD} | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1 1 1' + echo '\nLogs(last 10 lines):' + kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} logs deployment/vllm-${ACCELERATOR_TYPE}-${HF_MODEL_NAME}-sd-${METHOD} --all-containers --tail 10" + ``` + + When the deployment is ready, you will see output similar to the following: + + ```text + NAME READY UP-TO-DATE AVAILABLE AGE + vllm-h100-llama-3-3-70b-it-sd-eagle 1/1 1 1 ### + ``` + + You can press `CTRL`+`c` to terminate the watch. + +- Send a test request to the model. + + Start a port forward to the model service. + + ```shell + export METHOD=eagle && \ + kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} port-forward service/vllm-${ACCELERATOR_TYPE}-${HF_MODEL_NAME}-sd-${METHOD} 8000:8000 >/dev/null & \ + PF_PID=$! + ``` + + Send a test request. + + ```shell + curl http://127.0.0.1:8000/v1/chat/completions \ + --data '{ + "model": "/gcs/'${HF_MODEL_ID}'", + "messages": [ { "role": "user", "content": "Why is the sky blue?" } ] + }' \ + --header "Content-Type: application/json" \ + --request POST \ + --show-error \ + --silent | jq + ``` + + Stop the port forward. + + ```shell + kill -9 ${PF_PID} + ``` + +- Delete the workload. + + ```shell + export METHOD=eagle && \ + kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-spec-decoding/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}-sd-${METHOD}" + ``` + ## Measuring speculative decoding (ngram/eagle) performance with inference-perf Inference-perf allows you to run your own benchmarks and simulate production @@ -356,85 +427,12 @@ curves ``` -Clean up - - Delete the benchmarking job. ```shell kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-${METHOD}" ``` -- Delete the workload. - - ```shell - export METHOD=ngram && \ - kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-spec-decoding/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}-sd-${METHOD}" - ``` - -### Speculative Decoding with Eagle - -- Deploy the inference workload. - - ```shell - export METHOD=eagle && \ - kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-spec-decoding/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}-sd-${METHOD}" - ``` - -- Watch the deployment until it is ready. - - ```shell - export METHOD=eagle && \ - watch --color --interval 5 --no-title "kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} get deployment/vllm-${ACCELERATOR_TYPE}-${HF_MODEL_NAME}-sd-${METHOD} | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1 1 1' - echo '\nLogs(last 10 lines):' - kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} logs deployment/vllm-${ACCELERATOR_TYPE}-${HF_MODEL_NAME}-sd-${METHOD} --all-containers --tail 10" - ``` - - When the deployment is ready, you will see output similar to the following: - - ```text - NAME READY UP-TO-DATE AVAILABLE AGE - vllm-h100-llama-3-3-70b-it-sd-eagle 1/1 1 1 ### - ``` - - You can press `CTRL`+`c` to terminate the watch. - -- Send a test request to the model. - - Start a port forward to the model service. - - ```shell - export METHOD=eagle && \ - kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} port-forward service/vllm-${ACCELERATOR_TYPE}-${HF_MODEL_NAME}-sd-${METHOD} 8000:8000 >/dev/null & \ - PF_PID=$! - ``` - - Send a test request. - - ```shell - curl http://127.0.0.1:8000/v1/chat/completions \ - --data '{ - "model": "/gcs/'${HF_MODEL_ID}'", - "messages": [ { "role": "user", "content": "Why is the sky blue?" } ] - }' \ - --header "Content-Type: application/json" \ - --request POST \ - --show-error \ - --silent | jq - ``` - - Stop the port forward. - - ```shell - kill -9 ${PF_PID} - ``` - -- Delete the workload. - - ```shell - export METHOD=eagle && \ - kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-spec-decoding/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}-sd-${METHOD}" - ``` - ## Troubleshooting If you experience any issue while deploying the workload, see the