GoogleCloudPlatform · syeda-anjum · Feb 10, 2026 · Feb 10, 2026 · Feb 11, 2026 · Feb 11, 2026
diff --git a/.github/workflows/dictionary/vllm.txt b/.github/workflows/dictionary/vllm.txt
@@ -1,3 +1,4 @@
+dailymail
 dtype
 flashinfer
 matplot

diff --git a/.../inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-hf-model.md b/.../inference-ref-arch/inference-perf-bench/inf-perf-benchmarking-with-hf-model.md
@@ -367,7 +367,7 @@ export ACCELERATOR="TPU"
 - Configure the benchmarking job.
 
   ```shell
-  "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/configure_benchmark.sh"
+  "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/configure_benchmark.sh"
   ```
 
   - OPTIONAL: Customize the load scenario:
@@ -381,14 +381,14 @@ export ACCELERATOR="TPU"
   > >
 
   ```shell
-   cd "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/
+   cd "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm/
 
   ```
 
 ## Deploy the benchmarking job.
 
 ```shell
-kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench"
+kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm"
 ```
 
 ## Check the status of the job
@@ -398,19 +398,19 @@ The job can take up an estimated 15 mins to run through all the stages
 #### For GPUs:
 
 ```shell
-  watch --color --interval 5 --no-title
-  "kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} get job/${HF_MODEL_ID_HASH}-inference-perf | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1     1            1'
-  echo '\nLogs(last 10 lines):'
-  kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} logs job/${HF_MODEL_ID_HASH}-inference-perf --all-containers --tail 10"
+watch --color --interval 5 --no-title "
+  kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} get job/${SHORT_HASH}-inference-perf | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1     1            1';
+  echo '\nLogs(last 10 lines):';
+  kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} logs job/${SHORT_HASH}-inference-perf --all-containers --tail 10"
 ```
 
 #### For TPUs:
 
 ```shell
-  watch --color --interval 5 --no-title
-  "kubectl --namespace=${ira_online_tpu_kubernetes_namespace_name} get job/${HF_MODEL_ID_HASH}-inference-perf | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1     1            1'
-  echo '\nLogs(last 10 lines):'
-  kubectl --namespace=${ira_online_tpu_kubernetes_namespace_name} logs job/${HF_MODEL_ID_HASH}-inference-perf --all-containers --tail 10"
+watch --color --interval 5 --no-title "
+  kubectl --namespace=${ira_online_tpu_kubernetes_namespace_name} get job/${SHORT_HASH}-inference-perf | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1     1            1';
+  echo '\nLogs(last 10 lines):';
+  kubectl --namespace=${ira_online_tpu_kubernetes_namespace_name} logs job/${SHORT_HASH}-inference-perf --all-containers --tail 10"
 ```
 
 When the job is complete, you will see the following:

diff --git a/...ses/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md b/...ses/inference-ref-arch/online-inference-gpu/vllm-spec-decoding-with-hf-model.md
@@ -240,8 +240,6 @@ This example is built on top of the
   kill -9 ${PF_PID}
   ```
 
-- Measuring performance with inference-perf
-
 - Delete the workload.
 
   ```shell
@@ -313,6 +311,128 @@ This example is built on top of the
   kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/online-inference-gpu/vllm-spec-decoding/${ACCELERATOR_TYPE}-${HF_MODEL_NAME}-sd-${METHOD}"
   ```
 
+## Measuring speculative decoding (ngram/eagle) performance with inference-perf
+
+Inference-perf allows you to run your own benchmarks and simulate production
+traffic and ensure the load generation is external to the model server pods.
+
+This implementation deploys the inference-perf tool as a Kubernetes Job and can
+be customized with different load scenarios and datasets.
+
+Stay-up to date with the official
+[inference-perf tool](https://github.com/kubernetes-sigs/inference-perf) to
+learn more about all the supported features for metrics,load scenarios, and
+datasets.
+
+Optional - Install the inference-perf and matplot libraries to be able to create
+throughput vs latency curves
+
+```shell
+pip install inference-perf
+pip install matplotlib
+```
+
+### Workflow
+
+This example will run through the following steps:
+
+1. Apply the inference_perf_bench terraform, which will:
+
+   - Create the GCS bucket for storing inference-perf results
+   - Create the GCS bucket for storing a custom benchmarking dataset
+   - Create the Kubernetes service account for the inference-perf workload
+   - Grant the required IAM permissions for workload identity KSA
+
+2. Create the custom kubernetes manifest for the benchmarking job
+3. Run the benchmarking job for a load test on the vLLM service
+4. Collect the google managed prometheus metrics to generate reports
+5. Push the results from the benchmark run to the results GCS bucket
+
+#### Run the Inference-perf terraform
+
+```shell
+export TF_VAR_enable_gpu=true
+export ACCELERATOR="GPU"
+export TF_PLUGIN_CACHE_DIR="${ACP_REPO_DIR}/.terraform.d/plugin-cache"
+cd ${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/inference_perf_bench && \
+rm -rf .terraform/ terraform.tfstate* && \
+terraform init && \
+terraform plan -input=false -out=tfplan && \
+terraform apply -input=false tfplan && \
+rm tfplan
+```
+
+- Source the environment configuration.
+
+  ```shell
+  source "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/terraform/_shared_config/scripts/set_environment_variables.sh"
+  ```
+
+- Export the vLLM service endpoint
+
+  ```shell
+  export APP_LABEL="vllm-${ACCELERATOR_TYPE}-${HF_MODEL_NAME}-sd-${METHOD}"
+  ```
+
+  > > Verify the APP_LABEL
+  > >
+  > > ```shell
+  > > echo $APP_LABEL
+  > > ```
+
+#### Run the benchmarking job.
+
+- Configure the benchmarking job.
+
+```shell
+"${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-${METHOD}/configure_benchmark.sh"
+```
+
+- Deploy the benchmarking job.
+
+```shell
+kubectl apply --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-${METHOD}"
+```
+
+- Check the status of the job
+
+The job can take up an estimated 15 mins to run through all the stages
+
+```shell
+watch --color --interval 5 --no-title "
+  kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} get job/${SHORT_HASH}-inference-perf | GREP_COLORS='mt=01;92' egrep --color=always -e '^' -e '1/1     1            1';
+  echo '\nLogs(last 10 lines):';
+  kubectl --namespace=${ira_online_gpu_kubernetes_namespace_name} logs job/${SHORT_HASH}-inference-perf --all-containers --tail 10
+  "
+```
+
+When the job is complete, you will see the following:
+
+```text
+NAME                       STATUS     COMPLETIONS   DURATION   AGE
+XXXXXX-inference-perf      Complete    1/1           15m       25m
+```
+
+#### Analyze and Interpret Results
+
+The output reports (JSON files) can be viewed in benchmarking results bucket
+with metrics for each load stage
+
+Download the report and run inference-perf to create the throughput and latency
+curves
+
+```shell
+   gsutil -m cp -r gs://${hub_models_bucket_bench_results_name}/ .
+   inference-perf --analyze ${hub_models_bucket_bench_results_name}/*
+
+```
+
+- Delete the benchmarking job.
+
+  ```shell
+  kubectl delete --ignore-not-found --kustomize "${ACP_REPO_DIR}/platforms/gke/base/use-cases/inference-ref-arch/kubernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-${METHOD}"
+  ```
+
 ## Troubleshooting
 
 If you experience any issue while deploying the workload, see the

diff --git a/...ernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/configure_benchmark.sh b/...ernetes-manifests/inference-perf-bench/vllm-spec-decoding/sd-eagle/configure_benchmark.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -o errexit
+set -o nounset
+set -o pipefail
+
+MY_PATH="$(
+  cd "$(dirname "$0")" >/dev/null 2>&1
+  pwd -P
+)"
+
+# Update benchmarking namespace depending on TPU or GPU selection
+TARGET_FILE="${MY_PATH}/templates/benchmarking.tpl.env"
+GPU_NS="${ira_online_gpu_kubernetes_namespace_name}"
+TPU_NS="${ira_online_tpu_kubernetes_namespace_name}"
+
+# Determine the correct namespace
+if [[ "$ACCELERATOR" == "GPU" ]]; then
+   export BENCHMARKING_KUBERNETES_NAMESPACE=$GPU_NS
+elif [[ "$ACCELERATOR" == "TPU" ]]; then
+   export BENCHMARKING_KUBERNETES_NAMESPACE=$TPU_NS
+else
+    echo "Error: Please specify 'GPU' or 'TPU'"
+    exit 1
+fi
+
+#  Use sed to update the value
+if grep -q "BENCHMARKING_KUBERNETES_NAMESPACE=" "$TARGET_FILE"; then
+    sed -i "s/^BENCHMARKING_KUBERNETES_NAMESPACE=.*/BENCHMARKING_KUBERNETES_NAMESPACE=$BENCHMARKING_KUBERNETES_NAMESPACE/" "$TARGET_FILE"
+    echo "Successfully updated $TARGET_FILE: BENCHMARKING_KUBERNETES_NAMESPACE=$BENCHMARKING_KUBERNETES_NAMESPACE"
+else
+    echo "Variable not found"
+fi
+
+source "${MY_PATH}/../../../../terraform/_shared_config/scripts/set_environment_variables.sh"
+
+envsubst < "${MY_PATH}/templates/benchmarking.tpl.env" | sponge "${MY_PATH}/benchmarking.env"
+
+envsubst < "${MY_PATH}/templates/configmap-benchmark.tpl.yaml" | sponge "${MY_PATH}/configmap-benchmark.yaml"
+
+envsubst < "${MY_PATH}/templates/secretproviderclass-huggingface-tokens.tpl.yaml" | sponge "${MY_PATH}/secretproviderclass-huggingface-tokens.yaml"
+
+
+cd "${MY_PATH}"
+SHORT_HASH=$(echo -n "${APP_LABEL}" | sha256sum | cut -c1-10)
+kustomize edit set nameprefix "${SHORT_HASH}-"
+
diff --git a/...s-manifests/inference-perf-bench/job.yaml → ...ench/vllm-spec-decoding/sd-eagle/job.yaml b/...s-manifests/inference-perf-bench/job.yaml → ...ench/vllm-spec-decoding/sd-eagle/job.yaml
@@ -57,3 +57,5 @@ spec:
             volumeAttributes:
               secretProviderClass: huggingface-token-read
           name: huggingface-token
+
+
diff --git a/...s/inference-perf-bench/kustomization.yaml → ...spec-decoding/sd-eagle/kustomization.yaml b/...s/inference-perf-bench/kustomization.yaml → ...spec-decoding/sd-eagle/kustomization.yaml
diff --git a/.../inference-perf-bench/pod-monitoring.yaml → ...pec-decoding/sd-eagle/pod-monitoring.yaml b/.../inference-perf-bench/pod-monitoring.yaml → ...pec-decoding/sd-eagle/pod-monitoring.yaml
@@ -32,3 +32,4 @@ spec:
     - pod
     - container
     - node
+
diff --git a/...ference-perf-bench/set-compute-class.yaml → ...-decoding/sd-eagle/set-compute-class.yaml b/...ference-perf-bench/set-compute-class.yaml → ...-decoding/sd-eagle/set-compute-class.yaml
@@ -22,3 +22,5 @@ spec:
     spec:
       nodeSelector:
         cloud.google.com/compute-class: model-download
+
+
diff --git a/...perf-bench/templates/benchmarking.tpl.env → ...g/sd-eagle/templates/benchmarking.tpl.env b/...perf-bench/templates/benchmarking.tpl.env → ...g/sd-eagle/templates/benchmarking.tpl.env
diff --git a/...s/inference-perf-bench/vllm-spec-decoding/sd-eagle/templates/configmap-benchmark.tpl.yaml b/...s/inference-perf-bench/vllm-spec-decoding/sd-eagle/templates/configmap-benchmark.tpl.yaml
@@ -0,0 +1,63 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: inference-perf-config
+  namespace: ${BENCHMARKING_KUBERNETES_NAMESPACE}
+data:
+  config.yaml: |
+    load:
+      type: constant
+      interval: 1.0
+      sweep:
+        type: linear
+        timeout: 250
+        num_stages: 7
+        stage_duration: 30
+      num_workers: 20
+      worker_max_concurrency: 15
+      worker_max_tcp_connections: 2500
+    api: 
+      type: completion
+      streaming: true
+    server:
+      type: vllm
+      model_name: /gcs/${HF_MODEL_ID}
+      base_url: http://${APP_LABEL}.${BENCHMARKING_KUBERNETES_NAMESPACE}.svc.cluster.local:8000
+      ignore_eos: true
+    tokenizer:
+      pretrained_model_name_or_path: ${HF_MODEL_ID}
+    data:
+      type: cnn_dailymail
+    metrics:
+      type: prometheus
+      prometheus:
+          scrape_interval: 15
+          google_managed: true         # Whether using Google Managed Prometheus
+          filters: [] 
+    report:
+      request_lifecycle:
+        summary: true
+        per_stage: true
+        per_request: true
+      prometheus:
+        summary: true
+        per_stage: true
+    storage:
+      google_cloud_storage:               
+        bucket_name: ${hub_models_bucket_bench_results_name}   # Required GCS bucket
+        report_file_prefix: null          # Optional filename prefix
+
diff --git a/...providerclass-huggingface-tokens.tpl.yaml → ...providerclass-huggingface-tokens.tpl.yaml b/...providerclass-huggingface-tokens.tpl.yaml → ...providerclass-huggingface-tokens.tpl.yaml
@@ -23,3 +23,4 @@ spec:
       - resourceName: "projects/${huggingface_secret_manager_project_id}/secrets/${huggingface_hub_access_token_read_secret_manager_secret_name}/versions/latest"
         path: "token"
   provider: gke
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -57,3 +57,5 @@ spec:
		volumeAttributes:
		secretProviderClass: huggingface-token-read
		name: huggingface-token
Original file line number	Diff line number	Diff line change
Expand Up		@@ -22,3 +22,5 @@ spec:
		spec:
		nodeSelector:
		cloud.google.com/compute-class: model-download
Original file line number	Diff line number	Diff line change
Expand Up		@@ -23,3 +23,4 @@ spec:
		- resourceName: "projects/${huggingface_secret_manager_project_id}/secrets/${huggingface_hub_access_token_read_secret_manager_secret_name}/versions/latest"
		path: "token"
		provider: gke