NVIDIA · amaslenn · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026
@@ -29,6 +29,7 @@ Available Workloads
    ":doc:`slurm_container`", "✅", "❌", "❌", "❌"
    "Triton Inference", "✅", "❌", "❌", "❌"
    ":doc:`ucc`", "✅", "❌", "❌", "❌"
+   ":doc:`vllm`", "✅", "❌", "❌", "❌"
 
 .. toctree::
     :hidden:

@@ -0,0 +1,154 @@
+vLLM
+====
+
+This workload (``test_template_name`` is ``vllm``) allows users to execute vLLM benchmarks within the CloudAI framework.
+
+vLLM is a high-throughput and memory-efficient inference engine for LLMs. This workload supports both aggregated and disaggregated prefill/decode modes.
+
+Usage Examples
+--------------
+
+Test + Scenario example
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: toml
+   :caption: test.toml (test definition)
+
+   name = "vllm_test"
+   description = "Example vLLM test"
+   test_template_name = "vllm"
+
+   [cmd_args]
+   docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:0.7.0"
+   model = "Qwen/Qwen3-0.6B"
+
+   [bench_cmd_args]
+   random_input_len = 16
+   random_output_len = 128
+   max_concurrency = 16
+   num_prompts = 30
+
+
+.. code-block:: toml
+   :caption: scenario.toml (scenario with one test)
+
+   name = "vllm-benchmark"
+
+   [[Tests]]
+   id = "vllm.1"
+   num_nodes = 1
+   time_limit = "00:10:00"
+   test_name = "vllm_test"
+
+Test-in-Scenario example
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: toml
+   :caption: scenario.toml (separate test toml is not needed)
+
+   name = "vllm-benchmark"
+
+   [[Tests]]
+   id = "vllm.1"
+   num_nodes = 1
+   time_limit = "00:10:00"
+
+   name = "vllm_test"
+   description = "Example vLLM test"
+   test_template_name = "vllm"
+
+   [Tests.cmd_args]
+   docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:0.7.0"
+   model = "Qwen/Qwen3-0.6B"
+
+   [Tests.bench_cmd_args]
+   random_input_len = 16
+   random_output_len = 128
+   max_concurrency = 16
+   num_prompts = 30
+
+
+Control number of GPUs
+----------------------
+The number of GPUs can be controlled using the options below, listed from lowest to highest priority:
+1. ``gpus_per_node`` system property (scalar value)
+2. ``CUDA_VISIBLE_DEVICES`` environment variable (comma-separated list of GPU IDs)
+3. ``gpu_ids`` command argument for ``prefill`` and ``decode`` configurations (comma-separated list of GPU IDs)
+
+
+Control disaggregation
+----------------------
+By default, vLLM will run without disaggregation as a single process. To enable disaggregation, one needs to set ``prefill`` configuration:
+
+.. code-block:: toml
+   :caption: test.toml (disaggregated prefill/decode)
+
+   [cmd_args]
+   docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:0.7.0"
+   model = "Qwen/Qwen3-0.6B"
+
+   [cmd_args.prefill]
+
+   [extra_env_vars]
+   CUDA_VISIBLE_DEVICES = "0,1,2,3"
+
+The config above will automatically split GPUs specified in ``CUDA_VISIBLE_DEVICES`` into two halves, first half will be used for prefill and second half will be used for decode.
+
+For more control, one can specify the GPU IDs explicitly in ``prefill`` and ``decode`` configurations:
+
+.. code-block:: toml
+   :caption: test.toml (disaggregated prefill/decode)
+
+   [cmd_args.prefill]
+   gpu_ids = "0,1"
+
+   [cmd_args.decode]
+   gpu_ids = "2,3"
+
+In this case ``CUDA_VISIBLE_DEVICES`` will be ignored and only the GPUs specified in ``gpu_ids`` will be used.
+
+
+Control ``proxy_script``
+------------------------
+``proxy_script`` is used to proxy the requests from the client to the prefill and decode instances. It is ignored for non-disaggregated mode. Default value can be found below.
+
+It can be overridden by setting ``proxy_script`` by using the latest version of the script from vLLM repository:
+
+.. code-block:: toml
+   :caption: test_scenario.toml (override proxy_script)
+
+   [[Tests.git_repos]]
+   url = "https://github.com/vllm-project/vllm.git"
+   commit = "main"
+   mount_as = "/vllm_repo"
+
+   [Tests.cmd_args]
+   docker_image_url = "vllm/vllm-openai:v0.14.0-cu130"
+   proxy_script = "/vllm_repo/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py"
+
+In this case the proxy script will be mounted from the vLLM repository (cloned locally) as ``/vllm_repo`` and used for the test.
+
+
+API Documentation
+-----------------
+
+Command Arguments
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: cloudai.workloads.vllm.vllm.VllmCmdArgs
+   :members:
+   :show-inheritance:
+
+Benchmark Command Arguments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: cloudai.workloads.vllm.vllm.VllmBenchCmdArgs
+   :members:
+   :show-inheritance:
+
+Test Definition
+~~~~~~~~~~~~~~~
+
+.. autoclass:: cloudai.workloads.vllm.vllm.VllmTestDefinition
+   :members:
+   :show-inheritance:
@@ -1,5 +1,5 @@
 # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
-# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -159,6 +159,11 @@ def register_all():
         UCCTestReportGenerationStrategy,
         UCCTestSlurmCommandGenStrategy,
     )
+    from cloudai.workloads.vllm import (
+        VLLMBenchReportGenerationStrategy,
+        VllmSlurmCommandGenStrategy,
+        VllmTestDefinition,
+    )
 
     Registry().add_runner("slurm", SlurmRunner)
     Registry().add_runner("kubernetes", KubernetesRunner)
@@ -216,6 +221,7 @@ def register_all():
     Registry().add_command_gen_strategy(SlurmSystem, BashCmdTestDefinition, BashCmdCommandGenStrategy)
     Registry().add_command_gen_strategy(SlurmSystem, NIXLKVBenchTestDefinition, NIXLKVBenchSlurmCommandGenStrategy)
     Registry().add_command_gen_strategy(SlurmSystem, OSUBenchTestDefinition, OSUBenchSlurmCommandGenStrategy)
+    Registry().add_command_gen_strategy(SlurmSystem, VllmTestDefinition, VllmSlurmCommandGenStrategy)
 
     Registry().add_installer("slurm", SlurmInstaller)
     Registry().add_installer("standalone", StandaloneInstaller)
@@ -251,6 +257,7 @@ def register_all():
     Registry().add_test_definition("NIXLKVBench", NIXLKVBenchTestDefinition)
     Registry().add_test_definition("Aiconfigurator", AiconfiguratorTestDefinition)
     Registry().add_test_definition("OSUBench", OSUBenchTestDefinition)
+    Registry().add_test_definition("vllm", VllmTestDefinition)
 
     Registry().add_agent("grid_search", GridSearchAgent)
 
@@ -271,6 +278,7 @@ def register_all():
     Registry().add_report(AIDynamoTestDefinition, AIDynamoReportGenerationStrategy)
     Registry().add_report(AiconfiguratorTestDefinition, AiconfiguratorReportGenerationStrategy)
     Registry().add_report(NixlPerftestTestDefinition, NIXLKVBenchDummyReport)
+    Registry().add_report(VllmTestDefinition, VLLMBenchReportGenerationStrategy)
 
     Registry().add_scenario_report("per_test", PerTestReporter, ReportConfig(enable=True))
     Registry().add_scenario_report("status", StatusReporter, ReportConfig(enable=True))

@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .report_generation_strategy import VLLMBenchReportGenerationStrategy
+from .slurm_command_gen_strategy import VllmSlurmCommandGenStrategy
+from .vllm import VLLM_BENCH_LOG_FILE, VllmArgs, VllmBenchCmdArgs, VllmCmdArgs, VllmTestDefinition
+
+__all__ = [
+    "VLLM_BENCH_LOG_FILE",
+    "VLLMBenchReportGenerationStrategy",
+    "VllmArgs",
+    "VllmBenchCmdArgs",
+    "VllmCmdArgs",
+    "VllmSlurmCommandGenStrategy",
+    "VllmTestDefinition",
+]
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+from functools import cache
+from pathlib import Path
+
+from pydantic import BaseModel, ConfigDict
+from rich.console import Console
+from rich.table import Table
+
+from cloudai.core import ReportGenerationStrategy
+
+from .vllm import VLLM_BENCH_JSON_FILE
+
+
+class VLLMBenchReport(BaseModel):
+    """Report for vLLM benchmark results."""
+
+    model_config = ConfigDict(extra="ignore")
+
+    num_prompts: int
+    completed: int
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    p99_ttft_ms: float
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    p99_tpot_ms: float
+
+
+@cache
+def parse_vllm_bench_output(res_file: Path) -> VLLMBenchReport | None:
+    """Parse the vLLM benchmark output file and return a VLLMBenchReport object."""
+    if not res_file.is_file():
+        return None
+
+    try:
+        with res_file.open("r") as f:
+            data = json.load(f)
+        return VLLMBenchReport.model_validate(data)
+    except Exception as e:
+        logging.debug(f"Error parsing vLLM benchmark output: {e}")
+        return None
+
+
+class VLLMBenchReportGenerationStrategy(ReportGenerationStrategy):
+    """Generate a report for vLLM benchmark results."""
+
+    def can_handle_directory(self) -> bool:
+        return parse_vllm_bench_output(self.test_run.output_path / VLLM_BENCH_JSON_FILE) is not None
+
+    def generate_report(self) -> None:
+        results = parse_vllm_bench_output(self.test_run.output_path / VLLM_BENCH_JSON_FILE)
+        if results is None:
+            return
+
+        console = Console()
+        table = Table(title=f"vLLM Benchmark Results ({self.test_run.output_path})", title_justify="left")
+        table.add_column("Successful prompts", justify="right")
+        table.add_column("TTFT Mean, ms", justify="right")
+        table.add_column("TTFT Median, ms", justify="right")
+        table.add_column("TTFT P99, ms", justify="right")
+        table.add_column("TPOT Mean, ms", justify="right")
+        table.add_column("TPOT Median, ms", justify="right")
+        table.add_column("TPOT P99, ms", justify="right")
+        table.add_row(
+            f"{results.completed / results.num_prompts * 100:.2f}% ({results.completed} of {results.num_prompts})",
+            f"{results.mean_ttft_ms:.4f}",
+            f"{results.median_ttft_ms:.4f}",
+            f"{results.p99_ttft_ms:.4f}",
+            f"{results.mean_tpot_ms:.4f}",
+            f"{results.median_tpot_ms:.4f}",
+            f"{results.p99_tpot_ms:.4f}",
+        )
+
+        console.print(table)