From f699535b57cb06d69bb6d5325027d3853e11636d Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 10:19:45 +0100 Subject: [PATCH 01/45] Add mock vLLM workload --- src/cloudai/registration.py | 8 +++- src/cloudai/workloads/vllm/__init__.py | 24 ++++++++++++ .../vllm/slurm_command_gen_strategy.py | 34 +++++++++++++++++ src/cloudai/workloads/vllm/vllm.py | 38 +++++++++++++++++++ tests/test_init.py | 5 ++- 5 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 src/cloudai/workloads/vllm/__init__.py create mode 100644 src/cloudai/workloads/vllm/slurm_command_gen_strategy.py create mode 100644 src/cloudai/workloads/vllm/vllm.py diff --git a/src/cloudai/registration.py b/src/cloudai/registration.py index f9be227e6..3cd803245 100644 --- a/src/cloudai/registration.py +++ b/src/cloudai/registration.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -159,6 +159,10 @@ def register_all(): UCCTestReportGenerationStrategy, UCCTestSlurmCommandGenStrategy, ) + from cloudai.workloads.vllm import ( + VllmSlurmCommandGenStrategy, + VllmTestDefinition, + ) Registry().add_runner("slurm", SlurmRunner) Registry().add_runner("kubernetes", KubernetesRunner) @@ -216,6 +220,7 @@ def register_all(): Registry().add_command_gen_strategy(SlurmSystem, BashCmdTestDefinition, BashCmdCommandGenStrategy) Registry().add_command_gen_strategy(SlurmSystem, NIXLKVBenchTestDefinition, NIXLKVBenchSlurmCommandGenStrategy) Registry().add_command_gen_strategy(SlurmSystem, OSUBenchTestDefinition, OSUBenchSlurmCommandGenStrategy) + Registry().add_command_gen_strategy(SlurmSystem, VllmTestDefinition, VllmSlurmCommandGenStrategy) Registry().add_installer("slurm", SlurmInstaller) Registry().add_installer("standalone", StandaloneInstaller) @@ -251,6 +256,7 @@ def register_all(): Registry().add_test_definition("NIXLKVBench", NIXLKVBenchTestDefinition) Registry().add_test_definition("Aiconfigurator", AiconfiguratorTestDefinition) Registry().add_test_definition("OSUBench", OSUBenchTestDefinition) + Registry().add_test_definition("Vllm", VllmTestDefinition) Registry().add_agent("grid_search", GridSearchAgent) diff --git a/src/cloudai/workloads/vllm/__init__.py b/src/cloudai/workloads/vllm/__init__.py new file mode 100644 index 000000000..07b2be366 --- /dev/null +++ b/src/cloudai/workloads/vllm/__init__.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .slurm_command_gen_strategy import VllmSlurmCommandGenStrategy +from .vllm import VllmCmdArgs, VllmTestDefinition + +__all__ = [ + "VllmCmdArgs", + "VllmSlurmCommandGenStrategy", + "VllmTestDefinition", +] diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py new file mode 100644 index 000000000..7860fe46d --- /dev/null +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, cast + +from cloudai.systems.slurm import SlurmCommandGenStrategy + +from .vllm import VllmCmdArgs, VllmTestDefinition + + +class VllmSlurmCommandGenStrategy(SlurmCommandGenStrategy): + """Command generation strategy for vLLM on Slurm systems.""" + + def _container_mounts(self) -> list[str]: + return [] + + def generate_test_command(self) -> List[str]: + tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) + tdef_cmd_args: VllmCmdArgs = tdef.cmd_args + # TODO: Implement full command generation with bash script + return [f"vllm serve {tdef_cmd_args.model}"] diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py new file mode 100644 index 000000000..11feb312c --- /dev/null +++ b/src/cloudai/workloads/vllm/vllm.py @@ -0,0 +1,38 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from cloudai.core import Installable +from cloudai.models.workload import CmdArgs, TestDefinition + + +class VllmCmdArgs(CmdArgs): + """vLLM test command arguments.""" + + docker_image_url: str + port: int = 8000 + vllm_server_wait_seconds: int = 300 + model: str = "Qwen/Qwen3-0.6B" + + +class VllmTestDefinition(TestDefinition): + """Test object for vLLM.""" + + cmd_args: VllmCmdArgs + + @property + def installables(self) -> list[Installable]: + return [] diff --git a/tests/test_init.py b/tests/test_init.py index 78b717d8f..8d6b0509f 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -97,6 +97,7 @@ UCCTestGradingStrategy, UCCTestSlurmCommandGenStrategy, ) +from cloudai.workloads.vllm import VllmSlurmCommandGenStrategy, VllmTestDefinition def test_systems(): @@ -144,6 +145,7 @@ def test_runners(): (SlurmSystem, NixlPerftestTestDefinition): NixlPerftestSlurmCommandGenStrategy, (SlurmSystem, NIXLKVBenchTestDefinition): NIXLKVBenchSlurmCommandGenStrategy, (SlurmSystem, OSUBenchTestDefinition): OSUBenchSlurmCommandGenStrategy, + (SlurmSystem, VllmTestDefinition): VllmSlurmCommandGenStrategy, } JSON_GEN_STRATEGIES = { (KubernetesSystem, NCCLTestDefinition): NcclTestKubernetesJsonGenStrategy, @@ -217,7 +219,7 @@ def test_installers(): def test_definitions(): test_defs = Registry().test_definitions_map - assert len(test_defs) == 22 + assert len(test_defs) == 23 for tdef in [ ("UCCTest", UCCTestDefinition), ("DDLBTest", DDLBTestDefinition), @@ -241,6 +243,7 @@ def test_definitions(): ("NIXLKVBench", NIXLKVBenchTestDefinition), ("Aiconfigurator", AiconfiguratorTestDefinition), ("OSUBench", OSUBenchTestDefinition), + ("Vllm", VllmTestDefinition), ]: assert test_defs[tdef[0]] == tdef[1] From 92755ce765d52ef255be5f0e7daad5498de447b1 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 10:28:42 +0100 Subject: [PATCH 02/45] Add get_vllm_serve_command() --- .../vllm/slurm_command_gen_strategy.py | 5 ++ .../test_vllm_slurm_command_gen_strategy.py | 49 +++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 7860fe46d..bd74fdea4 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -32,3 +32,8 @@ def generate_test_command(self) -> List[str]: tdef_cmd_args: VllmCmdArgs = tdef.cmd_args # TODO: Implement full command generation with bash script return [f"vllm serve {tdef_cmd_args.model}"] + + def get_vllm_serve_command(self) -> list[str]: + tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) + tdef_cmd_args: VllmCmdArgs = tdef.cmd_args + return ["vllm", "serve", tdef_cmd_args.model, "--port", str(tdef_cmd_args.port)] diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py new file mode 100644 index 000000000..973a5b012 --- /dev/null +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path + +import pytest + +from cloudai.core import TestRun +from cloudai.systems.slurm import SlurmSystem +from cloudai.workloads.vllm import VllmCmdArgs, VllmSlurmCommandGenStrategy, VllmTestDefinition + + +@pytest.fixture +def vllm() -> VllmTestDefinition: + return VllmTestDefinition( + name="vllm_test", + description="vLLM benchmark test", + test_template_name="Vllm", + cmd_args=VllmCmdArgs(docker_image_url="nvcr.io/nvidia/vllm:latest", model="Qwen/Qwen3-0.6B", port=8000), + ) + + +@pytest.fixture +def vllm_tr(vllm: VllmTestDefinition, tmp_path: Path) -> TestRun: + return TestRun(test=vllm, num_nodes=1, nodes=[], output_path=tmp_path, name="vllm-job") + + +class TestVllmSlurmCommandGenStrategy: + """Test the VllmSlurmCommandGenStrategy class.""" + + def test_generate_vllm_command(self, vllm_tr: TestRun, slurm_system: SlurmSystem) -> None: + cmd_gen_strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr) + + command = " ".join(cmd_gen_strategy.get_vllm_serve_command()) + + assert command == f"vllm serve {vllm_tr.test.cmd_args.model} --port {vllm_tr.test.cmd_args.port}" From 2980e239d1361e00fc5d755126e2ce9b0ecf08a2 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 10:37:18 +0100 Subject: [PATCH 03/45] Add generate_serve_run_and_wait_block() --- .../vllm/slurm_command_gen_strategy.py | 33 +++++++++++++++++ src/cloudai/workloads/vllm/vllm.py | 2 +- .../test_vllm_slurm_command_gen_strategy.py | 35 +++++++++++++++++++ 3 files changed, 69 insertions(+), 1 deletion(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index bd74fdea4..356ccbc1c 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -37,3 +37,36 @@ def get_vllm_serve_command(self) -> list[str]: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) tdef_cmd_args: VllmCmdArgs = tdef.cmd_args return ["vllm", "serve", tdef_cmd_args.model, "--port", str(tdef_cmd_args.port)] + + def generate_serve_run_and_wait_block(self) -> str: + """Generate bash block to run vLLM serve and wait for health check.""" + tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) + cmd_args: VllmCmdArgs = tdef.cmd_args + serve_cmd = " ".join(self.get_vllm_serve_command()) + + return f"""\ +{serve_cmd} & +VLLM_PID=$! + +TIMEOUT={cmd_args.vllm_serve_wait_seconds} +SLEEP_INTERVAL=5 +HOST=0.0.0.0 +PORT={cmd_args.port} + +end_time=$(($(date +%s) + TIMEOUT)) +while [ "$(date +%s)" -lt "$end_time" ]; do + if curl -sf "http://${{HOST}}:${{PORT}}/health" > /dev/null 2>&1; then + echo "vLLM server is ready!" + break + fi + if ! kill -0 "$VLLM_PID" 2>/dev/null; then + echo "vLLM server process died unexpectedly!" + exit 1 + fi + sleep "$SLEEP_INTERVAL" +done + +if ! curl -sf "http://${{HOST}}:${{PORT}}/health" > /dev/null 2>&1; then + echo "Timeout waiting for vLLM to start" + exit 1 +fi""" diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index 11feb312c..3aa5447a3 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -24,7 +24,7 @@ class VllmCmdArgs(CmdArgs): docker_image_url: str port: int = 8000 - vllm_server_wait_seconds: int = 300 + vllm_serve_wait_seconds: int = 300 model: str = "Qwen/Qwen3-0.6B" diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index 973a5b012..03888c525 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -47,3 +47,38 @@ def test_generate_vllm_command(self, vllm_tr: TestRun, slurm_system: SlurmSystem command = " ".join(cmd_gen_strategy.get_vllm_serve_command()) assert command == f"vllm serve {vllm_tr.test.cmd_args.model} --port {vllm_tr.test.cmd_args.port}" + + def test_generate_serve_run_and_wait_block(self, vllm_tr: TestRun, slurm_system: SlurmSystem) -> None: + cmd_gen_strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr) + cmd_args = vllm_tr.test.cmd_args + + block = cmd_gen_strategy.generate_serve_run_and_wait_block() + + expected = f"""\ +vllm serve {cmd_args.model} --port {cmd_args.port} & +VLLM_PID=$! + +TIMEOUT={cmd_args.vllm_server_wait_seconds} +SLEEP_INTERVAL=5 +HOST=0.0.0.0 +PORT={cmd_args.port} + +end_time=$(($(date +%s) + TIMEOUT)) +while [ "$(date +%s)" -lt "$end_time" ]; do + if curl -sf "http://${{HOST}}:${{PORT}}/health" > /dev/null 2>&1; then + echo "vLLM server is ready!" + break + fi + if ! kill -0 "$VLLM_PID" 2>/dev/null; then + echo "vLLM server process died unexpectedly!" + exit 1 + fi + sleep "$SLEEP_INTERVAL" +done + +if ! curl -sf "http://${{HOST}}:${{PORT}}/health" > /dev/null 2>&1; then + echo "Timeout waiting for vLLM to start" + exit 1 +fi""" + + assert block == expected From 4bdf09c84424b5a9faac2417014f68eff279c24b Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 10:54:12 +0100 Subject: [PATCH 04/45] Initial implementation for _gen_srun_command() --- .../vllm/slurm_command_gen_strategy.py | 24 +++++++++++++------ .../test_vllm_slurm_command_gen_strategy.py | 24 ++++++++++++++++++- 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 356ccbc1c..68c0c1b72 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, cast +from typing import cast from cloudai.systems.slurm import SlurmCommandGenStrategy @@ -24,15 +24,11 @@ class VllmSlurmCommandGenStrategy(SlurmCommandGenStrategy): """Command generation strategy for vLLM on Slurm systems.""" + VLLM_RUN_SCRIPT_NAME = "vllm_run.sh" + def _container_mounts(self) -> list[str]: return [] - def generate_test_command(self) -> List[str]: - tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) - tdef_cmd_args: VllmCmdArgs = tdef.cmd_args - # TODO: Implement full command generation with bash script - return [f"vllm serve {tdef_cmd_args.model}"] - def get_vllm_serve_command(self) -> list[str]: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) tdef_cmd_args: VllmCmdArgs = tdef.cmd_args @@ -70,3 +66,17 @@ def generate_serve_run_and_wait_block(self) -> str: echo "Timeout waiting for vLLM to start" exit 1 fi""" + + def _gen_srun_command(self) -> str: + script_path = self.test_run.output_path / self.VLLM_RUN_SCRIPT_NAME + script_path.write_text(self.generate_serve_run_and_wait_block()) + + srun_parts = [ + *self.gen_srun_prefix(), + "--ntasks-per-node=1", + "--ntasks=1", + "bash", + "-c", + f'"{script_path.absolute()}"', + ] + return " ".join(srun_parts) diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index 03888c525..f21d3a3c7 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -38,6 +38,11 @@ def vllm_tr(vllm: VllmTestDefinition, tmp_path: Path) -> TestRun: return TestRun(test=vllm, num_nodes=1, nodes=[], output_path=tmp_path, name="vllm-job") +@pytest.fixture +def vllm_cmd_gen_strategy(vllm_tr: TestRun, slurm_system: SlurmSystem) -> VllmSlurmCommandGenStrategy: + return VllmSlurmCommandGenStrategy(slurm_system, vllm_tr) + + class TestVllmSlurmCommandGenStrategy: """Test the VllmSlurmCommandGenStrategy class.""" @@ -58,7 +63,7 @@ def test_generate_serve_run_and_wait_block(self, vllm_tr: TestRun, slurm_system: vllm serve {cmd_args.model} --port {cmd_args.port} & VLLM_PID=$! -TIMEOUT={cmd_args.vllm_server_wait_seconds} +TIMEOUT={cmd_args.vllm_serve_wait_seconds} SLEEP_INTERVAL=5 HOST=0.0.0.0 PORT={cmd_args.port} @@ -82,3 +87,20 @@ def test_generate_serve_run_and_wait_block(self, vllm_tr: TestRun, slurm_system: fi""" assert block == expected + + def test_gen_srun_command_writes_script_and_returns_srun( + self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy + ) -> None: + srun_command = vllm_cmd_gen_strategy._gen_srun_command() + + script_path = vllm_cmd_gen_strategy.test_run.output_path / VllmSlurmCommandGenStrategy.VLLM_RUN_SCRIPT_NAME + assert script_path.exists() + + expected_script = vllm_cmd_gen_strategy.generate_serve_run_and_wait_block() + assert script_path.read_text() == expected_script + + expected_srun = ( + " ".join(vllm_cmd_gen_strategy.gen_srun_prefix()) + + f' --ntasks-per-node=1 --ntasks=1 bash -c "{script_path.absolute()}"' + ) + assert srun_command == expected_srun From 2df119440ee5953b0f044f9a860551ff759fdea2 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 11:00:27 +0100 Subject: [PATCH 05/45] Add acceptance case --- tests/ref_data/vllm.sbatch | 17 +++++++++++++++++ tests/test_acceptance.py | 16 ++++++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 tests/ref_data/vllm.sbatch diff --git a/tests/ref_data/vllm.sbatch b/tests/ref_data/vllm.sbatch new file mode 100644 index 000000000..437fc7c27 --- /dev/null +++ b/tests/ref_data/vllm.sbatch @@ -0,0 +1,17 @@ +#!/bin/bash +# generated by CloudAI@__CLOUDAI_VERSION__ +#SBATCH --job-name=__JOB_NAME__ +#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt +#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt +#SBATCH --partition=main +#SBATCH -N 1 +#SBATCH --gpus-per-node=8 +#SBATCH --gres=gpu:8 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) + +srun --export=ALL --mpi=pmix -N1 --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." + +srun --export=ALL --mpi=pmix -N1 --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash __OUTPUT_DIR__/install/slurm-metadata.sh + +srun --export=ALL --mpi=pmix -N1 --ntasks-per-node=1 --ntasks=1 bash -c "__OUTPUT_DIR__/output/vllm_run.sh" diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index df3bbc06b..48a319369 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -73,6 +73,7 @@ TritonInferenceTestDefinition, ) from cloudai.workloads.ucc_test import UCCCmdArgs, UCCTestDefinition +from cloudai.workloads.vllm import VllmCmdArgs, VllmTestDefinition SLURM_TEST_SCENARIOS = [ {"path": Path("conf/common/test_scenario/sleep.toml"), "expected_dirs_number": 4, "log_file": "sleep_debug.log"}, @@ -262,6 +263,7 @@ def build_special_test_run( "nixl-kvbench", "deepep-benchmark", "osu-bench", + "vllm", ] ) def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> Tuple[TestRun, str, Optional[str]]: @@ -492,6 +494,20 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - ), ), ), + "vllm": lambda: create_test_run( + partial_tr, + "vllm", + VllmTestDefinition( + name="vllm", + description="vLLM benchmark", + test_template_name="Vllm", + cmd_args=VllmCmdArgs( + docker_image_url="nvcr.io/nvidia/vllm:latest", + model="Qwen/Qwen3-0.6B", + port=8000, + ), + ), + ), } if request.param.startswith(("gpt-", "grok-", "nemo-run-", "nemo-launcher")): From 90b8eaab901f6da576d99ca5344c814eeb85fd96 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 11:51:22 +0100 Subject: [PATCH 06/45] Control flow from the sbatch --- .../vllm/slurm_command_gen_strategy.py | 82 ++++++++--------- tests/ref_data/vllm.sbatch | 31 ++++++- .../test_vllm_slurm_command_gen_strategy.py | 88 ++++++++++--------- 3 files changed, 116 insertions(+), 85 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 68c0c1b72..5a00acb1b 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -24,8 +24,6 @@ class VllmSlurmCommandGenStrategy(SlurmCommandGenStrategy): """Command generation strategy for vLLM on Slurm systems.""" - VLLM_RUN_SCRIPT_NAME = "vllm_run.sh" - def _container_mounts(self) -> list[str]: return [] @@ -34,49 +32,51 @@ def get_vllm_serve_command(self) -> list[str]: tdef_cmd_args: VllmCmdArgs = tdef.cmd_args return ["vllm", "serve", tdef_cmd_args.model, "--port", str(tdef_cmd_args.port)] - def generate_serve_run_and_wait_block(self) -> str: - """Generate bash block to run vLLM serve and wait for health check.""" + def generate_wait_for_health_function(self) -> str: + """Generate bash function for health check.""" tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) cmd_args: VllmCmdArgs = tdef.cmd_args - serve_cmd = " ".join(self.get_vllm_serve_command()) return f"""\ -{serve_cmd} & -VLLM_PID=$! +wait_for_health() {{ + local endpoint="$1" + local timeout={cmd_args.vllm_serve_wait_seconds} + local interval=5 + local end_time=$(($(date +%s) + timeout)) + + while [ "$(date +%s)" -lt "$end_time" ]; do + if curl -sf "$endpoint" > /dev/null 2>&1; then + echo "Health check passed: $endpoint" + return 0 + fi + sleep "$interval" + done -TIMEOUT={cmd_args.vllm_serve_wait_seconds} -SLEEP_INTERVAL=5 -HOST=0.0.0.0 -PORT={cmd_args.port} - -end_time=$(($(date +%s) + TIMEOUT)) -while [ "$(date +%s)" -lt "$end_time" ]; do - if curl -sf "http://${{HOST}}:${{PORT}}/health" > /dev/null 2>&1; then - echo "vLLM server is ready!" - break - fi - if ! kill -0 "$VLLM_PID" 2>/dev/null; then - echo "vLLM server process died unexpectedly!" - exit 1 - fi - sleep "$SLEEP_INTERVAL" -done - -if ! curl -sf "http://${{HOST}}:${{PORT}}/health" > /dev/null 2>&1; then - echo "Timeout waiting for vLLM to start" - exit 1 -fi""" + echo "Timeout waiting for: $endpoint" + return 1 +}}""" def _gen_srun_command(self) -> str: - script_path = self.test_run.output_path / self.VLLM_RUN_SCRIPT_NAME - script_path.write_text(self.generate_serve_run_and_wait_block()) - - srun_parts = [ - *self.gen_srun_prefix(), - "--ntasks-per-node=1", - "--ntasks=1", - "bash", - "-c", - f'"{script_path.absolute()}"', - ] - return " ".join(srun_parts) + """Generate full command flow: server start, health check, cleanup.""" + tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) + cmd_args: VllmCmdArgs = tdef.cmd_args + + srun_prefix = " ".join(self.gen_srun_prefix()) + serve_cmd = " ".join(self.get_vllm_serve_command()) + health_func = self.generate_wait_for_health_function() + + return f"""\ +cleanup() {{ + [ -n "$VLLM_PID" ] && kill $VLLM_PID 2>/dev/null +}} +trap cleanup EXIT + +{health_func} + +# Start vLLM in background +{srun_prefix} --ntasks-per-node=1 --ntasks=1 {serve_cmd} & +VLLM_PID=$! + +# Wait for instances to be ready +NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) +wait_for_health "http://${{NODE}}:{cmd_args.port}/health" || exit 1""" diff --git a/tests/ref_data/vllm.sbatch b/tests/ref_data/vllm.sbatch index 437fc7c27..eedb6eeeb 100644 --- a/tests/ref_data/vllm.sbatch +++ b/tests/ref_data/vllm.sbatch @@ -14,4 +14,33 @@ srun --export=ALL --mpi=pmix -N1 --output=__OUTPUT_DIR__/output/mapping-stdout.t srun --export=ALL --mpi=pmix -N1 --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash __OUTPUT_DIR__/install/slurm-metadata.sh -srun --export=ALL --mpi=pmix -N1 --ntasks-per-node=1 --ntasks=1 bash -c "__OUTPUT_DIR__/output/vllm_run.sh" +cleanup() { + [ -n "$VLLM_PID" ] && kill $VLLM_PID 2>/dev/null +} +trap cleanup EXIT + +wait_for_health() { + local endpoint="$1" + local timeout=300 + local interval=5 + local end_time=$(($(date +%s) + timeout)) + + while [ "$(date +%s)" -lt "$end_time" ]; do + if curl -sf "$endpoint" > /dev/null 2>&1; then + echo "Health check passed: $endpoint" + return 0 + fi + sleep "$interval" + done + + echo "Timeout waiting for: $endpoint" + return 1 +} + +# Start vLLM in background +srun --export=ALL --mpi=pmix -N1 --ntasks-per-node=1 --ntasks=1 vllm serve Qwen/Qwen3-0.6B --port 8000 & +VLLM_PID=$! + +# Wait for instances to be ready +NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) +wait_for_health "http://${NODE}:8000/health" || exit 1 diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index f21d3a3c7..40076eda7 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -53,54 +53,56 @@ def test_generate_vllm_command(self, vllm_tr: TestRun, slurm_system: SlurmSystem assert command == f"vllm serve {vllm_tr.test.cmd_args.model} --port {vllm_tr.test.cmd_args.port}" - def test_generate_serve_run_and_wait_block(self, vllm_tr: TestRun, slurm_system: SlurmSystem) -> None: - cmd_gen_strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr) - cmd_args = vllm_tr.test.cmd_args + def test_generate_wait_for_health_function(self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy) -> None: + """Test that wait_for_health bash function is generated correctly.""" + cmd_args = vllm_cmd_gen_strategy.test_run.test.cmd_args - block = cmd_gen_strategy.generate_serve_run_and_wait_block() + func = vllm_cmd_gen_strategy.generate_wait_for_health_function() expected = f"""\ -vllm serve {cmd_args.model} --port {cmd_args.port} & -VLLM_PID=$! +wait_for_health() {{ + local endpoint="$1" + local timeout={cmd_args.vllm_serve_wait_seconds} + local interval=5 + local end_time=$(($(date +%s) + timeout)) + + while [ "$(date +%s)" -lt "$end_time" ]; do + if curl -sf "$endpoint" > /dev/null 2>&1; then + echo "Health check passed: $endpoint" + return 0 + fi + sleep "$interval" + done + + echo "Timeout waiting for: $endpoint" + return 1 +}}""" + + assert func == expected + + def test_gen_srun_command_full_flow(self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy) -> None: + """Test that _gen_srun_command returns full flow: cleanup, server, health check.""" + cmd_args = vllm_cmd_gen_strategy.test_run.test.cmd_args + srun_prefix = " ".join(vllm_cmd_gen_strategy.gen_srun_prefix()) + serve_cmd = " ".join(vllm_cmd_gen_strategy.get_vllm_serve_command()) + health_func = vllm_cmd_gen_strategy.generate_wait_for_health_function() -TIMEOUT={cmd_args.vllm_serve_wait_seconds} -SLEEP_INTERVAL=5 -HOST=0.0.0.0 -PORT={cmd_args.port} - -end_time=$(($(date +%s) + TIMEOUT)) -while [ "$(date +%s)" -lt "$end_time" ]; do - if curl -sf "http://${{HOST}}:${{PORT}}/health" > /dev/null 2>&1; then - echo "vLLM server is ready!" - break - fi - if ! kill -0 "$VLLM_PID" 2>/dev/null; then - echo "vLLM server process died unexpectedly!" - exit 1 - fi - sleep "$SLEEP_INTERVAL" -done - -if ! curl -sf "http://${{HOST}}:${{PORT}}/health" > /dev/null 2>&1; then - echo "Timeout waiting for vLLM to start" - exit 1 -fi""" - - assert block == expected - - def test_gen_srun_command_writes_script_and_returns_srun( - self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy - ) -> None: srun_command = vllm_cmd_gen_strategy._gen_srun_command() - script_path = vllm_cmd_gen_strategy.test_run.output_path / VllmSlurmCommandGenStrategy.VLLM_RUN_SCRIPT_NAME - assert script_path.exists() + expected = f"""\ +cleanup() {{ + [ -n "$VLLM_PID" ] && kill $VLLM_PID 2>/dev/null +}} +trap cleanup EXIT + +{health_func} + +# Start vLLM in background +{srun_prefix} --ntasks-per-node=1 --ntasks=1 {serve_cmd} & +VLLM_PID=$! - expected_script = vllm_cmd_gen_strategy.generate_serve_run_and_wait_block() - assert script_path.read_text() == expected_script +# Wait for instances to be ready +NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) +wait_for_health "http://${{NODE}}:{cmd_args.port}/health" || exit 1""" - expected_srun = ( - " ".join(vllm_cmd_gen_strategy.gen_srun_prefix()) - + f' --ntasks-per-node=1 --ntasks=1 bash -c "{script_path.absolute()}"' - ) - assert srun_command == expected_srun + assert srun_command == expected From 8677a02a8df3bcdcc34bcad5f92137f6229dddab Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 12:06:07 +0100 Subject: [PATCH 07/45] Add bench runs --- src/cloudai/workloads/vllm/__init__.py | 3 +- .../vllm/slurm_command_gen_strategy.py | 33 +++++++++++++++++-- src/cloudai/workloads/vllm/vllm.py | 12 ++++++- tests/ref_data/vllm.sbatch | 9 +++-- .../test_vllm_slurm_command_gen_strategy.py | 29 ++++++++++++++-- 5 files changed, 76 insertions(+), 10 deletions(-) diff --git a/src/cloudai/workloads/vllm/__init__.py b/src/cloudai/workloads/vllm/__init__.py index 07b2be366..a38af059c 100644 --- a/src/cloudai/workloads/vllm/__init__.py +++ b/src/cloudai/workloads/vllm/__init__.py @@ -15,9 +15,10 @@ # limitations under the License. from .slurm_command_gen_strategy import VllmSlurmCommandGenStrategy -from .vllm import VllmCmdArgs, VllmTestDefinition +from .vllm import VllmBenchCmdArgs, VllmCmdArgs, VllmTestDefinition __all__ = [ + "VllmBenchCmdArgs", "VllmCmdArgs", "VllmSlurmCommandGenStrategy", "VllmTestDefinition", diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 5a00acb1b..bdb0c539a 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -27,11 +27,37 @@ class VllmSlurmCommandGenStrategy(SlurmCommandGenStrategy): def _container_mounts(self) -> list[str]: return [] + def image_path(self) -> str | None: + tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) + return tdef.cmd_args.docker_image_url + def get_vllm_serve_command(self) -> list[str]: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) tdef_cmd_args: VllmCmdArgs = tdef.cmd_args return ["vllm", "serve", tdef_cmd_args.model, "--port", str(tdef_cmd_args.port)] + def get_vllm_bench_command(self) -> list[str]: + tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) + cmd_args: VllmCmdArgs = tdef.cmd_args + bench_args = tdef.bench_cmd_args + return [ + "vllm", + "bench", + "serve", + "--model", + cmd_args.model, + "--base-url", + f"http://${{NODE}}:{cmd_args.port}", + "--random-input-len", + str(bench_args.random_input_len), + "--random-output-len", + str(bench_args.random_output_len), + "--max-concurrency", + str(bench_args.max_concurrency), + "--num-prompts", + str(bench_args.num_prompts), + ] + def generate_wait_for_health_function(self) -> str: """Generate bash function for health check.""" tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) @@ -57,12 +83,12 @@ def generate_wait_for_health_function(self) -> str: }}""" def _gen_srun_command(self) -> str: - """Generate full command flow: server start, health check, cleanup.""" tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) cmd_args: VllmCmdArgs = tdef.cmd_args srun_prefix = " ".join(self.gen_srun_prefix()) serve_cmd = " ".join(self.get_vllm_serve_command()) + bench_cmd = " ".join(self.get_vllm_bench_command()) health_func = self.generate_wait_for_health_function() return f"""\ @@ -79,4 +105,7 @@ def _gen_srun_command(self) -> str: # Wait for instances to be ready NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) -wait_for_health "http://${{NODE}}:{cmd_args.port}/health" || exit 1""" +wait_for_health "http://${{NODE}}:{cmd_args.port}/health" || exit 1 + +# Run benchmark +{srun_prefix} --ntasks-per-node=1 --ntasks=1 {bench_cmd}""" diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index 3aa5447a3..755e7eb2b 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -20,7 +20,7 @@ class VllmCmdArgs(CmdArgs): - """vLLM test command arguments.""" + """vLLM serve command arguments.""" docker_image_url: str port: int = 8000 @@ -28,10 +28,20 @@ class VllmCmdArgs(CmdArgs): model: str = "Qwen/Qwen3-0.6B" +class VllmBenchCmdArgs(CmdArgs): + """vLLM bench serve command arguments.""" + + random_input_len: int = 16 + random_output_len: int = 128 + max_concurrency: int = 16 + num_prompts: int = 30 + + class VllmTestDefinition(TestDefinition): """Test object for vLLM.""" cmd_args: VllmCmdArgs + bench_cmd_args: VllmBenchCmdArgs = VllmBenchCmdArgs() @property def installables(self) -> list[Installable]: diff --git a/tests/ref_data/vllm.sbatch b/tests/ref_data/vllm.sbatch index eedb6eeeb..af2f6d716 100644 --- a/tests/ref_data/vllm.sbatch +++ b/tests/ref_data/vllm.sbatch @@ -10,9 +10,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) -srun --export=ALL --mpi=pmix -N1 --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --export=ALL --mpi=pmix -N1 --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash __OUTPUT_DIR__/install/slurm-metadata.sh +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh cleanup() { [ -n "$VLLM_PID" ] && kill $VLLM_PID 2>/dev/null @@ -38,9 +38,12 @@ wait_for_health() { } # Start vLLM in background -srun --export=ALL --mpi=pmix -N1 --ntasks-per-node=1 --ntasks=1 vllm serve Qwen/Qwen3-0.6B --port 8000 & +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks-per-node=1 --ntasks=1 vllm serve Qwen/Qwen3-0.6B --port 8000 & VLLM_PID=$! # Wait for instances to be ready NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) wait_for_health "http://${NODE}:8000/health" || exit 1 + +# Run benchmark +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks-per-node=1 --ntasks=1 vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index 40076eda7..54c54a4ee 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -15,6 +15,7 @@ # limitations under the License. from pathlib import Path +from typing import cast import pytest @@ -80,11 +81,30 @@ def test_generate_wait_for_health_function(self, vllm_cmd_gen_strategy: VllmSlur assert func == expected + def test_get_vllm_bench_command(self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy) -> None: + tdef = cast(VllmTestDefinition, vllm_cmd_gen_strategy.test_run.test) + cmd_args = tdef.cmd_args + bench_args = tdef.bench_cmd_args + + command = " ".join(vllm_cmd_gen_strategy.get_vllm_bench_command()) + + expected = ( + f"vllm bench serve --model {cmd_args.model} " + f"--base-url http://${{NODE}}:{cmd_args.port} " + f"--random-input-len {bench_args.random_input_len} " + f"--random-output-len {bench_args.random_output_len} " + f"--max-concurrency {bench_args.max_concurrency} " + f"--num-prompts {bench_args.num_prompts}" + ) + assert command == expected + def test_gen_srun_command_full_flow(self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy) -> None: - """Test that _gen_srun_command returns full flow: cleanup, server, health check.""" - cmd_args = vllm_cmd_gen_strategy.test_run.test.cmd_args + """Test that _gen_srun_command returns full flow: cleanup, server, health check, bench.""" + tdef = vllm_cmd_gen_strategy.test_run.test + cmd_args = tdef.cmd_args srun_prefix = " ".join(vllm_cmd_gen_strategy.gen_srun_prefix()) serve_cmd = " ".join(vllm_cmd_gen_strategy.get_vllm_serve_command()) + bench_cmd = " ".join(vllm_cmd_gen_strategy.get_vllm_bench_command()) health_func = vllm_cmd_gen_strategy.generate_wait_for_health_function() srun_command = vllm_cmd_gen_strategy._gen_srun_command() @@ -103,6 +123,9 @@ def test_gen_srun_command_full_flow(self, vllm_cmd_gen_strategy: VllmSlurmComman # Wait for instances to be ready NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) -wait_for_health "http://${{NODE}}:{cmd_args.port}/health" || exit 1""" +wait_for_health "http://${{NODE}}:{cmd_args.port}/health" || exit 1 + +# Run benchmark +{srun_prefix} --ntasks-per-node=1 --ntasks=1 {bench_cmd}""" assert srun_command == expected From 3f513fa7a29fb3406a251ef9859c7b642ed1ef3c Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 12:34:26 +0100 Subject: [PATCH 08/45] Redirect outputs and use --overlap --- .../workloads/vllm/slurm_command_gen_strategy.py | 15 ++++++++++----- tests/ref_data/vllm.sbatch | 13 ++++++++----- .../test_vllm_slurm_command_gen_strategy.py | 14 +++++++++----- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index bdb0c539a..3c4627a58 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -83,8 +83,10 @@ def generate_wait_for_health_function(self) -> str: }}""" def _gen_srun_command(self) -> str: + """Generate full command flow: cleanup, server start, health check, benchmark.""" tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) cmd_args: VllmCmdArgs = tdef.cmd_args + output_path = self.test_run.output_path.absolute() srun_prefix = " ".join(self.gen_srun_prefix()) serve_cmd = " ".join(self.get_vllm_serve_command()) @@ -99,13 +101,16 @@ def _gen_srun_command(self) -> str: {health_func} -# Start vLLM in background -{srun_prefix} --ntasks-per-node=1 --ntasks=1 {serve_cmd} & +{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ + --output={output_path}/vllm-serve-stdout.txt \\ + --error={output_path}/vllm-serve-stderr.txt \\ + {serve_cmd} & VLLM_PID=$! -# Wait for instances to be ready NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) wait_for_health "http://${{NODE}}:{cmd_args.port}/health" || exit 1 -# Run benchmark -{srun_prefix} --ntasks-per-node=1 --ntasks=1 {bench_cmd}""" +{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ + --output={output_path}/vllm-bench-stdout.txt \\ + --error={output_path}/vllm-bench-stderr.txt \\ + {bench_cmd}""" diff --git a/tests/ref_data/vllm.sbatch b/tests/ref_data/vllm.sbatch index af2f6d716..7a2a92970 100644 --- a/tests/ref_data/vllm.sbatch +++ b/tests/ref_data/vllm.sbatch @@ -37,13 +37,16 @@ wait_for_health() { return 1 } -# Start vLLM in background -srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks-per-node=1 --ntasks=1 vllm serve Qwen/Qwen3-0.6B --port 8000 & +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ + --output=__OUTPUT_DIR__/output/vllm-serve-stdout.txt \ + --error=__OUTPUT_DIR__/output/vllm-serve-stderr.txt \ + vllm serve Qwen/Qwen3-0.6B --port 8000 & VLLM_PID=$! -# Wait for instances to be ready NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) wait_for_health "http://${NODE}:8000/health" || exit 1 -# Run benchmark -srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks-per-node=1 --ntasks=1 vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ + --output=__OUTPUT_DIR__/output/vllm-bench-stdout.txt \ + --error=__OUTPUT_DIR__/output/vllm-bench-stderr.txt \ + vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index 54c54a4ee..49e907541 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -102,6 +102,7 @@ def test_gen_srun_command_full_flow(self, vllm_cmd_gen_strategy: VllmSlurmComman """Test that _gen_srun_command returns full flow: cleanup, server, health check, bench.""" tdef = vllm_cmd_gen_strategy.test_run.test cmd_args = tdef.cmd_args + output_path = vllm_cmd_gen_strategy.test_run.output_path.absolute() srun_prefix = " ".join(vllm_cmd_gen_strategy.gen_srun_prefix()) serve_cmd = " ".join(vllm_cmd_gen_strategy.get_vllm_serve_command()) bench_cmd = " ".join(vllm_cmd_gen_strategy.get_vllm_bench_command()) @@ -117,15 +118,18 @@ def test_gen_srun_command_full_flow(self, vllm_cmd_gen_strategy: VllmSlurmComman {health_func} -# Start vLLM in background -{srun_prefix} --ntasks-per-node=1 --ntasks=1 {serve_cmd} & +{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ + --output={output_path}/vllm-serve-stdout.txt \\ + --error={output_path}/vllm-serve-stderr.txt \\ + {serve_cmd} & VLLM_PID=$! -# Wait for instances to be ready NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) wait_for_health "http://${{NODE}}:{cmd_args.port}/health" || exit 1 -# Run benchmark -{srun_prefix} --ntasks-per-node=1 --ntasks=1 {bench_cmd}""" +{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ + --output={output_path}/vllm-bench-stdout.txt \\ + --error={output_path}/vllm-bench-stderr.txt \\ + {bench_cmd}""" assert srun_command == expected From ea4845646c58f5d3a9e5aeda9602bcdcd6e5c2bc Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 12:39:08 +0100 Subject: [PATCH 09/45] Update to use docker cache --- .../workloads/vllm/slurm_command_gen_strategy.py | 2 +- src/cloudai/workloads/vllm/vllm.py | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 3c4627a58..fa8fc0625 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -29,7 +29,7 @@ def _container_mounts(self) -> list[str]: def image_path(self) -> str | None: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) - return tdef.cmd_args.docker_image_url + return str(tdef.docker_image.installed_path) def get_vllm_serve_command(self) -> list[str]: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index 755e7eb2b..84224254a 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -15,7 +15,7 @@ # limitations under the License. -from cloudai.core import Installable +from cloudai.core import DockerImage, Installable from cloudai.models.workload import CmdArgs, TestDefinition @@ -43,6 +43,14 @@ class VllmTestDefinition(TestDefinition): cmd_args: VllmCmdArgs bench_cmd_args: VllmBenchCmdArgs = VllmBenchCmdArgs() + _docker_image: DockerImage | None = None + + @property + def docker_image(self) -> DockerImage: + if not self._docker_image: + self._docker_image = DockerImage(url=self.cmd_args.docker_image_url) + return self._docker_image + @property def installables(self) -> list[Installable]: - return [] + return [*self.git_repos, self.docker_image] From e9080fd40883827433dc9028312eb36f52fa05c5 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 12:52:11 +0100 Subject: [PATCH 10/45] Log steps and less output files --- src/cloudai/workloads/vllm/slurm_command_gen_strategy.py | 9 +++++---- tests/ref_data/vllm.sbatch | 9 +++++---- .../test_vllm_slurm_command_gen_strategy.py | 9 +++++---- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index fa8fc0625..c48bb0792 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -101,16 +101,17 @@ def _gen_srun_command(self) -> str: {health_func} +echo "Starting vLLM instances..." {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --output={output_path}/vllm-serve-stdout.txt \\ - --error={output_path}/vllm-serve-stderr.txt \\ + --output={output_path}/vllm-serve.txt \\ {serve_cmd} & VLLM_PID=$! NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) +echo "Waiting for vLLM on $NODE to be ready..." wait_for_health "http://${{NODE}}:{cmd_args.port}/health" || exit 1 +echo "Running benchmark..." {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --output={output_path}/vllm-bench-stdout.txt \\ - --error={output_path}/vllm-bench-stderr.txt \\ + --output={output_path}/vllm-bench.txt \\ {bench_cmd}""" diff --git a/tests/ref_data/vllm.sbatch b/tests/ref_data/vllm.sbatch index 7a2a92970..a7ff313c9 100644 --- a/tests/ref_data/vllm.sbatch +++ b/tests/ref_data/vllm.sbatch @@ -37,16 +37,17 @@ wait_for_health() { return 1 } +echo "Starting vLLM instances..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ - --output=__OUTPUT_DIR__/output/vllm-serve-stdout.txt \ - --error=__OUTPUT_DIR__/output/vllm-serve-stderr.txt \ + --output=__OUTPUT_DIR__/output/vllm-serve.txt \ vllm serve Qwen/Qwen3-0.6B --port 8000 & VLLM_PID=$! NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) +echo "Waiting for vLLM on $NODE to be ready..." wait_for_health "http://${NODE}:8000/health" || exit 1 +echo "Running benchmark..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ - --output=__OUTPUT_DIR__/output/vllm-bench-stdout.txt \ - --error=__OUTPUT_DIR__/output/vllm-bench-stderr.txt \ + --output=__OUTPUT_DIR__/output/vllm-bench.txt \ vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index 49e907541..5c76202e6 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -118,18 +118,19 @@ def test_gen_srun_command_full_flow(self, vllm_cmd_gen_strategy: VllmSlurmComman {health_func} +echo "Starting vLLM instances..." {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --output={output_path}/vllm-serve-stdout.txt \\ - --error={output_path}/vllm-serve-stderr.txt \\ + --output={output_path}/vllm-serve.txt \\ {serve_cmd} & VLLM_PID=$! NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) +echo "Waiting for vLLM on $NODE to be ready..." wait_for_health "http://${{NODE}}:{cmd_args.port}/health" || exit 1 +echo "Running benchmark..." {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --output={output_path}/vllm-bench-stdout.txt \\ - --error={output_path}/vllm-bench-stderr.txt \\ + --output={output_path}/vllm-bench.txt \\ {bench_cmd}""" assert srun_command == expected From 1390886af5efff310f848eaaffb8cfec82f0d3b0 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 13:05:44 +0100 Subject: [PATCH 11/45] Check if run successfull --- src/cloudai/workloads/vllm/__init__.py | 3 +- .../vllm/slurm_command_gen_strategy.py | 6 +- src/cloudai/workloads/vllm/vllm.py | 19 +++++- ...test_vllm_job_status_retrieval_strategy.py | 59 +++++++++++++++++++ 4 files changed, 82 insertions(+), 5 deletions(-) create mode 100644 tests/job_status_retrieval_strategy/test_vllm_job_status_retrieval_strategy.py diff --git a/src/cloudai/workloads/vllm/__init__.py b/src/cloudai/workloads/vllm/__init__.py index a38af059c..65c9cdd08 100644 --- a/src/cloudai/workloads/vllm/__init__.py +++ b/src/cloudai/workloads/vllm/__init__.py @@ -15,9 +15,10 @@ # limitations under the License. from .slurm_command_gen_strategy import VllmSlurmCommandGenStrategy -from .vllm import VllmBenchCmdArgs, VllmCmdArgs, VllmTestDefinition +from .vllm import VLLM_BENCH_LOG_FILE, VllmBenchCmdArgs, VllmCmdArgs, VllmTestDefinition __all__ = [ + "VLLM_BENCH_LOG_FILE", "VllmBenchCmdArgs", "VllmCmdArgs", "VllmSlurmCommandGenStrategy", diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index c48bb0792..a6c49614a 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -18,7 +18,7 @@ from cloudai.systems.slurm import SlurmCommandGenStrategy -from .vllm import VllmCmdArgs, VllmTestDefinition +from .vllm import VLLM_BENCH_LOG_FILE, VLLM_SERVE_LOG_FILE, VllmCmdArgs, VllmTestDefinition class VllmSlurmCommandGenStrategy(SlurmCommandGenStrategy): @@ -103,7 +103,7 @@ def _gen_srun_command(self) -> str: echo "Starting vLLM instances..." {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --output={output_path}/vllm-serve.txt \\ + --output={output_path}/{VLLM_SERVE_LOG_FILE} \\ {serve_cmd} & VLLM_PID=$! @@ -113,5 +113,5 @@ def _gen_srun_command(self) -> str: echo "Running benchmark..." {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --output={output_path}/vllm-bench.txt \\ + --output={output_path}/{VLLM_BENCH_LOG_FILE} \\ {bench_cmd}""" diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index 84224254a..bed2f89f6 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -15,9 +15,12 @@ # limitations under the License. -from cloudai.core import DockerImage, Installable +from cloudai.core import DockerImage, Installable, JobStatusResult, TestRun from cloudai.models.workload import CmdArgs, TestDefinition +VLLM_SERVE_LOG_FILE = "vllm-serve.log" +VLLM_BENCH_LOG_FILE = "vllm-bench.log" + class VllmCmdArgs(CmdArgs): """vLLM serve command arguments.""" @@ -54,3 +57,17 @@ def docker_image(self) -> DockerImage: @property def installables(self) -> list[Installable]: return [*self.git_repos, self.docker_image] + + def was_run_successful(self, tr: TestRun) -> JobStatusResult: + log_path = tr.output_path / VLLM_BENCH_LOG_FILE + if not log_path.is_file(): + return JobStatusResult(is_successful=False, error_message=f"vLLM bench log not found in {tr.output_path}.") + + with log_path.open("r") as f: + for line in f: + if "============ Serving Benchmark Result ============" in line: + return JobStatusResult(is_successful=True) + + return JobStatusResult( + is_successful=False, error_message=f"vLLM bench log does not contain benchmark result in {tr.output_path}." + ) diff --git a/tests/job_status_retrieval_strategy/test_vllm_job_status_retrieval_strategy.py b/tests/job_status_retrieval_strategy/test_vllm_job_status_retrieval_strategy.py new file mode 100644 index 000000000..6e49a358d --- /dev/null +++ b/tests/job_status_retrieval_strategy/test_vllm_job_status_retrieval_strategy.py @@ -0,0 +1,59 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cloudai.core import TestRun +from cloudai.workloads.vllm import VLLM_BENCH_LOG_FILE, VllmCmdArgs, VllmTestDefinition + + +class TestVllmSuccessCheck: + def setup_method(self) -> None: + self.vllm_tdef = VllmTestDefinition( + name="vllm", + description="vLLM benchmark", + test_template_name="Vllm", + cmd_args=VllmCmdArgs(docker_image_url="nvcr.io/nvidia/vllm:latest"), + ) + + def test_no_bench_log_file(self, base_tr: TestRun) -> None: + result = self.vllm_tdef.was_run_successful(base_tr) + assert not result.is_successful + assert result.error_message == f"vLLM bench log not found in {base_tr.output_path}." + + def test_successful_job(self, base_tr: TestRun) -> None: + base_tr.output_path.mkdir(parents=True, exist_ok=True) + log_file = base_tr.output_path / VLLM_BENCH_LOG_FILE + log_content = "============ Serving Benchmark Result ============" + log_file.write_text(log_content) + result = self.vllm_tdef.was_run_successful(base_tr) + assert result.is_successful + assert result.error_message == "" + + def test_failed_job_no_result(self, base_tr: TestRun) -> None: + base_tr.output_path.mkdir(parents=True, exist_ok=True) + log_file = base_tr.output_path / VLLM_BENCH_LOG_FILE + log_content = "Starting benchmark...\nsome line\n" + log_file.write_text(log_content) + result = self.vllm_tdef.was_run_successful(base_tr) + assert not result.is_successful + assert result.error_message == f"vLLM bench log does not contain benchmark result in {base_tr.output_path}." + + def test_empty_log_file(self, base_tr: TestRun) -> None: + base_tr.output_path.mkdir(parents=True, exist_ok=True) + log_file = base_tr.output_path / VLLM_BENCH_LOG_FILE + log_file.touch() + result = self.vllm_tdef.was_run_successful(base_tr) + assert not result.is_successful + assert result.error_message == f"vLLM bench log does not contain benchmark result in {base_tr.output_path}." From 3544898a580b42fa5d77753aed591e5ba01fe0b8 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 13:32:44 +0100 Subject: [PATCH 12/45] Fix tests --- tests/ref_data/vllm.sbatch | 4 ++-- .../test_vllm_slurm_command_gen_strategy.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/ref_data/vllm.sbatch b/tests/ref_data/vllm.sbatch index a7ff313c9..11356b48f 100644 --- a/tests/ref_data/vllm.sbatch +++ b/tests/ref_data/vllm.sbatch @@ -39,7 +39,7 @@ wait_for_health() { echo "Starting vLLM instances..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ - --output=__OUTPUT_DIR__/output/vllm-serve.txt \ + --output=__OUTPUT_DIR__/output/vllm-serve.log \ vllm serve Qwen/Qwen3-0.6B --port 8000 & VLLM_PID=$! @@ -49,5 +49,5 @@ wait_for_health "http://${NODE}:8000/health" || exit 1 echo "Running benchmark..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ - --output=__OUTPUT_DIR__/output/vllm-bench.txt \ + --output=__OUTPUT_DIR__/output/vllm-bench.log \ vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index 5c76202e6..e78a14797 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -22,6 +22,7 @@ from cloudai.core import TestRun from cloudai.systems.slurm import SlurmSystem from cloudai.workloads.vllm import VllmCmdArgs, VllmSlurmCommandGenStrategy, VllmTestDefinition +from cloudai.workloads.vllm.vllm import VLLM_BENCH_LOG_FILE, VLLM_SERVE_LOG_FILE @pytest.fixture @@ -31,6 +32,7 @@ def vllm() -> VllmTestDefinition: description="vLLM benchmark test", test_template_name="Vllm", cmd_args=VllmCmdArgs(docker_image_url="nvcr.io/nvidia/vllm:latest", model="Qwen/Qwen3-0.6B", port=8000), + extra_env_vars={"CUDA_VISIBLE_DEVICES": "0"}, ) @@ -44,8 +46,8 @@ def vllm_cmd_gen_strategy(vllm_tr: TestRun, slurm_system: SlurmSystem) -> VllmSl return VllmSlurmCommandGenStrategy(slurm_system, vllm_tr) -class TestVllmSlurmCommandGenStrategy: - """Test the VllmSlurmCommandGenStrategy class.""" +class TestVllmAggregatedMode: + """Tests for vLLM non-disaggregated mode with 1 GPU.""" def test_generate_vllm_command(self, vllm_tr: TestRun, slurm_system: SlurmSystem) -> None: cmd_gen_strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr) @@ -55,7 +57,6 @@ def test_generate_vllm_command(self, vllm_tr: TestRun, slurm_system: SlurmSystem assert command == f"vllm serve {vllm_tr.test.cmd_args.model} --port {vllm_tr.test.cmd_args.port}" def test_generate_wait_for_health_function(self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy) -> None: - """Test that wait_for_health bash function is generated correctly.""" cmd_args = vllm_cmd_gen_strategy.test_run.test.cmd_args func = vllm_cmd_gen_strategy.generate_wait_for_health_function() @@ -99,7 +100,6 @@ def test_get_vllm_bench_command(self, vllm_cmd_gen_strategy: VllmSlurmCommandGen assert command == expected def test_gen_srun_command_full_flow(self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy) -> None: - """Test that _gen_srun_command returns full flow: cleanup, server, health check, bench.""" tdef = vllm_cmd_gen_strategy.test_run.test cmd_args = tdef.cmd_args output_path = vllm_cmd_gen_strategy.test_run.output_path.absolute() @@ -120,7 +120,7 @@ def test_gen_srun_command_full_flow(self, vllm_cmd_gen_strategy: VllmSlurmComman echo "Starting vLLM instances..." {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --output={output_path}/vllm-serve.txt \\ + --output={output_path}/{VLLM_SERVE_LOG_FILE} \\ {serve_cmd} & VLLM_PID=$! @@ -130,7 +130,7 @@ def test_gen_srun_command_full_flow(self, vllm_cmd_gen_strategy: VllmSlurmComman echo "Running benchmark..." {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --output={output_path}/vllm-bench.txt \\ + --output={output_path}/{VLLM_BENCH_LOG_FILE} \\ {bench_cmd}""" assert srun_command == expected From 573a35fae016831d5f9739071aca402923dc4b5d Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 13:52:04 +0100 Subject: [PATCH 13/45] Prepare for disagg --- .../vllm/slurm_command_gen_strategy.py | 20 ++++++++--- tests/ref_data/vllm.sbatch | 2 +- .../test_vllm_slurm_command_gen_strategy.py | 34 ++++++++++++++++--- tests/test_acceptance.py | 1 + 4 files changed, 46 insertions(+), 11 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index a6c49614a..7530c92e9 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -31,10 +31,21 @@ def image_path(self) -> str | None: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) return str(tdef.docker_image.installed_path) - def get_vllm_serve_command(self) -> list[str]: + @property + def gpu_ids(self) -> list[int]: + cuda_devices = self.test_run.test.extra_env_vars.get("CUDA_VISIBLE_DEVICES") + if cuda_devices: + return [int(gpu_id) for gpu_id in str(cuda_devices).split(",")] + return list(range(self.system.gpus_per_node or 1)) + + def get_vllm_serve_commands(self) -> list[list[str]]: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) - tdef_cmd_args: VllmCmdArgs = tdef.cmd_args - return ["vllm", "serve", tdef_cmd_args.model, "--port", str(tdef_cmd_args.port)] + cmd_args: VllmCmdArgs = tdef.cmd_args + base_cmd = ["vllm", "serve", cmd_args.model, "--port", str(cmd_args.port)] + if len(self.gpu_ids) == 1: + return [base_cmd] + + return [[]] # TODO def get_vllm_bench_command(self) -> list[str]: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) @@ -83,13 +94,12 @@ def generate_wait_for_health_function(self) -> str: }}""" def _gen_srun_command(self) -> str: - """Generate full command flow: cleanup, server start, health check, benchmark.""" tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) cmd_args: VllmCmdArgs = tdef.cmd_args output_path = self.test_run.output_path.absolute() srun_prefix = " ".join(self.gen_srun_prefix()) - serve_cmd = " ".join(self.get_vllm_serve_command()) + serve_cmd = " ".join(self.get_vllm_serve_commands()[0]) bench_cmd = " ".join(self.get_vllm_bench_command()) health_func = self.generate_wait_for_health_function() diff --git a/tests/ref_data/vllm.sbatch b/tests/ref_data/vllm.sbatch index 11356b48f..73eba98c7 100644 --- a/tests/ref_data/vllm.sbatch +++ b/tests/ref_data/vllm.sbatch @@ -9,7 +9,7 @@ #SBATCH --gres=gpu:8 export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) - +export CUDA_VISIBLE_DEVICES=0 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index e78a14797..3c7b0645f 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -46,15 +46,39 @@ def vllm_cmd_gen_strategy(vllm_tr: TestRun, slurm_system: SlurmSystem) -> VllmSl return VllmSlurmCommandGenStrategy(slurm_system, vllm_tr) +class TestGpuDetection: + """Tests for GPU detection logic.""" + + @pytest.mark.parametrize("cuda_visible_devices", ["0", "0,1,2,3", "0,1,2,3,4,5,6,7"]) + def test_gpu_ids_from_cuda_visible_devices_single( + self, cuda_visible_devices: str, vllm_tr: TestRun, slurm_system: SlurmSystem + ) -> None: + vllm_tr.test.extra_env_vars = {"CUDA_VISIBLE_DEVICES": cuda_visible_devices} + strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr) + assert strategy.gpu_ids == [int(gpu_id) for gpu_id in cuda_visible_devices.split(",")] + + @pytest.mark.parametrize("gpus_per_node", [None, 1, 8]) + def test_gpu_ids_fallback_to_system( + self, gpus_per_node: int | None, vllm_tr: TestRun, slurm_system: SlurmSystem + ) -> None: + vllm_tr.test.extra_env_vars = {} + slurm_system.gpus_per_node = gpus_per_node + + strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr) + + assert strategy.gpu_ids == list(range(gpus_per_node or 1)) + + class TestVllmAggregatedMode: """Tests for vLLM non-disaggregated mode with 1 GPU.""" - def test_generate_vllm_command(self, vllm_tr: TestRun, slurm_system: SlurmSystem) -> None: - cmd_gen_strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr) + def test_get_vllm_serve_commands_single_gpu(self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy) -> None: + cmd_args = vllm_cmd_gen_strategy.test_run.test.cmd_args - command = " ".join(cmd_gen_strategy.get_vllm_serve_command()) + commands = vllm_cmd_gen_strategy.get_vllm_serve_commands() - assert command == f"vllm serve {vllm_tr.test.cmd_args.model} --port {vllm_tr.test.cmd_args.port}" + assert len(commands) == 1 + assert commands[0] == ["vllm", "serve", cmd_args.model, "--port", str(cmd_args.port)] def test_generate_wait_for_health_function(self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy) -> None: cmd_args = vllm_cmd_gen_strategy.test_run.test.cmd_args @@ -104,7 +128,7 @@ def test_gen_srun_command_full_flow(self, vllm_cmd_gen_strategy: VllmSlurmComman cmd_args = tdef.cmd_args output_path = vllm_cmd_gen_strategy.test_run.output_path.absolute() srun_prefix = " ".join(vllm_cmd_gen_strategy.gen_srun_prefix()) - serve_cmd = " ".join(vllm_cmd_gen_strategy.get_vllm_serve_command()) + serve_cmd = " ".join(vllm_cmd_gen_strategy.get_vllm_serve_commands()[0]) bench_cmd = " ".join(vllm_cmd_gen_strategy.get_vllm_bench_command()) health_func = vllm_cmd_gen_strategy.generate_wait_for_health_function() diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 48a319369..5eafb58be 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -506,6 +506,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - model="Qwen/Qwen3-0.6B", port=8000, ), + extra_env_vars={"CUDA_VISIBLE_DEVICES": "0"}, ), ), } From e57bc770dd7fe6ac7b902609d95a4e4a0fce5ca4 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 14:50:20 +0100 Subject: [PATCH 14/45] vLLM disagg mode --- .../vllm/slurm_command_gen_strategy.py | 135 +++++++++++++++++- src/cloudai/workloads/vllm/vllm.py | 1 + tests/ref_data/vllm-disagg.sbatch | 67 +++++++++ .../test_vllm_slurm_command_gen_strategy.py | 126 ++++++++++++++++ tests/test_acceptance.py | 16 +++ 5 files changed, 340 insertions(+), 5 deletions(-) create mode 100644 tests/ref_data/vllm-disagg.sbatch diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 7530c92e9..f134c77a4 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -38,14 +38,68 @@ def gpu_ids(self) -> list[int]: return [int(gpu_id) for gpu_id in str(cuda_devices).split(",")] return list(range(self.system.gpus_per_node or 1)) + @property + def prefill_gpu_ids(self) -> list[int]: + """Return first half of GPUs for prefill.""" + mid = len(self.gpu_ids) // 2 + return self.gpu_ids[:mid] + + @property + def decode_gpu_ids(self) -> list[int]: + """Return second half of GPUs for decode.""" + mid = len(self.gpu_ids) // 2 + return self.gpu_ids[mid:] + def get_vllm_serve_commands(self) -> list[list[str]]: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) cmd_args: VllmCmdArgs = tdef.cmd_args - base_cmd = ["vllm", "serve", cmd_args.model, "--port", str(cmd_args.port)] + if len(self.gpu_ids) == 1: - return [base_cmd] + return [["vllm", "serve", cmd_args.model, "--port", str(cmd_args.port)]] + + prefill_port = cmd_args.port + 100 + decode_port = cmd_args.port + 200 + + prefill_cmd = [ + "vllm", + "serve", + cmd_args.model, + "--port", + str(prefill_port), + "--kv-transfer-config", + '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}', + ] + decode_cmd = [ + "vllm", + "serve", + cmd_args.model, + "--port", + str(decode_port), + "--kv-transfer-config", + '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}', + ] + return [prefill_cmd, decode_cmd] - return [[]] # TODO + def get_proxy_command(self) -> list[str]: + """Return proxy server command for disaggregated mode.""" + tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) + cmd_args: VllmCmdArgs = tdef.cmd_args + prefill_port = cmd_args.port + 100 + decode_port = cmd_args.port + 200 + return [ + "python3", + cmd_args.proxy_script, + "--port", + str(cmd_args.port), + "--prefiller-hosts", + "localhost", + "--prefiller-ports", + str(prefill_port), + "--decoder-hosts", + "localhost", + "--decoder-ports", + str(decode_port), + ] def get_vllm_bench_command(self) -> list[str]: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) @@ -99,10 +153,28 @@ def _gen_srun_command(self) -> str: output_path = self.test_run.output_path.absolute() srun_prefix = " ".join(self.gen_srun_prefix()) - serve_cmd = " ".join(self.get_vllm_serve_commands()[0]) + serve_commands = self.get_vllm_serve_commands() bench_cmd = " ".join(self.get_vllm_bench_command()) health_func = self.generate_wait_for_health_function() + if len(serve_commands) == 1: + return self._gen_aggregated_script( + srun_prefix, serve_commands[0], bench_cmd, health_func, cmd_args, output_path + ) + else: + return self._gen_disaggregated_script( + srun_prefix, serve_commands, bench_cmd, health_func, cmd_args, output_path + ) + + def _gen_aggregated_script( + self, + srun_prefix: str, + serve_cmd: list[str], + bench_cmd: str, + health_func: str, + cmd_args: VllmCmdArgs, + output_path, + ) -> str: return f"""\ cleanup() {{ [ -n "$VLLM_PID" ] && kill $VLLM_PID 2>/dev/null @@ -114,13 +186,66 @@ def _gen_srun_command(self) -> str: echo "Starting vLLM instances..." {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ --output={output_path}/{VLLM_SERVE_LOG_FILE} \\ - {serve_cmd} & + {" ".join(serve_cmd)} & VLLM_PID=$! NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) echo "Waiting for vLLM on $NODE to be ready..." wait_for_health "http://${{NODE}}:{cmd_args.port}/health" || exit 1 +echo "Running benchmark..." +{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ + --output={output_path}/{VLLM_BENCH_LOG_FILE} \\ + {bench_cmd}""" + + def _gen_disaggregated_script( + self, + srun_prefix: str, + serve_commands: list[list[str]], + bench_cmd: str, + health_func: str, + cmd_args: VllmCmdArgs, + output_path, + ) -> str: + prefill_cmd, decode_cmd = serve_commands + proxy_cmd = self.get_proxy_command() + prefill_port = cmd_args.port + 100 + decode_port = cmd_args.port + 200 + prefill_gpus = ",".join(str(g) for g in self.prefill_gpu_ids) + decode_gpus = ",".join(str(g) for g in self.decode_gpu_ids) + + return f"""\ +cleanup() {{ + [ -n "$PREFILL_PID" ] && kill $PREFILL_PID 2>/dev/null + [ -n "$DECODE_PID" ] && kill $DECODE_PID 2>/dev/null + [ -n "$PROXY_PID" ] && kill $PROXY_PID 2>/dev/null +}} +trap cleanup EXIT + +{health_func} + +echo "Starting vLLM instances..." +CUDA_VISIBLE_DEVICES={prefill_gpus} {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ + --output={output_path}/vllm-prefill.log \\ + {" ".join(prefill_cmd)} & +PREFILL_PID=$! + +CUDA_VISIBLE_DEVICES={decode_gpus} {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ + --output={output_path}/vllm-decode.log \\ + {" ".join(decode_cmd)} & +DECODE_PID=$! + +NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) +echo "Waiting for vLLM on $NODE to be ready..." +wait_for_health "http://${{NODE}}:{prefill_port}/health" || exit 1 +wait_for_health "http://${{NODE}}:{decode_port}/health" || exit 1 + +echo "Starting proxy..." +{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ + --output={output_path}/vllm-proxy.log \\ + {" ".join(proxy_cmd)} & +PROXY_PID=$! + echo "Running benchmark..." {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ --output={output_path}/{VLLM_BENCH_LOG_FILE} \\ diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index bed2f89f6..54df95797 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -29,6 +29,7 @@ class VllmCmdArgs(CmdArgs): port: int = 8000 vllm_serve_wait_seconds: int = 300 model: str = "Qwen/Qwen3-0.6B" + proxy_script: str = "/opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" class VllmBenchCmdArgs(CmdArgs): diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch new file mode 100644 index 000000000..412adc9be --- /dev/null +++ b/tests/ref_data/vllm-disagg.sbatch @@ -0,0 +1,67 @@ +#!/bin/bash +# generated by CloudAI@__CLOUDAI_VERSION__ +#SBATCH --job-name=__JOB_NAME__ +#SBATCH --output=__OUTPUT_DIR__/output/stdout.txt +#SBATCH --error=__OUTPUT_DIR__/output/stderr.txt +#SBATCH --partition=main +#SBATCH -N 1 +#SBATCH --gpus-per-node=8 +#SBATCH --gres=gpu:8 + +export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) +export CUDA_VISIBLE_DEVICES=0,1,2,3 +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." + +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh + +cleanup() { + [ -n "$PREFILL_PID" ] && kill $PREFILL_PID 2>/dev/null + [ -n "$DECODE_PID" ] && kill $DECODE_PID 2>/dev/null + [ -n "$PROXY_PID" ] && kill $PROXY_PID 2>/dev/null +} +trap cleanup EXIT + +wait_for_health() { + local endpoint="$1" + local timeout=300 + local interval=5 + local end_time=$(($(date +%s) + timeout)) + + while [ "$(date +%s)" -lt "$end_time" ]; do + if curl -sf "$endpoint" > /dev/null 2>&1; then + echo "Health check passed: $endpoint" + return 0 + fi + sleep "$interval" + done + + echo "Timeout waiting for: $endpoint" + return 1 +} + +echo "Starting vLLM instances..." +CUDA_VISIBLE_DEVICES=0,1 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ + --output=__OUTPUT_DIR__/output/vllm-prefill.log \ + vllm serve Qwen/Qwen3-0.6B --port 8100 --kv-transfer-config {"kv_connector":"NixlConnector","kv_role":"kv_producer"} & +PREFILL_PID=$! + +CUDA_VISIBLE_DEVICES=2,3 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ + --output=__OUTPUT_DIR__/output/vllm-decode.log \ + vllm serve Qwen/Qwen3-0.6B --port 8200 --kv-transfer-config {"kv_connector":"NixlConnector","kv_role":"kv_consumer"} & +DECODE_PID=$! + +NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) +echo "Waiting for vLLM on $NODE to be ready..." +wait_for_health "http://${NODE}:8100/health" || exit 1 +wait_for_health "http://${NODE}:8200/health" || exit 1 + +echo "Starting proxy..." +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ + --output=__OUTPUT_DIR__/output/vllm-proxy.log \ + python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8000 --prefiller-hosts localhost --prefiller-ports 8100 --decoder-hosts localhost --decoder-ports 8200 & +PROXY_PID=$! + +echo "Running benchmark..." +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ + --output=__OUTPUT_DIR__/output/vllm-bench.log \ + vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index 3c7b0645f..37b41e47c 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -46,6 +46,13 @@ def vllm_cmd_gen_strategy(vllm_tr: TestRun, slurm_system: SlurmSystem) -> VllmSl return VllmSlurmCommandGenStrategy(slurm_system, vllm_tr) +@pytest.fixture +def vllm_disagg_tr(vllm: VllmTestDefinition, tmp_path: Path) -> TestRun: + """TestRun for disaggregated mode with 4 GPUs.""" + vllm.extra_env_vars = {"CUDA_VISIBLE_DEVICES": "0,1,2,3"} + return TestRun(test=vllm, num_nodes=1, nodes=[], output_path=tmp_path, name="vllm-disagg-job") + + class TestGpuDetection: """Tests for GPU detection logic.""" @@ -158,3 +165,122 @@ def test_gen_srun_command_full_flow(self, vllm_cmd_gen_strategy: VllmSlurmComman {bench_cmd}""" assert srun_command == expected + + +class TestVllmDisaggregatedMode: + """Tests for vLLM disaggregated mode with multiple GPUs.""" + + def test_prefill_gpu_ids(self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: + """Prefill gets first half of GPUs.""" + strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_disagg_tr) + assert strategy.prefill_gpu_ids == [0, 1] + + def test_decode_gpu_ids(self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: + """Decode gets second half of GPUs.""" + strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_disagg_tr) + assert strategy.decode_gpu_ids == [2, 3] + + def test_get_vllm_serve_commands_returns_two(self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: + """Disagg mode returns prefill and decode commands.""" + strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_disagg_tr) + cmd_args = vllm_disagg_tr.test.cmd_args + + commands = strategy.get_vllm_serve_commands() + + assert len(commands) == 2 + prefill_cmd, decode_cmd = commands + + assert prefill_cmd == [ + "vllm", + "serve", + cmd_args.model, + "--port", + str(cmd_args.port + 100), + "--kv-transfer-config", + '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}', + ] + assert decode_cmd == [ + "vllm", + "serve", + cmd_args.model, + "--port", + str(cmd_args.port + 200), + "--kv-transfer-config", + '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}', + ] + + def test_get_proxy_command(self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: + """Proxy command routes to prefill and decode ports.""" + strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_disagg_tr) + cmd_args = vllm_disagg_tr.test.cmd_args + + command = strategy.get_proxy_command() + + assert command == [ + "python3", + cmd_args.proxy_script, + "--port", + str(cmd_args.port), + "--prefiller-hosts", + "localhost", + "--prefiller-ports", + str(cmd_args.port + 100), + "--decoder-hosts", + "localhost", + "--decoder-ports", + str(cmd_args.port + 200), + ] + + def test_gen_srun_command_disagg_flow(self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: + """Disagg mode starts prefill, decode, and proxy, waits for health checks.""" + strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_disagg_tr) + cmd_args = vllm_disagg_tr.test.cmd_args + output_path = vllm_disagg_tr.output_path.absolute() + srun_prefix = " ".join(strategy.gen_srun_prefix()) + prefill_cmd, decode_cmd = strategy.get_vllm_serve_commands() + proxy_cmd = strategy.get_proxy_command() + bench_cmd = " ".join(strategy.get_vllm_bench_command()) + health_func = strategy.generate_wait_for_health_function() + prefill_gpus = ",".join(str(g) for g in strategy.prefill_gpu_ids) + decode_gpus = ",".join(str(g) for g in strategy.decode_gpu_ids) + + srun_command = strategy._gen_srun_command() + + expected = f"""\ +cleanup() {{ + [ -n "$PREFILL_PID" ] && kill $PREFILL_PID 2>/dev/null + [ -n "$DECODE_PID" ] && kill $DECODE_PID 2>/dev/null + [ -n "$PROXY_PID" ] && kill $PROXY_PID 2>/dev/null +}} +trap cleanup EXIT + +{health_func} + +echo "Starting vLLM instances..." +CUDA_VISIBLE_DEVICES={prefill_gpus} {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ + --output={output_path}/vllm-prefill.log \\ + {" ".join(prefill_cmd)} & +PREFILL_PID=$! + +CUDA_VISIBLE_DEVICES={decode_gpus} {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ + --output={output_path}/vllm-decode.log \\ + {" ".join(decode_cmd)} & +DECODE_PID=$! + +NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) +echo "Waiting for vLLM on $NODE to be ready..." +wait_for_health "http://${{NODE}}:{cmd_args.port + 100}/health" || exit 1 +wait_for_health "http://${{NODE}}:{cmd_args.port + 200}/health" || exit 1 + +echo "Starting proxy..." +{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ + --output={output_path}/vllm-proxy.log \\ + {" ".join(proxy_cmd)} & +PROXY_PID=$! + +echo "Running benchmark..." +{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ + --output={output_path}/{VLLM_BENCH_LOG_FILE} \\ + {bench_cmd}""" + + assert srun_command == expected diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 5eafb58be..8d81ea5b3 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -264,6 +264,7 @@ def build_special_test_run( "deepep-benchmark", "osu-bench", "vllm", + "vllm-disagg", ] ) def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -> Tuple[TestRun, str, Optional[str]]: @@ -509,6 +510,21 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - extra_env_vars={"CUDA_VISIBLE_DEVICES": "0"}, ), ), + "vllm-disagg": lambda: create_test_run( + partial_tr, + "vllm-disagg", + VllmTestDefinition( + name="vllm-disagg", + description="vLLM disaggregated benchmark", + test_template_name="Vllm", + cmd_args=VllmCmdArgs( + docker_image_url="nvcr.io/nvidia/vllm:latest", + model="Qwen/Qwen3-0.6B", + port=8000, + ), + extra_env_vars={"CUDA_VISIBLE_DEVICES": "0,1,2,3"}, + ), + ), } if request.param.startswith(("gpt-", "grok-", "nemo-run-", "nemo-launcher")): From 812c1c562470fb444decffbca8f198c516a6b15c Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 14:57:40 +0100 Subject: [PATCH 15/45] Add quotation --- src/cloudai/workloads/vllm/slurm_command_gen_strategy.py | 4 ++-- tests/ref_data/vllm-disagg.sbatch | 4 ++-- .../test_vllm_slurm_command_gen_strategy.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index f134c77a4..486e4d284 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -67,7 +67,7 @@ def get_vllm_serve_commands(self) -> list[list[str]]: "--port", str(prefill_port), "--kv-transfer-config", - '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}', + '\'{"kv_connector":"NixlConnector","kv_role":"kv_producer"}\'', ] decode_cmd = [ "vllm", @@ -76,7 +76,7 @@ def get_vllm_serve_commands(self) -> list[list[str]]: "--port", str(decode_port), "--kv-transfer-config", - '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}', + '\'{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}\'', ] return [prefill_cmd, decode_cmd] diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch index 412adc9be..22c24c4b2 100644 --- a/tests/ref_data/vllm-disagg.sbatch +++ b/tests/ref_data/vllm-disagg.sbatch @@ -42,12 +42,12 @@ wait_for_health() { echo "Starting vLLM instances..." CUDA_VISIBLE_DEVICES=0,1 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-prefill.log \ - vllm serve Qwen/Qwen3-0.6B --port 8100 --kv-transfer-config {"kv_connector":"NixlConnector","kv_role":"kv_producer"} & + vllm serve Qwen/Qwen3-0.6B --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & PREFILL_PID=$! CUDA_VISIBLE_DEVICES=2,3 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-decode.log \ - vllm serve Qwen/Qwen3-0.6B --port 8200 --kv-transfer-config {"kv_connector":"NixlConnector","kv_role":"kv_consumer"} & + vllm serve Qwen/Qwen3-0.6B --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & DECODE_PID=$! NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index 37b41e47c..bf19a1f16 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -197,7 +197,7 @@ def test_get_vllm_serve_commands_returns_two(self, vllm_disagg_tr: TestRun, slur "--port", str(cmd_args.port + 100), "--kv-transfer-config", - '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}', + '\'{"kv_connector":"NixlConnector","kv_role":"kv_producer"}\'', ] assert decode_cmd == [ "vllm", @@ -206,7 +206,7 @@ def test_get_vllm_serve_commands_returns_two(self, vllm_disagg_tr: TestRun, slur "--port", str(cmd_args.port + 200), "--kv-transfer-config", - '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}', + '\'{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}\'', ] def test_get_proxy_command(self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: From 07074d966717fb89614482d4ee67a917cc2ae7d0 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 15:05:41 +0100 Subject: [PATCH 16/45] Add wa for conflicting VLLM_NIXL_SIDE_CHANNEL_PORT --- src/cloudai/workloads/vllm/slurm_command_gen_strategy.py | 6 ++++-- tests/ref_data/vllm-disagg.sbatch | 6 ++++-- .../test_vllm_slurm_command_gen_strategy.py | 6 ++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 486e4d284..cc479d9e8 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -225,12 +225,14 @@ def _gen_disaggregated_script( {health_func} echo "Starting vLLM instances..." -CUDA_VISIBLE_DEVICES={prefill_gpus} {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ +CUDA_VISIBLE_DEVICES={prefill_gpus} VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + SLURM_JOB_ID)) \\ + {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ --output={output_path}/vllm-prefill.log \\ {" ".join(prefill_cmd)} & PREFILL_PID=$! -CUDA_VISIBLE_DEVICES={decode_gpus} {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ +CUDA_VISIBLE_DEVICES={decode_gpus} VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + SLURM_JOB_ID + 1)) \\ + {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ --output={output_path}/vllm-decode.log \\ {" ".join(decode_cmd)} & DECODE_PID=$! diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch index 22c24c4b2..f2c724add 100644 --- a/tests/ref_data/vllm-disagg.sbatch +++ b/tests/ref_data/vllm-disagg.sbatch @@ -40,12 +40,14 @@ wait_for_health() { } echo "Starting vLLM instances..." -CUDA_VISIBLE_DEVICES=0,1 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ +CUDA_VISIBLE_DEVICES=0,1 VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + SLURM_JOB_ID)) \ + srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-prefill.log \ vllm serve Qwen/Qwen3-0.6B --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & PREFILL_PID=$! -CUDA_VISIBLE_DEVICES=2,3 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ +CUDA_VISIBLE_DEVICES=2,3 VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + SLURM_JOB_ID + 1)) \ + srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-decode.log \ vllm serve Qwen/Qwen3-0.6B --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & DECODE_PID=$! diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index bf19a1f16..eab5dc3ed 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -257,12 +257,14 @@ def test_gen_srun_command_disagg_flow(self, vllm_disagg_tr: TestRun, slurm_syste {health_func} echo "Starting vLLM instances..." -CUDA_VISIBLE_DEVICES={prefill_gpus} {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ +CUDA_VISIBLE_DEVICES={prefill_gpus} VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + SLURM_JOB_ID)) \\ + {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ --output={output_path}/vllm-prefill.log \\ {" ".join(prefill_cmd)} & PREFILL_PID=$! -CUDA_VISIBLE_DEVICES={decode_gpus} {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ +CUDA_VISIBLE_DEVICES={decode_gpus} VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + SLURM_JOB_ID + 1)) \\ + {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ --output={output_path}/vllm-decode.log \\ {" ".join(decode_cmd)} & DECODE_PID=$! From 0ce7b0120452ee617ed15dd34ab606d5df83887c Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 15:09:21 +0100 Subject: [PATCH 17/45] Fix port offset value --- src/cloudai/workloads/vllm/slurm_command_gen_strategy.py | 6 ++++-- tests/ref_data/vllm-disagg.sbatch | 6 ++++-- .../test_vllm_slurm_command_gen_strategy.py | 6 ++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index cc479d9e8..c5be7f431 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -224,14 +224,16 @@ def _gen_disaggregated_script( {health_func} +PORT_OFFSET=$((SLURM_JOB_ID % 1000)) + echo "Starting vLLM instances..." -CUDA_VISIBLE_DEVICES={prefill_gpus} VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + SLURM_JOB_ID)) \\ +CUDA_VISIBLE_DEVICES={prefill_gpus} VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + PORT_OFFSET)) \\ {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ --output={output_path}/vllm-prefill.log \\ {" ".join(prefill_cmd)} & PREFILL_PID=$! -CUDA_VISIBLE_DEVICES={decode_gpus} VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + SLURM_JOB_ID + 1)) \\ +CUDA_VISIBLE_DEVICES={decode_gpus} VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + PORT_OFFSET + 1)) \\ {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ --output={output_path}/vllm-decode.log \\ {" ".join(decode_cmd)} & diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch index f2c724add..6f61812b0 100644 --- a/tests/ref_data/vllm-disagg.sbatch +++ b/tests/ref_data/vllm-disagg.sbatch @@ -39,14 +39,16 @@ wait_for_health() { return 1 } +PORT_OFFSET=$((SLURM_JOB_ID % 1000)) + echo "Starting vLLM instances..." -CUDA_VISIBLE_DEVICES=0,1 VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + SLURM_JOB_ID)) \ +CUDA_VISIBLE_DEVICES=0,1 VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + PORT_OFFSET)) \ srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-prefill.log \ vllm serve Qwen/Qwen3-0.6B --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & PREFILL_PID=$! -CUDA_VISIBLE_DEVICES=2,3 VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + SLURM_JOB_ID + 1)) \ +CUDA_VISIBLE_DEVICES=2,3 VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + PORT_OFFSET + 1)) \ srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-decode.log \ vllm serve Qwen/Qwen3-0.6B --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index eab5dc3ed..bdb76dc2f 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -256,14 +256,16 @@ def test_gen_srun_command_disagg_flow(self, vllm_disagg_tr: TestRun, slurm_syste {health_func} +PORT_OFFSET=$((SLURM_JOB_ID % 1000)) + echo "Starting vLLM instances..." -CUDA_VISIBLE_DEVICES={prefill_gpus} VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + SLURM_JOB_ID)) \\ +CUDA_VISIBLE_DEVICES={prefill_gpus} VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + PORT_OFFSET)) \\ {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ --output={output_path}/vllm-prefill.log \\ {" ".join(prefill_cmd)} & PREFILL_PID=$! -CUDA_VISIBLE_DEVICES={decode_gpus} VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + SLURM_JOB_ID + 1)) \\ +CUDA_VISIBLE_DEVICES={decode_gpus} VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + PORT_OFFSET + 1)) \\ {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ --output={output_path}/vllm-decode.log \\ {" ".join(decode_cmd)} & From cf8cbe18ffcf13828e7e9b6fc977691155f69933 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 15:18:05 +0100 Subject: [PATCH 18/45] Use --export for per-run env vars --- .../workloads/vllm/slurm_command_gen_strategy.py | 10 ++++++---- tests/ref_data/vllm-disagg.sbatch | 10 ++++++---- .../test_vllm_slurm_command_gen_strategy.py | 10 ++++++---- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index c5be7f431..437b05b70 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -225,16 +225,18 @@ def _gen_disaggregated_script( {health_func} PORT_OFFSET=$((SLURM_JOB_ID % 1000)) +PREFILL_NIXL_PORT=$((5557 + PORT_OFFSET)) +DECODE_NIXL_PORT=$((5557 + PORT_OFFSET + 1)) echo "Starting vLLM instances..." -CUDA_VISIBLE_DEVICES={prefill_gpus} VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + PORT_OFFSET)) \\ - {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ +{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ + --export=ALL,CUDA_VISIBLE_DEVICES={prefill_gpus},VLLM_NIXL_SIDE_CHANNEL_PORT=$PREFILL_NIXL_PORT \\ --output={output_path}/vllm-prefill.log \\ {" ".join(prefill_cmd)} & PREFILL_PID=$! -CUDA_VISIBLE_DEVICES={decode_gpus} VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + PORT_OFFSET + 1)) \\ - {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ +{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ + --export=ALL,CUDA_VISIBLE_DEVICES={decode_gpus},VLLM_NIXL_SIDE_CHANNEL_PORT=$DECODE_NIXL_PORT \\ --output={output_path}/vllm-decode.log \\ {" ".join(decode_cmd)} & DECODE_PID=$! diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch index 6f61812b0..be6cabac9 100644 --- a/tests/ref_data/vllm-disagg.sbatch +++ b/tests/ref_data/vllm-disagg.sbatch @@ -40,16 +40,18 @@ wait_for_health() { } PORT_OFFSET=$((SLURM_JOB_ID % 1000)) +PREFILL_NIXL_PORT=$((5557 + PORT_OFFSET)) +DECODE_NIXL_PORT=$((5557 + PORT_OFFSET + 1)) echo "Starting vLLM instances..." -CUDA_VISIBLE_DEVICES=0,1 VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + PORT_OFFSET)) \ - srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ + --export=ALL,CUDA_VISIBLE_DEVICES=0,1,VLLM_NIXL_SIDE_CHANNEL_PORT=$PREFILL_NIXL_PORT \ --output=__OUTPUT_DIR__/output/vllm-prefill.log \ vllm serve Qwen/Qwen3-0.6B --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & PREFILL_PID=$! -CUDA_VISIBLE_DEVICES=2,3 VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + PORT_OFFSET + 1)) \ - srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ + --export=ALL,CUDA_VISIBLE_DEVICES=2,3,VLLM_NIXL_SIDE_CHANNEL_PORT=$DECODE_NIXL_PORT \ --output=__OUTPUT_DIR__/output/vllm-decode.log \ vllm serve Qwen/Qwen3-0.6B --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & DECODE_PID=$! diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index bdb76dc2f..3b4e00a19 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -257,16 +257,18 @@ def test_gen_srun_command_disagg_flow(self, vllm_disagg_tr: TestRun, slurm_syste {health_func} PORT_OFFSET=$((SLURM_JOB_ID % 1000)) +PREFILL_NIXL_PORT=$((5557 + PORT_OFFSET)) +DECODE_NIXL_PORT=$((5557 + PORT_OFFSET + 1)) echo "Starting vLLM instances..." -CUDA_VISIBLE_DEVICES={prefill_gpus} VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + PORT_OFFSET)) \\ - {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ +{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ + --export=ALL,CUDA_VISIBLE_DEVICES={prefill_gpus},VLLM_NIXL_SIDE_CHANNEL_PORT=$PREFILL_NIXL_PORT \\ --output={output_path}/vllm-prefill.log \\ {" ".join(prefill_cmd)} & PREFILL_PID=$! -CUDA_VISIBLE_DEVICES={decode_gpus} VLLM_NIXL_SIDE_CHANNEL_PORT=$((5557 + PORT_OFFSET + 1)) \\ - {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ +{srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ + --export=ALL,CUDA_VISIBLE_DEVICES={decode_gpus},VLLM_NIXL_SIDE_CHANNEL_PORT=$DECODE_NIXL_PORT \\ --output={output_path}/vllm-decode.log \\ {" ".join(decode_cmd)} & DECODE_PID=$! From a55a969f0099900f25d266d0ba89630f95d69385 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 15:34:03 +0100 Subject: [PATCH 19/45] Support extra args --- .../vllm/slurm_command_gen_strategy.py | 5 +- src/cloudai/workloads/vllm/vllm.py | 14 ++++++ .../test_vllm_slurm_command_gen_strategy.py | 50 +++++++++++++++++++ 3 files changed, 68 insertions(+), 1 deletion(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 437b05b70..1266ca681 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -53,9 +53,10 @@ def decode_gpu_ids(self) -> list[int]: def get_vllm_serve_commands(self) -> list[list[str]]: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) cmd_args: VllmCmdArgs = tdef.cmd_args + extra_args = tdef.serve_extra_args if len(self.gpu_ids) == 1: - return [["vllm", "serve", cmd_args.model, "--port", str(cmd_args.port)]] + return [["vllm", "serve", cmd_args.model, "--port", str(cmd_args.port), *extra_args]] prefill_port = cmd_args.port + 100 decode_port = cmd_args.port + 200 @@ -68,6 +69,7 @@ def get_vllm_serve_commands(self) -> list[list[str]]: str(prefill_port), "--kv-transfer-config", '\'{"kv_connector":"NixlConnector","kv_role":"kv_producer"}\'', + *extra_args, ] decode_cmd = [ "vllm", @@ -77,6 +79,7 @@ def get_vllm_serve_commands(self) -> list[list[str]]: str(decode_port), "--kv-transfer-config", '\'{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}\'', + *extra_args, ] return [prefill_cmd, decode_cmd] diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index 54df95797..07001916e 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -59,6 +59,20 @@ def docker_image(self) -> DockerImage: def installables(self) -> list[Installable]: return [*self.git_repos, self.docker_image] + @property + def cmd_args_dict(self) -> dict[str, str | list[str]]: + """Return cmd_args as dict, excluding fields handled separately.""" + excluded = {"docker_image_url", "port", "vllm_serve_wait_seconds", "model", "proxy_script"} + return {k: str(v) for k, v in self.cmd_args.model_dump().items() if k not in excluded} + + @property + def serve_extra_args(self) -> list[str]: + """Convert cmd_args_dict to command-line arguments list for vllm serve.""" + args = [] + for k, v in self.cmd_args_dict.items(): + args.extend([f"--{k.replace('_', '-')}", str(v)]) + return args + def was_run_successful(self, tr: TestRun) -> JobStatusResult: log_path = tr.output_path / VLLM_BENCH_LOG_FILE if not log_path.is_file(): diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index 3b4e00a19..562328cb5 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -76,6 +76,56 @@ def test_gpu_ids_fallback_to_system( assert strategy.gpu_ids == list(range(gpus_per_node or 1)) +class TestServeExtraArgs: + """Tests for serve_extra_args property.""" + + def test_serve_extra_args_empty_by_default(self) -> None: + """Default cmd_args produces empty extra args (all fields excluded).""" + tdef = VllmTestDefinition( + name="vllm", + description="test", + test_template_name="Vllm", + cmd_args=VllmCmdArgs(docker_image_url="image:latest"), + ) + assert tdef.serve_extra_args == [] + + def test_serve_extra_args_with_custom_fields(self) -> None: + """Extra fields in cmd_args appear in serve_extra_args.""" + tdef = VllmTestDefinition( + name="vllm", + description="test", + test_template_name="Vllm", + cmd_args=VllmCmdArgs.model_validate( + { + "docker_image_url": "image:latest", + "tensor_parallel_size": 4, + "max_model_len": 8192, + } + ), + ) + assert tdef.serve_extra_args == [ + "--tensor-parallel-size", + "4", + "--max-model-len", + "8192", + ] + + def test_serve_extra_args_underscore_to_dash(self) -> None: + """Underscores in field names are converted to dashes.""" + tdef = VllmTestDefinition( + name="vllm", + description="test", + test_template_name="Vllm", + cmd_args=VllmCmdArgs.model_validate( + { + "docker_image_url": "image:latest", + "some_long_arg": "value", + } + ), + ) + assert "--some-long-arg" in tdef.serve_extra_args + + class TestVllmAggregatedMode: """Tests for vLLM non-disaggregated mode with 1 GPU.""" From ac7e6037d34be6f0f2a98be3f8f9bd578eaa183d Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 15:39:47 +0100 Subject: [PATCH 20/45] More info on cleanup --- .../workloads/vllm/slurm_command_gen_strategy.py | 10 ++++++---- tests/ref_data/vllm-disagg.sbatch | 7 ++++--- tests/ref_data/vllm.sbatch | 3 ++- .../test_vllm_slurm_command_gen_strategy.py | 10 ++++++---- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 1266ca681..6b2d6f556 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -180,7 +180,8 @@ def _gen_aggregated_script( ) -> str: return f"""\ cleanup() {{ - [ -n "$VLLM_PID" ] && kill $VLLM_PID 2>/dev/null + echo "Cleaning up PIDs: VLLM_PID=$VLLM_PID" + [ -n "$VLLM_PID" ] && kill -9 $VLLM_PID 2>/dev/null }} trap cleanup EXIT @@ -219,9 +220,10 @@ def _gen_disaggregated_script( return f"""\ cleanup() {{ - [ -n "$PREFILL_PID" ] && kill $PREFILL_PID 2>/dev/null - [ -n "$DECODE_PID" ] && kill $DECODE_PID 2>/dev/null - [ -n "$PROXY_PID" ] && kill $PROXY_PID 2>/dev/null + echo "Cleaning up PIDs: PREFILL_PID=$PREFILL_PID DECODE_PID=$DECODE_PID PROXY_PID=$PROXY_PID" + [ -n "$PREFILL_PID" ] && kill -9 $PREFILL_PID 2>/dev/null + [ -n "$DECODE_PID" ] && kill -9 $DECODE_PID 2>/dev/null + [ -n "$PROXY_PID" ] && kill -9 $PROXY_PID 2>/dev/null }} trap cleanup EXIT diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch index be6cabac9..4096fdcd3 100644 --- a/tests/ref_data/vllm-disagg.sbatch +++ b/tests/ref_data/vllm-disagg.sbatch @@ -15,9 +15,10 @@ srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest -- srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh cleanup() { - [ -n "$PREFILL_PID" ] && kill $PREFILL_PID 2>/dev/null - [ -n "$DECODE_PID" ] && kill $DECODE_PID 2>/dev/null - [ -n "$PROXY_PID" ] && kill $PROXY_PID 2>/dev/null + echo "Cleaning up PIDs: PREFILL_PID=$PREFILL_PID DECODE_PID=$DECODE_PID PROXY_PID=$PROXY_PID" + [ -n "$PREFILL_PID" ] && kill -9 $PREFILL_PID 2>/dev/null + [ -n "$DECODE_PID" ] && kill -9 $DECODE_PID 2>/dev/null + [ -n "$PROXY_PID" ] && kill -9 $PROXY_PID 2>/dev/null } trap cleanup EXIT diff --git a/tests/ref_data/vllm.sbatch b/tests/ref_data/vllm.sbatch index 73eba98c7..d1e089ce7 100644 --- a/tests/ref_data/vllm.sbatch +++ b/tests/ref_data/vllm.sbatch @@ -15,7 +15,8 @@ srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest -- srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh cleanup() { - [ -n "$VLLM_PID" ] && kill $VLLM_PID 2>/dev/null + echo "Cleaning up PIDs: VLLM_PID=$VLLM_PID" + [ -n "$VLLM_PID" ] && kill -9 $VLLM_PID 2>/dev/null } trap cleanup EXIT diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index 562328cb5..30fdc1557 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -193,7 +193,8 @@ def test_gen_srun_command_full_flow(self, vllm_cmd_gen_strategy: VllmSlurmComman expected = f"""\ cleanup() {{ - [ -n "$VLLM_PID" ] && kill $VLLM_PID 2>/dev/null + echo "Cleaning up PIDs: VLLM_PID=$VLLM_PID" + [ -n "$VLLM_PID" ] && kill -9 $VLLM_PID 2>/dev/null }} trap cleanup EXIT @@ -298,9 +299,10 @@ def test_gen_srun_command_disagg_flow(self, vllm_disagg_tr: TestRun, slurm_syste expected = f"""\ cleanup() {{ - [ -n "$PREFILL_PID" ] && kill $PREFILL_PID 2>/dev/null - [ -n "$DECODE_PID" ] && kill $DECODE_PID 2>/dev/null - [ -n "$PROXY_PID" ] && kill $PROXY_PID 2>/dev/null + echo "Cleaning up PIDs: PREFILL_PID=$PREFILL_PID DECODE_PID=$DECODE_PID PROXY_PID=$PROXY_PID" + [ -n "$PREFILL_PID" ] && kill -9 $PREFILL_PID 2>/dev/null + [ -n "$DECODE_PID" ] && kill -9 $DECODE_PID 2>/dev/null + [ -n "$PROXY_PID" ] && kill -9 $PROXY_PID 2>/dev/null }} trap cleanup EXIT From 6052bd761753004c486ab2042f6b27a31540a450 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 16:01:12 +0100 Subject: [PATCH 21/45] Correct list of devices as arg --- src/cloudai/workloads/vllm/slurm_command_gen_strategy.py | 4 ++-- tests/ref_data/vllm-disagg.sbatch | 4 ++-- .../test_vllm_slurm_command_gen_strategy.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 6b2d6f556..1ea6bfd89 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -235,13 +235,13 @@ def _gen_disaggregated_script( echo "Starting vLLM instances..." {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --export=ALL,CUDA_VISIBLE_DEVICES={prefill_gpus},VLLM_NIXL_SIDE_CHANNEL_PORT=$PREFILL_NIXL_PORT \\ + --export=ALL,CUDA_VISIBLE_DEVICES="{prefill_gpus}",VLLM_NIXL_SIDE_CHANNEL_PORT=$PREFILL_NIXL_PORT \\ --output={output_path}/vllm-prefill.log \\ {" ".join(prefill_cmd)} & PREFILL_PID=$! {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --export=ALL,CUDA_VISIBLE_DEVICES={decode_gpus},VLLM_NIXL_SIDE_CHANNEL_PORT=$DECODE_NIXL_PORT \\ + --export=ALL,CUDA_VISIBLE_DEVICES="{decode_gpus}",VLLM_NIXL_SIDE_CHANNEL_PORT=$DECODE_NIXL_PORT \\ --output={output_path}/vllm-decode.log \\ {" ".join(decode_cmd)} & DECODE_PID=$! diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch index 4096fdcd3..a746b8237 100644 --- a/tests/ref_data/vllm-disagg.sbatch +++ b/tests/ref_data/vllm-disagg.sbatch @@ -46,13 +46,13 @@ DECODE_NIXL_PORT=$((5557 + PORT_OFFSET + 1)) echo "Starting vLLM instances..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ - --export=ALL,CUDA_VISIBLE_DEVICES=0,1,VLLM_NIXL_SIDE_CHANNEL_PORT=$PREFILL_NIXL_PORT \ + --export=ALL,CUDA_VISIBLE_DEVICES="0,1",VLLM_NIXL_SIDE_CHANNEL_PORT=$PREFILL_NIXL_PORT \ --output=__OUTPUT_DIR__/output/vllm-prefill.log \ vllm serve Qwen/Qwen3-0.6B --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & PREFILL_PID=$! srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ - --export=ALL,CUDA_VISIBLE_DEVICES=2,3,VLLM_NIXL_SIDE_CHANNEL_PORT=$DECODE_NIXL_PORT \ + --export=ALL,CUDA_VISIBLE_DEVICES="2,3",VLLM_NIXL_SIDE_CHANNEL_PORT=$DECODE_NIXL_PORT \ --output=__OUTPUT_DIR__/output/vllm-decode.log \ vllm serve Qwen/Qwen3-0.6B --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & DECODE_PID=$! diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index 30fdc1557..a711f6112 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -314,13 +314,13 @@ def test_gen_srun_command_disagg_flow(self, vllm_disagg_tr: TestRun, slurm_syste echo "Starting vLLM instances..." {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --export=ALL,CUDA_VISIBLE_DEVICES={prefill_gpus},VLLM_NIXL_SIDE_CHANNEL_PORT=$PREFILL_NIXL_PORT \\ + --export=ALL,CUDA_VISIBLE_DEVICES="{prefill_gpus}",VLLM_NIXL_SIDE_CHANNEL_PORT=$PREFILL_NIXL_PORT \\ --output={output_path}/vllm-prefill.log \\ {" ".join(prefill_cmd)} & PREFILL_PID=$! {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --export=ALL,CUDA_VISIBLE_DEVICES={decode_gpus},VLLM_NIXL_SIDE_CHANNEL_PORT=$DECODE_NIXL_PORT \\ + --export=ALL,CUDA_VISIBLE_DEVICES="{decode_gpus}",VLLM_NIXL_SIDE_CHANNEL_PORT=$DECODE_NIXL_PORT \\ --output={output_path}/vllm-decode.log \\ {" ".join(decode_cmd)} & DECODE_PID=$! From f25e41e94529e0629f056abc4b073c0e220bb5a5 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 16:35:40 +0100 Subject: [PATCH 22/45] Better env vars handling --- src/cloudai/workloads/vllm/slurm_command_gen_strategy.py | 6 ++++-- tests/ref_data/vllm-disagg.sbatch | 6 ++++-- .../test_vllm_slurm_command_gen_strategy.py | 6 ++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 1ea6bfd89..fb55e2729 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -234,14 +234,16 @@ def _gen_disaggregated_script( DECODE_NIXL_PORT=$((5557 + PORT_OFFSET + 1)) echo "Starting vLLM instances..." +export CUDA_VISIBLE_DEVICES="{prefill_gpus}" +export VLLM_NIXL_SIDE_CHANNEL_PORT=$PREFILL_NIXL_PORT {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --export=ALL,CUDA_VISIBLE_DEVICES="{prefill_gpus}",VLLM_NIXL_SIDE_CHANNEL_PORT=$PREFILL_NIXL_PORT \\ --output={output_path}/vllm-prefill.log \\ {" ".join(prefill_cmd)} & PREFILL_PID=$! +export CUDA_VISIBLE_DEVICES="{decode_gpus}" +export VLLM_NIXL_SIDE_CHANNEL_PORT=$DECODE_NIXL_PORT {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --export=ALL,CUDA_VISIBLE_DEVICES="{decode_gpus}",VLLM_NIXL_SIDE_CHANNEL_PORT=$DECODE_NIXL_PORT \\ --output={output_path}/vllm-decode.log \\ {" ".join(decode_cmd)} & DECODE_PID=$! diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch index a746b8237..496a5ea98 100644 --- a/tests/ref_data/vllm-disagg.sbatch +++ b/tests/ref_data/vllm-disagg.sbatch @@ -45,14 +45,16 @@ PREFILL_NIXL_PORT=$((5557 + PORT_OFFSET)) DECODE_NIXL_PORT=$((5557 + PORT_OFFSET + 1)) echo "Starting vLLM instances..." +export CUDA_VISIBLE_DEVICES="0,1" +export VLLM_NIXL_SIDE_CHANNEL_PORT=$PREFILL_NIXL_PORT srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ - --export=ALL,CUDA_VISIBLE_DEVICES="0,1",VLLM_NIXL_SIDE_CHANNEL_PORT=$PREFILL_NIXL_PORT \ --output=__OUTPUT_DIR__/output/vllm-prefill.log \ vllm serve Qwen/Qwen3-0.6B --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & PREFILL_PID=$! +export CUDA_VISIBLE_DEVICES="2,3" +export VLLM_NIXL_SIDE_CHANNEL_PORT=$DECODE_NIXL_PORT srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ - --export=ALL,CUDA_VISIBLE_DEVICES="2,3",VLLM_NIXL_SIDE_CHANNEL_PORT=$DECODE_NIXL_PORT \ --output=__OUTPUT_DIR__/output/vllm-decode.log \ vllm serve Qwen/Qwen3-0.6B --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & DECODE_PID=$! diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index a711f6112..20f79b348 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -313,14 +313,16 @@ def test_gen_srun_command_disagg_flow(self, vllm_disagg_tr: TestRun, slurm_syste DECODE_NIXL_PORT=$((5557 + PORT_OFFSET + 1)) echo "Starting vLLM instances..." +export CUDA_VISIBLE_DEVICES="{prefill_gpus}" +export VLLM_NIXL_SIDE_CHANNEL_PORT=$PREFILL_NIXL_PORT {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --export=ALL,CUDA_VISIBLE_DEVICES="{prefill_gpus}",VLLM_NIXL_SIDE_CHANNEL_PORT=$PREFILL_NIXL_PORT \\ --output={output_path}/vllm-prefill.log \\ {" ".join(prefill_cmd)} & PREFILL_PID=$! +export CUDA_VISIBLE_DEVICES="{decode_gpus}" +export VLLM_NIXL_SIDE_CHANNEL_PORT=$DECODE_NIXL_PORT {srun_prefix} --overlap --ntasks-per-node=1 --ntasks=1 \\ - --export=ALL,CUDA_VISIBLE_DEVICES="{decode_gpus}",VLLM_NIXL_SIDE_CHANNEL_PORT=$DECODE_NIXL_PORT \\ --output={output_path}/vllm-decode.log \\ {" ".join(decode_cmd)} & DECODE_PID=$! From f1ec11d4bfd9dea50db13a382ddf9d54a09cd289 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 16:45:52 +0100 Subject: [PATCH 23/45] Better check for success --- src/cloudai/workloads/vllm/vllm.py | 8 +++++++- .../test_vllm_job_status_retrieval_strategy.py | 17 ++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index 07001916e..86a8b1ac6 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -78,10 +78,16 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult: if not log_path.is_file(): return JobStatusResult(is_successful=False, error_message=f"vLLM bench log not found in {tr.output_path}.") + has_results_marker = False with log_path.open("r") as f: for line in f: if "============ Serving Benchmark Result ============" in line: - return JobStatusResult(is_successful=True) + has_results_marker = True + continue + if has_results_marker and "Successful requests:" in line: + num_successful_requests = int(line.split()[2]) + if num_successful_requests > 0: + return JobStatusResult(is_successful=True) return JobStatusResult( is_successful=False, error_message=f"vLLM bench log does not contain benchmark result in {tr.output_path}." diff --git a/tests/job_status_retrieval_strategy/test_vllm_job_status_retrieval_strategy.py b/tests/job_status_retrieval_strategy/test_vllm_job_status_retrieval_strategy.py index 6e49a358d..a5ade87b8 100644 --- a/tests/job_status_retrieval_strategy/test_vllm_job_status_retrieval_strategy.py +++ b/tests/job_status_retrieval_strategy/test_vllm_job_status_retrieval_strategy.py @@ -35,7 +35,10 @@ def test_no_bench_log_file(self, base_tr: TestRun) -> None: def test_successful_job(self, base_tr: TestRun) -> None: base_tr.output_path.mkdir(parents=True, exist_ok=True) log_file = base_tr.output_path / VLLM_BENCH_LOG_FILE - log_content = "============ Serving Benchmark Result ============" + log_content = """ +============ Serving Benchmark Result ============ +Successful requests: 1 +""" log_file.write_text(log_content) result = self.vllm_tdef.was_run_successful(base_tr) assert result.is_successful @@ -57,3 +60,15 @@ def test_empty_log_file(self, base_tr: TestRun) -> None: result = self.vllm_tdef.was_run_successful(base_tr) assert not result.is_successful assert result.error_message == f"vLLM bench log does not contain benchmark result in {base_tr.output_path}." + + def test_no_succesfull_requests(self, base_tr: TestRun) -> None: + base_tr.output_path.mkdir(parents=True, exist_ok=True) + log_file = base_tr.output_path / VLLM_BENCH_LOG_FILE + log_content = """ +============ Serving Benchmark Result ============ +Successful requests: 0 +""" + log_file.write_text(log_content) + result = self.vllm_tdef.was_run_successful(base_tr) + assert not result.is_successful + assert result.error_message == f"vLLM bench log does not contain benchmark result in {base_tr.output_path}." From 11b00ed4314701a0e9122e1e66f72e7a18d11d37 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 16:52:47 +0100 Subject: [PATCH 24/45] Update port offset logic --- src/cloudai/workloads/vllm/slurm_command_gen_strategy.py | 2 +- tests/ref_data/vllm-disagg.sbatch | 2 +- .../test_vllm_slurm_command_gen_strategy.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index fb55e2729..ec785b990 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -231,7 +231,7 @@ def _gen_disaggregated_script( PORT_OFFSET=$((SLURM_JOB_ID % 1000)) PREFILL_NIXL_PORT=$((5557 + PORT_OFFSET)) -DECODE_NIXL_PORT=$((5557 + PORT_OFFSET + 1)) +DECODE_NIXL_PORT=$((5557 + PORT_OFFSET + {len(self.gpu_ids)})) echo "Starting vLLM instances..." export CUDA_VISIBLE_DEVICES="{prefill_gpus}" diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch index 496a5ea98..405bb2658 100644 --- a/tests/ref_data/vllm-disagg.sbatch +++ b/tests/ref_data/vllm-disagg.sbatch @@ -42,7 +42,7 @@ wait_for_health() { PORT_OFFSET=$((SLURM_JOB_ID % 1000)) PREFILL_NIXL_PORT=$((5557 + PORT_OFFSET)) -DECODE_NIXL_PORT=$((5557 + PORT_OFFSET + 1)) +DECODE_NIXL_PORT=$((5557 + PORT_OFFSET + 4)) echo "Starting vLLM instances..." export CUDA_VISIBLE_DEVICES="0,1" diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index 20f79b348..1155cd326 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -310,7 +310,7 @@ def test_gen_srun_command_disagg_flow(self, vllm_disagg_tr: TestRun, slurm_syste PORT_OFFSET=$((SLURM_JOB_ID % 1000)) PREFILL_NIXL_PORT=$((5557 + PORT_OFFSET)) -DECODE_NIXL_PORT=$((5557 + PORT_OFFSET + 1)) +DECODE_NIXL_PORT=$((5557 + PORT_OFFSET + {len(strategy.gpu_ids)})) echo "Starting vLLM instances..." export CUDA_VISIBLE_DEVICES="{prefill_gpus}" From dcee8ec66764f9c1c4f5731db39816a739d42f2c Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 3 Feb 2026 17:11:18 +0100 Subject: [PATCH 25/45] Use ip instead of localhost --- src/cloudai/workloads/vllm/slurm_command_gen_strategy.py | 6 +++--- tests/ref_data/vllm-disagg.sbatch | 4 ++-- tests/ref_data/vllm.sbatch | 2 +- .../test_vllm_slurm_command_gen_strategy.py | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index ec785b990..0049fd528 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -95,11 +95,11 @@ def get_proxy_command(self) -> list[str]: "--port", str(cmd_args.port), "--prefiller-hosts", - "localhost", + "0.0.0.0", "--prefiller-ports", str(prefill_port), "--decoder-hosts", - "localhost", + "0.0.0.0", "--decoder-ports", str(decode_port), ] @@ -115,7 +115,7 @@ def get_vllm_bench_command(self) -> list[str]: "--model", cmd_args.model, "--base-url", - f"http://${{NODE}}:{cmd_args.port}", + f"http://0.0.0.0:{cmd_args.port}", "--random-input-len", str(bench_args.random_input_len), "--random-output-len", diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch index 405bb2658..410387a05 100644 --- a/tests/ref_data/vllm-disagg.sbatch +++ b/tests/ref_data/vllm-disagg.sbatch @@ -67,10 +67,10 @@ wait_for_health "http://${NODE}:8200/health" || exit 1 echo "Starting proxy..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-proxy.log \ - python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8000 --prefiller-hosts localhost --prefiller-ports 8100 --decoder-hosts localhost --decoder-ports 8200 & + python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8000 --prefiller-hosts 0.0.0.0 --prefiller-ports 8100 --decoder-hosts 0.0.0.0 --decoder-ports 8200 & PROXY_PID=$! echo "Running benchmark..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-bench.log \ - vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 + vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://0.0.0.0:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 diff --git a/tests/ref_data/vllm.sbatch b/tests/ref_data/vllm.sbatch index d1e089ce7..ae687a6bc 100644 --- a/tests/ref_data/vllm.sbatch +++ b/tests/ref_data/vllm.sbatch @@ -51,4 +51,4 @@ wait_for_health "http://${NODE}:8000/health" || exit 1 echo "Running benchmark..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-bench.log \ - vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 + vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://0.0.0.0:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index 1155cd326..587aa35e5 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -172,7 +172,7 @@ def test_get_vllm_bench_command(self, vllm_cmd_gen_strategy: VllmSlurmCommandGen expected = ( f"vllm bench serve --model {cmd_args.model} " - f"--base-url http://${{NODE}}:{cmd_args.port} " + f"--base-url http://0.0.0.0:{cmd_args.port} " f"--random-input-len {bench_args.random_input_len} " f"--random-output-len {bench_args.random_output_len} " f"--max-concurrency {bench_args.max_concurrency} " @@ -273,11 +273,11 @@ def test_get_proxy_command(self, vllm_disagg_tr: TestRun, slurm_system: SlurmSys "--port", str(cmd_args.port), "--prefiller-hosts", - "localhost", + "0.0.0.0", "--prefiller-ports", str(cmd_args.port + 100), "--decoder-hosts", - "localhost", + "0.0.0.0", "--decoder-ports", str(cmd_args.port + 200), ] From f85086f0ecd5d9cf386f2944ca760daa7e89d382 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Thu, 5 Feb 2026 14:45:33 +0100 Subject: [PATCH 26/45] Configure a git repo for proxy script --- src/cloudai/workloads/vllm/vllm.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index 86a8b1ac6..8bb07a035 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -15,7 +15,7 @@ # limitations under the License. -from cloudai.core import DockerImage, Installable, JobStatusResult, TestRun +from cloudai.core import DockerImage, GitRepo, Installable, JobStatusResult, TestRun from cloudai.models.workload import CmdArgs, TestDefinition VLLM_SERVE_LOG_FILE = "vllm-serve.log" @@ -46,6 +46,7 @@ class VllmTestDefinition(TestDefinition): cmd_args: VllmCmdArgs bench_cmd_args: VllmBenchCmdArgs = VllmBenchCmdArgs() + proxy_script_repo: GitRepo | None = None _docker_image: DockerImage | None = None @@ -57,7 +58,10 @@ def docker_image(self) -> DockerImage: @property def installables(self) -> list[Installable]: - return [*self.git_repos, self.docker_image] + installables = [*self.git_repos, self.docker_image] + if self.proxy_script_repo: + installables.append(self.proxy_script_repo) + return installables @property def cmd_args_dict(self) -> dict[str, str | list[str]]: From ce78d8549a7498e2aa4baf9ba38875a883f96348 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Thu, 5 Feb 2026 15:01:57 +0100 Subject: [PATCH 27/45] Override prefill/decode gpu list --- .../vllm/slurm_command_gen_strategy.py | 14 +++++++---- src/cloudai/workloads/vllm/vllm.py | 20 +++++++++++++++- .../test_vllm_slurm_command_gen_strategy.py | 24 +++++++++++++++++++ 3 files changed, 53 insertions(+), 5 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 0049fd528..9c720ae22 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -31,22 +31,30 @@ def image_path(self) -> str | None: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) return str(tdef.docker_image.installed_path) + @property + def tdef(self) -> VllmTestDefinition: + return cast(VllmTestDefinition, self.test_run.test) + @property def gpu_ids(self) -> list[int]: cuda_devices = self.test_run.test.extra_env_vars.get("CUDA_VISIBLE_DEVICES") + if self.tdef.cmd_args.prefill_gpu_ids and self.tdef.cmd_args.decode_gpu_ids: + cuda_devices = f"{self.tdef.cmd_args.prefill_gpu_ids},{self.tdef.cmd_args.decode_gpu_ids}" if cuda_devices: return [int(gpu_id) for gpu_id in str(cuda_devices).split(",")] return list(range(self.system.gpus_per_node or 1)) @property def prefill_gpu_ids(self) -> list[int]: - """Return first half of GPUs for prefill.""" + if self.tdef.cmd_args.prefill_gpu_ids: + return [int(gpu_id) for gpu_id in str(self.tdef.cmd_args.prefill_gpu_ids).split(",")] mid = len(self.gpu_ids) // 2 return self.gpu_ids[:mid] @property def decode_gpu_ids(self) -> list[int]: - """Return second half of GPUs for decode.""" + if self.tdef.cmd_args.decode_gpu_ids: + return [int(gpu_id) for gpu_id in str(self.tdef.cmd_args.decode_gpu_ids).split(",")] mid = len(self.gpu_ids) // 2 return self.gpu_ids[mid:] @@ -84,7 +92,6 @@ def get_vllm_serve_commands(self) -> list[list[str]]: return [prefill_cmd, decode_cmd] def get_proxy_command(self) -> list[str]: - """Return proxy server command for disaggregated mode.""" tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) cmd_args: VllmCmdArgs = tdef.cmd_args prefill_port = cmd_args.port + 100 @@ -127,7 +134,6 @@ def get_vllm_bench_command(self) -> list[str]: ] def generate_wait_for_health_function(self) -> str: - """Generate bash function for health check.""" tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) cmd_args: VllmCmdArgs = tdef.cmd_args diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index 8bb07a035..caabf2258 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -15,6 +15,8 @@ # limitations under the License. +from pydantic import Field + from cloudai.core import DockerImage, GitRepo, Installable, JobStatusResult, TestRun from cloudai.models.workload import CmdArgs, TestDefinition @@ -30,6 +32,14 @@ class VllmCmdArgs(CmdArgs): vllm_serve_wait_seconds: int = 300 model: str = "Qwen/Qwen3-0.6B" proxy_script: str = "/opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" + prefill_gpu_ids: str | list[str] | None = Field( + default=None, + description="Comma-separated GPU IDs for prefill. If not set, will use first half of available GPUs.", + ) + decode_gpu_ids: str | list[str] | None = Field( + default=None, + description="Comma-separated GPU IDs for decode. If not set, will use second half of available GPUs.", + ) class VllmBenchCmdArgs(CmdArgs): @@ -66,7 +76,15 @@ def installables(self) -> list[Installable]: @property def cmd_args_dict(self) -> dict[str, str | list[str]]: """Return cmd_args as dict, excluding fields handled separately.""" - excluded = {"docker_image_url", "port", "vllm_serve_wait_seconds", "model", "proxy_script"} + excluded = { + "docker_image_url", + "port", + "vllm_serve_wait_seconds", + "model", + "proxy_script", + "prefill_gpu_ids", + "decode_gpu_ids", + } return {k: str(v) for k, v in self.cmd_args.model_dump().items() if k not in excluded} @property diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index 587aa35e5..4daac56cd 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -75,6 +75,30 @@ def test_gpu_ids_fallback_to_system( assert strategy.gpu_ids == list(range(gpus_per_node or 1)) + def test_gpu_ids_use_prefill_and_decode_gpu_ids(self, vllm_tr: TestRun, slurm_system: SlurmSystem) -> None: + slurm_system.gpus_per_node = 4 + vllm_tr.test.extra_env_vars = {"CUDA_VISIBLE_DEVICES": "0,1,2,3"} + vllm_tr.test.cmd_args.prefill_gpu_ids = "4" + vllm_tr.test.cmd_args.decode_gpu_ids = "5" + strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr) + assert strategy.gpu_ids == [4, 5] + assert strategy.prefill_gpu_ids == [4] + assert strategy.decode_gpu_ids == [5] + + def test_prefill_nodes_set(self, vllm_tr: TestRun, slurm_system: SlurmSystem) -> None: + slurm_system.gpus_per_node = 4 + vllm_tr.test.extra_env_vars = {"CUDA_VISIBLE_DEVICES": "0,1,2,3"} + vllm_tr.test.cmd_args.prefill_gpu_ids = "0,3" + strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr) + assert strategy.prefill_gpu_ids == [0, 3] + + def test_decode_nodes_set(self, vllm_tr: TestRun, slurm_system: SlurmSystem) -> None: + slurm_system.gpus_per_node = 4 + vllm_tr.test.extra_env_vars = {"CUDA_VISIBLE_DEVICES": "0,1,2,3"} + vllm_tr.test.cmd_args.decode_gpu_ids = "1,2" + strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr) + assert strategy.decode_gpu_ids == [1, 2] + class TestServeExtraArgs: """Tests for serve_extra_args property.""" From bc09add15fffb05c87012086a3e5bcf56334e249 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Thu, 5 Feb 2026 15:40:35 +0100 Subject: [PATCH 28/45] Updates --- .../vllm/slurm_command_gen_strategy.py | 17 ++++++----------- src/cloudai/workloads/vllm/vllm.py | 11 +++-------- .../test_vllm_slurm_command_gen_strategy.py | 6 ++++++ 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 9c720ae22..33e158f5c 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -37,11 +37,11 @@ def tdef(self) -> VllmTestDefinition: @property def gpu_ids(self) -> list[int]: - cuda_devices = self.test_run.test.extra_env_vars.get("CUDA_VISIBLE_DEVICES") + cuda_devices = str(self.test_run.test.extra_env_vars.get("CUDA_VISIBLE_DEVICES", "")) if self.tdef.cmd_args.prefill_gpu_ids and self.tdef.cmd_args.decode_gpu_ids: cuda_devices = f"{self.tdef.cmd_args.prefill_gpu_ids},{self.tdef.cmd_args.decode_gpu_ids}" if cuda_devices: - return [int(gpu_id) for gpu_id in str(cuda_devices).split(",")] + return [int(gpu_id) for gpu_id in cuda_devices.split(",")] return list(range(self.system.gpus_per_node or 1)) @property @@ -63,31 +63,26 @@ def get_vllm_serve_commands(self) -> list[list[str]]: cmd_args: VllmCmdArgs = tdef.cmd_args extra_args = tdef.serve_extra_args + base_cmd = ["vllm", "serve", cmd_args.model, *extra_args, "--port", str(cmd_args.port)] if len(self.gpu_ids) == 1: - return [["vllm", "serve", cmd_args.model, "--port", str(cmd_args.port), *extra_args]] + return [base_cmd] prefill_port = cmd_args.port + 100 decode_port = cmd_args.port + 200 prefill_cmd = [ - "vllm", - "serve", - cmd_args.model, + *base_cmd[:3], "--port", str(prefill_port), "--kv-transfer-config", '\'{"kv_connector":"NixlConnector","kv_role":"kv_producer"}\'', - *extra_args, ] decode_cmd = [ - "vllm", - "serve", - cmd_args.model, + *base_cmd[:3], "--port", str(decode_port), "--kv-transfer-config", '\'{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}\'', - *extra_args, ] return [prefill_cmd, decode_cmd] diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index caabf2258..97e807a78 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -74,8 +74,8 @@ def installables(self) -> list[Installable]: return installables @property - def cmd_args_dict(self) -> dict[str, str | list[str]]: - """Return cmd_args as dict, excluding fields handled separately.""" + def serve_extra_args(self) -> list[str]: + """Convert cmd_args_dict to command-line arguments list for vllm serve.""" excluded = { "docker_image_url", "port", @@ -85,13 +85,8 @@ def cmd_args_dict(self) -> dict[str, str | list[str]]: "prefill_gpu_ids", "decode_gpu_ids", } - return {k: str(v) for k, v in self.cmd_args.model_dump().items() if k not in excluded} - - @property - def serve_extra_args(self) -> list[str]: - """Convert cmd_args_dict to command-line arguments list for vllm serve.""" args = [] - for k, v in self.cmd_args_dict.items(): + for k, v in self.cmd_args.model_dump(exclude=excluded).items(): args.extend([f"--{k.replace('_', '-')}", str(v)]) return args diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index 4daac56cd..239bc54dd 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -368,3 +368,9 @@ def test_gen_srun_command_disagg_flow(self, vllm_disagg_tr: TestRun, slurm_syste {bench_cmd}""" assert srun_command == expected + + +def test_sweep_detection(vllm: VllmTestDefinition) -> None: + assert vllm.is_dse_job is False + vllm.cmd_args.decode_gpu_ids = ["1"] + assert vllm.is_dse_job is True From 29271b01902b8188f2482283d667b1eb6dfc9622 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Thu, 5 Feb 2026 16:23:41 +0100 Subject: [PATCH 29/45] Configure output files for bench, allow any options --- .../vllm/slurm_command_gen_strategy.py | 26 ++++++------ src/cloudai/workloads/vllm/vllm.py | 1 + .../test_vllm_slurm_command_gen_strategy.py | 41 ++++++++++++++----- 3 files changed, 44 insertions(+), 24 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 33e158f5c..f52ba1547 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -18,7 +18,7 @@ from cloudai.systems.slurm import SlurmCommandGenStrategy -from .vllm import VLLM_BENCH_LOG_FILE, VLLM_SERVE_LOG_FILE, VllmCmdArgs, VllmTestDefinition +from .vllm import VLLM_BENCH_JSON_FILE, VLLM_BENCH_LOG_FILE, VLLM_SERVE_LOG_FILE, VllmCmdArgs, VllmTestDefinition class VllmSlurmCommandGenStrategy(SlurmCommandGenStrategy): @@ -110,22 +110,22 @@ def get_vllm_bench_command(self) -> list[str]: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) cmd_args: VllmCmdArgs = tdef.cmd_args bench_args = tdef.bench_cmd_args + extra_args = tdef.bench_cmd_args.model_extra or {} + extras = ["--" + k.replace("_", "-") + " " + str(v) for k, v in extra_args.items()] return [ "vllm", "bench", "serve", - "--model", - cmd_args.model, - "--base-url", - f"http://0.0.0.0:{cmd_args.port}", - "--random-input-len", - str(bench_args.random_input_len), - "--random-output-len", - str(bench_args.random_output_len), - "--max-concurrency", - str(bench_args.max_concurrency), - "--num-prompts", - str(bench_args.num_prompts), + f"--model {cmd_args.model}", + f"--base-url http://0.0.0.0:{cmd_args.port}", + f"--random-input-len {bench_args.random_input_len}", + f"--random-output-len {bench_args.random_output_len}", + f"--max-concurrency {bench_args.max_concurrency}", + f"--num-prompts {bench_args.num_prompts}", + f"--result-dir {self.test_run.output_path.absolute()}", + f"--result-filename {VLLM_BENCH_JSON_FILE}", + "--save-result", + *extras, ] def generate_wait_for_health_function(self) -> str: diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index 97e807a78..c1e698620 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -22,6 +22,7 @@ VLLM_SERVE_LOG_FILE = "vllm-serve.log" VLLM_BENCH_LOG_FILE = "vllm-bench.log" +VLLM_BENCH_JSON_FILE = "vllm-bench.json" class VllmCmdArgs(CmdArgs): diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index 239bc54dd..f0a4329ca 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -21,8 +21,8 @@ from cloudai.core import TestRun from cloudai.systems.slurm import SlurmSystem -from cloudai.workloads.vllm import VllmCmdArgs, VllmSlurmCommandGenStrategy, VllmTestDefinition -from cloudai.workloads.vllm.vllm import VLLM_BENCH_LOG_FILE, VLLM_SERVE_LOG_FILE +from cloudai.workloads.vllm import VllmBenchCmdArgs, VllmCmdArgs, VllmSlurmCommandGenStrategy, VllmTestDefinition +from cloudai.workloads.vllm.vllm import VLLM_BENCH_JSON_FILE, VLLM_BENCH_LOG_FILE, VLLM_SERVE_LOG_FILE @pytest.fixture @@ -192,18 +192,37 @@ def test_get_vllm_bench_command(self, vllm_cmd_gen_strategy: VllmSlurmCommandGen cmd_args = tdef.cmd_args bench_args = tdef.bench_cmd_args - command = " ".join(vllm_cmd_gen_strategy.get_vllm_bench_command()) + command = vllm_cmd_gen_strategy.get_vllm_bench_command() - expected = ( - f"vllm bench serve --model {cmd_args.model} " - f"--base-url http://0.0.0.0:{cmd_args.port} " - f"--random-input-len {bench_args.random_input_len} " - f"--random-output-len {bench_args.random_output_len} " - f"--max-concurrency {bench_args.max_concurrency} " - f"--num-prompts {bench_args.num_prompts}" - ) + expected = [ + "vllm", + "bench", + "serve", + f"--model {cmd_args.model}", + f"--base-url http://0.0.0.0:{cmd_args.port}", + f"--random-input-len {bench_args.random_input_len}", + f"--random-output-len {bench_args.random_output_len}", + f"--max-concurrency {bench_args.max_concurrency}", + f"--num-prompts {bench_args.num_prompts}", + f"--result-dir {vllm_cmd_gen_strategy.test_run.output_path.absolute()}", + f"--result-filename {VLLM_BENCH_JSON_FILE}", + "--save-result", + ] assert command == expected + def test_get_vllm_bench_command_with_extra_args( + self, vllm: VllmTestDefinition, vllm_tr: TestRun, slurm_system: SlurmSystem + ) -> None: + vllm.bench_cmd_args = VllmBenchCmdArgs.model_validate({"extra1": 1, "extra-2": 2, "extra_3": 3}) + vllm_tr.test = vllm + vllm_cmd_gen_strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr) + + cmd = vllm_cmd_gen_strategy.get_vllm_bench_command() + + assert "--extra1 1" in cmd + assert "--extra-2 2" in cmd + assert "--extra-3 3" in cmd + def test_gen_srun_command_full_flow(self, vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy) -> None: tdef = vllm_cmd_gen_strategy.test_run.test cmd_args = tdef.cmd_args From 271df0f79c3e2a9dfe254e3d61263c3c01ece552 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Thu, 5 Feb 2026 16:46:28 +0100 Subject: [PATCH 30/45] Add reporting for vLLM --- src/cloudai/registration.py | 2 + src/cloudai/workloads/vllm/__init__.py | 2 + .../vllm/report_generation_strategy.py | 73 +++++++++++++++++++ tests/ref_data/vllm-disagg.sbatch | 2 +- tests/ref_data/vllm.sbatch | 2 +- tests/test_test_scenario.py | 2 +- 6 files changed, 80 insertions(+), 3 deletions(-) create mode 100644 src/cloudai/workloads/vllm/report_generation_strategy.py diff --git a/src/cloudai/registration.py b/src/cloudai/registration.py index 3cd803245..a1f9840e5 100644 --- a/src/cloudai/registration.py +++ b/src/cloudai/registration.py @@ -160,6 +160,7 @@ def register_all(): UCCTestSlurmCommandGenStrategy, ) from cloudai.workloads.vllm import ( + VLLMBenchReportGenerationStrategy, VllmSlurmCommandGenStrategy, VllmTestDefinition, ) @@ -277,6 +278,7 @@ def register_all(): Registry().add_report(AIDynamoTestDefinition, AIDynamoReportGenerationStrategy) Registry().add_report(AiconfiguratorTestDefinition, AiconfiguratorReportGenerationStrategy) Registry().add_report(NixlPerftestTestDefinition, NIXLKVBenchDummyReport) + Registry().add_report(VllmTestDefinition, VLLMBenchReportGenerationStrategy) Registry().add_scenario_report("per_test", PerTestReporter, ReportConfig(enable=True)) Registry().add_scenario_report("status", StatusReporter, ReportConfig(enable=True)) diff --git a/src/cloudai/workloads/vllm/__init__.py b/src/cloudai/workloads/vllm/__init__.py index 65c9cdd08..54782b25a 100644 --- a/src/cloudai/workloads/vllm/__init__.py +++ b/src/cloudai/workloads/vllm/__init__.py @@ -14,11 +14,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .report_generation_strategy import VLLMBenchReportGenerationStrategy from .slurm_command_gen_strategy import VllmSlurmCommandGenStrategy from .vllm import VLLM_BENCH_LOG_FILE, VllmBenchCmdArgs, VllmCmdArgs, VllmTestDefinition __all__ = [ "VLLM_BENCH_LOG_FILE", + "VLLMBenchReportGenerationStrategy", "VllmBenchCmdArgs", "VllmCmdArgs", "VllmSlurmCommandGenStrategy", diff --git a/src/cloudai/workloads/vllm/report_generation_strategy.py b/src/cloudai/workloads/vllm/report_generation_strategy.py new file mode 100644 index 000000000..74db74a50 --- /dev/null +++ b/src/cloudai/workloads/vllm/report_generation_strategy.py @@ -0,0 +1,73 @@ +import json +from functools import cache +from pathlib import Path + +from pydantic import BaseModel, ConfigDict +from rich.console import Console +from rich.table import Table + +from cloudai.core import ReportGenerationStrategy + +from .vllm import VLLM_BENCH_JSON_FILE + + +class VLLMBenchReport(BaseModel): + """Report for vLLM benchmark results.""" + + model_config = ConfigDict(extra="ignore") + + num_prompts: int + completed: int + mean_ttft_ms: float + median_ttft_ms: float + std_ttft_ms: float + p99_ttft_ms: float + mean_tpot_ms: float + median_tpot_ms: float + std_tpot_ms: float + p99_tpot_ms: float + + +@cache +def parse_vllm_bench_output(res_file: Path) -> VLLMBenchReport | None: + """Parse the vLLM benchmark output file and return a VLLMBenchReport object.""" + if not res_file.is_file(): + return None + + with res_file.open("r") as f: + data = json.load(f) + + return VLLMBenchReport.model_validate(data) + + +class VLLMBenchReportGenerationStrategy(ReportGenerationStrategy): + """Generate a report for vLLM benchmark results.""" + + def can_handle_directory(self) -> bool: + return parse_vllm_bench_output(self.test_run.output_path / VLLM_BENCH_JSON_FILE) is not None + + def generate_report(self) -> None: + results = parse_vllm_bench_output(self.test_run.output_path / VLLM_BENCH_JSON_FILE) + if results is None: + return + + console = Console() + table = Table(title=f"vLLM Benchmark Results ({self.test_run.output_path})", title_justify="left") + table.add_column("Successful prompt rate, %", justify="right") + table.add_column("TTFT Mean, ms", justify="right") + table.add_column("TTFT Median, ms", justify="right") + table.add_column("TTFT P99, ms", justify="right") + table.add_column("TPOT Mean, ms", justify="right") + table.add_column("TPOT Median, ms", justify="right") + table.add_column("TPOT P99, ms", justify="right") + table.add_row( + f"{results.completed / results.num_prompts * 100:.2f}% ({results.completed} / {results.num_prompts})", + f"{results.mean_ttft_ms:.4f}", + f"{results.median_ttft_ms:.4f}", + f"{results.p99_ttft_ms:.4f}", + f"{results.mean_tpot_ms:.4f}", + f"{results.median_tpot_ms:.4f}", + f"{results.p99_tpot_ms:.4f}", + ) + + console.print(table) diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch index 410387a05..c7e8ef454 100644 --- a/tests/ref_data/vllm-disagg.sbatch +++ b/tests/ref_data/vllm-disagg.sbatch @@ -73,4 +73,4 @@ PROXY_PID=$! echo "Running benchmark..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-bench.log \ - vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://0.0.0.0:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 + vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://0.0.0.0:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result diff --git a/tests/ref_data/vllm.sbatch b/tests/ref_data/vllm.sbatch index ae687a6bc..ebb50a835 100644 --- a/tests/ref_data/vllm.sbatch +++ b/tests/ref_data/vllm.sbatch @@ -51,4 +51,4 @@ wait_for_health "http://${NODE}:8000/health" || exit 1 echo "Running benchmark..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-bench.log \ - vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://0.0.0.0:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 + vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://0.0.0.0:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result diff --git a/tests/test_test_scenario.py b/tests/test_test_scenario.py index c2af1373b..656fde3bc 100644 --- a/tests/test_test_scenario.py +++ b/tests/test_test_scenario.py @@ -472,7 +472,7 @@ def test_default(self): assert len(reporters) == 0 def test_default_reporters_size(self): - assert len(Registry().reports_map) == 16 + assert len(Registry().reports_map) == 17 @pytest.mark.parametrize( "tdef,expected_reporters", From 1940f05e386579a0e48fbfda7560c512d7409022 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Thu, 5 Feb 2026 16:50:37 +0100 Subject: [PATCH 31/45] More human readable column --- src/cloudai/workloads/vllm/report_generation_strategy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cloudai/workloads/vllm/report_generation_strategy.py b/src/cloudai/workloads/vllm/report_generation_strategy.py index 74db74a50..dff6eedc9 100644 --- a/src/cloudai/workloads/vllm/report_generation_strategy.py +++ b/src/cloudai/workloads/vllm/report_generation_strategy.py @@ -53,7 +53,7 @@ def generate_report(self) -> None: console = Console() table = Table(title=f"vLLM Benchmark Results ({self.test_run.output_path})", title_justify="left") - table.add_column("Successful prompt rate, %", justify="right") + table.add_column("Successful prompts", justify="right") table.add_column("TTFT Mean, ms", justify="right") table.add_column("TTFT Median, ms", justify="right") table.add_column("TTFT P99, ms", justify="right") @@ -61,7 +61,7 @@ def generate_report(self) -> None: table.add_column("TPOT Median, ms", justify="right") table.add_column("TPOT P99, ms", justify="right") table.add_row( - f"{results.completed / results.num_prompts * 100:.2f}% ({results.completed} / {results.num_prompts})", + f"{results.completed / results.num_prompts * 100:.2f}% ({results.completed} of {results.num_prompts})", f"{results.mean_ttft_ms:.4f}", f"{results.median_ttft_ms:.4f}", f"{results.p99_ttft_ms:.4f}", From b501ceb53ad27ca29abb90a8253d34133b8c0cdf Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Fri, 6 Feb 2026 02:25:32 -0800 Subject: [PATCH 32/45] Control decode/prefill separately --- src/cloudai/registration.py | 2 +- src/cloudai/workloads/vllm/__init__.py | 3 +- .../vllm/slurm_command_gen_strategy.py | 26 +++++---- src/cloudai/workloads/vllm/vllm.py | 50 ++++++++-------- .../test_vllm_slurm_command_gen_strategy.py | 57 +++++++++++-------- 5 files changed, 74 insertions(+), 64 deletions(-) diff --git a/src/cloudai/registration.py b/src/cloudai/registration.py index a1f9840e5..749d7da24 100644 --- a/src/cloudai/registration.py +++ b/src/cloudai/registration.py @@ -257,7 +257,7 @@ def register_all(): Registry().add_test_definition("NIXLKVBench", NIXLKVBenchTestDefinition) Registry().add_test_definition("Aiconfigurator", AiconfiguratorTestDefinition) Registry().add_test_definition("OSUBench", OSUBenchTestDefinition) - Registry().add_test_definition("Vllm", VllmTestDefinition) + Registry().add_test_definition("vllm", VllmTestDefinition) Registry().add_agent("grid_search", GridSearchAgent) diff --git a/src/cloudai/workloads/vllm/__init__.py b/src/cloudai/workloads/vllm/__init__.py index 54782b25a..e0ec73351 100644 --- a/src/cloudai/workloads/vllm/__init__.py +++ b/src/cloudai/workloads/vllm/__init__.py @@ -16,11 +16,12 @@ from .report_generation_strategy import VLLMBenchReportGenerationStrategy from .slurm_command_gen_strategy import VllmSlurmCommandGenStrategy -from .vllm import VLLM_BENCH_LOG_FILE, VllmBenchCmdArgs, VllmCmdArgs, VllmTestDefinition +from .vllm import VLLM_BENCH_LOG_FILE, VllmArgs, VllmBenchCmdArgs, VllmCmdArgs, VllmTestDefinition __all__ = [ "VLLM_BENCH_LOG_FILE", "VLLMBenchReportGenerationStrategy", + "VllmArgs", "VllmBenchCmdArgs", "VllmCmdArgs", "VllmSlurmCommandGenStrategy", diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index f52ba1547..a1db60be9 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -38,51 +38,53 @@ def tdef(self) -> VllmTestDefinition: @property def gpu_ids(self) -> list[int]: cuda_devices = str(self.test_run.test.extra_env_vars.get("CUDA_VISIBLE_DEVICES", "")) - if self.tdef.cmd_args.prefill_gpu_ids and self.tdef.cmd_args.decode_gpu_ids: - cuda_devices = f"{self.tdef.cmd_args.prefill_gpu_ids},{self.tdef.cmd_args.decode_gpu_ids}" + if (self.tdef.cmd_args.prefill and self.tdef.cmd_args.prefill.gpu_ids) and self.tdef.cmd_args.decode.gpu_ids: + cuda_devices = f"{self.tdef.cmd_args.prefill.gpu_ids},{self.tdef.cmd_args.decode.gpu_ids}" if cuda_devices: return [int(gpu_id) for gpu_id in cuda_devices.split(",")] return list(range(self.system.gpus_per_node or 1)) @property def prefill_gpu_ids(self) -> list[int]: - if self.tdef.cmd_args.prefill_gpu_ids: - return [int(gpu_id) for gpu_id in str(self.tdef.cmd_args.prefill_gpu_ids).split(",")] + if self.tdef.cmd_args.prefill and self.tdef.cmd_args.prefill.gpu_ids: + return [int(gpu_id) for gpu_id in str(self.tdef.cmd_args.prefill.gpu_ids).split(",")] mid = len(self.gpu_ids) // 2 return self.gpu_ids[:mid] @property def decode_gpu_ids(self) -> list[int]: - if self.tdef.cmd_args.decode_gpu_ids: - return [int(gpu_id) for gpu_id in str(self.tdef.cmd_args.decode_gpu_ids).split(",")] + if self.tdef.cmd_args.decode.gpu_ids: + return [int(gpu_id) for gpu_id in str(self.tdef.cmd_args.decode.gpu_ids).split(",")] mid = len(self.gpu_ids) // 2 return self.gpu_ids[mid:] def get_vllm_serve_commands(self) -> list[list[str]]: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) cmd_args: VllmCmdArgs = tdef.cmd_args - extra_args = tdef.serve_extra_args - base_cmd = ["vllm", "serve", cmd_args.model, *extra_args, "--port", str(cmd_args.port)] - if len(self.gpu_ids) == 1: - return [base_cmd] + base_cmd = ["vllm", "serve", cmd_args.model] + if not tdef.cmd_args.prefill: + return [[*base_cmd, *tdef.cmd_args.decode.serve_args, "--port", str(cmd_args.port)]] prefill_port = cmd_args.port + 100 decode_port = cmd_args.port + 200 + prefill_extra_args = tdef.cmd_args.prefill.serve_args if tdef.cmd_args.prefill else [] prefill_cmd = [ - *base_cmd[:3], + *base_cmd, "--port", str(prefill_port), "--kv-transfer-config", '\'{"kv_connector":"NixlConnector","kv_role":"kv_producer"}\'', + *prefill_extra_args, ] decode_cmd = [ - *base_cmd[:3], + *base_cmd, "--port", str(decode_port), "--kv-transfer-config", '\'{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}\'', + *tdef.cmd_args.decode.serve_args, ] return [prefill_cmd, decode_cmd] diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index c1e698620..cb9aa372c 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -15,7 +15,7 @@ # limitations under the License. -from pydantic import Field +from pydantic import ConfigDict, Field from cloudai.core import DockerImage, GitRepo, Installable, JobStatusResult, TestRun from cloudai.models.workload import CmdArgs, TestDefinition @@ -25,22 +25,39 @@ VLLM_BENCH_JSON_FILE = "vllm-bench.json" +class VllmArgs(CmdArgs): + """Base command arguments for vLLM instances.""" + + gpu_ids: str | list[str] | None = Field( + default=None, + description="Comma-separated GPU IDs. If not set, will use all available GPUs.", + ) + + @property + def serve_args(self) -> list[str]: + """Convert cmd_args_dict to command-line arguments list for vllm serve.""" + args = [] + for k, v in self.model_dump(exclude={"gpu_ids"}).items(): + args.extend([f"--{k.replace('_', '-')}", str(v)]) + return args + + class VllmCmdArgs(CmdArgs): """vLLM serve command arguments.""" + model_config = ConfigDict(extra="forbid") # arbitrary fileds are allowed per decode/prefill, not here + docker_image_url: str port: int = 8000 vllm_serve_wait_seconds: int = 300 - model: str = "Qwen/Qwen3-0.6B" proxy_script: str = "/opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" - prefill_gpu_ids: str | list[str] | None = Field( - default=None, - description="Comma-separated GPU IDs for prefill. If not set, will use first half of available GPUs.", - ) - decode_gpu_ids: str | list[str] | None = Field( + + model: str = "Qwen/Qwen3-0.6B" + prefill: VllmArgs | None = Field( default=None, - description="Comma-separated GPU IDs for decode. If not set, will use second half of available GPUs.", + description="Prefill instance arguments. If not set, a single instance without disaggregation will be used.", ) + decode: VllmArgs = Field(default_factory=VllmArgs, description="Decode instance arguments.") class VllmBenchCmdArgs(CmdArgs): @@ -74,23 +91,6 @@ def installables(self) -> list[Installable]: installables.append(self.proxy_script_repo) return installables - @property - def serve_extra_args(self) -> list[str]: - """Convert cmd_args_dict to command-line arguments list for vllm serve.""" - excluded = { - "docker_image_url", - "port", - "vllm_serve_wait_seconds", - "model", - "proxy_script", - "prefill_gpu_ids", - "decode_gpu_ids", - } - args = [] - for k, v in self.cmd_args.model_dump(exclude=excluded).items(): - args.extend([f"--{k.replace('_', '-')}", str(v)]) - return args - def was_run_successful(self, tr: TestRun) -> JobStatusResult: log_path = tr.output_path / VLLM_BENCH_LOG_FILE if not log_path.is_file(): diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index f0a4329ca..9068e71ad 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -21,7 +21,13 @@ from cloudai.core import TestRun from cloudai.systems.slurm import SlurmSystem -from cloudai.workloads.vllm import VllmBenchCmdArgs, VllmCmdArgs, VllmSlurmCommandGenStrategy, VllmTestDefinition +from cloudai.workloads.vllm import ( + VllmArgs, + VllmBenchCmdArgs, + VllmCmdArgs, + VllmSlurmCommandGenStrategy, + VllmTestDefinition, +) from cloudai.workloads.vllm.vllm import VLLM_BENCH_JSON_FILE, VLLM_BENCH_LOG_FILE, VLLM_SERVE_LOG_FILE @@ -50,6 +56,7 @@ def vllm_cmd_gen_strategy(vllm_tr: TestRun, slurm_system: SlurmSystem) -> VllmSl def vllm_disagg_tr(vllm: VllmTestDefinition, tmp_path: Path) -> TestRun: """TestRun for disaggregated mode with 4 GPUs.""" vllm.extra_env_vars = {"CUDA_VISIBLE_DEVICES": "0,1,2,3"} + vllm.cmd_args.prefill = VllmArgs() return TestRun(test=vllm, num_nodes=1, nodes=[], output_path=tmp_path, name="vllm-disagg-job") @@ -78,8 +85,8 @@ def test_gpu_ids_fallback_to_system( def test_gpu_ids_use_prefill_and_decode_gpu_ids(self, vllm_tr: TestRun, slurm_system: SlurmSystem) -> None: slurm_system.gpus_per_node = 4 vllm_tr.test.extra_env_vars = {"CUDA_VISIBLE_DEVICES": "0,1,2,3"} - vllm_tr.test.cmd_args.prefill_gpu_ids = "4" - vllm_tr.test.cmd_args.decode_gpu_ids = "5" + vllm_tr.test.cmd_args.prefill = VllmArgs(gpu_ids="4") + vllm_tr.test.cmd_args.decode.gpu_ids = "5" strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr) assert strategy.gpu_ids == [4, 5] assert strategy.prefill_gpu_ids == [4] @@ -88,33 +95,25 @@ def test_gpu_ids_use_prefill_and_decode_gpu_ids(self, vllm_tr: TestRun, slurm_sy def test_prefill_nodes_set(self, vllm_tr: TestRun, slurm_system: SlurmSystem) -> None: slurm_system.gpus_per_node = 4 vllm_tr.test.extra_env_vars = {"CUDA_VISIBLE_DEVICES": "0,1,2,3"} - vllm_tr.test.cmd_args.prefill_gpu_ids = "0,3" + vllm_tr.test.cmd_args.prefill = VllmArgs(gpu_ids="0,3") strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr) assert strategy.prefill_gpu_ids == [0, 3] def test_decode_nodes_set(self, vllm_tr: TestRun, slurm_system: SlurmSystem) -> None: slurm_system.gpus_per_node = 4 vllm_tr.test.extra_env_vars = {"CUDA_VISIBLE_DEVICES": "0,1,2,3"} - vllm_tr.test.cmd_args.decode_gpu_ids = "1,2" + vllm_tr.test.cmd_args.decode.gpu_ids = "1,2" strategy = VllmSlurmCommandGenStrategy(slurm_system, vllm_tr) assert strategy.decode_gpu_ids == [1, 2] class TestServeExtraArgs: - """Tests for serve_extra_args property.""" + """Tests for serve_args property.""" - def test_serve_extra_args_empty_by_default(self) -> None: - """Default cmd_args produces empty extra args (all fields excluded).""" - tdef = VllmTestDefinition( - name="vllm", - description="test", - test_template_name="Vllm", - cmd_args=VllmCmdArgs(docker_image_url="image:latest"), - ) - assert tdef.serve_extra_args == [] + def test_serve_args_empty_by_default(self) -> None: + assert VllmArgs().serve_args == [] - def test_serve_extra_args_with_custom_fields(self) -> None: - """Extra fields in cmd_args appear in serve_extra_args.""" + def test_decode_serve_args_with_custom_fields(self) -> None: tdef = VllmTestDefinition( name="vllm", description="test", @@ -122,20 +121,20 @@ def test_serve_extra_args_with_custom_fields(self) -> None: cmd_args=VllmCmdArgs.model_validate( { "docker_image_url": "image:latest", - "tensor_parallel_size": 4, - "max_model_len": 8192, + "decode": {"tensor_parallel_size": 4, "max_model_len": 8192, "some_long_arg": "value"}, } ), ) - assert tdef.serve_extra_args == [ + assert tdef.cmd_args.decode.serve_args == [ "--tensor-parallel-size", "4", "--max-model-len", "8192", + "--some-long-arg", + "value", ] - def test_serve_extra_args_underscore_to_dash(self) -> None: - """Underscores in field names are converted to dashes.""" + def test_prefill_serve_args_with_custom_fields(self) -> None: tdef = VllmTestDefinition( name="vllm", description="test", @@ -143,11 +142,19 @@ def test_serve_extra_args_underscore_to_dash(self) -> None: cmd_args=VllmCmdArgs.model_validate( { "docker_image_url": "image:latest", - "some_long_arg": "value", + "prefill": {"tensor_parallel_size": 4, "max_model_len": 8192, "some_long_arg": "value"}, } ), ) - assert "--some-long-arg" in tdef.serve_extra_args + assert tdef.cmd_args.prefill is not None + assert tdef.cmd_args.prefill.serve_args == [ + "--tensor-parallel-size", + "4", + "--max-model-len", + "8192", + "--some-long-arg", + "value", + ] class TestVllmAggregatedMode: @@ -391,5 +398,5 @@ def test_gen_srun_command_disagg_flow(self, vllm_disagg_tr: TestRun, slurm_syste def test_sweep_detection(vllm: VllmTestDefinition) -> None: assert vllm.is_dse_job is False - vllm.cmd_args.decode_gpu_ids = ["1"] + vllm.cmd_args.decode.gpu_ids = ["1"] assert vllm.is_dse_job is True From a353c0609b4456ff2c6155473779d179394415f8 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 10 Feb 2026 11:30:31 +0100 Subject: [PATCH 33/45] Fix typo --- src/cloudai/workloads/vllm/vllm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index cb9aa372c..dfeeca1dc 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -45,7 +45,7 @@ def serve_args(self) -> list[str]: class VllmCmdArgs(CmdArgs): """vLLM serve command arguments.""" - model_config = ConfigDict(extra="forbid") # arbitrary fileds are allowed per decode/prefill, not here + model_config = ConfigDict(extra="forbid") # arbitrary fields are allowed per decode/prefill, not here docker_image_url: str port: int = 8000 From 1e858812cc22fb1a81bc3cd75d7154c95eb9b62a Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 10 Feb 2026 04:13:08 -0800 Subject: [PATCH 34/45] Install HF model and mount into container --- .../workloads/vllm/slurm_command_gen_strategy.py | 2 +- src/cloudai/workloads/vllm/vllm.py | 11 +++++++++-- tests/ref_data/vllm-disagg.sbatch | 12 ++++++------ tests/ref_data/vllm.sbatch | 8 ++++---- .../test_vllm_slurm_command_gen_strategy.py | 6 ++++++ tests/test_acceptance.py | 3 ++- tests/test_init.py | 2 +- 7 files changed, 29 insertions(+), 15 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index a1db60be9..14145bbe3 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -25,7 +25,7 @@ class VllmSlurmCommandGenStrategy(SlurmCommandGenStrategy): """Command generation strategy for vLLM on Slurm systems.""" def _container_mounts(self) -> list[str]: - return [] + return [f"{self.system.hf_home_path.absolute()}:/root/.cache/huggingface"] def image_path(self) -> str | None: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index dfeeca1dc..f2d0b947f 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -17,7 +17,7 @@ from pydantic import ConfigDict, Field -from cloudai.core import DockerImage, GitRepo, Installable, JobStatusResult, TestRun +from cloudai.core import DockerImage, GitRepo, HFModel, Installable, JobStatusResult, TestRun from cloudai.models.workload import CmdArgs, TestDefinition VLLM_SERVE_LOG_FILE = "vllm-serve.log" @@ -77,6 +77,7 @@ class VllmTestDefinition(TestDefinition): proxy_script_repo: GitRepo | None = None _docker_image: DockerImage | None = None + _hf_model: HFModel | None = None @property def docker_image(self) -> DockerImage: @@ -84,9 +85,15 @@ def docker_image(self) -> DockerImage: self._docker_image = DockerImage(url=self.cmd_args.docker_image_url) return self._docker_image + @property + def hf_model(self) -> HFModel: + if not self._hf_model: + self._hf_model = HFModel(model_name=self.cmd_args.model) + return self._hf_model + @property def installables(self) -> list[Installable]: - installables = [*self.git_repos, self.docker_image] + installables = [*self.git_repos, self.docker_image, self.hf_model] if self.proxy_script_repo: installables.append(self.proxy_script_repo) return installables diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch index c7e8ef454..78d2cf613 100644 --- a/tests/ref_data/vllm-disagg.sbatch +++ b/tests/ref_data/vllm-disagg.sbatch @@ -10,9 +10,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) export CUDA_VISIBLE_DEVICES=0,1,2,3 -srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh cleanup() { echo "Cleaning up PIDs: PREFILL_PID=$PREFILL_PID DECODE_PID=$DECODE_PID PROXY_PID=$PROXY_PID" @@ -47,14 +47,14 @@ DECODE_NIXL_PORT=$((5557 + PORT_OFFSET + 4)) echo "Starting vLLM instances..." export CUDA_VISIBLE_DEVICES="0,1" export VLLM_NIXL_SIDE_CHANNEL_PORT=$PREFILL_NIXL_PORT -srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-prefill.log \ vllm serve Qwen/Qwen3-0.6B --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & PREFILL_PID=$! export CUDA_VISIBLE_DEVICES="2,3" export VLLM_NIXL_SIDE_CHANNEL_PORT=$DECODE_NIXL_PORT -srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-decode.log \ vllm serve Qwen/Qwen3-0.6B --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & DECODE_PID=$! @@ -65,12 +65,12 @@ wait_for_health "http://${NODE}:8100/health" || exit 1 wait_for_health "http://${NODE}:8200/health" || exit 1 echo "Starting proxy..." -srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-proxy.log \ python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8000 --prefiller-hosts 0.0.0.0 --prefiller-ports 8100 --decoder-hosts 0.0.0.0 --decoder-ports 8200 & PROXY_PID=$! echo "Running benchmark..." -srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-bench.log \ vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://0.0.0.0:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result diff --git a/tests/ref_data/vllm.sbatch b/tests/ref_data/vllm.sbatch index ebb50a835..9d0c99be1 100644 --- a/tests/ref_data/vllm.sbatch +++ b/tests/ref_data/vllm.sbatch @@ -10,9 +10,9 @@ export SLURM_JOB_MASTER_NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) export CUDA_VISIBLE_DEVICES=0 -srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --output=__OUTPUT_DIR__/output/mapping-stdout.txt --error=__OUTPUT_DIR__/output/mapping-stderr.txt bash -c "echo \$(date): \$(hostname):node \${SLURM_NODEID}:rank \${SLURM_PROCID}." -srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --ntasks=1 --ntasks-per-node=1 --output=__OUTPUT_DIR__/output/metadata/node-%N.toml --error=__OUTPUT_DIR__/output/metadata/nodes.err bash /cloudai_install/slurm-metadata.sh cleanup() { echo "Cleaning up PIDs: VLLM_PID=$VLLM_PID" @@ -39,7 +39,7 @@ wait_for_health() { } echo "Starting vLLM instances..." -srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-serve.log \ vllm serve Qwen/Qwen3-0.6B --port 8000 & VLLM_PID=$! @@ -49,6 +49,6 @@ echo "Waiting for vLLM on $NODE to be ready..." wait_for_health "http://${NODE}:8000/health" || exit 1 echo "Running benchmark..." -srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output --overlap --ntasks-per-node=1 --ntasks=1 \ +srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-bench.log \ vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://0.0.0.0:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index 9068e71ad..3071674f2 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -157,6 +157,12 @@ def test_prefill_serve_args_with_custom_fields(self) -> None: ] +def test_container_mounts(vllm_cmd_gen_strategy: VllmSlurmCommandGenStrategy) -> None: + assert vllm_cmd_gen_strategy._container_mounts() == [ + f"{vllm_cmd_gen_strategy.system.hf_home_path.absolute()}:/root/.cache/huggingface" + ] + + class TestVllmAggregatedMode: """Tests for vLLM non-disaggregated mode with 1 GPU.""" diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 8d81ea5b3..48af3d09b 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -73,7 +73,7 @@ TritonInferenceTestDefinition, ) from cloudai.workloads.ucc_test import UCCCmdArgs, UCCTestDefinition -from cloudai.workloads.vllm import VllmCmdArgs, VllmTestDefinition +from cloudai.workloads.vllm import VllmArgs, VllmCmdArgs, VllmTestDefinition SLURM_TEST_SCENARIOS = [ {"path": Path("conf/common/test_scenario/sleep.toml"), "expected_dirs_number": 4, "log_file": "sleep_debug.log"}, @@ -521,6 +521,7 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - docker_image_url="nvcr.io/nvidia/vllm:latest", model="Qwen/Qwen3-0.6B", port=8000, + prefill=VllmArgs(), ), extra_env_vars={"CUDA_VISIBLE_DEVICES": "0,1,2,3"}, ), diff --git a/tests/test_init.py b/tests/test_init.py index 8d6b0509f..9b0e8a54f 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -243,7 +243,7 @@ def test_definitions(): ("NIXLKVBench", NIXLKVBenchTestDefinition), ("Aiconfigurator", AiconfiguratorTestDefinition), ("OSUBench", OSUBenchTestDefinition), - ("Vllm", VllmTestDefinition), + ("vllm", VllmTestDefinition), ]: assert test_defs[tdef[0]] == tdef[1] From a5bb85a72c14e37bd7e7b6f2b05fd4f04b761893 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 10 Feb 2026 13:37:43 +0100 Subject: [PATCH 35/45] Add doc for vLLM --- doc/workloads/index.rst | 1 + doc/workloads/vllm.rst | 154 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+) create mode 100644 doc/workloads/vllm.rst diff --git a/doc/workloads/index.rst b/doc/workloads/index.rst index b8d23ddad..af6dccb7f 100644 --- a/doc/workloads/index.rst +++ b/doc/workloads/index.rst @@ -29,6 +29,7 @@ Available Workloads ":doc:`slurm_container`", "✅", "❌", "❌", "❌" "Triton Inference", "✅", "❌", "❌", "❌" ":doc:`ucc`", "✅", "❌", "❌", "❌" + ":doc:`vllm`", "✅", "❌", "❌", "❌" .. toctree:: :hidden: diff --git a/doc/workloads/vllm.rst b/doc/workloads/vllm.rst new file mode 100644 index 000000000..c504d1fa9 --- /dev/null +++ b/doc/workloads/vllm.rst @@ -0,0 +1,154 @@ +vLLM +==== + +This workload (``test_template_name`` is ``vllm``) allows users to execute vLLM benchmarks within the CloudAI framework. + +vLLM is a high-throughput and memory-efficient inference engine for LLMs. This workload supports both aggregated and disaggregated prefill/decode modes. + +Usage Examples +-------------- + +Test + Scenario example +~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: toml + :caption: test.toml (test definition) + + name = "vllm_test" + description = "Example vLLM test" + test_template_name = "vllm" + + [cmd_args] + docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:0.7.0" + model = "Qwen/Qwen3-0.6B" + + [bench_cmd_args] + random_input_len = 16 + random_output_len = 128 + max_concurrency = 16 + num_prompts = 30 + + +.. code-block:: toml + :caption: scenario.toml (scenario with one test) + + name = "vllm-benchmark" + + [[Tests]] + id = "vllm.1" + num_nodes = 1 + time_limit = "00:10:00" + test_name = "vllm_test" + +Test-in-Scenario example +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: toml + :caption: scenario.toml (separate test toml is not needed) + + name = "vllm-benchmark" + + [[Tests]] + id = "vllm.1" + num_nodes = 1 + time_limit = "00:10:00" + + name = "vllm_test" + description = "Example vLLM test" + test_template_name = "vllm" + + [Tests.cmd_args] + docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:0.7.0" + model = "Qwen/Qwen3-0.6B" + + [Tests.bench_cmd_args] + random_input_len = 16 + random_output_len = 128 + max_concurrency = 16 + num_prompts = 30 + + +Control number of GPUs +---------------------- +The number of GPUs can be controlled using the options below, listed from less priority to more priority: +1. ``gpus_per_node`` system property (scalar value) +2. ``CUDA_VISIBLE_DEVICES`` environment variable (comma-separated list of GPU IDs) +3. ``gpu_ids`` command argument for ``prefill`` and ``decode`` configurations (comma-separated list of GPU IDs) + + +Control disaggregation +---------------------- +By default, vLLM will run without disaggregation as a single process. To enable disaggregation, one need to set ``prefill`` configuration: + +.. code-block:: toml + :caption: test.toml (disaggregated prefill/decode) + + [cmd_args] + docker_image_url = "nvcr.io#nvidia/ai-dynamo/vllm-runtime:0.7.0" + model = "Qwen/Qwen3-0.6B" + + [cmd_args.prefill] + + [extra_env_vars] + CUDA_VISIBLE_DEVICES = "0,1,2,3" + +The config above will automatically split GPUs specified in ``CUDA_VISIBLE_DEVICES`` into two halves, first half will be used for prefill and second half will be used for decode. + +For more control, one can specify the GPU IDs explicitly in ``prefill`` and ``decode`` configurations: + +.. code-block:: toml + :caption: test.toml (disaggregated prefill/decode) + + [cmd_args.prefill] + gpu_ids = "0,1" + + [cmd_args.decode] + gpu_ids = "2,3" + +In this case ``CUDA_VISIBLE_DEVICES`` will be ignored and only the GPUs specified in ``gpu_ids`` will be used. + + +Control ``proxy_script`` +------------------------ +``proxy_script`` is used to proxy the requests from the client to the prefill and decode instances. It is ignored for non-disaggregated mode. Default value can be found below. + +It can be overridden by setting ``proxy_script`` by using the latest version of the script from vLLM repository: + +.. code-block:: toml + :caption: test_scenario.toml (override proxy_script) + + [[Tests.git_repos]] + url = "https://github.com/vllm-project/vllm.git" + commit = "main" + mount_as = "/vllm_repo" + + [Tests.cmd_args] + docker_image_url = "vllm/vllm-openai:v0.14.0-cu130" + proxy_script = "/vllm_repo/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" + +In this case the proxy script will be mounted from the vLLM repository (cloned locally) as ``/vllm_repo`` and used for the test. + + +API Documentation +----------------- + +Command Arguments +~~~~~~~~~~~~~~~~~ + +.. autoclass:: cloudai.workloads.vllm.vllm.VllmCmdArgs + :members: + :show-inheritance: + +Benchmark Command Arguments +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: cloudai.workloads.vllm.vllm.VllmBenchCmdArgs + :members: + :show-inheritance: + +Test Definition +~~~~~~~~~~~~~~~ + +.. autoclass:: cloudai.workloads.vllm.vllm.VllmTestDefinition + :members: + :show-inheritance: From 4fd089faf427714c0de6df1285086e5425387231 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 10 Feb 2026 13:43:43 +0100 Subject: [PATCH 36/45] Fix copyright year --- .../workloads/vllm/report_generation_strategy.py | 16 ++++++++++++++++ tests/test_init.py | 2 +- tests/test_test_scenario.py | 2 +- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/cloudai/workloads/vllm/report_generation_strategy.py b/src/cloudai/workloads/vllm/report_generation_strategy.py index dff6eedc9..a299e68ca 100644 --- a/src/cloudai/workloads/vllm/report_generation_strategy.py +++ b/src/cloudai/workloads/vllm/report_generation_strategy.py @@ -1,3 +1,19 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import json from functools import cache from pathlib import Path diff --git a/tests/test_init.py b/tests/test_init.py index 9b0e8a54f..9a5da9db7 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tests/test_test_scenario.py b/tests/test_test_scenario.py index 656fde3bc..e55fda0d5 100644 --- a/tests/test_test_scenario.py +++ b/tests/test_test_scenario.py @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES -# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); From 6c19287a37e72959fdc5ad42c1d3c820482a9a42 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 10 Feb 2026 13:57:50 +0100 Subject: [PATCH 37/45] Update doc/workloads/vllm.rst Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- doc/workloads/vllm.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/workloads/vllm.rst b/doc/workloads/vllm.rst index c504d1fa9..749a79afd 100644 --- a/doc/workloads/vllm.rst +++ b/doc/workloads/vllm.rst @@ -78,7 +78,7 @@ The number of GPUs can be controlled using the options below, listed from less p Control disaggregation ---------------------- -By default, vLLM will run without disaggregation as a single process. To enable disaggregation, one need to set ``prefill`` configuration: +By default, vLLM will run without disaggregation as a single process. To enable disaggregation, one needs to set ``prefill`` configuration: .. code-block:: toml :caption: test.toml (disaggregated prefill/decode) From f2ef121f553aae019bda63b219e11a6287ad7efc Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 10 Feb 2026 14:04:49 +0100 Subject: [PATCH 38/45] Update src/cloudai/workloads/vllm/slurm_command_gen_strategy.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- src/cloudai/workloads/vllm/slurm_command_gen_strategy.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 14145bbe3..5f45c28ea 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -28,8 +28,7 @@ def _container_mounts(self) -> list[str]: return [f"{self.system.hf_home_path.absolute()}:/root/.cache/huggingface"] def image_path(self) -> str | None: - tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) - return str(tdef.docker_image.installed_path) + return str(self.tdef.docker_image.installed_path) @property def tdef(self) -> VllmTestDefinition: From eafb0450cdf9a52afbe66eb5499d0aa3afeb8b46 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 10 Feb 2026 14:05:35 +0100 Subject: [PATCH 39/45] Update tests/job_status_retrieval_strategy/test_vllm_job_status_retrieval_strategy.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- .../test_vllm_job_status_retrieval_strategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/job_status_retrieval_strategy/test_vllm_job_status_retrieval_strategy.py b/tests/job_status_retrieval_strategy/test_vllm_job_status_retrieval_strategy.py index a5ade87b8..ec7678199 100644 --- a/tests/job_status_retrieval_strategy/test_vllm_job_status_retrieval_strategy.py +++ b/tests/job_status_retrieval_strategy/test_vllm_job_status_retrieval_strategy.py @@ -61,7 +61,7 @@ def test_empty_log_file(self, base_tr: TestRun) -> None: assert not result.is_successful assert result.error_message == f"vLLM bench log does not contain benchmark result in {base_tr.output_path}." - def test_no_succesfull_requests(self, base_tr: TestRun) -> None: + def test_no_successful_requests(self, base_tr: TestRun) -> None: base_tr.output_path.mkdir(parents=True, exist_ok=True) log_file = base_tr.output_path / VLLM_BENCH_LOG_FILE log_content = """ From a81447e072dd749c38a09649e9de63ccacb31f06 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 10 Feb 2026 14:07:10 +0100 Subject: [PATCH 40/45] Make parse_vllm_bench_output safer --- .../workloads/vllm/report_generation_strategy.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/cloudai/workloads/vllm/report_generation_strategy.py b/src/cloudai/workloads/vllm/report_generation_strategy.py index a299e68ca..25f96199a 100644 --- a/src/cloudai/workloads/vllm/report_generation_strategy.py +++ b/src/cloudai/workloads/vllm/report_generation_strategy.py @@ -50,10 +50,12 @@ def parse_vllm_bench_output(res_file: Path) -> VLLMBenchReport | None: if not res_file.is_file(): return None - with res_file.open("r") as f: - data = json.load(f) - - return VLLMBenchReport.model_validate(data) + try: + with res_file.open("r") as f: + data = json.load(f) + return VLLMBenchReport.model_validate(data) + except (json.JSONDecodeError, Exception): + return None class VLLMBenchReportGenerationStrategy(ReportGenerationStrategy): From ad9e6b524b6edd98416ca119f96d3e65adf8220a Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 10 Feb 2026 14:21:32 +0100 Subject: [PATCH 41/45] Address review comments --- .../vllm/slurm_command_gen_strategy.py | 5 +++-- src/cloudai/workloads/vllm/vllm.py | 17 +++++++++++++---- .../test_vllm_slurm_command_gen_strategy.py | 7 +++++++ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index 5f45c28ea..9b5f0522a 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path from typing import cast from cloudai.systems.slurm import SlurmCommandGenStrategy @@ -178,7 +179,7 @@ def _gen_aggregated_script( bench_cmd: str, health_func: str, cmd_args: VllmCmdArgs, - output_path, + output_path: Path, ) -> str: return f"""\ cleanup() {{ @@ -211,7 +212,7 @@ def _gen_disaggregated_script( bench_cmd: str, health_func: str, cmd_args: VllmCmdArgs, - output_path, + output_path: Path, ) -> str: prefill_cmd, decode_cmd = serve_commands proxy_cmd = self.get_proxy_command() diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index f2d0b947f..1d0e837b8 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -15,6 +15,8 @@ # limitations under the License. +import logging + from pydantic import ConfigDict, Field from cloudai.core import DockerImage, GitRepo, HFModel, Installable, JobStatusResult, TestRun @@ -38,7 +40,11 @@ def serve_args(self) -> list[str]: """Convert cmd_args_dict to command-line arguments list for vllm serve.""" args = [] for k, v in self.model_dump(exclude={"gpu_ids"}).items(): - args.extend([f"--{k.replace('_', '-')}", str(v)]) + opt = f"--{k.replace('_', '-')}" + if v: + args.extend([opt, str(v)]) + else: + args.append(opt) return args @@ -110,9 +116,12 @@ def was_run_successful(self, tr: TestRun) -> JobStatusResult: has_results_marker = True continue if has_results_marker and "Successful requests:" in line: - num_successful_requests = int(line.split()[2]) - if num_successful_requests > 0: - return JobStatusResult(is_successful=True) + try: + num_successful_requests = int(line.split()[2]) + if num_successful_requests > 0: + return JobStatusResult(is_successful=True) + except Exception as e: + logging.debug(f"Error parsing number of successful requests: {e}") return JobStatusResult( is_successful=False, error_message=f"vLLM bench log does not contain benchmark result in {tr.output_path}." diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index 3071674f2..ba68f2419 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -113,6 +113,13 @@ class TestServeExtraArgs: def test_serve_args_empty_by_default(self) -> None: assert VllmArgs().serve_args == [] + def test_empty_string_value_means_flag(self) -> None: + assert VllmArgs.model_validate({"some_flag": "", "some_arg": "value"}).serve_args == [ + "--some-flag", + "--some-arg", + "value", + ] + def test_decode_serve_args_with_custom_fields(self) -> None: tdef = VllmTestDefinition( name="vllm", From cd8c093a72679757277ddb8be9e8f8aaed4a209b Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 10 Feb 2026 14:28:10 +0100 Subject: [PATCH 42/45] Update doc/workloads/vllm.rst Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- doc/workloads/vllm.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/workloads/vllm.rst b/doc/workloads/vllm.rst index 749a79afd..65ec9d1f6 100644 --- a/doc/workloads/vllm.rst +++ b/doc/workloads/vllm.rst @@ -70,7 +70,7 @@ Test-in-Scenario example Control number of GPUs ---------------------- -The number of GPUs can be controlled using the options below, listed from less priority to more priority: +The number of GPUs can be controlled using the options below, listed from lowest to highest priority: 1. ``gpus_per_node`` system property (scalar value) 2. ``CUDA_VISIBLE_DEVICES`` environment variable (comma-separated list of GPU IDs) 3. ``gpu_ids`` command argument for ``prefill`` and ``decode`` configurations (comma-separated list of GPU IDs) From d05b7c7c495d569882bce9b4033682f675b7fadb Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 10 Feb 2026 14:30:45 +0100 Subject: [PATCH 43/45] Address review comments --- src/cloudai/workloads/vllm/report_generation_strategy.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cloudai/workloads/vllm/report_generation_strategy.py b/src/cloudai/workloads/vllm/report_generation_strategy.py index 25f96199a..ab724fa3c 100644 --- a/src/cloudai/workloads/vllm/report_generation_strategy.py +++ b/src/cloudai/workloads/vllm/report_generation_strategy.py @@ -15,6 +15,7 @@ # limitations under the License. import json +import logging from functools import cache from pathlib import Path @@ -54,7 +55,8 @@ def parse_vllm_bench_output(res_file: Path) -> VLLMBenchReport | None: with res_file.open("r") as f: data = json.load(f) return VLLMBenchReport.model_validate(data) - except (json.JSONDecodeError, Exception): + except Exception as e: + logging.debug(f"Error parsing vLLM benchmark output: {e}") return None From 8c71afb8b74421d8542d115f7d7013a3de4a7044 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 10 Feb 2026 16:53:04 +0100 Subject: [PATCH 44/45] Address review comments --- src/cloudai/workloads/vllm/report_generation_strategy.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/cloudai/workloads/vllm/report_generation_strategy.py b/src/cloudai/workloads/vllm/report_generation_strategy.py index ab724fa3c..7ade3e325 100644 --- a/src/cloudai/workloads/vllm/report_generation_strategy.py +++ b/src/cloudai/workloads/vllm/report_generation_strategy.py @@ -37,11 +37,9 @@ class VLLMBenchReport(BaseModel): completed: int mean_ttft_ms: float median_ttft_ms: float - std_ttft_ms: float p99_ttft_ms: float mean_tpot_ms: float median_tpot_ms: float - std_tpot_ms: float p99_tpot_ms: float From 488d1eef03490fe71536fd8cbd5781a9d822c261 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Tue, 10 Feb 2026 17:23:14 +0100 Subject: [PATCH 45/45] Address review comments --- src/cloudai/workloads/vllm/vllm.py | 6 +++--- .../test_vllm_slurm_command_gen_strategy.py | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/cloudai/workloads/vllm/vllm.py b/src/cloudai/workloads/vllm/vllm.py index 1d0e837b8..2c184c0a3 100644 --- a/src/cloudai/workloads/vllm/vllm.py +++ b/src/cloudai/workloads/vllm/vllm.py @@ -41,10 +41,10 @@ def serve_args(self) -> list[str]: args = [] for k, v in self.model_dump(exclude={"gpu_ids"}).items(): opt = f"--{k.replace('_', '-')}" - if v: - args.extend([opt, str(v)]) - else: + if v == "": args.append(opt) + else: + args.extend([opt, str(v)]) return args diff --git a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py index ba68f2419..7909b3d67 100644 --- a/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_vllm_slurm_command_gen_strategy.py @@ -114,10 +114,12 @@ def test_serve_args_empty_by_default(self) -> None: assert VllmArgs().serve_args == [] def test_empty_string_value_means_flag(self) -> None: - assert VllmArgs.model_validate({"some_flag": "", "some_arg": "value"}).serve_args == [ + assert VllmArgs.model_validate({"some_flag": "", "some_arg": "value", "zero_value": 0}).serve_args == [ "--some-flag", "--some-arg", "value", + "--zero-value", + "0", ] def test_decode_serve_args_with_custom_fields(self) -> None: