Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/cloudai/models/scenario.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -108,7 +108,7 @@ def tdef_model_dump(self, by_alias: bool) -> dict:
"extra_env_vars": self.extra_env_vars if self.extra_env_vars else None,
"cmd_args": self.cmd_args.model_dump(by_alias=by_alias) if self.cmd_args else None,
"git_repos": [repo.model_dump() for repo in self.git_repos] if self.git_repos else None,
"nsys": self.nsys.model_dump() if self.nsys else None,
"nsys": self.nsys.model_dump(exclude_unset=True) if self.nsys else None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't that lead to a dict without required fields set?

BTW, I think this is similar to what you did for reports. Maybe we should have a function merge_models(tdef, tscenario) or something like that? Do you think that will be useful?

Our current approach is "all or nothing", meaning that everything should be either set on one level or another. If we start doing smarter merge, I'd prefer we do it consistently. Overall this feels as a better and friendlier approach.

cc @podkidyshev

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

still learning, so my opinion should be rather optional here :)

  1. the change seems safe and positive. agree with Andrey that it would be nice to make it consistent across all model merges but I would assume it's on the cloudai dev team's shoulders

  2. Can't that lead to a dict without required fields set?

seems safe, if this is the risk you meant

>>> import pydantic
>>> from typing import Optional
>>> class A(pydantic.BaseModel):
...     model_config = pydantic.ConfigDict(extra='forbid')
...     a: str
...     b: Optional[str] = 'b'
... 
>>> A.model_validate({'b': '33'})
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Users/ipodkidyshev/projects/cloudai/.venv/lib/python3.10/site-packages/pydantic/main.py", line 705, in model_validate
    return cls.__pydantic_validator__.validate_python(
pydantic_core._pydantic_core.ValidationError: 1 validation error for A
a
  Field required [type=missing, input_value={'b': '33'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.11/v/missing

}
return {k: v for k, v in data.items() if v is not None}

Expand Down
159 changes: 158 additions & 1 deletion tests/test_test_scenario.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -515,3 +515,160 @@ def test_get_reporters_nccl(self):
assert len(reporters) == 2
assert NcclTestPerformanceReportGenerationStrategy in reporters
assert NcclTestPredictionReportGenerationStrategy in reporters


class TestNsysMerging:
def test_nsys_partial_override_preserves_base_config(
self, test_scenario_parser: TestScenarioParser, slurm_system: SlurmSystem
):
from cloudai.core import NsysConfiguration

test_scenario_parser.test_mapping = {
"nccl": NCCLTestDefinition(
name="nccl",
description="desc",
test_template_name="NcclTest",
cmd_args=NCCLCmdArgs(docker_image_url="fake://url/nccl"),
nsys=NsysConfiguration(
enable=True,
nsys_binary="/custom/nsys",
output="/base/output",
trace="cuda,nvtx",
sample="cpu",
),
)
}
model = TestScenarioModel.model_validate(
toml.loads(
"""
name = "test"

[[Tests]]
id = "1"
test_name = "nccl"

[Tests.nsys]
output = "/scenario/output"
"""
)
)
tdef = test_scenario_parser._prepare_tdef(model.tests[0])

assert tdef.nsys is not None
assert tdef.nsys.output == "/scenario/output"
assert tdef.nsys.nsys_binary == "/custom/nsys"
assert tdef.nsys.trace == "cuda,nvtx"
assert tdef.nsys.sample == "cpu"
assert tdef.nsys.enable is True

def test_nsys_multiple_fields_override(self, test_scenario_parser: TestScenarioParser, slurm_system: SlurmSystem):
from cloudai.core import NsysConfiguration
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can it be imported on top of this file? (same for all tests)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@juntaowww I assume you used an AI assistant to help you with the code. they really tend to bring imports deep inside. could you please share what is the assistant so we could prepare some common agents config (if there's such an option) to let the agent know about the code style?


test_scenario_parser.test_mapping = {
"nccl": NCCLTestDefinition(
name="nccl",
description="desc",
test_template_name="NcclTest",
cmd_args=NCCLCmdArgs(docker_image_url="fake://url/nccl"),
nsys=NsysConfiguration(
enable=True,
nsys_binary="/base/nsys",
output="/base/output",
trace="cuda",
force_overwrite=False,
),
)
}
model = TestScenarioModel.model_validate(
toml.loads(
"""
name = "test"

[[Tests]]
id = "1"
test_name = "nccl"

[Tests.nsys]
output = "/new/output"
force_overwrite = true
"""
)
)
tdef = test_scenario_parser._prepare_tdef(model.tests[0])

assert tdef.nsys is not None
assert tdef.nsys.output == "/new/output"
assert tdef.nsys.force_overwrite is True
assert tdef.nsys.nsys_binary == "/base/nsys"
assert tdef.nsys.trace == "cuda"
assert tdef.nsys.enable is True

def test_nsys_scenario_adds_to_base_without_nsys(
self, test_scenario_parser: TestScenarioParser, slurm_system: SlurmSystem
):
test_scenario_parser.test_mapping = {
"nccl": NCCLTestDefinition(
name="nccl",
description="desc",
test_template_name="NcclTest",
cmd_args=NCCLCmdArgs(docker_image_url="fake://url/nccl"),
# No nsys in base config
)
}
model = TestScenarioModel.model_validate(
toml.loads(
"""
name = "test"

[[Tests]]
id = "1"
test_name = "nccl"

[Tests.nsys]
output = "/scenario/output"
trace = "cuda,nvtx"
"""
)
)
tdef = test_scenario_parser._prepare_tdef(model.tests[0])

assert tdef.nsys is not None
assert tdef.nsys.output == "/scenario/output"
assert tdef.nsys.trace == "cuda,nvtx"
assert tdef.nsys.enable is True
assert tdef.nsys.nsys_binary == "nsys"

def test_nsys_disable_override(self, test_scenario_parser: TestScenarioParser, slurm_system: SlurmSystem):
from cloudai.core import NsysConfiguration

test_scenario_parser.test_mapping = {
"nccl": NCCLTestDefinition(
name="nccl",
description="desc",
test_template_name="NcclTest",
cmd_args=NCCLCmdArgs(docker_image_url="fake://url/nccl"),
nsys=NsysConfiguration(
enable=True,
output="/base/output",
),
)
}
model = TestScenarioModel.model_validate(
toml.loads(
"""
name = "test"

[[Tests]]
id = "1"
test_name = "nccl"

[Tests.nsys]
enable = false
"""
)
)
tdef = test_scenario_parser._prepare_tdef(model.tests[0])

assert tdef.nsys is not None
assert tdef.nsys.enable is False
assert tdef.nsys.output == "/base/output"