Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 58 additions & 16 deletions lisa/microsoft/testsuites/stress/stress_ng_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Any, Dict, List, Tuple, cast

import yaml
from exceptiongroup import ExceptionGroup

from lisa import (
Environment,
Expand All @@ -19,7 +20,7 @@
from lisa.messages import TestStatus, send_sub_test_result_message
from lisa.testsuite import TestResult
from lisa.tools import StressNg
from lisa.util import SkippedException
from lisa.util import KernelPanicException, SkippedException
from lisa.util.logger import Logger
from lisa.util.process import Process

Expand Down Expand Up @@ -73,12 +74,14 @@ def stress_ng_jobfile(
@TestCaseMetadata(
description="Runs stress-ng's 'cpu' class stressors for 60s each.",
priority=4,
timeout=7200, # 2 hours
)
def stress_ng_cpu_stressors(
self,
environment: Environment,
log: Logger,
) -> None:
self._run_stressor_class(environment, "cpu")
self._run_stressor_class(environment, "cpu", log)

@TestCaseMetadata(
description="Runs stress-ng's 'memory' class stressors for 60s each.",
Expand All @@ -87,8 +90,9 @@ def stress_ng_cpu_stressors(
def stress_ng_memory_stressors(
self,
environment: Environment,
log: Logger,
) -> None:
self._run_stressor_class(environment, "memory")
self._run_stressor_class(environment, "memory", log)

@TestCaseMetadata(
description="Runs stress-ng's 'vm' class stressors for 60s each.",
Expand All @@ -97,8 +101,9 @@ def stress_ng_memory_stressors(
def stress_ng_vm_stressors(
self,
environment: Environment,
log: Logger,
) -> None:
self._run_stressor_class(environment, "vm")
self._run_stressor_class(environment, "vm", log)

@TestCaseMetadata(
description="Runs stress-ng's 'io' class stressors for 60s each.",
Expand All @@ -107,8 +112,9 @@ def stress_ng_vm_stressors(
def stress_ng_io_stressors(
self,
environment: Environment,
log: Logger,
) -> None:
self._run_stressor_class(environment, "io")
self._run_stressor_class(environment, "io", log)

@TestCaseMetadata(
description="Runs stress-ng's 'network' class stressors for 60s each.",
Expand All @@ -117,8 +123,9 @@ def stress_ng_io_stressors(
def stress_ng_network_stressors(
self,
environment: Environment,
log: Logger,
) -> None:
self._run_stressor_class(environment, "network")
self._run_stressor_class(environment, "network", log)

@TestCaseMetadata(
description="""
Expand Down Expand Up @@ -174,21 +181,56 @@ def multi_vm_stress_test(
for job_file in jobs:
self._run_stress_ng_job(job_file, environment, result, log)

def _run_stressor_class(self, environment: Environment, class_name: str) -> None:
def _run_stressor_class(
self, environment: Environment, class_name: str, log: Logger
) -> None:
nodes = [cast(RemoteNode, node) for node in environment.nodes.list()]
procs: List[Process] = []
try:
for node in nodes:
procs.append(node.tools[StressNg].launch_class_async(class_name))
for proc in procs:
procs: List[Tuple[RemoteNode, Process]] = []

# Launch Processes
start_failures: List[Tuple[RemoteNode, Exception]] = []
for node in nodes:
try:
procs.append(
(node, node.tools[StressNg].launch_class_async(class_name))
)
except Exception as e:
start_failures.append((node, e))

# Validate Results
result_failures: List[Tuple[RemoteNode, Exception]] = []
for node, proc in procs:
try:
proc.wait_result(timeout=self.TIME_OUT, expected_exit_code=0)
except Exception as e:
for node in nodes:
# check_panic will automatically log and raise if panic detected
except Exception as e:
result_failures.append((node, e))

# Check for kernel panics on all failed nodes
kernel_panics: List[Tuple[RemoteNode, Exception]] = []
for node, _e in start_failures + result_failures:
try:
node.features[SerialConsole].check_panic(
saved_path=None, force_run=True
)
raise e
except KernelPanicException as e:
kernel_panics.append((node, e))

# Raise exceptions if there were any failures
if start_failures or result_failures or kernel_panics:
total = len(start_failures) + len(result_failures) + len(kernel_panics)
log.error(
f"{total} node(s) encountered errors during "
f"stress_ng_{class_name}_stressors."
)

raise ExceptionGroup(
f"{len(start_failures)} start failures, "
f"{len(result_failures)} exit code failures, "
f"{len(kernel_panics)} kernel panics.",
[exc for _node, exc in start_failures]
+ [exc for _node, exc in result_failures]
+ [exc for _node, exc in kernel_panics],
)

def _run_stress_ng_job(
self,
Expand Down
2 changes: 1 addition & 1 deletion lisa/tools/stress_ng.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def launch_class_async(
v_flag = "-v" if verbose else ""
return self.run_async(
f"{v_flag} --sequential {num_workers} --class {class_name} "
f"--timeout {timeout_secs}",
f"--timeout {timeout_secs} --oom-avoid",
sudo=sudo,
)

Expand Down
8 changes: 6 additions & 2 deletions lisa/util/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class ExecutableResult:
_stderr: str
exit_code: Optional[int]
cmd: Union[str, List[str]]
_id_: str
elapsed: float
is_timeout: bool = False

Expand Down Expand Up @@ -71,7 +72,9 @@ def assert_exit_code(
message: str = "",
include_output: bool = False,
) -> AssertionBuilder:
message = "\n".join([message, f"Get unexpected exit code on cmd {self.cmd}"])
message = "\n".join(
[message, f"Get unexpected exit code on cmd {self._id_} {self.cmd}"]
)
if include_output:
message = "\n".join(
[message, "stdout:", self.stdout, "stderr:", self.stderr]
Expand Down Expand Up @@ -322,7 +325,7 @@ def start(
# FileNotFoundError: not found command on Windows
# NoSuchCommandError: not found command on remote Posix
self._result = ExecutableResult(
"", e.strerror, 1, split_command, self._timer.elapsed()
"", e.strerror, 1, split_command, self._id_, self._timer.elapsed()
)
self._log.log(stderr_level, f"not found command: {e}")
except SshSpawnTimeoutException:
Expand Down Expand Up @@ -431,6 +434,7 @@ def _wait_result(
process_result.stderr_output.strip(),
process_result.return_code,
self._cmd,
self._id_,
self._timer.elapsed(),
is_timeout,
)
Expand Down
7 changes: 4 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,21 @@ classifiers = [
]
dependencies = [
"assertpy ~= 1.1",
"func-timeout ~= 4.3.5",
"charset_normalizer ~= 2.1.1",
"dataclasses-json ~= 0.5.2",
"exceptiongroup ~= 1.3.1",
"func-timeout ~= 4.3.5",
"paramiko ~= 3.5.1",
"pluggy ~= 0.13.1",
"python-dateutil ~= 2.8.1",
"PyYAML ~= 6.0.1",
"randmac ~= 0.1",
"requests ~= 2.32.4",
"retry ~= 0.9.2",
"semver ~= 2.13.0",
"simpleeval ~= 0.9.12",
"spurplus ~= 2.3.5",
"websockets ~= 15.0.1",
"charset_normalizer ~= 2.1.1",
"requests ~= 2.32.4",
]
dynamic = ["version"]
license = {text = "MIT"}
Expand Down
Loading