From 6c5ab7959b201abdec07adb4c7187296fc8d9662 Mon Sep 17 00:00:00 2001 From: iximeow Date: Mon, 9 Feb 2026 23:57:09 +0000 Subject: [PATCH 1/2] phd: check exit status of commands similar to std::process::Command: if a command fails with status, we should report that more proactively. further, some commands are expected to exit with status in tests, so adjust test expectations to match the new regime. --- phd-tests/framework/src/test_vm/mod.rs | 144 ++++++++++++++++++++---- phd-tests/tests/src/boot_order.rs | 18 ++- phd-tests/tests/src/crucible/migrate.rs | 3 +- phd-tests/tests/src/crucible/smoke.rs | 3 +- phd-tests/tests/src/hw.rs | 16 ++- phd-tests/tests/src/hyperv.rs | 42 ++++--- phd-tests/tests/src/migrate.rs | 19 +++- 7 files changed, 189 insertions(+), 56 deletions(-) diff --git a/phd-tests/framework/src/test_vm/mod.rs b/phd-tests/framework/src/test_vm/mod.rs index 50d971cae..269399b80 100644 --- a/phd-tests/framework/src/test_vm/mod.rs +++ b/phd-tests/framework/src/test_vm/mod.rs @@ -23,7 +23,7 @@ use crate::{ TestCtx, }; -use anyhow::{anyhow, Context, Result}; +use anyhow::{anyhow, bail, Context, Result}; use camino::Utf8PathBuf; use core::result::Result as StdResult; use propolis_client::{ @@ -152,6 +152,107 @@ enum VmState { Ensured { serial: SerialConsole }, } +/// Description of the acceptable status codes from executing a command in a +/// [`TestVm::run_shell_command`]. +// This could reasonably have a `Status(u16)` variant to check specific non-zero +// statuses, but specific codes are not terribly portable! In the few cases we +// can expect a specific status for errors, those specific codes change between +// f.ex illumos and Linux guests. +enum StatusCheck { + Ok, + NotOk, +} + +pub struct ShellOutputExecutor<'ctx> { + vm: &'ctx TestVm, + cmd: &'ctx str, + status_check: Option, +} + +impl<'a> ShellOutputExecutor<'a> { + pub fn ignore_status(mut self) -> ShellOutputExecutor<'a> { + self.status_check = None; + self + } + + pub fn check_ok(mut self) -> ShellOutputExecutor<'a> { + self.status_check = Some(StatusCheck::Ok); + self + } + + pub fn check_err(mut self) -> ShellOutputExecutor<'a> { + self.status_check = Some(StatusCheck::NotOk); + self + } +} +use futures::FutureExt; + +impl<'a> std::future::IntoFuture for ShellOutputExecutor<'a> { + type Output = Result; + type IntoFuture = futures::future::BoxFuture<'a, Result>; + + fn into_future(self) -> Self::IntoFuture { + Box::pin(async move { + // Allow the guest OS to transform the input command into a + // guest-specific command sequence. This accounts for the guest's + // shell type (which affects e.g. affects how it displays multi-line + // commands) and serial console buffering discipline. + let command_sequence = + self.vm.guest_os.shell_command_sequence(self.cmd); + self.vm.run_command_sequence(command_sequence).await?; + + // `shell_command_sequence` promises that the generated command + // sequence clears buffer of everything up to and including the + // input command before actually issuing the final '\n' that issues + // the command. This ensures that the buffer contents returned by + // this call contain only the command's output. + let output = self + .vm + .wait_for_serial_output( + self.vm.guest_os.get_shell_prompt(), + Duration::from_secs(300), + ) + .await?; + + // Trim any leading newlines inserted when the command was issued + // and any trailing whitespace that isn't actually part of the + // command output. Any other embedded whitespace is the caller's + // problem. + let output = output.trim().to_string(); + + if let Some(check) = self.status_check { + let status_command_sequence = + self.vm.guest_os.shell_command_sequence("echo $?"); + self.vm.run_command_sequence(status_command_sequence).await?; + let status = self + .vm + .wait_for_serial_output( + self.vm.guest_os.get_shell_prompt(), + Duration::from_secs(300), + ) + .await?; + let status = status.trim().parse::()?; + + match check { + StatusCheck::Ok => { + if status != 0 { + bail!("expected status 0, got {}", status); + } + } + StatusCheck::NotOk => { + if status == 0 { + bail!("expected non-zero status, got {}", status); + } + } + } + } + + Ok(output) + }) + .boxed() + } +} + /// A virtual machine running in a Propolis server. Test cases create these VMs /// using the `factory::VmFactory` embedded in their test contexts. /// @@ -885,30 +986,23 @@ impl TestVm { /// waits for another shell prompt to appear using /// [`Self::wait_for_serial_output`] and returns any text that was buffered /// to the serial console after the command was sent. - pub async fn run_shell_command(&self, cmd: &str) -> Result { - // Allow the guest OS to transform the input command into a - // guest-specific command sequence. This accounts for the guest's shell - // type (which affects e.g. affects how it displays multi-line commands) - // and serial console buffering discipline. - let command_sequence = self.guest_os.shell_command_sequence(cmd); - self.run_command_sequence(command_sequence).await?; - - // `shell_command_sequence` promises that the generated command sequence - // clears buffer of everything up to and including the input command - // before actually issuing the final '\n' that issues the command. - // This ensures that the buffer contents returned by this call contain - // only the command's output. - let out = self - .wait_for_serial_output( - self.guest_os.get_shell_prompt(), - Duration::from_secs(300), - ) - .await?; - - // Trim any leading newlines inserted when the command was issued and - // any trailing whitespace that isn't actually part of the command - // output. Any other embedded whitespace is the caller's problem. - Ok(out.trim().to_string()) + /// + /// After running the shell command, sends `echo $?` to query and return the + /// command's return status as well. + /// + /// This will return an error if the command returns a non-zero status by + /// default; to ignore the status or expect a non-zero as a positive + /// condition, see [`ShellOutputExecutor::ignore_status`] or + /// [`ShellOutputExecutor::check_err`]. + pub fn run_shell_command<'a>( + &'a self, + cmd: &'a str, + ) -> ShellOutputExecutor<'a> { + ShellOutputExecutor { + vm: self, + cmd, + status_check: Some(StatusCheck::Ok), + } } pub async fn graceful_reboot(&self) -> Result<()> { diff --git a/phd-tests/tests/src/boot_order.rs b/phd-tests/tests/src/boot_order.rs index a048ae6cf..6ae79ff0a 100644 --- a/phd-tests/tests/src/boot_order.rs +++ b/phd-tests/tests/src/boot_order.rs @@ -256,7 +256,8 @@ async fn guest_can_adjust_boot_order(ctx: &TestCtx) { // If the guest doesn't have an EFI partition then there's no way for boot // order preferences to be persisted. - let mountline = vm.run_shell_command("mount | grep efivarfs").await?; + let mountline = + vm.run_shell_command("mount | grep efivarfs").ignore_status().await?; if !mountline.starts_with("efivarfs on ") { warn!( @@ -268,11 +269,16 @@ async fn guest_can_adjust_boot_order(ctx: &TestCtx) { // Try adding a few new boot options, then add them to the boot order, // reboot, and make sure they're all as we set them. - if !vm - .run_shell_command(&format!("ls {}", efipath(&bootvar(0xffff)))) - .await? - .is_empty() - { + let bootffff_path = efipath(&bootvar(0xffff)); + let bootffff_res = vm + .run_shell_command(&format!("ls {bootffff_path}")) + .ignore_status() + .await?; + // `ls` just prints the file path if it exists, but the error text varies a + // bit depending on Alpine, Ubuntu, Busybox, etc. Notionally we could + // `check_err()` above, but having a `BootFFFF` entry already is merely + // weird; we can still replace it and continue with the test. + if bootffff_res == bootffff_path { warn!( "guest environment already has a BootFFFF entry; \ is this not a fresh image?" diff --git a/phd-tests/tests/src/crucible/migrate.rs b/phd-tests/tests/src/crucible/migrate.rs index 05c6cab30..c6f64cd88 100644 --- a/phd-tests/tests/src/crucible/migrate.rs +++ b/phd-tests/tests/src/crucible/migrate.rs @@ -19,7 +19,8 @@ async fn smoke_test(ctx: &TestCtx) { source.launch().await?; source.wait_to_boot().await?; - let lsout = source.run_shell_command("ls foo.bar 2> /dev/null").await?; + let lsout = + source.run_shell_command("ls foo.bar 2> /dev/null").check_err().await?; assert_eq!(lsout, ""); source.run_shell_command("touch ./foo.bar").await?; source.run_shell_command("sync ./foo.bar").await?; diff --git a/phd-tests/tests/src/crucible/smoke.rs b/phd-tests/tests/src/crucible/smoke.rs index 0532ac0a2..815e2782c 100644 --- a/phd-tests/tests/src/crucible/smoke.rs +++ b/phd-tests/tests/src/crucible/smoke.rs @@ -65,7 +65,8 @@ async fn shutdown_persistence_test(ctx: &TestCtx) { // Verify that the test file doesn't exist yet, then touch it, flush it, and // shut down the VM. - let lsout = vm.run_shell_command("ls foo.bar 2> /dev/null").await?; + let lsout = + vm.run_shell_command("ls foo.bar 2> /dev/null").check_err().await?; assert_eq!(lsout, ""); vm.run_shell_command("touch ./foo.bar").await?; vm.run_shell_command("sync ./foo.bar").await?; diff --git a/phd-tests/tests/src/hw.rs b/phd-tests/tests/src/hw.rs index db2b2cbd3..10325d6a3 100644 --- a/phd-tests/tests/src/hw.rs +++ b/phd-tests/tests/src/hw.rs @@ -17,15 +17,23 @@ async fn lspci_lifecycle_test(ctx: &TestCtx) { vm.launch().await?; vm.wait_to_boot().await?; - let lspci = vm.run_shell_command(LSPCI).await?; - let lshw = vm.run_shell_command(LSHW).await?; + // XXX: do not `ignore_status()` on these commands! They fail for any number + // of reasons on different guests: + // * sudo may not exist (some Alpine) + // * lshw may not exist (Debian) + // * we may not input a sudo password (Ubuntu) + + let lspci = vm.run_shell_command(LSPCI).ignore_status().await?; + let lshw = vm.run_shell_command(LSHW).ignore_status().await?; ctx.lifecycle_test(vm, &[Action::StopAndStart], move |vm| { let lspci = lspci.clone(); let lshw = lshw.clone(); Box::pin(async move { - let new_lspci = vm.run_shell_command(LSPCI).await.unwrap(); + let new_lspci = + vm.run_shell_command(LSPCI).ignore_status().await.unwrap(); assert_eq!(new_lspci, lspci); - let new_lshw = vm.run_shell_command(LSHW).await.unwrap(); + let new_lshw = + vm.run_shell_command(LSHW).ignore_status().await.unwrap(); assert_eq!(new_lshw, lshw); }) }) diff --git a/phd-tests/tests/src/hyperv.rs b/phd-tests/tests/src/hyperv.rs index 8ab9df6ba..02ce28f92 100644 --- a/phd-tests/tests/src/hyperv.rs +++ b/phd-tests/tests/src/hyperv.rs @@ -24,20 +24,19 @@ use uuid::Uuid; /// startup and shutdown with Hyper-V emulation enabled. async fn guest_detect_hyperv(vm: &TestVm) -> anyhow::Result<()> { if vm.guest_os_kind().is_linux() { - // Many Linux distros come with systemd installed out of the box. On - // these distros, it's easiest to use `systemd-detect-virt` to determine - // whether the guest thinks it's running on a Hyper-V-compatible - // hypervisor. (Whether any actual enlightenments are enabled is another - // story, but those can often be detected by other means.) - let out = vm.run_shell_command("systemd-detect-virt").await?; - if out.contains("systemd-detect-virt: not found") { - warn!( - "guest doesn't support systemd-detect-virt, can't verify it \ - detected Hyper-V support" - ); - } else { - assert_eq!(out, "microsoft"); - } + // One might imagine we could simply use `systemd-detect-virt` to check + // hypervisor information here, but it's not present out of the box on + // Alpine. In the interest of exercising Hyper-V in typical CI runs on a + // standard Alpine image, detect Hyper-V in a.. worse but reliable way: + // looking for relevant logs in dmesg. This should work for all Linuxes + // from later than May-ish 2010 (>2.6.34 or so). If we don't see Hyper-V + // reported in dmesg either it's genuinely not detected, it's a very old + // Linux, or it's a new Linux and dmesg text has changed. + const HV_TEXT: &str = "Hypervisor detected: Microsoft Hyper-V"; + + // No "sudo" here because Alpine doesn't have sudo; for Linux tests we + // expect to run test commands as root anyway. + vm.run_shell_command(&format!("dmesg | grep \"{HV_TEXT}\"")).await?; } else if vm.guest_os_kind().is_windows() { // Windows is good about giving signals that it's running in a Hyper-V // *root partition*, but offers no clear signal as to whether it has @@ -45,6 +44,8 @@ async fn guest_detect_hyperv(vm: &TestVm) -> anyhow::Result<()> { // are methods for detecting whether Windows is running as a guest, but // these don't identify the detected hypervisor type.) warn!("running on Windows, can't verify it detected Hyper-V support"); + } else { + warn!("unknown guest type, can't verify it detected Hyper-V support"); } Ok(()) @@ -56,6 +57,16 @@ async fn hyperv_smoke_test(ctx: &TestCtx) { cfg.guest_hv_interface(GuestHypervisorInterface::HyperV { features: Default::default(), }); + // For reasons absolutely indecipherable to me, Alpine (3.16, kernel + // 5.15.41-0-virt) seems to lose some early dmesg lines if booted with less + // than four vCPUs. Among the early dmesg lines are `Hypervisor detected: + // Microsoft Hyper-V` that we look for as confirmation that Linux knows + // there's a Hyper-V-like hypervisor present. + // + // I'd love to debug exactly why this is relevant to the contents of dmesg + // (the remaining log is identical and it doesn't seem that the ring buffer + // is full), but I really can't justify the time! + cfg.cpus(4); let mut vm = ctx.spawn_vm(&cfg, None).await?; vm.launch().await?; vm.wait_to_boot().await?; @@ -69,6 +80,9 @@ async fn hyperv_lifecycle_test(ctx: &TestCtx) { cfg.guest_hv_interface(GuestHypervisorInterface::HyperV { features: Default::default(), }); + // Spooky load-bearing vCPU count to preserve dmesg log lines. See the + // comment in `hyperv_smoke_test`. + cfg.cpus(4); let mut vm = ctx.spawn_vm(&cfg, None).await?; vm.launch().await?; vm.wait_to_boot().await?; diff --git a/phd-tests/tests/src/migrate.rs b/phd-tests/tests/src/migrate.rs index e7875acb1..c55436c7f 100644 --- a/phd-tests/tests/src/migrate.rs +++ b/phd-tests/tests/src/migrate.rs @@ -55,7 +55,11 @@ mod from_base { spawn_base_vm(ctx, "migration_from_base_and_back").await?; source.launch().await?; source.wait_to_boot().await?; - let lsout = source.run_shell_command("ls foo.bar 2> /dev/null").await?; + // `ls` with no results exits non-zero, so expect an error here. + let lsout = source + .run_shell_command("ls foo.bar 2> /dev/null") + .check_err() + .await?; assert_eq!(lsout, ""); // create an empty file on the source VM. @@ -74,8 +78,9 @@ mod from_base { // the file should still exist on the target VM after migration. let lsout = target .run_shell_command("ls foo.bar") + .ignore_status() .await - .expect("`ls foo.bar` should succeed"); + .expect("can try to run `ls foo.bar`"); assert_eq!(lsout, "foo.bar"); }) }, @@ -275,7 +280,9 @@ mod running_process { )) .await?; vm.run_shell_command("chmod +x dirt.sh").await?; - let run_dirt = vm.run_shell_command("./dirt.sh").await?; + // When dirt.sh suspends itself, the parent shell will report a non-zero + // status (one example is 148: 128 + SIGTSTP aka 20 on Linux). + let run_dirt = vm.run_shell_command("./dirt.sh").check_err().await?; assert!(run_dirt.contains("made dirt"), "dirt.sh failed: {run_dirt:?}"); assert!( run_dirt.contains("Stopped"), @@ -314,7 +321,8 @@ async fn multiple_migrations(ctx: &TestCtx) { async fn run_smoke_test(ctx: &TestCtx, mut source: TestVm) -> Result<()> { source.launch().await?; source.wait_to_boot().await?; - let lsout = source.run_shell_command("ls foo.bar 2> /dev/null").await?; + let lsout = + source.run_shell_command("ls foo.bar 2> /dev/null").check_err().await?; assert_eq!(lsout, ""); // create an empty file on the source VM. @@ -329,8 +337,9 @@ async fn run_smoke_test(ctx: &TestCtx, mut source: TestVm) -> Result<()> { // the file should still exist on the target VM after migration. let lsout = target .run_shell_command("ls foo.bar") + .ignore_status() .await - .expect("`ls foo.bar` should succeed after migration"); + .expect("can try to run `ls foo.bar`"); assert_eq!(lsout, "foo.bar"); }) }, From bebcd040308ca6b845d17df6e7af86d1b787961e Mon Sep 17 00:00:00 2001 From: iximeow Date: Tue, 10 Feb 2026 01:28:45 +0000 Subject: [PATCH 2/2] pr feedback and drag cpu topo test into the future --- phd-tests/framework/src/test_vm/mod.rs | 2 +- phd-tests/tests/src/cpuid.rs | 1 + phd-tests/tests/src/hw.rs | 2 ++ phd-tests/tests/src/hyperv.rs | 2 +- 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/phd-tests/framework/src/test_vm/mod.rs b/phd-tests/framework/src/test_vm/mod.rs index 269399b80..8997bf644 100644 --- a/phd-tests/framework/src/test_vm/mod.rs +++ b/phd-tests/framework/src/test_vm/mod.rs @@ -26,6 +26,7 @@ use crate::{ use anyhow::{anyhow, bail, Context, Result}; use camino::Utf8PathBuf; use core::result::Result as StdResult; +use futures::FutureExt; use propolis_client::{ instance_spec::{ ComponentV0, InstanceProperties, InstanceSpecGetResponse, @@ -185,7 +186,6 @@ impl<'a> ShellOutputExecutor<'a> { self } } -use futures::FutureExt; impl<'a> std::future::IntoFuture for ShellOutputExecutor<'a> { type Output = Result; diff --git a/phd-tests/tests/src/cpuid.rs b/phd-tests/tests/src/cpuid.rs index 0c76cf136..cbf764efa 100644 --- a/phd-tests/tests/src/cpuid.rs +++ b/phd-tests/tests/src/cpuid.rs @@ -193,6 +193,7 @@ impl<'a> LinuxGuestTopo<'a> { "ls {}", Self::cpu_stem(this.cpus().await) )) + .ignore_status() .await .expect("can run ls of a directory that doesn't exist"); assert!(out.contains("No such file or directory")); diff --git a/phd-tests/tests/src/hw.rs b/phd-tests/tests/src/hw.rs index 10325d6a3..e0defefdf 100644 --- a/phd-tests/tests/src/hw.rs +++ b/phd-tests/tests/src/hw.rs @@ -22,6 +22,8 @@ async fn lspci_lifecycle_test(ctx: &TestCtx) { // * sudo may not exist (some Alpine) // * lshw may not exist (Debian) // * we may not input a sudo password (Ubuntu) + // + // see also: https://github.com/oxidecomputer/propolis/issues/792 let lspci = vm.run_shell_command(LSPCI).ignore_status().await?; let lshw = vm.run_shell_command(LSHW).ignore_status().await?; diff --git a/phd-tests/tests/src/hyperv.rs b/phd-tests/tests/src/hyperv.rs index 02ce28f92..d51b29962 100644 --- a/phd-tests/tests/src/hyperv.rs +++ b/phd-tests/tests/src/hyperv.rs @@ -27,7 +27,7 @@ async fn guest_detect_hyperv(vm: &TestVm) -> anyhow::Result<()> { // One might imagine we could simply use `systemd-detect-virt` to check // hypervisor information here, but it's not present out of the box on // Alpine. In the interest of exercising Hyper-V in typical CI runs on a - // standard Alpine image, detect Hyper-V in a.. worse but reliable way: + // standard Alpine image, detect Hyper-V in a... worse but reliable way: // looking for relevant logs in dmesg. This should work for all Linuxes // from later than May-ish 2010 (>2.6.34 or so). If we don't see Hyper-V // reported in dmesg either it's genuinely not detected, it's a very old