From 6c5ab7959b201abdec07adb4c7187296fc8d9662 Mon Sep 17 00:00:00 2001
From: iximeow <iximeow@oxide.computer>
Date: Mon, 9 Feb 2026 23:57:09 +0000
Subject: [PATCH 1/2] phd: check exit status of commands

similar to std::process::Command: if a command fails with status, we
should report that more proactively. further, some commands are expected
to exit with status in tests, so adjust test expectations to match the
new regime.
---
 phd-tests/framework/src/test_vm/mod.rs  | 144 ++++++++++++++++++++----
 phd-tests/tests/src/boot_order.rs       |  18 ++-
 phd-tests/tests/src/crucible/migrate.rs |   3 +-
 phd-tests/tests/src/crucible/smoke.rs   |   3 +-
 phd-tests/tests/src/hw.rs               |  16 ++-
 phd-tests/tests/src/hyperv.rs           |  42 ++++---
 phd-tests/tests/src/migrate.rs          |  19 +++-
 7 files changed, 189 insertions(+), 56 deletions(-)
diff --git a/phd-tests/framework/src/test_vm/mod.rs b/phd-tests/framework/src/test_vm/mod.rs
index 50d971cae..269399b80 100644
--- a/phd-tests/framework/src/test_vm/mod.rs
+++ b/phd-tests/framework/src/test_vm/mod.rs
@@ -23,7 +23,7 @@ use crate::{
     TestCtx,
 };
 
-use anyhow::{anyhow, Context, Result};
+use anyhow::{anyhow, bail, Context, Result};
 use camino::Utf8PathBuf;
 use core::result::Result as StdResult;
 use propolis_client::{
@@ -152,6 +152,107 @@ enum VmState {
     Ensured { serial: SerialConsole },
 }
 
+/// Description of the acceptable status codes from executing a command in a
+/// [`TestVm::run_shell_command`].
+// This could reasonably have a `Status(u16)` variant to check specific non-zero
+// statuses, but specific codes are not terribly portable! In the few cases we
+// can expect a specific status for errors, those specific codes change between
+// f.ex illumos and Linux guests.
+enum StatusCheck {
+    Ok,
+    NotOk,
+}
+
+pub struct ShellOutputExecutor<'ctx> {
+    vm: &'ctx TestVm,
+    cmd: &'ctx str,
+    status_check: Option<StatusCheck>,
+}
+
+impl<'a> ShellOutputExecutor<'a> {
+    pub fn ignore_status(mut self) -> ShellOutputExecutor<'a> {
+        self.status_check = None;
+        self
+    }
+
+    pub fn check_ok(mut self) -> ShellOutputExecutor<'a> {
+        self.status_check = Some(StatusCheck::Ok);
+        self
+    }
+
+    pub fn check_err(mut self) -> ShellOutputExecutor<'a> {
+        self.status_check = Some(StatusCheck::NotOk);
+        self
+    }
+}
+use futures::FutureExt;
+
+impl<'a> std::future::IntoFuture for ShellOutputExecutor<'a> {
+    type Output = Result<String>;
+    type IntoFuture = futures::future::BoxFuture<'a, Result<String>>;
+
+    fn into_future(self) -> Self::IntoFuture {
+        Box::pin(async move {
+            // Allow the guest OS to transform the input command into a
+            // guest-specific command sequence. This accounts for the guest's
+            // shell type (which affects e.g. affects how it displays multi-line
+            // commands) and serial console buffering discipline.
+            let command_sequence =
+                self.vm.guest_os.shell_command_sequence(self.cmd);
+            self.vm.run_command_sequence(command_sequence).await?;
+
+            // `shell_command_sequence` promises that the generated command
+            // sequence clears buffer of everything up to and including the
+            // input command before actually issuing the final '\n' that issues
+            // the command.  This ensures that the buffer contents returned by
+            // this call contain only the command's output.
+            let output = self
+                .vm
+                .wait_for_serial_output(
+                    self.vm.guest_os.get_shell_prompt(),
+                    Duration::from_secs(300),
+                )
+                .await?;
+
+            // Trim any leading newlines inserted when the command was issued
+            // and any trailing whitespace that isn't actually part of the
+            // command output. Any other embedded whitespace is the caller's
+            // problem.
+            let output = output.trim().to_string();
+
+            if let Some(check) = self.status_check {
+                let status_command_sequence =
+                    self.vm.guest_os.shell_command_sequence("echo $?");
+                self.vm.run_command_sequence(status_command_sequence).await?;
+                let status = self
+                    .vm
+                    .wait_for_serial_output(
+                        self.vm.guest_os.get_shell_prompt(),
+                        Duration::from_secs(300),
+                    )
+                    .await?;
+                let status = status.trim().parse::<u16>()?;
+
+                match check {
+                    StatusCheck::Ok => {
+                        if status != 0 {
+                            bail!("expected status 0, got {}", status);
+                        }
+                    }
+                    StatusCheck::NotOk => {
+                        if status == 0 {
+                            bail!("expected non-zero status, got {}", status);
+                        }
+                    }
+                }
+            }
+
+            Ok(output)
+        })
+        .boxed()
+    }
+}
+
 /// A virtual machine running in a Propolis server. Test cases create these VMs
 /// using the `factory::VmFactory` embedded in their test contexts.
 ///
@@ -885,30 +986,23 @@ impl TestVm {
     /// waits for another shell prompt to appear using
     /// [`Self::wait_for_serial_output`] and returns any text that was buffered
     /// to the serial console after the command was sent.
-    pub async fn run_shell_command(&self, cmd: &str) -> Result<String> {
-        // Allow the guest OS to transform the input command into a
-        // guest-specific command sequence. This accounts for the guest's shell
-        // type (which affects e.g. affects how it displays multi-line commands)
-        // and serial console buffering discipline.
-        let command_sequence = self.guest_os.shell_command_sequence(cmd);
-        self.run_command_sequence(command_sequence).await?;
-
-        // `shell_command_sequence` promises that the generated command sequence
-        // clears buffer of everything up to and including the input command
-        // before actually issuing the final '\n' that issues the command.
-        // This ensures that the buffer contents returned by this call contain
-        // only the command's output.
-        let out = self
-            .wait_for_serial_output(
-                self.guest_os.get_shell_prompt(),
-                Duration::from_secs(300),
-            )
-            .await?;
-
-        // Trim any leading newlines inserted when the command was issued and
-        // any trailing whitespace that isn't actually part of the command
-        // output. Any other embedded whitespace is the caller's problem.
-        Ok(out.trim().to_string())
+    ///
+    /// After running the shell command, sends `echo $?` to query and return the
+    /// command's return status as well.
+    ///
+    /// This will return an error if the command returns a non-zero status by
+    /// default; to ignore the status or expect a non-zero as a positive
+    /// condition, see [`ShellOutputExecutor::ignore_status`] or
+    /// [`ShellOutputExecutor::check_err`].
+    pub fn run_shell_command<'a>(
+        &'a self,
+        cmd: &'a str,
+    ) -> ShellOutputExecutor<'a> {
+        ShellOutputExecutor {
+            vm: self,
+            cmd,
+            status_check: Some(StatusCheck::Ok),
+        }
     }
 
     pub async fn graceful_reboot(&self) -> Result<()> {
diff --git a/phd-tests/tests/src/boot_order.rs b/phd-tests/tests/src/boot_order.rs
index a048ae6cf..6ae79ff0a 100644
--- a/phd-tests/tests/src/boot_order.rs
+++ b/phd-tests/tests/src/boot_order.rs
@@ -256,7 +256,8 @@ async fn guest_can_adjust_boot_order(ctx: &TestCtx) {
 
     // If the guest doesn't have an EFI partition then there's no way for boot
     // order preferences to be persisted.
-    let mountline = vm.run_shell_command("mount | grep efivarfs").await?;
+    let mountline =
+        vm.run_shell_command("mount | grep efivarfs").ignore_status().await?;
 
     if !mountline.starts_with("efivarfs on ") {
         warn!(
@@ -268,11 +269,16 @@ async fn guest_can_adjust_boot_order(ctx: &TestCtx) {
 
     // Try adding a few new boot options, then add them to the boot order,
     // reboot, and make sure they're all as we set them.
-    if !vm
-        .run_shell_command(&format!("ls {}", efipath(&bootvar(0xffff))))
-        .await?
-        .is_empty()
-    {
+    let bootffff_path = efipath(&bootvar(0xffff));
+    let bootffff_res = vm
+        .run_shell_command(&format!("ls {bootffff_path}"))
+        .ignore_status()
+        .await?;
+    // `ls` just prints the file path if it exists, but the error text varies a
+    // bit depending on Alpine, Ubuntu, Busybox, etc. Notionally we could
+    // `check_err()` above, but having a `BootFFFF` entry already is merely
+    // weird; we can still replace it and continue with the test.
+    if bootffff_res == bootffff_path {
         warn!(
             "guest environment already has a BootFFFF entry; \
             is this not a fresh image?"
diff --git a/phd-tests/tests/src/crucible/migrate.rs b/phd-tests/tests/src/crucible/migrate.rs
index 05c6cab30..c6f64cd88 100644
--- a/phd-tests/tests/src/crucible/migrate.rs
+++ b/phd-tests/tests/src/crucible/migrate.rs
@@ -19,7 +19,8 @@ async fn smoke_test(ctx: &TestCtx) {
     source.launch().await?;
     source.wait_to_boot().await?;
 
-    let lsout = source.run_shell_command("ls foo.bar 2> /dev/null").await?;
+    let lsout =
+        source.run_shell_command("ls foo.bar 2> /dev/null").check_err().await?;
     assert_eq!(lsout, "");
     source.run_shell_command("touch ./foo.bar").await?;
     source.run_shell_command("sync ./foo.bar").await?;
diff --git a/phd-tests/tests/src/crucible/smoke.rs b/phd-tests/tests/src/crucible/smoke.rs
index 0532ac0a2..815e2782c 100644
--- a/phd-tests/tests/src/crucible/smoke.rs
+++ b/phd-tests/tests/src/crucible/smoke.rs
@@ -65,7 +65,8 @@ async fn shutdown_persistence_test(ctx: &TestCtx) {
 
     // Verify that the test file doesn't exist yet, then touch it, flush it, and
     // shut down the VM.
-    let lsout = vm.run_shell_command("ls foo.bar 2> /dev/null").await?;
+    let lsout =
+        vm.run_shell_command("ls foo.bar 2> /dev/null").check_err().await?;
     assert_eq!(lsout, "");
     vm.run_shell_command("touch ./foo.bar").await?;
     vm.run_shell_command("sync ./foo.bar").await?;
diff --git a/phd-tests/tests/src/hw.rs b/phd-tests/tests/src/hw.rs
index db2b2cbd3..10325d6a3 100644
--- a/phd-tests/tests/src/hw.rs
+++ b/phd-tests/tests/src/hw.rs
@@ -17,15 +17,23 @@ async fn lspci_lifecycle_test(ctx: &TestCtx) {
     vm.launch().await?;
     vm.wait_to_boot().await?;
 
-    let lspci = vm.run_shell_command(LSPCI).await?;
-    let lshw = vm.run_shell_command(LSHW).await?;
+    // XXX: do not `ignore_status()` on these commands! They fail for any number
+    // of reasons on different guests:
+    // * sudo may not exist (some Alpine)
+    // * lshw may not exist (Debian)
+    // * we may not input a sudo password (Ubuntu)
+
+    let lspci = vm.run_shell_command(LSPCI).ignore_status().await?;
+    let lshw = vm.run_shell_command(LSHW).ignore_status().await?;
     ctx.lifecycle_test(vm, &[Action::StopAndStart], move |vm| {
         let lspci = lspci.clone();
         let lshw = lshw.clone();
         Box::pin(async move {
-            let new_lspci = vm.run_shell_command(LSPCI).await.unwrap();
+            let new_lspci =
+                vm.run_shell_command(LSPCI).ignore_status().await.unwrap();
             assert_eq!(new_lspci, lspci);
-            let new_lshw = vm.run_shell_command(LSHW).await.unwrap();
+            let new_lshw =
+                vm.run_shell_command(LSHW).ignore_status().await.unwrap();
             assert_eq!(new_lshw, lshw);
         })
     })
diff --git a/phd-tests/tests/src/hyperv.rs b/phd-tests/tests/src/hyperv.rs
index 8ab9df6ba..02ce28f92 100644
--- a/phd-tests/tests/src/hyperv.rs
+++ b/phd-tests/tests/src/hyperv.rs
@@ -24,20 +24,19 @@ use uuid::Uuid;
 /// startup and shutdown with Hyper-V emulation enabled.
 async fn guest_detect_hyperv(vm: &TestVm) -> anyhow::Result<()> {
     if vm.guest_os_kind().is_linux() {
-        // Many Linux distros come with systemd installed out of the box. On
-        // these distros, it's easiest to use `systemd-detect-virt` to determine
-        // whether the guest thinks it's running on a Hyper-V-compatible
-        // hypervisor. (Whether any actual enlightenments are enabled is another
-        // story, but those can often be detected by other means.)
-        let out = vm.run_shell_command("systemd-detect-virt").await?;
-        if out.contains("systemd-detect-virt: not found") {
-            warn!(
-                "guest doesn't support systemd-detect-virt, can't verify it \
-                detected Hyper-V support"
-            );
-        } else {
-            assert_eq!(out, "microsoft");
-        }
+        // One might imagine we could simply use `systemd-detect-virt` to check
+        // hypervisor information here, but it's not present out of the box on
+        // Alpine. In the interest of exercising Hyper-V in typical CI runs on a
+        // standard Alpine image, detect Hyper-V in a.. worse but reliable way:
+        // looking for relevant logs in dmesg. This should work for all Linuxes
+        // from later than May-ish 2010 (>2.6.34 or so). If we don't see Hyper-V
+        // reported in dmesg either it's genuinely not detected, it's a very old
+        // Linux, or it's a new Linux and dmesg text has changed.
+        const HV_TEXT: &str = "Hypervisor detected: Microsoft Hyper-V";
+
+        // No "sudo" here because Alpine doesn't have sudo; for Linux tests we
+        // expect to run test commands as root anyway.
+        vm.run_shell_command(&format!("dmesg | grep \"{HV_TEXT}\"")).await?;
     } else if vm.guest_os_kind().is_windows() {
         // Windows is good about giving signals that it's running in a Hyper-V
         // *root partition*, but offers no clear signal as to whether it has
@@ -45,6 +44,8 @@ async fn guest_detect_hyperv(vm: &TestVm) -> anyhow::Result<()> {
         // are methods for detecting whether Windows is running as a guest, but
         // these don't identify the detected hypervisor type.)
         warn!("running on Windows, can't verify it detected Hyper-V support");
+    } else {
+        warn!("unknown guest type, can't verify it detected Hyper-V support");
     }
 
     Ok(())
@@ -56,6 +57,16 @@ async fn hyperv_smoke_test(ctx: &TestCtx) {
     cfg.guest_hv_interface(GuestHypervisorInterface::HyperV {
         features: Default::default(),
     });
+    // For reasons absolutely indecipherable to me, Alpine (3.16, kernel
+    // 5.15.41-0-virt) seems to lose some early dmesg lines if booted with less
+    // than four vCPUs. Among the early dmesg lines are `Hypervisor detected:
+    // Microsoft Hyper-V` that we look for as confirmation that Linux knows
+    // there's a Hyper-V-like hypervisor present.
+    //
+    // I'd love to debug exactly why this is relevant to the contents of dmesg
+    // (the remaining log is identical and it doesn't seem that the ring buffer
+    // is full), but I really can't justify the time!
+    cfg.cpus(4);
     let mut vm = ctx.spawn_vm(&cfg, None).await?;
     vm.launch().await?;
     vm.wait_to_boot().await?;
@@ -69,6 +80,9 @@ async fn hyperv_lifecycle_test(ctx: &TestCtx) {
     cfg.guest_hv_interface(GuestHypervisorInterface::HyperV {
         features: Default::default(),
     });
+    // Spooky load-bearing vCPU count to preserve dmesg log lines. See the
+    // comment in `hyperv_smoke_test`.
+    cfg.cpus(4);
     let mut vm = ctx.spawn_vm(&cfg, None).await?;
     vm.launch().await?;
     vm.wait_to_boot().await?;
diff --git a/phd-tests/tests/src/migrate.rs b/phd-tests/tests/src/migrate.rs
index e7875acb1..c55436c7f 100644
--- a/phd-tests/tests/src/migrate.rs
+++ b/phd-tests/tests/src/migrate.rs
@@ -55,7 +55,11 @@ mod from_base {
             spawn_base_vm(ctx, "migration_from_base_and_back").await?;
         source.launch().await?;
         source.wait_to_boot().await?;
-        let lsout = source.run_shell_command("ls foo.bar 2> /dev/null").await?;
+        // `ls` with no results exits non-zero, so expect an error here.
+        let lsout = source
+            .run_shell_command("ls foo.bar 2> /dev/null")
+            .check_err()
+            .await?;
         assert_eq!(lsout, "");
 
         // create an empty file on the source VM.
@@ -74,8 +78,9 @@ mod from_base {
                     // the file should still exist on the target VM after migration.
                     let lsout = target
                         .run_shell_command("ls foo.bar")
+                        .ignore_status()
                         .await
-                        .expect("`ls foo.bar` should succeed");
+                        .expect("can try to run `ls foo.bar`");
                     assert_eq!(lsout, "foo.bar");
                 })
             },
@@ -275,7 +280,9 @@ mod running_process {
         ))
         .await?;
         vm.run_shell_command("chmod +x dirt.sh").await?;
-        let run_dirt = vm.run_shell_command("./dirt.sh").await?;
+        // When dirt.sh suspends itself, the parent shell will report a non-zero
+        // status (one example is 148: 128 + SIGTSTP aka 20 on Linux).
+        let run_dirt = vm.run_shell_command("./dirt.sh").check_err().await?;
         assert!(run_dirt.contains("made dirt"), "dirt.sh failed: {run_dirt:?}");
         assert!(
             run_dirt.contains("Stopped"),
@@ -314,7 +321,8 @@ async fn multiple_migrations(ctx: &TestCtx) {
 async fn run_smoke_test(ctx: &TestCtx, mut source: TestVm) -> Result<()> {
     source.launch().await?;
     source.wait_to_boot().await?;
-    let lsout = source.run_shell_command("ls foo.bar 2> /dev/null").await?;
+    let lsout =
+        source.run_shell_command("ls foo.bar 2> /dev/null").check_err().await?;
     assert_eq!(lsout, "");
 
     // create an empty file on the source VM.
@@ -329,8 +337,9 @@ async fn run_smoke_test(ctx: &TestCtx, mut source: TestVm) -> Result<()> {
                 // the file should still exist on the target VM after migration.
                 let lsout = target
                     .run_shell_command("ls foo.bar")
+                    .ignore_status()
                     .await
-                    .expect("`ls foo.bar` should succeed after migration");
+                    .expect("can try to run `ls foo.bar`");
                 assert_eq!(lsout, "foo.bar");
             })
         },

From bebcd040308ca6b845d17df6e7af86d1b787961e Mon Sep 17 00:00:00 2001
From: iximeow <iximeow@oxide.computer>
Date: Tue, 10 Feb 2026 01:28:45 +0000
Subject: [PATCH 2/2] pr feedback and drag cpu topo test into the future

---
 phd-tests/framework/src/test_vm/mod.rs | 2 +-
 phd-tests/tests/src/cpuid.rs           | 1 +
 phd-tests/tests/src/hw.rs              | 2 ++
 phd-tests/tests/src/hyperv.rs          | 2 +-
 4 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/phd-tests/framework/src/test_vm/mod.rs b/phd-tests/framework/src/test_vm/mod.rs
index 269399b80..8997bf644 100644
--- a/phd-tests/framework/src/test_vm/mod.rs
+++ b/phd-tests/framework/src/test_vm/mod.rs
@@ -26,6 +26,7 @@ use crate::{
 use anyhow::{anyhow, bail, Context, Result};
 use camino::Utf8PathBuf;
 use core::result::Result as StdResult;
+use futures::FutureExt;
 use propolis_client::{
     instance_spec::{
         ComponentV0, InstanceProperties, InstanceSpecGetResponse,
@@ -185,7 +186,6 @@ impl<'a> ShellOutputExecutor<'a> {
         self
     }
 }
-use futures::FutureExt;
 
 impl<'a> std::future::IntoFuture for ShellOutputExecutor<'a> {
     type Output = Result<String>;
diff --git a/phd-tests/tests/src/cpuid.rs b/phd-tests/tests/src/cpuid.rs
index 0c76cf136..cbf764efa 100644
--- a/phd-tests/tests/src/cpuid.rs
+++ b/phd-tests/tests/src/cpuid.rs
@@ -193,6 +193,7 @@ impl<'a> LinuxGuestTopo<'a> {
                 "ls {}",
                 Self::cpu_stem(this.cpus().await)
             ))
+            .ignore_status()
             .await
             .expect("can run ls of a directory that doesn't exist");
         assert!(out.contains("No such file or directory"));
diff --git a/phd-tests/tests/src/hw.rs b/phd-tests/tests/src/hw.rs
index 10325d6a3..e0defefdf 100644
--- a/phd-tests/tests/src/hw.rs
+++ b/phd-tests/tests/src/hw.rs
@@ -22,6 +22,8 @@ async fn lspci_lifecycle_test(ctx: &TestCtx) {
     // * sudo may not exist (some Alpine)
     // * lshw may not exist (Debian)
     // * we may not input a sudo password (Ubuntu)
+    //
+    // see also: https://github.com/oxidecomputer/propolis/issues/792
 
     let lspci = vm.run_shell_command(LSPCI).ignore_status().await?;
     let lshw = vm.run_shell_command(LSHW).ignore_status().await?;
diff --git a/phd-tests/tests/src/hyperv.rs b/phd-tests/tests/src/hyperv.rs
index 02ce28f92..d51b29962 100644
--- a/phd-tests/tests/src/hyperv.rs
+++ b/phd-tests/tests/src/hyperv.rs
@@ -27,7 +27,7 @@ async fn guest_detect_hyperv(vm: &TestVm) -> anyhow::Result<()> {
         // One might imagine we could simply use `systemd-detect-virt` to check
         // hypervisor information here, but it's not present out of the box on
         // Alpine. In the interest of exercising Hyper-V in typical CI runs on a
-        // standard Alpine image, detect Hyper-V in a.. worse but reliable way:
+        // standard Alpine image, detect Hyper-V in a... worse but reliable way:
         // looking for relevant logs in dmesg. This should work for all Linuxes
         // from later than May-ish 2010 (>2.6.34 or so). If we don't see Hyper-V
         // reported in dmesg either it's genuinely not detected, it's a very old