From 667bec2cf43f26a537c508bcfa22482156a71876 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 13 Jan 2026 15:45:12 +1300 Subject: [PATCH 1/3] [sim] Optionally enable health monitor --- Cargo.lock | 1 + dev-tools/omicron-dev/Cargo.toml | 1 + dev-tools/omicron-dev/src/main.rs | 15 ++++++++++++++- nexus/inventory/src/collector.rs | 3 +++ nexus/test-utils/src/nexus_test.rs | 3 +++ nexus/test-utils/src/starter.rs | 14 ++++++++++++++ nexus/tests/integration_tests/instances.rs | 4 ++++ nexus/tests/integration_tests/sleds.rs | 1 + sled-agent/src/bin/sled-agent-sim.rs | 5 +++++ sled-agent/src/long_running_tasks.rs | 2 +- sled-agent/src/sim/config.rs | 12 ++++++++++++ sled-agent/src/sim/mod.rs | 5 +++-- sled-agent/src/sim/sled_agent.rs | 11 +++++++++-- 13 files changed, 71 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b3cf4ebbf0b..c0c43c38aaf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8158,6 +8158,7 @@ dependencies = [ "omicron-dev-lib", "omicron-nexus", "omicron-rpaths", + "omicron-sled-agent", "omicron-test-utils", "omicron-workspace-hack", "oxide-client", diff --git a/dev-tools/omicron-dev/Cargo.toml b/dev-tools/omicron-dev/Cargo.toml index 21a4bc7210c..971e5af5bfd 100644 --- a/dev-tools/omicron-dev/Cargo.toml +++ b/dev-tools/omicron-dev/Cargo.toml @@ -23,6 +23,7 @@ nexus-test-interface.workspace = true nexus-test-utils = { workspace = true, features = ["omicron-dev"] } omicron-nexus.workspace = true omicron-workspace-hack.workspace = true +omicron-sled-agent.workspace = true oxide-tokio-rt.workspace = true # See omicron-rpaths for more about the "pq-sys" dependency. pq-sys = "*" diff --git a/dev-tools/omicron-dev/src/main.rs b/dev-tools/omicron-dev/src/main.rs index 9fa5ac0fc05..a5efe7c766c 100644 --- a/dev-tools/omicron-dev/src/main.rs +++ b/dev-tools/omicron-dev/src/main.rs @@ -11,6 +11,7 @@ use libc::SIGINT; use nexus_config::NexusConfig; use nexus_test_interface::NexusServer; use nexus_test_utils::resource_helpers::DiskTest; +use omicron_sled_agent::sim::ConfigHealthMonitor; use signal_hook_tokio::Signals; use std::fs; @@ -57,6 +58,9 @@ struct RunAllArgs { /// Override the nexus configuration file. #[clap(long, default_value = DEFAULT_NEXUS_CONFIG)] nexus_config: Utf8PathBuf, + /// Enable the sled agent health monitor + #[clap(long, default_value_t = false, action)] + enable_sled_agent_health_monitor: bool, } impl RunAllArgs { @@ -87,10 +91,19 @@ impl RunAllArgs { .set_port(p); } + let sled_agent_health_monitor = ConfigHealthMonitor { + enabled: self.enable_sled_agent_health_monitor, + }; + println!("omicron-dev: setting up all services ... "); let cptestctx = nexus_test_utils::omicron_dev_setup_with_config::< omicron_nexus::Server, - >(&mut config, 0, self.gateway_config.clone()) + >( + &mut config, + 0, + self.gateway_config.clone(), + sled_agent_health_monitor, + ) .await .context("error setting up services")?; diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs index 915cfa58af3..9dd16fc028c 100644 --- a/nexus/inventory/src/collector.rs +++ b/nexus/inventory/src/collector.rs @@ -980,6 +980,9 @@ mod test { None, sim::ZpoolConfig::None, SledCpuFamily::AmdMilan, + // For now we disable the health monitor, we can change this preference + // later if necessary. + sim::ConfigHealthMonitor { enabled: false }, ); let agent = diff --git a/nexus/test-utils/src/nexus_test.rs b/nexus/test-utils/src/nexus_test.rs index a57bf139a9b..65b18c7ed3e 100644 --- a/nexus/test-utils/src/nexus_test.rs +++ b/nexus/test-utils/src/nexus_test.rs @@ -90,6 +90,7 @@ impl<'a> ControlPlaneBuilder<'a> { self.nextra_sled_agents, DEFAULT_SP_SIM_CONFIG.into(), false, + sim::ConfigHealthMonitor { enabled: false }, ) .await } @@ -361,6 +362,7 @@ pub async fn omicron_dev_setup_with_config( config: &mut NexusConfig, extra_sled_agents: u16, gateway_config_file: Utf8PathBuf, + sled_agent_health_monitor: sim::ConfigHealthMonitor, ) -> Result> { let starter = ControlPlaneStarter::::new("omicron-dev", config); @@ -388,6 +390,7 @@ pub async fn omicron_dev_setup_with_config( extra_sled_agents, gateway_config_file, true, + sled_agent_health_monitor, ) .await) } diff --git a/nexus/test-utils/src/starter.rs b/nexus/test-utils/src/starter.rs index 696a40b8e88..fb0c0b848a1 100644 --- a/nexus/test-utils/src/starter.rs +++ b/nexus/test-utils/src/starter.rs @@ -880,6 +880,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { sled_id: SledUuid, sled_index: u16, sim_mode: sim::SimMode, + health_monitor: sim::ConfigHealthMonitor, ) { let nexus_address = self.nexus_internal_addr.expect("Must launch Nexus first"); @@ -896,6 +897,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { tempdir.path(), sim_mode, &self.simulated_upstairs, + health_monitor, ) .await .expect("Failed to start sled agent"); @@ -1000,6 +1002,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { sled_id: SledUuid, sled_index: u16, sim_mode: sim::SimMode, + health_monitor: sim::ConfigHealthMonitor, ) { let nexus_address = self.nexus_internal_addr.expect("Must launch Nexus first"); @@ -1016,6 +1019,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> { tempdir.path(), sim_mode, &self.simulated_upstairs, + health_monitor, ) .await .expect("Failed to start sled agent"); @@ -1542,6 +1546,7 @@ pub(crate) async fn setup_with_config_impl( extra_sled_agents: u16, gateway_config_file: Utf8PathBuf, second_nexus: bool, + sled_agent_health_monitor: sim::ConfigHealthMonitor, ) -> ControlPlaneTestContext { const STEP_TIMEOUT: Duration = Duration::from_secs(600); @@ -1705,6 +1710,7 @@ pub(crate) async fn setup_with_config_impl( // The first and second sled agents have special UUIDs, and any extra ones // after that are random. + let health_monitor = sled_agent_health_monitor.clone(); starter .init_with_steps( vec![( @@ -1715,6 +1721,7 @@ pub(crate) async fn setup_with_config_impl( SLED_AGENT_UUID.parse().unwrap(), 0, sim_mode, + health_monitor, ) .boxed() }), @@ -1723,6 +1730,7 @@ pub(crate) async fn setup_with_config_impl( ) .await; + let health_monitor = sled_agent_health_monitor.clone(); if extra_sled_agents > 0 { starter .init_with_steps( @@ -1734,6 +1742,7 @@ pub(crate) async fn setup_with_config_impl( SLED_AGENT2_UUID.parse().unwrap(), 1, sim_mode, + health_monitor, ) .boxed() }), @@ -1743,7 +1752,9 @@ pub(crate) async fn setup_with_config_impl( .await; } + let health_monitor = sled_agent_health_monitor.clone(); for index in 1..extra_sled_agents { + let health_monitor = health_monitor.clone(); starter .init_with_steps( vec![( @@ -1754,6 +1765,7 @@ pub(crate) async fn setup_with_config_impl( SledUuid::new_v4(), index.checked_add(1).unwrap(), sim_mode, + health_monitor.clone(), ) .boxed() }), @@ -1847,6 +1859,7 @@ pub async fn start_sled_agent( update_directory: &Utf8Path, sim_mode: sim::SimMode, simulated_upstairs: &Arc, + health_monitor: sim::ConfigHealthMonitor, ) -> Result { // Generate a baseboard serial number that matches the SP configuration // (SimGimlet00, SimGimlet01, etc.) so that inventory can link sled agents @@ -1861,6 +1874,7 @@ pub async fn start_sled_agent( sim::ZpoolConfig::None, SledCpuFamily::AmdMilan, Some(baseboard_serial), + health_monitor, ); start_sled_agent_with_config(log, &config, sled_index, simulated_upstairs) .await diff --git a/nexus/tests/integration_tests/instances.rs b/nexus/tests/integration_tests/instances.rs index 70503aa0c1a..614405701cc 100644 --- a/nexus/tests/integration_tests/instances.rs +++ b/nexus/tests/integration_tests/instances.rs @@ -1160,6 +1160,7 @@ async fn test_instance_migration_compatible_cpu_platforms( Some(&camino::Utf8Path::new("/an/unused/update/directory")), omicron_sled_agent::sim::ZpoolConfig::None, sled_agent_types::inventory::SledCpuFamily::AmdTurin, + omicron_sled_agent::sim::ConfigHealthMonitor { enabled: false }, ); let new_sled_id = config.id; @@ -1349,6 +1350,7 @@ async fn test_instance_migration_incompatible_cpu_platforms( Some(&camino::Utf8Path::new("/an/unused/update/directory")), omicron_sled_agent::sim::ZpoolConfig::None, sled_agent_types::inventory::SledCpuFamily::AmdTurin, + omicron_sled_agent::sim::ConfigHealthMonitor { enabled: false }, ); let turin_sled_id = config.id; @@ -1426,6 +1428,7 @@ async fn test_instance_migration_unknown_sled_type( Some(&camino::Utf8Path::new("/an/unused/update/directory")), omicron_sled_agent::sim::ZpoolConfig::None, sled_agent_types::inventory::SledCpuFamily::Unknown, + omicron_sled_agent::sim::ConfigHealthMonitor { enabled: false }, ); let new_sled_id = config.id; @@ -7125,6 +7128,7 @@ async fn test_can_start_instance_with_cpu_platform( Some(&camino::Utf8Path::new("/an/unused/update/directory")), omicron_sled_agent::sim::ZpoolConfig::None, sled_agent_types::inventory::SledCpuFamily::AmdTurin, + omicron_sled_agent::sim::ConfigHealthMonitor { enabled: false }, ); let new_sled_id = config.id; diff --git a/nexus/tests/integration_tests/sleds.rs b/nexus/tests/integration_tests/sleds.rs index 53ae7d92394..9518872c385 100644 --- a/nexus/tests/integration_tests/sleds.rs +++ b/nexus/tests/integration_tests/sleds.rs @@ -79,6 +79,7 @@ async fn test_sleds_list(cptestctx: &ControlPlaneTestContext) { &update_directory, sim::SimMode::Explicit, &cptestctx.first_sled_agent().simulated_upstairs, + sim::ConfigHealthMonitor { enabled: false }, ) .await .unwrap(), diff --git a/sled-agent/src/bin/sled-agent-sim.rs b/sled-agent/src/bin/sled-agent-sim.rs index e18ab69c213..287bb54b45e 100644 --- a/sled-agent/src/bin/sled-agent-sim.rs +++ b/sled-agent/src/bin/sled-agent-sim.rs @@ -15,6 +15,7 @@ use dropshot::ConfigLoggingLevel; use omicron_common::api::internal::nexus::Certificate; use omicron_common::cmd::CmdError; use omicron_common::cmd::fatal; +use omicron_sled_agent::sim::ConfigHealthMonitor; use omicron_sled_agent::sim::RssArgs; use omicron_sled_agent::sim::{ Config, ConfigHardware, ConfigStorage, ConfigZpool, SimMode, ZpoolConfig, @@ -56,6 +57,9 @@ struct Args { #[clap(action)] nexus_lockstep_port: u16, + #[clap(long, default_value_t = false, action)] + enable_health_monitor: bool, + #[clap(long, name = "NEXUS_EXTERNAL_IP:PORT", action)] /// If specified, when the simulated sled agent initializes the rack, it /// will record the Nexus service running with the specified external IP @@ -127,6 +131,7 @@ async fn do_run() -> Result<(), CmdError> { Some(tmp.path()), ZpoolConfig::TenVirtualU2s, SledCpuFamily::AmdMilan, + ConfigHealthMonitor { enabled: args.enable_health_monitor }, ) }; diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index 700d4a08f4b..b9fb087f073 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -275,7 +275,7 @@ async fn spawn_bootstore_tasks( node_handle } -async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle { +pub async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle { info!(log, "Starting health monitor"); let log = log.new(o!("component" => "HealthMonitor")); HealthMonitorHandle::spawn(log) diff --git a/sled-agent/src/sim/config.rs b/sled-agent/src/sim/config.rs index 744ebb1bea3..5b9d231cf84 100644 --- a/sled-agent/src/sim/config.rs +++ b/sled-agent/src/sim/config.rs @@ -66,6 +66,12 @@ pub struct ConfigHardware { pub baseboard: Baseboard, } +/// Configuration for the health monitor. +#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] +pub struct ConfigHealthMonitor { + pub enabled: bool, +} + /// Configuration for a sled agent #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] pub struct Config { @@ -83,6 +89,8 @@ pub struct Config { pub updates: ConfigUpdates, /// configuration to emulate the sled agent's hardware pub hardware: ConfigHardware, + /// configuration for the sled agent's health monitor + pub health_monitor: ConfigHealthMonitor, } pub enum ZpoolConfig { @@ -101,6 +109,7 @@ impl Config { update_directory: Option<&Utf8Path>, zpool_config: ZpoolConfig, cpu_family: SledCpuFamily, + health_monitor: ConfigHealthMonitor, ) -> Config { Self::for_testing_with_baseboard( id, @@ -110,6 +119,7 @@ impl Config { zpool_config, cpu_family, None, + health_monitor, ) } @@ -121,6 +131,7 @@ impl Config { zpool_config: ZpoolConfig, cpu_family: SledCpuFamily, baseboard_serial: Option, + health_monitor: ConfigHealthMonitor, ) -> Config { // This IP range is guaranteed by RFC 6666 to discard traffic. // For tests that don't use a Nexus, we use this address to simulate a @@ -173,6 +184,7 @@ impl Config { revision: 3, }, }, + health_monitor, } } } diff --git a/sled-agent/src/sim/mod.rs b/sled-agent/src/sim/mod.rs index ef7915293e8..6662ee2c1ba 100644 --- a/sled-agent/src/sim/mod.rs +++ b/sled-agent/src/sim/mod.rs @@ -20,8 +20,9 @@ mod upstairs; pub use crate::updates::ConfigUpdates; pub use config::{ - Baseboard, Config, ConfigHardware, ConfigStorage, ConfigZpool, SimMode, - TEST_HARDWARE_THREADS, TEST_RESERVOIR_RAM, ZpoolConfig, + Baseboard, Config, ConfigHardware, ConfigHealthMonitor, ConfigStorage, + ConfigZpool, SimMode, TEST_HARDWARE_THREADS, TEST_RESERVOIR_RAM, + ZpoolConfig, }; pub use server::{RssArgs, Server, run_standalone_server}; pub use sled_agent::SledAgent; diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index bb6e9c028e8..3fc0fd30d61 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -12,9 +12,10 @@ use super::instance::{self, SimInstance}; use super::storage::CrucibleData; use super::storage::Storage; use crate::artifact_store::ArtifactStore; +use crate::long_running_tasks::spawn_health_monitor_tasks; use crate::nexus::NexusClient; -use crate::sim::SimulatedUpstairs; use crate::sim::simulatable::Simulatable; +use crate::sim::{ConfigHealthMonitor, SimulatedUpstairs}; use crate::support_bundle::storage::SupportBundleQueryType; use crate::updates::UpdateManager; use anyhow::Context; @@ -168,7 +169,13 @@ impl SledAgent { .await .start(&log, &config.dropshot); - let health_monitor = HealthMonitorHandle::stub(); + let ConfigHealthMonitor { enabled } = config.health_monitor; + + let health_monitor = if enabled { + spawn_health_monitor_tasks(&log).await + } else { + HealthMonitorHandle::stub() + }; Arc::new(SledAgent { id, From 893b8501259919c35cf3ee0b0918b63b1b7cb3d7 Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 13 Jan 2026 16:08:04 +1300 Subject: [PATCH 2/3] clippy --- nexus/test-utils/src/nexus_test.rs | 21 +++++++++++++++------ nexus/test-utils/src/starter.rs | 17 ++++++++++++++--- sled-agent/src/sim/config.rs | 1 + 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/nexus/test-utils/src/nexus_test.rs b/nexus/test-utils/src/nexus_test.rs index 65b18c7ed3e..6e781ef9b67 100644 --- a/nexus/test-utils/src/nexus_test.rs +++ b/nexus/test-utils/src/nexus_test.rs @@ -7,6 +7,7 @@ use crate::ControlPlaneStarter; use crate::ControlPlaneTestContextSledAgent; use crate::starter::PopulateCrdb; +use crate::starter::SledAgentOptions; use crate::starter::setup_with_config_impl; #[cfg(feature = "omicron-dev")] use anyhow::Context; @@ -85,12 +86,16 @@ impl<'a> ControlPlaneBuilder<'a> { setup_with_config_impl( starter, PopulateCrdb::FromEnvironmentSeed, - sim::SimMode::Explicit, + SledAgentOptions { + sim_mode: sim::SimMode::Explicit, + extra_sled_agents: self.nextra_sled_agents, + sled_agent_health_monitor: sim::ConfigHealthMonitor { + enabled: false, + }, + }, self.tls_cert, - self.nextra_sled_agents, DEFAULT_SP_SIM_CONFIG.into(), false, - sim::ConfigHealthMonitor { enabled: false }, ) .await } @@ -364,6 +369,8 @@ pub async fn omicron_dev_setup_with_config( gateway_config_file: Utf8PathBuf, sled_agent_health_monitor: sim::ConfigHealthMonitor, ) -> Result> { + use crate::starter::SledAgentOptions; + let starter = ControlPlaneStarter::::new("omicron-dev", config); let log = &starter.logctx.log; @@ -385,12 +392,14 @@ pub async fn omicron_dev_setup_with_config( Ok(setup_with_config_impl( starter, PopulateCrdb::FromSeed { input_tar: seed_tar }, - sim::SimMode::Auto, + SledAgentOptions { + sim_mode: sim::SimMode::Auto, + extra_sled_agents, + sled_agent_health_monitor, + }, None, - extra_sled_agents, gateway_config_file, true, - sled_agent_health_monitor, ) .await) } diff --git a/nexus/test-utils/src/starter.rs b/nexus/test-utils/src/starter.rs index fb0c0b848a1..27281b2affc 100644 --- a/nexus/test-utils/src/starter.rs +++ b/nexus/test-utils/src/starter.rs @@ -1538,16 +1538,26 @@ impl RackInitRequestBuilder { } } +#[derive(Debug, Clone)] +pub(crate) struct SledAgentOptions { + pub sim_mode: sim::SimMode, + pub extra_sled_agents: u16, + pub sled_agent_health_monitor: sim::ConfigHealthMonitor, +} + pub(crate) async fn setup_with_config_impl( mut starter: ControlPlaneStarter<'_, N>, populate: PopulateCrdb, - sim_mode: sim::SimMode, + sled_agent_opts: SledAgentOptions, initial_cert: Option, - extra_sled_agents: u16, gateway_config_file: Utf8PathBuf, second_nexus: bool, - sled_agent_health_monitor: sim::ConfigHealthMonitor, ) -> ControlPlaneTestContext { + let SledAgentOptions { + sim_mode, + extra_sled_agents, + sled_agent_health_monitor, + } = sled_agent_opts; const STEP_TIMEOUT: Duration = Duration::from_secs(600); // All setups will start with CRDB and clickhouse @@ -1851,6 +1861,7 @@ pub(crate) enum PopulateCrdb { /// /// Note: you should probably use the `extra_sled_agents` macro parameter on /// `nexus_test` instead! +#[allow(clippy::too_many_arguments)] pub async fn start_sled_agent( log: Logger, nexus_address: SocketAddr, diff --git a/sled-agent/src/sim/config.rs b/sled-agent/src/sim/config.rs index 5b9d231cf84..c2e96b54df3 100644 --- a/sled-agent/src/sim/config.rs +++ b/sled-agent/src/sim/config.rs @@ -123,6 +123,7 @@ impl Config { ) } + #[allow(clippy::too_many_arguments)] pub fn for_testing_with_baseboard( id: SledUuid, sim_mode: SimMode, From b40ee44f4f367b154f0fa1cd868e2365d3e95add Mon Sep 17 00:00:00 2001 From: karencfv Date: Tue, 13 Jan 2026 16:12:33 +1300 Subject: [PATCH 3/3] clean up --- nexus/test-utils/src/nexus_test.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/nexus/test-utils/src/nexus_test.rs b/nexus/test-utils/src/nexus_test.rs index 6e781ef9b67..090e73dca8f 100644 --- a/nexus/test-utils/src/nexus_test.rs +++ b/nexus/test-utils/src/nexus_test.rs @@ -369,8 +369,6 @@ pub async fn omicron_dev_setup_with_config( gateway_config_file: Utf8PathBuf, sled_agent_health_monitor: sim::ConfigHealthMonitor, ) -> Result> { - use crate::starter::SledAgentOptions; - let starter = ControlPlaneStarter::::new("omicron-dev", config); let log = &starter.logctx.log;