Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions dev-tools/omicron-dev/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ nexus-test-interface.workspace = true
nexus-test-utils = { workspace = true, features = ["omicron-dev"] }
omicron-nexus.workspace = true
omicron-workspace-hack.workspace = true
omicron-sled-agent.workspace = true
oxide-tokio-rt.workspace = true
# See omicron-rpaths for more about the "pq-sys" dependency.
pq-sys = "*"
Expand Down
15 changes: 14 additions & 1 deletion dev-tools/omicron-dev/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use libc::SIGINT;
use nexus_config::NexusConfig;
use nexus_test_interface::NexusServer;
use nexus_test_utils::resource_helpers::DiskTest;
use omicron_sled_agent::sim::ConfigHealthMonitor;
use signal_hook_tokio::Signals;
use std::fs;

Expand Down Expand Up @@ -57,6 +58,9 @@ struct RunAllArgs {
/// Override the nexus configuration file.
#[clap(long, default_value = DEFAULT_NEXUS_CONFIG)]
nexus_config: Utf8PathBuf,
/// Enable the sled agent health monitor
#[clap(long, default_value_t = false, action)]
enable_sled_agent_health_monitor: bool,
}

impl RunAllArgs {
Expand Down Expand Up @@ -87,10 +91,19 @@ impl RunAllArgs {
.set_port(p);
}

let sled_agent_health_monitor = ConfigHealthMonitor {
enabled: self.enable_sled_agent_health_monitor,
};

println!("omicron-dev: setting up all services ... ");
let cptestctx = nexus_test_utils::omicron_dev_setup_with_config::<
omicron_nexus::Server,
>(&mut config, 0, self.gateway_config.clone())
>(
&mut config,
0,
self.gateway_config.clone(),
sled_agent_health_monitor,
)
.await
.context("error setting up services")?;

Expand Down
3 changes: 3 additions & 0 deletions nexus/inventory/src/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -980,6 +980,9 @@ mod test {
None,
sim::ZpoolConfig::None,
SledCpuFamily::AmdMilan,
// For now we disable the health monitor, we can change this preference
// later if necessary.
sim::ConfigHealthMonitor { enabled: false },
);

let agent =
Expand Down
18 changes: 14 additions & 4 deletions nexus/test-utils/src/nexus_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
use crate::ControlPlaneStarter;
use crate::ControlPlaneTestContextSledAgent;
use crate::starter::PopulateCrdb;
use crate::starter::SledAgentOptions;
use crate::starter::setup_with_config_impl;
#[cfg(feature = "omicron-dev")]
use anyhow::Context;
Expand Down Expand Up @@ -85,9 +86,14 @@ impl<'a> ControlPlaneBuilder<'a> {
setup_with_config_impl(
starter,
PopulateCrdb::FromEnvironmentSeed,
sim::SimMode::Explicit,
SledAgentOptions {
sim_mode: sim::SimMode::Explicit,
extra_sled_agents: self.nextra_sled_agents,
sled_agent_health_monitor: sim::ConfigHealthMonitor {
enabled: false,
},
},
self.tls_cert,
self.nextra_sled_agents,
DEFAULT_SP_SIM_CONFIG.into(),
false,
)
Expand Down Expand Up @@ -361,6 +367,7 @@ pub async fn omicron_dev_setup_with_config<N: NexusServer>(
config: &mut NexusConfig,
extra_sled_agents: u16,
gateway_config_file: Utf8PathBuf,
sled_agent_health_monitor: sim::ConfigHealthMonitor,
) -> Result<ControlPlaneTestContext<N>> {
let starter = ControlPlaneStarter::<N>::new("omicron-dev", config);

Expand All @@ -383,9 +390,12 @@ pub async fn omicron_dev_setup_with_config<N: NexusServer>(
Ok(setup_with_config_impl(
starter,
PopulateCrdb::FromSeed { input_tar: seed_tar },
sim::SimMode::Auto,
SledAgentOptions {
sim_mode: sim::SimMode::Auto,
extra_sled_agents,
sled_agent_health_monitor,
},
None,
extra_sled_agents,
gateway_config_file,
true,
)
Expand Down
29 changes: 27 additions & 2 deletions nexus/test-utils/src/starter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -880,6 +880,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> {
sled_id: SledUuid,
sled_index: u16,
sim_mode: sim::SimMode,
health_monitor: sim::ConfigHealthMonitor,
) {
let nexus_address =
self.nexus_internal_addr.expect("Must launch Nexus first");
Expand All @@ -896,6 +897,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> {
tempdir.path(),
sim_mode,
&self.simulated_upstairs,
health_monitor,
)
.await
.expect("Failed to start sled agent");
Expand Down Expand Up @@ -1000,6 +1002,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> {
sled_id: SledUuid,
sled_index: u16,
sim_mode: sim::SimMode,
health_monitor: sim::ConfigHealthMonitor,
) {
let nexus_address =
self.nexus_internal_addr.expect("Must launch Nexus first");
Expand All @@ -1016,6 +1019,7 @@ impl<'a, N: NexusServer> ControlPlaneStarter<'a, N> {
tempdir.path(),
sim_mode,
&self.simulated_upstairs,
health_monitor,
)
.await
.expect("Failed to start sled agent");
Expand Down Expand Up @@ -1534,15 +1538,26 @@ impl RackInitRequestBuilder {
}
}

#[derive(Debug, Clone)]
pub(crate) struct SledAgentOptions {
pub sim_mode: sim::SimMode,
pub extra_sled_agents: u16,
pub sled_agent_health_monitor: sim::ConfigHealthMonitor,
}

pub(crate) async fn setup_with_config_impl<N: NexusServer>(
mut starter: ControlPlaneStarter<'_, N>,
populate: PopulateCrdb,
sim_mode: sim::SimMode,
sled_agent_opts: SledAgentOptions,
initial_cert: Option<Certificate>,
extra_sled_agents: u16,
gateway_config_file: Utf8PathBuf,
second_nexus: bool,
) -> ControlPlaneTestContext<N> {
let SledAgentOptions {
sim_mode,
extra_sled_agents,
sled_agent_health_monitor,
} = sled_agent_opts;
const STEP_TIMEOUT: Duration = Duration::from_secs(600);

// All setups will start with CRDB and clickhouse
Expand Down Expand Up @@ -1705,6 +1720,7 @@ pub(crate) async fn setup_with_config_impl<N: NexusServer>(
// The first and second sled agents have special UUIDs, and any extra ones
// after that are random.

let health_monitor = sled_agent_health_monitor.clone();
starter
.init_with_steps(
vec![(
Expand All @@ -1715,6 +1731,7 @@ pub(crate) async fn setup_with_config_impl<N: NexusServer>(
SLED_AGENT_UUID.parse().unwrap(),
0,
sim_mode,
health_monitor,
)
.boxed()
}),
Expand All @@ -1723,6 +1740,7 @@ pub(crate) async fn setup_with_config_impl<N: NexusServer>(
)
.await;

let health_monitor = sled_agent_health_monitor.clone();
if extra_sled_agents > 0 {
starter
.init_with_steps(
Expand All @@ -1734,6 +1752,7 @@ pub(crate) async fn setup_with_config_impl<N: NexusServer>(
SLED_AGENT2_UUID.parse().unwrap(),
1,
sim_mode,
health_monitor,
)
.boxed()
}),
Expand All @@ -1743,7 +1762,9 @@ pub(crate) async fn setup_with_config_impl<N: NexusServer>(
.await;
}

let health_monitor = sled_agent_health_monitor.clone();
for index in 1..extra_sled_agents {
let health_monitor = health_monitor.clone();
starter
.init_with_steps(
vec![(
Expand All @@ -1754,6 +1775,7 @@ pub(crate) async fn setup_with_config_impl<N: NexusServer>(
SledUuid::new_v4(),
index.checked_add(1).unwrap(),
sim_mode,
health_monitor.clone(),
)
.boxed()
}),
Expand Down Expand Up @@ -1839,6 +1861,7 @@ pub(crate) enum PopulateCrdb {
///
/// Note: you should probably use the `extra_sled_agents` macro parameter on
/// `nexus_test` instead!
#[allow(clippy::too_many_arguments)]
pub async fn start_sled_agent(
log: Logger,
nexus_address: SocketAddr,
Expand All @@ -1847,6 +1870,7 @@ pub async fn start_sled_agent(
update_directory: &Utf8Path,
sim_mode: sim::SimMode,
simulated_upstairs: &Arc<sim::SimulatedUpstairs>,
health_monitor: sim::ConfigHealthMonitor,
) -> Result<sim::Server, String> {
// Generate a baseboard serial number that matches the SP configuration
// (SimGimlet00, SimGimlet01, etc.) so that inventory can link sled agents
Expand All @@ -1861,6 +1885,7 @@ pub async fn start_sled_agent(
sim::ZpoolConfig::None,
SledCpuFamily::AmdMilan,
Some(baseboard_serial),
health_monitor,
);
start_sled_agent_with_config(log, &config, sled_index, simulated_upstairs)
.await
Expand Down
4 changes: 4 additions & 0 deletions nexus/tests/integration_tests/instances.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1160,6 +1160,7 @@ async fn test_instance_migration_compatible_cpu_platforms(
Some(&camino::Utf8Path::new("/an/unused/update/directory")),
omicron_sled_agent::sim::ZpoolConfig::None,
sled_agent_types::inventory::SledCpuFamily::AmdTurin,
omicron_sled_agent::sim::ConfigHealthMonitor { enabled: false },
);
let new_sled_id = config.id;

Expand Down Expand Up @@ -1349,6 +1350,7 @@ async fn test_instance_migration_incompatible_cpu_platforms(
Some(&camino::Utf8Path::new("/an/unused/update/directory")),
omicron_sled_agent::sim::ZpoolConfig::None,
sled_agent_types::inventory::SledCpuFamily::AmdTurin,
omicron_sled_agent::sim::ConfigHealthMonitor { enabled: false },
);
let turin_sled_id = config.id;

Expand Down Expand Up @@ -1426,6 +1428,7 @@ async fn test_instance_migration_unknown_sled_type(
Some(&camino::Utf8Path::new("/an/unused/update/directory")),
omicron_sled_agent::sim::ZpoolConfig::None,
sled_agent_types::inventory::SledCpuFamily::Unknown,
omicron_sled_agent::sim::ConfigHealthMonitor { enabled: false },
);
let new_sled_id = config.id;

Expand Down Expand Up @@ -7125,6 +7128,7 @@ async fn test_can_start_instance_with_cpu_platform(
Some(&camino::Utf8Path::new("/an/unused/update/directory")),
omicron_sled_agent::sim::ZpoolConfig::None,
sled_agent_types::inventory::SledCpuFamily::AmdTurin,
omicron_sled_agent::sim::ConfigHealthMonitor { enabled: false },
);
let new_sled_id = config.id;

Expand Down
1 change: 1 addition & 0 deletions nexus/tests/integration_tests/sleds.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ async fn test_sleds_list(cptestctx: &ControlPlaneTestContext) {
&update_directory,
sim::SimMode::Explicit,
&cptestctx.first_sled_agent().simulated_upstairs,
sim::ConfigHealthMonitor { enabled: false },
)
.await
.unwrap(),
Expand Down
5 changes: 5 additions & 0 deletions sled-agent/src/bin/sled-agent-sim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use dropshot::ConfigLoggingLevel;
use omicron_common::api::internal::nexus::Certificate;
use omicron_common::cmd::CmdError;
use omicron_common::cmd::fatal;
use omicron_sled_agent::sim::ConfigHealthMonitor;
use omicron_sled_agent::sim::RssArgs;
use omicron_sled_agent::sim::{
Config, ConfigHardware, ConfigStorage, ConfigZpool, SimMode, ZpoolConfig,
Expand Down Expand Up @@ -56,6 +57,9 @@ struct Args {
#[clap(action)]
nexus_lockstep_port: u16,

#[clap(long, default_value_t = false, action)]
enable_health_monitor: bool,

#[clap(long, name = "NEXUS_EXTERNAL_IP:PORT", action)]
/// If specified, when the simulated sled agent initializes the rack, it
/// will record the Nexus service running with the specified external IP
Expand Down Expand Up @@ -127,6 +131,7 @@ async fn do_run() -> Result<(), CmdError> {
Some(tmp.path()),
ZpoolConfig::TenVirtualU2s,
SledCpuFamily::AmdMilan,
ConfigHealthMonitor { enabled: args.enable_health_monitor },
)
};

Expand Down
2 changes: 1 addition & 1 deletion sled-agent/src/long_running_tasks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ async fn spawn_bootstore_tasks(
node_handle
}

async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle {
pub async fn spawn_health_monitor_tasks(log: &Logger) -> HealthMonitorHandle {
info!(log, "Starting health monitor");
let log = log.new(o!("component" => "HealthMonitor"));
HealthMonitorHandle::spawn(log)
Expand Down
13 changes: 13 additions & 0 deletions sled-agent/src/sim/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ pub struct ConfigHardware {
pub baseboard: Baseboard,
}

/// Configuration for the health monitor.
#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)]
pub struct ConfigHealthMonitor {
pub enabled: bool,
}

/// Configuration for a sled agent
#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
pub struct Config {
Expand All @@ -83,6 +89,8 @@ pub struct Config {
pub updates: ConfigUpdates,
/// configuration to emulate the sled agent's hardware
pub hardware: ConfigHardware,
/// configuration for the sled agent's health monitor
pub health_monitor: ConfigHealthMonitor,
}

pub enum ZpoolConfig {
Expand All @@ -101,6 +109,7 @@ impl Config {
update_directory: Option<&Utf8Path>,
zpool_config: ZpoolConfig,
cpu_family: SledCpuFamily,
health_monitor: ConfigHealthMonitor,
) -> Config {
Self::for_testing_with_baseboard(
id,
Expand All @@ -110,9 +119,11 @@ impl Config {
zpool_config,
cpu_family,
None,
health_monitor,
)
}

#[allow(clippy::too_many_arguments)]
pub fn for_testing_with_baseboard(
id: SledUuid,
sim_mode: SimMode,
Expand All @@ -121,6 +132,7 @@ impl Config {
zpool_config: ZpoolConfig,
cpu_family: SledCpuFamily,
baseboard_serial: Option<String>,
health_monitor: ConfigHealthMonitor,
) -> Config {
// This IP range is guaranteed by RFC 6666 to discard traffic.
// For tests that don't use a Nexus, we use this address to simulate a
Expand Down Expand Up @@ -173,6 +185,7 @@ impl Config {
revision: 3,
},
},
health_monitor,
}
}
}
5 changes: 3 additions & 2 deletions sled-agent/src/sim/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ mod upstairs;

pub use crate::updates::ConfigUpdates;
pub use config::{
Baseboard, Config, ConfigHardware, ConfigStorage, ConfigZpool, SimMode,
TEST_HARDWARE_THREADS, TEST_RESERVOIR_RAM, ZpoolConfig,
Baseboard, Config, ConfigHardware, ConfigHealthMonitor, ConfigStorage,
ConfigZpool, SimMode, TEST_HARDWARE_THREADS, TEST_RESERVOIR_RAM,
ZpoolConfig,
};
pub use server::{RssArgs, Server, run_standalone_server};
pub use sled_agent::SledAgent;
Expand Down
Loading
Loading