Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions clients/sled-agent-client/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,7 @@ schemars.workspace = true
serde.workspace = true
serde_json.workspace = true
sled-agent-types.workspace = true
sled-hardware-types.workspace = true
trust-quorum-types.workspace = true
slog.workspace = true
uuid.workspace = true
11 changes: 9 additions & 2 deletions clients/sled-agent-client/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ use std::convert::TryFrom;
use uuid::Uuid;

pub use propolis_client::{CrucibleOpts, VolumeConstructionRequest};

progenitor::generate_api!(
spec = "../../openapi/sled-agent/sled-agent-latest.json",
interface = Positional,
Expand Down Expand Up @@ -47,14 +46,19 @@ progenitor::generate_api!(
},
replace = {
Baseboard = sled_agent_types_versions::latest::inventory::Baseboard,
BaseboardId = sled_hardware_types::BaseboardId,
ByteCount = omicron_common::api::external::ByteCount,
CommitRequest = trust_quorum_types::messages::CommitRequest,
CommitStatus = trust_quorum_types::status::CommitStatus,
CoordinatorStatus = trust_quorum_types::status::CoordinatorStatus,
DatasetsConfig = omicron_common::disk::DatasetsConfig,
DatasetManagementStatus = omicron_common::disk::DatasetManagementStatus,
DatasetKind = omicron_common::api::internal::shared::DatasetKind,
DiskIdentity = omicron_common::disk::DiskIdentity,
DiskManagementStatus = omicron_common::disk::DiskManagementStatus,
DiskManagementError = omicron_common::disk::DiskManagementError,
DiskVariant = omicron_common::disk::DiskVariant,
Epoch = trust_quorum_types::types::Epoch,
ExternalIpGatewayMap = omicron_common::api::internal::shared::ExternalIpGatewayMap,
ExternalIpConfig = omicron_common::api::internal::shared::ExternalIpConfig,
ExternalIpv4Config = omicron_common::api::internal::shared::ExternalIpv4Config,
Expand All @@ -79,15 +83,18 @@ progenitor::generate_api!(
OmicronZonesConfig = sled_agent_types_versions::latest::inventory::OmicronZonesConfig,
PortFec = omicron_common::api::internal::shared::PortFec,
PortSpeed = omicron_common::api::internal::shared::PortSpeed,
RouterId = omicron_common::api::internal::shared::RouterId,
PrepareAndCommitRequest = trust_quorum_types::messages::PrepareAndCommitRequest,
ReconfigureMsg = trust_quorum_types::messages::ReconfigureMsg,
ResolvedVpcFirewallRule = omicron_common::api::internal::shared::ResolvedVpcFirewallRule,
ResolvedVpcRoute = omicron_common::api::internal::shared::ResolvedVpcRoute,
ResolvedVpcRouteSet = omicron_common::api::internal::shared::ResolvedVpcRouteSet,
RouterId = omicron_common::api::internal::shared::RouterId,
RouterTarget = omicron_common::api::internal::shared::RouterTarget,
RouterVersion = omicron_common::api::internal::shared::RouterVersion,
SledRole = sled_agent_types_versions::latest::inventory::SledRole,
SourceNatConfigGeneric = omicron_common::api::internal::shared::SourceNatConfigGeneric,
SwitchLocation = omicron_common::api::external::SwitchLocation,
Threshold = trust_quorum_types::types::Threshold,
Vni = omicron_common::api::external::Vni,
VpcFirewallIcmpFilter = omicron_common::api::external::VpcFirewallIcmpFilter,
ZpoolKind = omicron_common::zpool_name::ZpoolKind,
Expand Down
38 changes: 38 additions & 0 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ use nexus_types::internal_api::background::SupportBundleCleanupReport;
use nexus_types::internal_api::background::SupportBundleCollectionReport;
use nexus_types::internal_api::background::SupportBundleCollectionStepStatus;
use nexus_types::internal_api::background::SupportBundleEreportStatus;
use nexus_types::internal_api::background::TrustQuorumManagerStatus;
use nexus_types::internal_api::background::TufArtifactReplicationCounters;
use nexus_types::internal_api::background::TufArtifactReplicationRequest;
use nexus_types::internal_api::background::TufArtifactReplicationStatus;
Expand Down Expand Up @@ -1250,6 +1251,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
"fm_sitrep_gc" => {
print_task_fm_sitrep_gc(details);
}
"trust_quorum_manager" => {
print_task_trust_quorum_manager(details);
}
_ => {
println!(
"warning: unknown background task: {:?} \
Expand Down Expand Up @@ -3243,6 +3247,40 @@ fn print_task_fm_sitrep_gc(details: &serde_json::Value) {
);
}

fn print_task_trust_quorum_manager(details: &serde_json::Value) {
let status = match serde_json::from_value::<TrustQuorumManagerStatus>(
details.clone(),
) {
Ok(status) => status,
Err(error) => {
eprintln!(
"warning: failed to interpret task details: {:?}: {:#?}",
error, details
);
return;
}
};

match status {
TrustQuorumManagerStatus::PerRackStatus { statuses, errors } => {
if statuses.is_empty() && errors.is_empty() {
println!("No active reconfigurations");
return;
}
for status in statuses {
println!("{status}");
}

for error in errors {
println!("{error}");
}
}
TrustQuorumManagerStatus::Error(error) => {
println!(" task did not complete successfully: {error}");
}
}
}

const ERRICON: &str = "/!\\";

fn warn_if_nonzero(n: usize) -> &'static str {
Expand Down
12 changes: 12 additions & 0 deletions dev-tools/omdb/tests/env.out
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,10 @@ task: "switch_port_config_manager"
manages switch port settings for rack switches


task: "trust_quorum_manager"
Drive trust quorum reconfigurations to completion


task: "tuf_artifact_replication"
replicate update repo artifacts across sleds

Expand Down Expand Up @@ -449,6 +453,10 @@ task: "switch_port_config_manager"
manages switch port settings for rack switches


task: "trust_quorum_manager"
Drive trust quorum reconfigurations to completion


task: "tuf_artifact_replication"
replicate update repo artifacts across sleds

Expand Down Expand Up @@ -669,6 +677,10 @@ task: "switch_port_config_manager"
manages switch port settings for rack switches


task: "trust_quorum_manager"
Drive trust quorum reconfigurations to completion


task: "tuf_artifact_replication"
replicate update repo artifacts across sleds

Expand Down
16 changes: 16 additions & 0 deletions dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,10 @@ task: "switch_port_config_manager"
manages switch port settings for rack switches


task: "trust_quorum_manager"
Drive trust quorum reconfigurations to completion


task: "tuf_artifact_replication"
replicate update repo artifacts across sleds

Expand Down Expand Up @@ -857,6 +861,12 @@ task: "switch_port_config_manager"
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
warning: unknown background task: "switch_port_config_manager" (don't know how to interpret details: Object {})

task: "trust_quorum_manager"
configured period: every <REDACTED_DURATION>m
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
No active reconfigurations

task: "tuf_artifact_replication"
configured period: every <REDACTED_DURATION>h
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
Expand Down Expand Up @@ -1425,6 +1435,12 @@ task: "switch_port_config_manager"
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
warning: unknown background task: "switch_port_config_manager" (don't know how to interpret details: Object {})

task: "trust_quorum_manager"
configured period: every <REDACTED_DURATION>m
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
No active reconfigurations

task: "tuf_artifact_replication"
configured period: every <REDACTED_DURATION>h
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
Expand Down
16 changes: 16 additions & 0 deletions nexus-config/src/nexus_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,8 @@ pub struct BackgroundTaskConfig {
pub probe_distributor: ProbeDistributorConfig,
/// configuration for multicast reconciler (group+members) task
pub multicast_reconciler: MulticastGroupReconcilerConfig,
/// configuration for trust quorum manager task
pub trust_quorum: TrustQuorumConfig,
}

#[serde_as]
Expand Down Expand Up @@ -965,6 +967,15 @@ pub struct ProbeDistributorConfig {
pub period_secs: Duration,
}

#[serde_as]
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct TrustQuorumConfig {
/// period (in seconds) for periodic activations of the background task that
/// completes trust quorum reconfigurations.
#[serde_as(as = "DurationSeconds<u64>")]
pub period_secs: Duration,
}

/// Configuration for a nexus server
#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)]
pub struct PackageConfig {
Expand Down Expand Up @@ -1269,6 +1280,7 @@ mod test {
fm.sitrep_gc_period_secs = 49
probe_distributor.period_secs = 50
multicast_reconciler.period_secs = 60
trust_quorum.period_secs = 60
[default_region_allocation_strategy]
type = "random"
seed = 0
Expand Down Expand Up @@ -1526,6 +1538,9 @@ mod test {
sled_cache_ttl_secs: MulticastGroupReconcilerConfig::default_sled_cache_ttl_secs(),
backplane_cache_ttl_secs: MulticastGroupReconcilerConfig::default_backplane_cache_ttl_secs(),
},
trust_quorum: TrustQuorumConfig {
period_secs: Duration::from_secs(60),
},
},
multicast: MulticastConfig { enabled: false },
default_region_allocation_strategy:
Expand Down Expand Up @@ -1629,6 +1644,7 @@ mod test {
fm.sitrep_gc_period_secs = 46
probe_distributor.period_secs = 47
multicast_reconciler.period_secs = 60
trust_quorum.period_secs = 60

[default_region_allocation_strategy]
type = "random"
Expand Down
2 changes: 2 additions & 0 deletions nexus/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ slog.workspace = true
slog-async.workspace = true
slog-dtrace.workspace = true
slog-error-chain.workspace = true
swrite.workspace = true
display-error-chain.workspace = true
slog-term.workspace = true
static_assertions.workspace = true
Expand All @@ -119,6 +120,7 @@ tokio = { workspace = true, features = ["full"] }
tokio-postgres = { workspace = true, features = ["with-serde_json-1"] }
tokio-util = { workspace = true, features = ["codec", "rt"] }
tough.workspace = true
trust-quorum-types.workspace = true
tufaceous-artifact.workspace = true
usdt.workspace = true
uuid.workspace = true
Expand Down
1 change: 1 addition & 0 deletions nexus/background-task-interface/src/init.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ pub struct BackgroundTasks {
pub task_fm_sitrep_gc: Activator,
pub task_probe_distributor: Activator,
pub task_multicast_reconciler: Activator,
pub task_trust_quorum_manager: Activator,

// Handles to activate background tasks that do not get used by Nexus
// at-large. These background tasks are implementation details as far as
Expand Down
26 changes: 26 additions & 0 deletions nexus/db-queries/src/db/datastore/sled.rs
Original file line number Diff line number Diff line change
Expand Up @@ -737,6 +737,32 @@ impl DataStore {
Ok(rack_id.map(RackUuid::from))
}

// Return the commissioned sled if it exists in the given rack, given its
// `BaseboardId`.
pub async fn sled_get_commissioned_by_baseboard_and_rack_id(
&self,
opctx: &OpContext,
rack_id: RackUuid,
baseboard_id: BaseboardId,
) -> Result<Option<Sled>, Error> {
opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?;
let conn = &*self.pool_connection_authorized(opctx).await?;
use nexus_db_schema::schema::sled::dsl;
let sled = dsl::sled
.filter(dsl::time_deleted.is_null())
.filter(dsl::part_number.eq(baseboard_id.part_number))
.filter(dsl::serial_number.eq(baseboard_id.serial_number))
.filter(dsl::rack_id.eq(rack_id.into_untyped_uuid()))
.sled_filter(SledFilter::Commissioned)
.select(Sled::as_select())
.get_result_async::<Sled>(conn)
.await
.optional()
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;

Ok(sled)
}

pub async fn sled_list(
&self,
opctx: &OpContext,
Expand Down
26 changes: 23 additions & 3 deletions nexus/db-queries/src/db/datastore/trust_quorum.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ use nexus_db_model::TrustQuorumConfiguration as DbTrustQuorumConfiguration;
use nexus_db_model::TrustQuorumMember as DbTrustQuorumMember;
use nexus_types::trust_quorum::ProposedTrustQuorumConfig;
use nexus_types::trust_quorum::{
TrustQuorumConfig, TrustQuorumMemberData, TrustQuorumMemberState,
TrustQuorumConfig, TrustQuorumConfigState, TrustQuorumMemberData,
TrustQuorumMemberState,
};
use omicron_common::api::external::Error;
use omicron_common::api::external::OptionalLookupResult;
Expand Down Expand Up @@ -163,6 +164,23 @@ impl DataStore {
.map_err(|err| err.into_public_ignore_retries())
}

/// Get the trust quorum configuration from the database for the given Epoch
pub async fn tq_get_config(
&self,
opctx: &OpContext,
rack_id: RackUuid,
epoch: Epoch,
) -> OptionalLookupResult<TrustQuorumConfig> {
opctx.authorize(authz::Action::Read, &authz::FLEET).await?;
let conn = &*self.pool_connection_authorized(opctx).await?;

Self::tq_get_config_with_members_from_epoch_conn(
opctx, conn, rack_id, epoch,
)
.await
.map_err(|err| err.into_public_ignore_retries())
}

async fn tq_get_latest_config_with_members_conn(
opctx: &OpContext,
conn: &async_bb8_diesel::Connection<DbConnection>,
Expand Down Expand Up @@ -591,7 +609,7 @@ impl DataStore {
opctx: &OpContext,
config: trust_quorum_types::configuration::Configuration,
acked_prepares: BTreeSet<BaseboardId>,
) -> Result<(), Error> {
) -> Result<TrustQuorumConfigState, Error> {
opctx.authorize(authz::Action::Modify, &authz::FLEET).await?;
let conn = &*self.pool_connection_authorized(opctx).await?;

Expand Down Expand Up @@ -739,9 +757,11 @@ impl DataStore {
)
.await
.map_err(|txn_error| txn_error.into_diesel(&err))?;

return Ok(TrustQuorumConfigState::Committing);
}

Ok(())
Ok(db_config.state.into())
}
})
.await
Expand Down
1 change: 1 addition & 0 deletions nexus/examples/config-second.toml
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ multicast_reconciler.period_secs = 60
# TTL for backplane topology cache (static platform configuration)
# Default: 86400 seconds (24 hours) - refreshed on-demand when validation fails
# multicast_reconciler.backplane_cache_ttl_secs = 86400
trust_quorum.period_secs = 60

[default_region_allocation_strategy]
# allocate region on 3 random distinct zpools, on 3 random distinct sleds.
Expand Down
Loading