From 487df6329932664fb15bb29beae9183ca89eafcc Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 5 Feb 2026 19:20:53 +0000 Subject: [PATCH 1/3] awful: pre-grow propolis-server heap to avoid #1008 the gory details are in that issue, but for VMs with large address spaces it is relatively easy for a guest picking random pages to cause long streams of page faults as Propolis does I/O against those pages. The faults then starve out anything that would change the address space, most importantly `brk()` and friends which may need to grow the heap to support an allocation made as part of device operations. At that point, the device will be (partially) stuck. Bad enough. Then the guest OS may notice the situation and try to restart the device. To do this, a vCPU will do some kind of access to the stuck device, which may be stuck in a way that the vCPU becomes blocked on the device. That vCPU won't be responsive to interrupts and from the guest perspective the whole machine is extremely broken. We're not immediately sure how to untangle the faults or AS lock bits, so for the time being we can at least try to not brk() at runtime by growing the heap probably-enough to serve real needs. --- bin/propolis-server/src/lib/initializer.rs | 39 ++++++++++++++++++++-- bin/propolis-server/src/lib/vm/ensure.rs | 35 +++++++++++++++++-- 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/bin/propolis-server/src/lib/initializer.rs b/bin/propolis-server/src/lib/initializer.rs index c53ddb34d..dfa7199dc 100644 --- a/bin/propolis-server/src/lib/initializer.rs +++ b/bin/propolis-server/src/lib/initializer.rs @@ -481,6 +481,7 @@ impl MachineInitializer<'_> { backend_spec: &StorageBackend, backend_id: &SpecKey, nexus_client: &Option, + wanted_heap: &mut usize, ) -> Result { match backend_spec { StorageBackend::Crucible(spec) => { @@ -504,6 +505,17 @@ impl MachineInitializer<'_> { } }; + // Wild guess: we might collect up to 1MB (assuming we're + // limited by NVMe MDTS) of data in each Crucible worker. That + // is accumulated into a BytesMut, which is backed by a + // Vec::with_capacity. With a power of two capacity it's + // *probably* not rounded up further. + const PER_WORKER_HEAP: usize = MB; + // And Crucible workers are not currently tunable, so this is + // how many there are + // (see propolis::block::crucible::Crucible::WORKER_COUNT) + *wanted_heap = 8 * PER_WORKER_HEAP; + let be = propolis::block::CrucibleBackend::create( vcr, propolis::block::BackendOpts { @@ -560,6 +572,13 @@ impl MachineInitializer<'_> { } None => NonZeroUsize::new(DEFAULT_WORKER_COUNT).unwrap(), }; + + // Similar to Crucible backends above: we might collect up to + // 1MB (assuming we're limited by NVMe MDTS) of data in each + // worker. This is a hack in its own right, see Propolis#985. + const PER_WORKER_HEAP: usize = MB; + *wanted_heap = nworkers.get() * PER_WORKER_HEAP; + let be = propolis::block::FileBackend::create( &spec.path, propolis::block::BackendOpts { @@ -617,7 +636,9 @@ impl MachineInitializer<'_> { &mut self, chipset: &RegisteredChipset, nexus_client: Option, - ) -> Result<(), MachineInitError> { + ) -> Result { + let mut wanted_heap = 0usize; + enum DeviceInterface { Virtio, Nvme, @@ -642,6 +663,19 @@ impl MachineInitializer<'_> { } }; + // For all storage devices we'll have a QueueMinder connecting + // each emulated device queue to storage backends. The minder and + // structures in its supporting logic don't have much state, but may + // do some dynamic allocation. Assume they won't need more than 1KiB + // of state (`in_flight` at most nworkers entries currently and will + // need to grow only once or twice to a small capacity, the number + // of outstanding boxed requests and responses is at most nworkers, + // etc). + // + // 64 * 1K is a wild over-estimate while we support 1-15 queues + // across virtio-block and nvme. + wanted_heap += 64 * 1024; + let bdf: pci::Bdf = pci_path.into(); let StorageBackendInstance { be: backend, crucible } = self @@ -649,6 +683,7 @@ impl MachineInitializer<'_> { &disk.backend_spec, backend_id, &nexus_client, + &mut wanted_heap, ) .await?; @@ -751,7 +786,7 @@ impl MachineInitializer<'_> { }; } } - Ok(()) + Ok(wanted_heap) } /// Initialize network devices, add them to the device map, and attach them diff --git a/bin/propolis-server/src/lib/vm/ensure.rs b/bin/propolis-server/src/lib/vm/ensure.rs index 354975d0f..022a876a9 100644 --- a/bin/propolis-server/src/lib/vm/ensure.rs +++ b/bin/propolis-server/src/lib/vm/ensure.rs @@ -573,7 +573,8 @@ async fn initialize_vm_objects( init.initialize_9pfs(&chipset); } - init.initialize_storage_devices(&chipset, options.nexus_client.clone()) + let wanted_heap = init + .initialize_storage_devices(&chipset, options.nexus_client.clone()) .await?; let ramfb = @@ -630,7 +631,7 @@ async fn initialize_vm_objects( devices, block_backends, crucible_backends, .. } = init; - Ok(InputVmObjects { + let res = InputVmObjects { instance_spec: spec.clone(), vcpu_tasks, machine, @@ -640,7 +641,35 @@ async fn initialize_vm_objects( com1, framebuffer: Some(ramfb), ps2ctrl, - }) + }; + + // Another really terrible hack. As we've found in Propolis#1008, brk() + // can end end up starved by a long stream of page faults. We allocate + // and deallocate often enough in the hot parts of Propolis at runtime, + // but the working set is expected to be relatively tiny; we'd brk() + // once or twice, get up to a reasonable size, and never thing twice a + // bout it. + // + // In practice that later allocation may be blocked for tens of + // *minutes*, and might be while holding locks for device state (in + // 1008 we hold a queue state lock, for example). Everything goes off + // the rails from there, as vCPUs can easily get dragged into the mess, + // guest NMIs aren't acknowledged because vCPUs can't run, etc. + // + // So, the awful hack. Most of the runtime growth, that we can think of, + // comes from storage backend implementation. We guess at that amount in + // `initialize_storage_devices`. Now' we'll alloc at least that much, free + // it, and try to avoid brk() when the OS won't be able to get to it. + // + // This should be able to be removed when we are confident we can + // actually brk() at runtime without starving ourselves out. + // + // As one last step, include an extra 16 MiB of heap as space we might want + // when running `propolis-server`, handling HTTP requests, etc. + let balloon = vec![0u8; wanted_heap + 16 * propolis::common::MB]; + std::mem::drop(balloon); + + Ok(res) } /// Create an object used to sample kstats. From ebb847e6fd6949d9aef6cd4d8e1aee9392ee45c0 Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 5 Feb 2026 20:20:46 +0000 Subject: [PATCH 2/3] the world's most annoying typo --- bin/propolis-server/src/lib/initializer.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/propolis-server/src/lib/initializer.rs b/bin/propolis-server/src/lib/initializer.rs index dfa7199dc..3779411ee 100644 --- a/bin/propolis-server/src/lib/initializer.rs +++ b/bin/propolis-server/src/lib/initializer.rs @@ -514,7 +514,7 @@ impl MachineInitializer<'_> { // And Crucible workers are not currently tunable, so this is // how many there are // (see propolis::block::crucible::Crucible::WORKER_COUNT) - *wanted_heap = 8 * PER_WORKER_HEAP; + *wanted_heap += 8 * PER_WORKER_HEAP; let be = propolis::block::CrucibleBackend::create( vcr, @@ -577,7 +577,7 @@ impl MachineInitializer<'_> { // 1MB (assuming we're limited by NVMe MDTS) of data in each // worker. This is a hack in its own right, see Propolis#985. const PER_WORKER_HEAP: usize = MB; - *wanted_heap = nworkers.get() * PER_WORKER_HEAP; + *wanted_heap += nworkers.get() * PER_WORKER_HEAP; let be = propolis::block::FileBackend::create( &spec.path, From 3ea6fa80f2f7405107870afcb76a3aa3da02e006 Mon Sep 17 00:00:00 2001 From: iximeow Date: Thu, 5 Feb 2026 20:41:03 +0000 Subject: [PATCH 3/3] words --- bin/propolis-server/src/lib/initializer.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/propolis-server/src/lib/initializer.rs b/bin/propolis-server/src/lib/initializer.rs index 3779411ee..c93908100 100644 --- a/bin/propolis-server/src/lib/initializer.rs +++ b/bin/propolis-server/src/lib/initializer.rs @@ -667,10 +667,10 @@ impl MachineInitializer<'_> { // each emulated device queue to storage backends. The minder and // structures in its supporting logic don't have much state, but may // do some dynamic allocation. Assume they won't need more than 1KiB - // of state (`in_flight` at most nworkers entries currently and will - // need to grow only once or twice to a small capacity, the number - // of outstanding boxed requests and responses is at most nworkers, - // etc). + // of state (`in_flight` has at most nworkers entries currently and + // will need to grow only once or twice to a small capacity. The + // number of outstanding boxed requests and responses is at most + // nworkers. Might be more, but not much). // // 64 * 1K is a wild over-estimate while we support 1-15 queues // across virtio-block and nvme.