oxidecomputer · iximeow · Feb 6, 2026 · Feb 5, 2026 · Feb 5, 2026 · Feb 5, 2026
diff --git a/bin/propolis-server/src/lib/initializer.rs b/bin/propolis-server/src/lib/initializer.rs
@@ -481,6 +481,7 @@ impl MachineInitializer<'_> {
         backend_spec: &StorageBackend,
         backend_id: &SpecKey,
         nexus_client: &Option<NexusClient>,
+        wanted_heap: &mut usize,
     ) -> Result<StorageBackendInstance, MachineInitError> {
         match backend_spec {
             StorageBackend::Crucible(spec) => {
@@ -504,6 +505,17 @@ impl MachineInitializer<'_> {
                     }
                 };
 
+                // Wild guess: we might collect up to 1MB (assuming we're
+                // limited by NVMe MDTS) of data in each Crucible worker. That
+                // is accumulated into a BytesMut, which is backed by a
+                // Vec::with_capacity. With a power of two capacity it's
+                // *probably* not rounded up further.
+                const PER_WORKER_HEAP: usize = MB;
+                // And Crucible workers are not currently tunable, so this is
+                // how many there are
+                // (see propolis::block::crucible::Crucible::WORKER_COUNT)
+                *wanted_heap += 8 * PER_WORKER_HEAP;
+
                 let be = propolis::block::CrucibleBackend::create(
                     vcr,
                     propolis::block::BackendOpts {
@@ -560,6 +572,13 @@ impl MachineInitializer<'_> {
                     }
                     None => NonZeroUsize::new(DEFAULT_WORKER_COUNT).unwrap(),
                 };
+
+                // Similar to Crucible backends above: we might collect up to
+                // 1MB (assuming we're limited by NVMe MDTS) of data in each
+                // worker. This is a hack in its own right, see Propolis#985.
+                const PER_WORKER_HEAP: usize = MB;
+                *wanted_heap += nworkers.get() * PER_WORKER_HEAP;
+
                 let be = propolis::block::FileBackend::create(
                     &spec.path,
                     propolis::block::BackendOpts {
@@ -617,7 +636,9 @@ impl MachineInitializer<'_> {
         &mut self,
         chipset: &RegisteredChipset,
         nexus_client: Option<NexusClient>,
-    ) -> Result<(), MachineInitError> {
+    ) -> Result<usize, MachineInitError> {
+        let mut wanted_heap = 0usize;
+
         enum DeviceInterface {
             Virtio,
             Nvme,
@@ -642,13 +663,27 @@ impl MachineInitializer<'_> {
                 }
             };
 
+            // For all storage devices we'll have a QueueMinder connecting
+            // each emulated device queue to storage backends. The minder and
+            // structures in its supporting logic don't have much state, but may
+            // do some dynamic allocation. Assume they won't need more than 1KiB
+            // of state (`in_flight` has at most nworkers entries currently and
+            // will need to grow only once or twice to a small capacity. The
+            // number of outstanding boxed requests and responses is at most
+            // nworkers. Might be more, but not much).
+            //
+            // 64 * 1K is a wild over-estimate while we support 1-15 queues
+            // across virtio-block and nvme.
+            wanted_heap += 64 * 1024;
+
             let bdf: pci::Bdf = pci_path.into();
 
             let StorageBackendInstance { be: backend, crucible } = self
                 .create_storage_backend_from_spec(
                     &disk.backend_spec,
                     backend_id,
                     &nexus_client,
+                    &mut wanted_heap,
                 )
                 .await?;
 
@@ -751,7 +786,7 @@ impl MachineInitializer<'_> {
                 };
             }
         }
-        Ok(())
+        Ok(wanted_heap)
     }
 
     /// Initialize network devices, add them to the device map, and attach them

diff --git a/bin/propolis-server/src/lib/vm/ensure.rs b/bin/propolis-server/src/lib/vm/ensure.rs
@@ -573,7 +573,8 @@ async fn initialize_vm_objects(
         init.initialize_9pfs(&chipset);
     }
 
-    init.initialize_storage_devices(&chipset, options.nexus_client.clone())
+    let wanted_heap = init
+        .initialize_storage_devices(&chipset, options.nexus_client.clone())
         .await?;
 
     let ramfb =
@@ -630,7 +631,7 @@ async fn initialize_vm_objects(
         devices, block_backends, crucible_backends, ..
     } = init;
 
-    Ok(InputVmObjects {
+    let res = InputVmObjects {
         instance_spec: spec.clone(),
         vcpu_tasks,
         machine,
@@ -640,7 +641,35 @@ async fn initialize_vm_objects(
         com1,
         framebuffer: Some(ramfb),
         ps2ctrl,
-    })
+    };
+
+    // Another really terrible hack. As we've found in Propolis#1008, brk()
+    // can end end up starved by a long stream of page faults. We allocate
+    // and deallocate often enough in the hot parts of Propolis at runtime,
+    // but the working set is expected to be relatively tiny; we'd brk()
+    // once or twice, get up to a reasonable size, and never thing twice a
+    // bout it.
+    //
+    // In practice that later allocation may be blocked for tens of
+    // *minutes*, and might be while holding locks for device state (in
+    // 1008 we hold a queue state lock, for example). Everything goes off
+    // the rails from there, as vCPUs can easily get dragged into the mess,
+    // guest NMIs aren't acknowledged because vCPUs can't run, etc.
+    //
+    // So, the awful hack. Most of the runtime growth, that we can think of,
+    // comes from storage backend implementation. We guess at that amount in
+    // `initialize_storage_devices`. Now' we'll alloc at least that much, free
+    // it, and try to avoid brk() when the OS won't be able to get to it.
+    //
+    // This should be able to be removed when we are confident we can
+    // actually brk() at runtime without starving ourselves out.
+    //
+    // As one last step, include an extra 16 MiB of heap as space we might want
+    // when running `propolis-server`, handling HTTP requests, etc.
+    let balloon = vec![0u8; wanted_heap + 16 * propolis::common::MB];
+    std::mem::drop(balloon);
+
+    Ok(res)
 }
 
 /// Create an object used to sample kstats.