diff --git a/bin/propolis-server/src/lib/initializer.rs b/bin/propolis-server/src/lib/initializer.rs index c53ddb34d..c93908100 100644 --- a/bin/propolis-server/src/lib/initializer.rs +++ b/bin/propolis-server/src/lib/initializer.rs @@ -481,6 +481,7 @@ impl MachineInitializer<'_> { backend_spec: &StorageBackend, backend_id: &SpecKey, nexus_client: &Option, + wanted_heap: &mut usize, ) -> Result { match backend_spec { StorageBackend::Crucible(spec) => { @@ -504,6 +505,17 @@ impl MachineInitializer<'_> { } }; + // Wild guess: we might collect up to 1MB (assuming we're + // limited by NVMe MDTS) of data in each Crucible worker. That + // is accumulated into a BytesMut, which is backed by a + // Vec::with_capacity. With a power of two capacity it's + // *probably* not rounded up further. + const PER_WORKER_HEAP: usize = MB; + // And Crucible workers are not currently tunable, so this is + // how many there are + // (see propolis::block::crucible::Crucible::WORKER_COUNT) + *wanted_heap += 8 * PER_WORKER_HEAP; + let be = propolis::block::CrucibleBackend::create( vcr, propolis::block::BackendOpts { @@ -560,6 +572,13 @@ impl MachineInitializer<'_> { } None => NonZeroUsize::new(DEFAULT_WORKER_COUNT).unwrap(), }; + + // Similar to Crucible backends above: we might collect up to + // 1MB (assuming we're limited by NVMe MDTS) of data in each + // worker. This is a hack in its own right, see Propolis#985. + const PER_WORKER_HEAP: usize = MB; + *wanted_heap += nworkers.get() * PER_WORKER_HEAP; + let be = propolis::block::FileBackend::create( &spec.path, propolis::block::BackendOpts { @@ -617,7 +636,9 @@ impl MachineInitializer<'_> { &mut self, chipset: &RegisteredChipset, nexus_client: Option, - ) -> Result<(), MachineInitError> { + ) -> Result { + let mut wanted_heap = 0usize; + enum DeviceInterface { Virtio, Nvme, @@ -642,6 +663,19 @@ impl MachineInitializer<'_> { } }; + // For all storage devices we'll have a QueueMinder connecting + // each emulated device queue to storage backends. The minder and + // structures in its supporting logic don't have much state, but may + // do some dynamic allocation. Assume they won't need more than 1KiB + // of state (`in_flight` has at most nworkers entries currently and + // will need to grow only once or twice to a small capacity. The + // number of outstanding boxed requests and responses is at most + // nworkers. Might be more, but not much). + // + // 64 * 1K is a wild over-estimate while we support 1-15 queues + // across virtio-block and nvme. + wanted_heap += 64 * 1024; + let bdf: pci::Bdf = pci_path.into(); let StorageBackendInstance { be: backend, crucible } = self @@ -649,6 +683,7 @@ impl MachineInitializer<'_> { &disk.backend_spec, backend_id, &nexus_client, + &mut wanted_heap, ) .await?; @@ -751,7 +786,7 @@ impl MachineInitializer<'_> { }; } } - Ok(()) + Ok(wanted_heap) } /// Initialize network devices, add them to the device map, and attach them diff --git a/bin/propolis-server/src/lib/vm/ensure.rs b/bin/propolis-server/src/lib/vm/ensure.rs index 354975d0f..022a876a9 100644 --- a/bin/propolis-server/src/lib/vm/ensure.rs +++ b/bin/propolis-server/src/lib/vm/ensure.rs @@ -573,7 +573,8 @@ async fn initialize_vm_objects( init.initialize_9pfs(&chipset); } - init.initialize_storage_devices(&chipset, options.nexus_client.clone()) + let wanted_heap = init + .initialize_storage_devices(&chipset, options.nexus_client.clone()) .await?; let ramfb = @@ -630,7 +631,7 @@ async fn initialize_vm_objects( devices, block_backends, crucible_backends, .. } = init; - Ok(InputVmObjects { + let res = InputVmObjects { instance_spec: spec.clone(), vcpu_tasks, machine, @@ -640,7 +641,35 @@ async fn initialize_vm_objects( com1, framebuffer: Some(ramfb), ps2ctrl, - }) + }; + + // Another really terrible hack. As we've found in Propolis#1008, brk() + // can end end up starved by a long stream of page faults. We allocate + // and deallocate often enough in the hot parts of Propolis at runtime, + // but the working set is expected to be relatively tiny; we'd brk() + // once or twice, get up to a reasonable size, and never thing twice a + // bout it. + // + // In practice that later allocation may be blocked for tens of + // *minutes*, and might be while holding locks for device state (in + // 1008 we hold a queue state lock, for example). Everything goes off + // the rails from there, as vCPUs can easily get dragged into the mess, + // guest NMIs aren't acknowledged because vCPUs can't run, etc. + // + // So, the awful hack. Most of the runtime growth, that we can think of, + // comes from storage backend implementation. We guess at that amount in + // `initialize_storage_devices`. Now' we'll alloc at least that much, free + // it, and try to avoid brk() when the OS won't be able to get to it. + // + // This should be able to be removed when we are confident we can + // actually brk() at runtime without starving ourselves out. + // + // As one last step, include an extra 16 MiB of heap as space we might want + // when running `propolis-server`, handling HTTP requests, etc. + let balloon = vec![0u8; wanted_heap + 16 * propolis::common::MB]; + std::mem::drop(balloon); + + Ok(res) } /// Create an object used to sample kstats.