Skip to content

Commit 15a6afe

Browse files
helsaawyAbhishek Singh (Manifold)
andauthored
Initial multipod support (#2546)
* support multipod scenarios with VirtualPodID annotation This is in continuation of the azcri changes https://msazure.visualstudio.com/ContainerPlatform/_git/azcri/pullrequest/12968264 - Add VirtualPodID, TenantSandboxID, and SkipPodNetworking annotations to pkg/annotations - Update create.go to treat containers with VirtualPodID equal to container ID as sandboxes for networking to support separate Network namespace for each Pod in the UVM. (cherry picked from commit c196086161b65682e0d923e3bb8b3c5ed789a497) Signed-off-by: Hamza El-Saawy <hamzaelsaawy@microsoft.com> * multi pod changes for GCS Introduce cgroup changes and per pod mount changes to support multiple pod. (cherry picked from commit 7170f3fae8d26fef6a975cdff8300f9ca67691d1) Signed-off-by: Hamza El-Saawy <hamzaelsaawy@microsoft.com> * Add multi-pod tmpfs support; fix lint errors Remove unused functions `get(Sandbox|Standalone)(Hostname|Hosts|Resolv)Path` and replace them with their `VirtualPodAware` counterparts to satisfy linter. (The original functions are already replaced wholesale). Expand multi-pod functionality to include tmpfs-backed sandbox mounts. Signed-off-by: Hamza El-Saawy <hamzaelsaawy@microsoft.com> --------- Signed-off-by: Hamza El-Saawy <hamzaelsaawy@microsoft.com> Co-authored-by: Abhishek Singh (Manifold) <Abhishek.Singh@microsoft.com>
1 parent c9c7431 commit 15a6afe

File tree

9 files changed

+662
-118
lines changed

9 files changed

+662
-118
lines changed

cmd/gcs/main.go

Lines changed: 63 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"os"
1111
"os/exec"
1212
"path/filepath"
13+
"strings"
1314
"syscall"
1415
"time"
1516

@@ -67,7 +68,12 @@ func readMemoryEvents(startTime time.Time, efdFile *os.File, cgName string, thre
6768
}
6869

6970
count++
70-
msg := "memory usage for cgroup exceeded threshold"
71+
var msg string
72+
if strings.HasPrefix(cgName, "/virtual-pods") {
73+
msg = "memory usage for virtual pods cgroup exceeded threshold"
74+
} else {
75+
msg = "memory usage for cgroup exceeded threshold"
76+
}
7177
entry := logrus.WithFields(logrus.Fields{
7278
"gcsStartTime": startTime,
7379
"time": time.Now(),
@@ -294,40 +300,9 @@ func main() {
294300
// Continuously log /dev/kmsg
295301
go kmsg.ReadForever(kmsg.LogLevel(*kmsgLogLevel))
296302

297-
tport := &transport.VsockTransport{}
298-
rtime, err := runc.NewRuntime(baseLogPath)
299-
if err != nil {
300-
logrus.WithError(err).Fatal("failed to initialize new runc runtime")
301-
}
302-
mux := bridge.NewBridgeMux()
303-
b := bridge.Bridge{
304-
Handler: mux,
305-
EnableV4: *v4,
306-
}
307-
h := hcsv2.NewHost(rtime, tport, initialEnforcer, logWriter)
308-
b.AssignHandlers(mux, h)
309-
310-
var bridgeIn io.ReadCloser
311-
var bridgeOut io.WriteCloser
312-
if *useInOutErr {
313-
bridgeIn = os.Stdin
314-
bridgeOut = os.Stdout
315-
} else {
316-
const commandPort uint32 = 0x40000000
317-
bridgeCon, err := tport.Dial(commandPort)
318-
if err != nil {
319-
logrus.WithFields(logrus.Fields{
320-
"port": commandPort,
321-
logrus.ErrorKey: err,
322-
}).Fatal("failed to dial host vsock connection")
323-
}
324-
bridgeIn = bridgeCon
325-
bridgeOut = bridgeCon
326-
}
327-
328303
// Setup the UVM cgroups to protect against a workload taking all available
329-
// memory and causing the GCS to malfunction we create two cgroups: gcs,
330-
// containers.
304+
// memory and causing the GCS to malfunction we create cgroups: gcs,
305+
// containers, and virtual-pods for multi-pod support.
331306
//
332307

333308
// Write 1 to memory.use_hierarchy on the root cgroup to enable hierarchy
@@ -357,6 +332,18 @@ func main() {
357332
}
358333
defer containersControl.Delete() //nolint:errcheck
359334

335+
// Create virtual-pods cgroup hierarchy for multi-pod support
336+
// This will be the parent for all virtual pod cgroups: /containers/virtual-pods/{virtualSandboxID}
337+
virtualPodsControl, err := cgroups.New(cgroups.StaticPath("/containers/virtual-pods"), &oci.LinuxResources{
338+
Memory: &oci.LinuxMemory{
339+
Limit: &containersLimit, // Share the same limit as containers
340+
},
341+
})
342+
if err != nil {
343+
logrus.WithError(err).Fatal("failed to create containers/virtual-pods cgroup")
344+
}
345+
defer virtualPodsControl.Delete() //nolint:errcheck
346+
360347
gcsControl, err := cgroups.New(cgroups.StaticPath("/gcs"), &oci.LinuxResources{})
361348
if err != nil {
362349
logrus.WithError(err).Fatal("failed to create gcs cgroup")
@@ -366,6 +353,39 @@ func main() {
366353
logrus.WithError(err).Fatal("failed add gcs pid to gcs cgroup")
367354
}
368355

356+
tport := &transport.VsockTransport{}
357+
rtime, err := runc.NewRuntime(baseLogPath)
358+
if err != nil {
359+
logrus.WithError(err).Fatal("failed to initialize new runc runtime")
360+
}
361+
mux := bridge.NewBridgeMux()
362+
b := bridge.Bridge{
363+
Handler: mux,
364+
EnableV4: *v4,
365+
}
366+
h := hcsv2.NewHost(rtime, tport, initialEnforcer, logWriter)
367+
// Initialize virtual pod support in the host
368+
h.InitializeVirtualPodSupport(virtualPodsControl)
369+
b.AssignHandlers(mux, h)
370+
371+
var bridgeIn io.ReadCloser
372+
var bridgeOut io.WriteCloser
373+
if *useInOutErr {
374+
bridgeIn = os.Stdin
375+
bridgeOut = os.Stdout
376+
} else {
377+
const commandPort uint32 = 0x40000000
378+
bridgeCon, err := tport.Dial(commandPort)
379+
if err != nil {
380+
logrus.WithFields(logrus.Fields{
381+
"port": commandPort,
382+
logrus.ErrorKey: err,
383+
}).Fatal("failed to dial host vsock connection")
384+
}
385+
bridgeIn = bridgeCon
386+
bridgeOut = bridgeCon
387+
}
388+
369389
event := cgroups.MemoryThresholdEvent(*gcsMemLimitBytes, false)
370390
gefd, err := gcsControl.RegisterMemoryEvent(event)
371391
if err != nil {
@@ -381,6 +401,14 @@ func main() {
381401
oomFile := os.NewFile(oom, "cefd")
382402
defer oomFile.Close()
383403

404+
// Setup OOM monitoring for virtual-pods cgroup
405+
virtualPodsOom, err := virtualPodsControl.OOMEventFD()
406+
if err != nil {
407+
logrus.WithError(err).Fatal("failed to retrieve the virtual-pods cgroups oom eventfd")
408+
}
409+
virtualPodsOomFile := os.NewFile(virtualPodsOom, "vp-oomfd")
410+
defer virtualPodsOomFile.Close()
411+
384412
// time synchronization service
385413
if !(*disableTimeSync) {
386414
if err = startTimeSyncService(); err != nil {
@@ -390,6 +418,7 @@ func main() {
390418

391419
go readMemoryEvents(startTime, gefdFile, "/gcs", int64(*gcsMemLimitBytes), gcsControl)
392420
go readMemoryEvents(startTime, oomFile, "/containers", containersLimit, containersControl)
421+
go readMemoryEvents(startTime, virtualPodsOomFile, "/containers/virtual-pods", containersLimit, virtualPodsControl)
393422
err = b.ListenAndServe(bridgeIn, bridgeOut)
394423
if err != nil {
395424
logrus.WithFields(logrus.Fields{

internal/guest/runtime/hcsv2/container.go

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"github.com/Microsoft/hcsshim/internal/oc"
3131
"github.com/Microsoft/hcsshim/internal/protocol/guestrequest"
3232
"github.com/Microsoft/hcsshim/internal/protocol/guestresource"
33+
"github.com/Microsoft/hcsshim/pkg/annotations"
3334
)
3435

3536
// containerStatus has been introduced to enable parallel container creation
@@ -193,13 +194,24 @@ func (c *Container) Delete(ctx context.Context) error {
193194
entity := log.G(ctx).WithField(logfields.ContainerID, c.id)
194195
entity.Info("opengcs::Container::Delete")
195196
if c.isSandbox {
196-
// remove user mounts in sandbox container
197-
if err := storage.UnmountAllInPath(ctx, specGuest.SandboxMountsDir(c.id), true); err != nil {
197+
// Check if this is a virtual pod
198+
virtualSandboxID := ""
199+
if c.spec != nil && c.spec.Annotations != nil {
200+
virtualSandboxID = c.spec.Annotations[annotations.VirtualPodID]
201+
}
202+
203+
// remove user mounts in sandbox container - use virtual pod aware paths
204+
if err := storage.UnmountAllInPath(ctx, specGuest.VirtualPodAwareSandboxMountsDir(c.id, virtualSandboxID), true); err != nil {
198205
entity.WithError(err).Error("failed to unmount sandbox mounts")
199206
}
200207

201-
// remove hugepages mounts in sandbox container
202-
if err := storage.UnmountAllInPath(ctx, specGuest.HugePagesMountsDir(c.id), true); err != nil {
208+
// remove user mounts in tmpfs sandbox container - use virtual pod aware paths
209+
if err := storage.UnmountAllInPath(ctx, specGuest.VirtualPodAwareSandboxTmpfsMountsDir(c.id, virtualSandboxID), true); err != nil {
210+
entity.WithError(err).Error("failed to unmount tmpfs sandbox mounts")
211+
}
212+
213+
// remove hugepages mounts in sandbox container - use virtual pod aware paths
214+
if err := storage.UnmountAllInPath(ctx, specGuest.VirtualPodAwareHugePagesMountsDir(c.id, virtualSandboxID), true); err != nil {
203215
entity.WithError(err).Error("failed to unmount hugepages mounts")
204216
}
205217
}

internal/guest/runtime/hcsv2/sandbox_container.go

Lines changed: 52 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,21 @@ import (
1515

1616
"github.com/Microsoft/hcsshim/internal/guest/network"
1717
specGuest "github.com/Microsoft/hcsshim/internal/guest/spec"
18+
"github.com/Microsoft/hcsshim/internal/log"
1819
"github.com/Microsoft/hcsshim/internal/oc"
1920
"github.com/Microsoft/hcsshim/pkg/annotations"
2021
)
2122

22-
func getSandboxHostnamePath(id string) string {
23-
return filepath.Join(specGuest.SandboxRootDir(id), "hostname")
23+
func getSandboxHostnamePath(id, virtualSandboxID string) string {
24+
return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "hostname")
2425
}
2526

26-
func getSandboxHostsPath(id string) string {
27-
return filepath.Join(specGuest.SandboxRootDir(id), "hosts")
27+
func getSandboxHostsPath(id, virtualSandboxID string) string {
28+
return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "hosts")
2829
}
2930

30-
func getSandboxResolvPath(id string) string {
31-
return filepath.Join(specGuest.SandboxRootDir(id), "resolv.conf")
31+
func getSandboxResolvPath(id, virtualSandboxID string) string {
32+
return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "resolv.conf")
3233
}
3334

3435
func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) (err error) {
@@ -37,8 +38,11 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) (
3738
defer func() { oc.SetSpanStatus(span, err) }()
3839
span.AddAttributes(trace.StringAttribute("cid", id))
3940

40-
// Generate the sandbox root dir
41-
rootDir := specGuest.SandboxRootDir(id)
41+
// Check if this is a virtual pod to use appropriate root directory
42+
virtualSandboxID := spec.Annotations[annotations.VirtualPodID]
43+
44+
// Generate the sandbox root dir - virtual pod aware
45+
rootDir := specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID)
4246
if err := os.MkdirAll(rootDir, 0755); err != nil {
4347
return errors.Wrapf(err, "failed to create sandbox root directory %q", rootDir)
4448
}
@@ -58,39 +62,53 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) (
5862
}
5963
}
6064

61-
sandboxHostnamePath := getSandboxHostnamePath(id)
65+
sandboxHostnamePath := getSandboxHostnamePath(id, virtualSandboxID)
6266
if err := os.WriteFile(sandboxHostnamePath, []byte(hostname+"\n"), 0644); err != nil {
6367
return errors.Wrapf(err, "failed to write hostname to %q", sandboxHostnamePath)
6468
}
6569

6670
// Write the hosts
6771
sandboxHostsContent := network.GenerateEtcHostsContent(ctx, hostname)
68-
sandboxHostsPath := getSandboxHostsPath(id)
72+
sandboxHostsPath := getSandboxHostsPath(id, virtualSandboxID)
6973
if err := os.WriteFile(sandboxHostsPath, []byte(sandboxHostsContent), 0644); err != nil {
7074
return errors.Wrapf(err, "failed to write sandbox hosts to %q", sandboxHostsPath)
7175
}
7276

77+
// Check if this is a virtual pod sandbox container by comparing container ID with virtual pod ID
78+
isVirtualPodSandbox := virtualSandboxID != "" && id == virtualSandboxID
79+
if strings.EqualFold(spec.Annotations[annotations.SkipPodNetworking], "true") || isVirtualPodSandbox {
80+
ns := GetOrAddNetworkNamespace(specGuest.GetNetworkNamespaceID(spec))
81+
err := ns.Sync(ctx)
82+
if err != nil {
83+
return err
84+
}
85+
}
7386
// Write resolv.conf
7487
ns, err := getNetworkNamespace(specGuest.GetNetworkNamespaceID(spec))
7588
if err != nil {
76-
return err
77-
}
78-
var searches, servers []string
79-
for _, n := range ns.Adapters() {
80-
if len(n.DNSSuffix) > 0 {
81-
searches = network.MergeValues(searches, strings.Split(n.DNSSuffix, ","))
89+
if !strings.EqualFold(spec.Annotations[annotations.SkipPodNetworking], "true") {
90+
return err
8291
}
83-
if len(n.DNSServerList) > 0 {
84-
servers = network.MergeValues(servers, strings.Split(n.DNSServerList, ","))
92+
// Networking is skipped, do not error out
93+
log.G(ctx).Infof("setupSandboxContainerSpec: Did not find NS spec %v, err %v", spec, err)
94+
} else {
95+
var searches, servers []string
96+
for _, n := range ns.Adapters() {
97+
if len(n.DNSSuffix) > 0 {
98+
searches = network.MergeValues(searches, strings.Split(n.DNSSuffix, ","))
99+
}
100+
if len(n.DNSServerList) > 0 {
101+
servers = network.MergeValues(servers, strings.Split(n.DNSServerList, ","))
102+
}
103+
}
104+
resolvContent, err := network.GenerateResolvConfContent(ctx, searches, servers, nil)
105+
if err != nil {
106+
return errors.Wrap(err, "failed to generate sandbox resolv.conf content")
107+
}
108+
sandboxResolvPath := getSandboxResolvPath(id, virtualSandboxID)
109+
if err := os.WriteFile(sandboxResolvPath, []byte(resolvContent), 0644); err != nil {
110+
return errors.Wrap(err, "failed to write sandbox resolv.conf")
85111
}
86-
}
87-
resolvContent, err := network.GenerateResolvConfContent(ctx, searches, servers, nil)
88-
if err != nil {
89-
return errors.Wrap(err, "failed to generate sandbox resolv.conf content")
90-
}
91-
sandboxResolvPath := getSandboxResolvPath(id)
92-
if err := os.WriteFile(sandboxResolvPath, []byte(resolvContent), 0644); err != nil {
93-
return errors.Wrap(err, "failed to write sandbox resolv.conf")
94112
}
95113

96114
// User.Username is generally only used on Windows, but as there's no (easy/fast at least) way to grab
@@ -113,8 +131,14 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) (
113131
// also has a concept of a sandbox/shm file when the IPC NamespaceMode !=
114132
// NODE.
115133

116-
// Force the parent cgroup into our /containers root
117-
spec.Linux.CgroupsPath = "/containers/" + id
134+
// Set cgroup path - check if this is a virtual pod
135+
if virtualSandboxID != "" {
136+
// Virtual pod sandbox gets its own cgroup under /containers/virtual-pods using the virtual pod ID
137+
spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualSandboxID
138+
} else {
139+
// Traditional sandbox goes under /containers
140+
spec.Linux.CgroupsPath = "/containers/" + id
141+
}
118142

119143
// Clear the windows section as we dont want to forward to runc
120144
spec.Windows = nil

0 commit comments

Comments
 (0)