@@ -6,13 +6,17 @@ import (
66 "errors"
77 "fmt"
88 "io"
9+ "maps"
910 "net"
1011 "os"
1112 "os/exec"
13+ "path"
1214 "path/filepath"
1315 "runtime"
1416 "strconv"
17+ "strings"
1518 "sync"
19+ "syscall"
1620 "time"
1721
1822 "github.com/opencontainers/runtime-spec/specs-go"
@@ -153,7 +157,6 @@ func (p *containerProcess) wait() (*os.ProcessState, error) { //nolint:unparam
153157
154158type setnsProcess struct {
155159 containerProcess
156- cgroupPaths map [string ]string
157160 rootlessCgroups bool
158161 intelRdtPath string
159162 initProcessPid int
@@ -244,12 +247,142 @@ func (p *setnsProcess) setFinalCPUAffinity() error {
244247 return nil
245248}
246249
250+ func (p * setnsProcess ) addIntoCgroupV1 () error {
251+ if sub , ok := p .process .SubCgroupPaths ["" ]; ok || len (p .process .SubCgroupPaths ) == 0 {
252+ // Either same sub-cgroup for all paths, or no sub-cgroup.
253+ err := p .manager .AddPid (sub , p .pid ())
254+ if err != nil && ! p .rootlessCgroups {
255+ return fmt .Errorf ("error adding pid %d to cgroups: %w" , p .pid (), err )
256+ }
257+ return nil
258+ }
259+
260+ // Per-controller sub-cgroup paths. Not supported by AddPid (or systemd),
261+ // so we have to calculate and check all sub-cgroup paths, and write
262+ // directly to cgroupfs.
263+ paths := maps .Clone (p .manager .GetPaths ())
264+ for ctrl , sub := range p .process .SubCgroupPaths {
265+ base , ok := paths [ctrl ]
266+ if ! ok {
267+ return fmt .Errorf ("unknown controller %s in SubCgroupPaths" , ctrl )
268+ }
269+ cgPath := path .Join (base , sub )
270+ if ! strings .HasPrefix (cgPath , base ) {
271+ return fmt .Errorf ("bad sub cgroup path: %s" , sub )
272+ }
273+ paths [ctrl ] = cgPath
274+ }
275+
276+ for _ , path := range paths {
277+ if err := cgroups .WriteCgroupProc (path , p .pid ()); err != nil && ! p .rootlessCgroups {
278+ return fmt .Errorf ("error adding pid %d to cgroups: %w" , p .pid (), err )
279+ }
280+ }
281+
282+ return nil
283+ }
284+
285+ func (p * setnsProcess ) addIntoCgroupV2 () error {
286+ sub := p .process .SubCgroupPaths ["" ]
287+ err := p .manager .AddPid (sub , p .pid ())
288+ if err != nil && ! p .rootlessCgroups {
289+ // On cgroup v2 + nesting + domain controllers, adding to initial cgroup may fail with EBUSY.
290+ // https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643
291+ // Try to join the cgroup of InitProcessPid, unless sub-cgroup is explicitly set.
292+ if p .initProcessPid != 0 && sub == "" {
293+ initProcCgroupFile := fmt .Sprintf ("/proc/%d/cgroup" , p .initProcessPid )
294+ initCg , initCgErr := cgroups .ParseCgroupFile (initProcCgroupFile )
295+ if initCgErr == nil {
296+ if initCgPath , ok := initCg ["" ]; ok {
297+ initCgDirpath := filepath .Join (fs2 .UnifiedMountpoint , initCgPath )
298+ logrus .Debugf ("adding pid %d to cgroup failed (%v), attempting to join %s" ,
299+ p .pid (), err , initCgDirpath )
300+ // NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container.
301+ err = cgroups .WriteCgroupProc (initCgDirpath , p .pid ())
302+ }
303+ }
304+ }
305+ if err != nil {
306+ return fmt .Errorf ("error adding pid %d to cgroups: %w" , p .pid (), err )
307+ }
308+ }
309+
310+ return nil
311+ }
312+
313+ func (p * setnsProcess ) addIntoCgroup () error {
314+ if p .cmd .SysProcAttr .UseCgroupFD {
315+ // We've used cgroupfd successfully, so the process is
316+ // already in the proper cgroup, nothing to do here.
317+ return nil
318+ }
319+ if cgroups .IsCgroup2UnifiedMode () {
320+ return p .addIntoCgroupV2 ()
321+ }
322+ return p .addIntoCgroupV1 ()
323+ }
324+
325+ // prepareCgroupFD sets up p.cmd to use clone3 with CLONE_INTO_CGROUP
326+ // to join cgroup early, in p.cmd.Start. Returns an *os.File which
327+ // must be closed by the caller after p.Cmd.Start return.
328+ func (p * setnsProcess ) prepareCgroupFD () (* os.File , error ) {
329+ if ! cgroups .IsCgroup2UnifiedMode () {
330+ return nil , nil
331+ }
332+
333+ base := p .manager .Path ("" )
334+ if base == "" { // No cgroup to join.
335+ return nil , nil
336+ }
337+ sub := ""
338+ if p .process .SubCgroupPaths != nil {
339+ sub = p .process .SubCgroupPaths ["" ]
340+ }
341+ cgroup := path .Join (base , sub )
342+ if ! strings .HasPrefix (cgroup , base ) {
343+ return nil , fmt .Errorf ("bad sub cgroup path: %s" , sub )
344+ }
345+
346+ fd , err := cgroups .OpenFile (base , sub , unix .O_PATH | unix .O_DIRECTORY | unix .O_CLOEXEC )
347+ if err != nil {
348+ if p .rootlessCgroups {
349+ return nil , nil
350+ }
351+ return nil , fmt .Errorf ("can't open cgroup: %w" , err )
352+ }
353+
354+ logrus .Debugf ("using CLONE_INTO_CGROUP %q" , cgroup )
355+ if p .cmd .SysProcAttr == nil {
356+ p .cmd .SysProcAttr = & syscall.SysProcAttr {}
357+ }
358+ p .cmd .SysProcAttr .UseCgroupFD = true
359+ p .cmd .SysProcAttr .CgroupFD = int (fd .Fd ())
360+
361+ return fd , nil
362+ }
363+
247364func (p * setnsProcess ) start () (retErr error ) {
248365 defer p .comm .closeParent ()
249366
367+ fd , err := p .prepareCgroupFD ()
368+ if err != nil {
369+ return err
370+ }
371+
250372 // Get the "before" value of oom kill count.
251373 oom , _ := p .manager .OOMKillCount ()
252- err := p .startWithCPUAffinity ()
374+
375+ err = p .startWithCPUAffinity ()
376+ if fd != nil {
377+ fd .Close ()
378+ }
379+ if err != nil && p .cmd .SysProcAttr .UseCgroupFD {
380+ logrus .Debugf ("exec with CLONE_INTO_CGROUP failed: %v; retrying without" , err )
381+ // SysProcAttr.CgroupFD is never used when UseCgroupFD is unset.
382+ p .cmd .SysProcAttr .UseCgroupFD = false
383+ err = p .startWithCPUAffinity ()
384+ }
385+
253386 // Close the child-side of the pipes (controlled by child).
254387 p .comm .closeChild ()
255388 if err != nil {
@@ -277,28 +410,8 @@ func (p *setnsProcess) start() (retErr error) {
277410 if err := p .execSetns (); err != nil {
278411 return fmt .Errorf ("error executing setns process: %w" , err )
279412 }
280- for _ , path := range p .cgroupPaths {
281- if err := cgroups .WriteCgroupProc (path , p .pid ()); err != nil && ! p .rootlessCgroups {
282- // On cgroup v2 + nesting + domain controllers, WriteCgroupProc may fail with EBUSY.
283- // https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643
284- // Try to join the cgroup of InitProcessPid.
285- if cgroups .IsCgroup2UnifiedMode () && p .initProcessPid != 0 {
286- initProcCgroupFile := fmt .Sprintf ("/proc/%d/cgroup" , p .initProcessPid )
287- initCg , initCgErr := cgroups .ParseCgroupFile (initProcCgroupFile )
288- if initCgErr == nil {
289- if initCgPath , ok := initCg ["" ]; ok {
290- initCgDirpath := filepath .Join (fs2 .UnifiedMountpoint , initCgPath )
291- logrus .Debugf ("adding pid %d to cgroups %v failed (%v), attempting to join %q (obtained from %s)" ,
292- p .pid (), p .cgroupPaths , err , initCg , initCgDirpath )
293- // NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container.
294- err = cgroups .WriteCgroupProc (initCgDirpath , p .pid ())
295- }
296- }
297- }
298- if err != nil {
299- return fmt .Errorf ("error adding pid %d to cgroups: %w" , p .pid (), err )
300- }
301- }
413+ if err := p .addIntoCgroup (); err != nil {
414+ return err
302415 }
303416 // Set final CPU affinity right after the process is moved into container's cgroup.
304417 if err := p .setFinalCPUAffinity (); err != nil {
0 commit comments