Skip to content

Commit 342ed8e

Browse files
author
jinda.ljd
committed
use clone3 for exec process creation to reduce cgroup lock contention
Currently, the runc exec process creates child processes by first cloning the child process and then writing its PID into cgroup.procs. This approach leads to high lock contention on the cgroup_threadgroup_rwsem read-write lock under conditions of high container density and numerous exec probes, potentially causing system hang. This change introduces the usage of the clone3 system call within the setnsProcess.start function to merge the application of the cgroup into the clone operation (assuming cgroup v2 is in use). By doing so, it avoids the need to write PIDs to cgroup.procs directly, thereby bypassing the requirement for taking the write lock and reducing the risk of lock contention.
1 parent b04031d commit 342ed8e

File tree

1 file changed

+28
-1
lines changed

1 file changed

+28
-1
lines changed

libcontainer/process_linux.go

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"runtime"
1414
"strconv"
1515
"sync"
16+
"syscall"
1617
"time"
1718

1819
"github.com/opencontainers/runtime-spec/specs-go"
@@ -203,6 +204,28 @@ func (p *setnsProcess) start() (retErr error) {
203204

204205
// Get the "before" value of oom kill count.
205206
oom, _ := p.manager.OOMKillCount()
207+
useClone3 := false
208+
if cgroups.IsCgroup2UnifiedMode() && p.initProcessPid != 0 {
209+
initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid)
210+
initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile)
211+
if initCgErr == nil {
212+
if initCgPath, ok := initCg[""]; ok {
213+
useClone3 = true
214+
initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath)
215+
fd, err := os.Open(initCgDirpath)
216+
if err != nil {
217+
return fmt.Errorf("error opening cgroup dir %q: %w", initCgDirpath, err)
218+
}
219+
defer fd.Close()
220+
if p.cmd.SysProcAttr == nil {
221+
p.cmd.SysProcAttr = &syscall.SysProcAttr{}
222+
}
223+
p.cmd.SysProcAttr.UseCgroupFD = true
224+
p.cmd.SysProcAttr.CgroupFD = int(fd.Fd())
225+
}
226+
}
227+
}
228+
206229
err := p.startWithCPUAffinity()
207230
// Close the child-side of the pipes (controlled by child).
208231
p.comm.closeChild()
@@ -232,7 +255,11 @@ func (p *setnsProcess) start() (retErr error) {
232255
return fmt.Errorf("error executing setns process: %w", err)
233256
}
234257
for _, path := range p.cgroupPaths {
235-
if err := cgroups.WriteCgroupProc(path, p.pid()); err != nil && !p.rootlessCgroups {
258+
procPid := p.pid()
259+
if useClone3 {
260+
procPid = -1
261+
}
262+
if err := cgroups.WriteCgroupProc(path, procPid); err != nil && !p.rootlessCgroups {
236263
// On cgroup v2 + nesting + domain controllers, WriteCgroupProc may fail with EBUSY.
237264
// https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643
238265
// Try to join the cgroup of InitProcessPid.

0 commit comments

Comments
 (0)