Skip to content

Commit 938693b

Browse files
authored
Merge pull request #4920 from cyphar/1.4-cgroup-clone-into-cgroup
[1.4] runc exec: use CLONE_INTO_CGROUP
2 parents e470a84 + d792f9f commit 938693b

File tree

14 files changed

+243
-66
lines changed

14 files changed

+243
-66
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ require (
1414
github.com/moby/sys/user v0.4.0
1515
github.com/moby/sys/userns v0.1.0
1616
github.com/mrunalp/fileutils v0.5.1
17-
github.com/opencontainers/cgroups v0.0.4
17+
github.com/opencontainers/cgroups v0.0.5
1818
github.com/opencontainers/runtime-spec v1.2.2-0.20250818071321-383cadbf08c0
1919
github.com/opencontainers/selinux v1.12.0
2020
github.com/seccomp/libseccomp-golang v0.11.1

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g
4444
github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28=
4545
github.com/mrunalp/fileutils v0.5.1 h1:F+S7ZlNKnrwHfSwdlgNSkKo67ReVf8o9fel6C3dkm/Q=
4646
github.com/mrunalp/fileutils v0.5.1/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
47-
github.com/opencontainers/cgroups v0.0.4 h1:XVj8P/IHVms/j+7eh8ggdkTLAxjz84ZzuFyGoE28DR4=
48-
github.com/opencontainers/cgroups v0.0.4/go.mod h1:s8lktyhlGUqM7OSRL5P7eAW6Wb+kWPNvt4qvVfzA5vs=
47+
github.com/opencontainers/cgroups v0.0.5 h1:DRITAqcOnY0uSBzIpt1RYWLjh5DPDiqUs4fY6Y0ktls=
48+
github.com/opencontainers/cgroups v0.0.5/go.mod h1:oWVzJsKK0gG9SCRBfTpnn16WcGEqDI8PAcpMGbqWxcs=
4949
github.com/opencontainers/runtime-spec v1.2.2-0.20250818071321-383cadbf08c0 h1:RLn0YfUWkiqPGtgUANvJrcjIkCHGRl3jcz/c557M28M=
5050
github.com/opencontainers/runtime-spec v1.2.2-0.20250818071321-383cadbf08c0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
5151
github.com/opencontainers/selinux v1.12.0 h1:6n5JV4Cf+4y0KNXW48TLj5DwfXpvWlxXplUkdTrmPb8=

libcontainer/container_linux.go

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import (
77
"io"
88
"os"
99
"os/exec"
10-
"path"
1110
"path/filepath"
1211
"reflect"
1312
"strconv"
@@ -655,40 +654,10 @@ func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, comm *processComm
655654
bootstrapData: data,
656655
container: c,
657656
},
658-
cgroupPaths: state.CgroupPaths,
659657
rootlessCgroups: c.config.RootlessCgroups,
660658
intelRdtPath: state.IntelRdtPath,
661659
initProcessPid: state.InitProcessPid,
662660
}
663-
if len(p.SubCgroupPaths) > 0 {
664-
if add, ok := p.SubCgroupPaths[""]; ok {
665-
// cgroup v1: using the same path for all controllers.
666-
// cgroup v2: the only possible way.
667-
for k := range proc.cgroupPaths {
668-
subPath := path.Join(proc.cgroupPaths[k], add)
669-
if !strings.HasPrefix(subPath, proc.cgroupPaths[k]) {
670-
return nil, fmt.Errorf("%s is not a sub cgroup path", add)
671-
}
672-
proc.cgroupPaths[k] = subPath
673-
}
674-
// cgroup v2: do not try to join init process's cgroup
675-
// as a fallback (see (*setnsProcess).start).
676-
proc.initProcessPid = 0
677-
} else {
678-
// Per-controller paths.
679-
for ctrl, add := range p.SubCgroupPaths {
680-
if val, ok := proc.cgroupPaths[ctrl]; ok {
681-
subPath := path.Join(val, add)
682-
if !strings.HasPrefix(subPath, val) {
683-
return nil, fmt.Errorf("%s is not a sub cgroup path", add)
684-
}
685-
proc.cgroupPaths[ctrl] = subPath
686-
} else {
687-
return nil, fmt.Errorf("unknown controller %s in SubCgroupPaths", ctrl)
688-
}
689-
}
690-
}
691-
}
692661
return proc, nil
693662
}
694663

libcontainer/container_linux_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ func (m *mockCgroupManager) Apply(pid int) error {
3232
return nil
3333
}
3434

35+
func (m *mockCgroupManager) AddPid(_ string, _ int) error {
36+
return nil
37+
}
38+
3539
func (m *mockCgroupManager) Set(_ *cgroups.Resources) error {
3640
return nil
3741
}

libcontainer/process_linux.go

Lines changed: 137 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,17 @@ import (
66
"errors"
77
"fmt"
88
"io"
9+
"maps"
910
"net"
1011
"os"
1112
"os/exec"
13+
"path"
1214
"path/filepath"
1315
"runtime"
1416
"strconv"
17+
"strings"
1518
"sync"
19+
"syscall"
1620
"time"
1721

1822
"github.com/opencontainers/runtime-spec/specs-go"
@@ -153,7 +157,6 @@ func (p *containerProcess) wait() (*os.ProcessState, error) { //nolint:unparam
153157

154158
type setnsProcess struct {
155159
containerProcess
156-
cgroupPaths map[string]string
157160
rootlessCgroups bool
158161
intelRdtPath string
159162
initProcessPid int
@@ -244,12 +247,142 @@ func (p *setnsProcess) setFinalCPUAffinity() error {
244247
return nil
245248
}
246249

250+
func (p *setnsProcess) addIntoCgroupV1() error {
251+
if sub, ok := p.process.SubCgroupPaths[""]; ok || len(p.process.SubCgroupPaths) == 0 {
252+
// Either same sub-cgroup for all paths, or no sub-cgroup.
253+
err := p.manager.AddPid(sub, p.pid())
254+
if err != nil && !p.rootlessCgroups {
255+
return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
256+
}
257+
return nil
258+
}
259+
260+
// Per-controller sub-cgroup paths. Not supported by AddPid (or systemd),
261+
// so we have to calculate and check all sub-cgroup paths, and write
262+
// directly to cgroupfs.
263+
paths := maps.Clone(p.manager.GetPaths())
264+
for ctrl, sub := range p.process.SubCgroupPaths {
265+
base, ok := paths[ctrl]
266+
if !ok {
267+
return fmt.Errorf("unknown controller %s in SubCgroupPaths", ctrl)
268+
}
269+
cgPath := path.Join(base, sub)
270+
if !strings.HasPrefix(cgPath, base) {
271+
return fmt.Errorf("bad sub cgroup path: %s", sub)
272+
}
273+
paths[ctrl] = cgPath
274+
}
275+
276+
for _, path := range paths {
277+
if err := cgroups.WriteCgroupProc(path, p.pid()); err != nil && !p.rootlessCgroups {
278+
return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
279+
}
280+
}
281+
282+
return nil
283+
}
284+
285+
func (p *setnsProcess) addIntoCgroupV2() error {
286+
sub := p.process.SubCgroupPaths[""]
287+
err := p.manager.AddPid(sub, p.pid())
288+
if err != nil && !p.rootlessCgroups {
289+
// On cgroup v2 + nesting + domain controllers, adding to initial cgroup may fail with EBUSY.
290+
// https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643
291+
// Try to join the cgroup of InitProcessPid, unless sub-cgroup is explicitly set.
292+
if p.initProcessPid != 0 && sub == "" {
293+
initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid)
294+
initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile)
295+
if initCgErr == nil {
296+
if initCgPath, ok := initCg[""]; ok {
297+
initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath)
298+
logrus.Debugf("adding pid %d to cgroup failed (%v), attempting to join %s",
299+
p.pid(), err, initCgDirpath)
300+
// NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container.
301+
err = cgroups.WriteCgroupProc(initCgDirpath, p.pid())
302+
}
303+
}
304+
}
305+
if err != nil {
306+
return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
307+
}
308+
}
309+
310+
return nil
311+
}
312+
313+
func (p *setnsProcess) addIntoCgroup() error {
314+
if p.cmd.SysProcAttr.UseCgroupFD {
315+
// We've used cgroupfd successfully, so the process is
316+
// already in the proper cgroup, nothing to do here.
317+
return nil
318+
}
319+
if cgroups.IsCgroup2UnifiedMode() {
320+
return p.addIntoCgroupV2()
321+
}
322+
return p.addIntoCgroupV1()
323+
}
324+
325+
// prepareCgroupFD sets up p.cmd to use clone3 with CLONE_INTO_CGROUP
326+
// to join cgroup early, in p.cmd.Start. Returns an *os.File which
327+
// must be closed by the caller after p.Cmd.Start return.
328+
func (p *setnsProcess) prepareCgroupFD() (*os.File, error) {
329+
if !cgroups.IsCgroup2UnifiedMode() {
330+
return nil, nil
331+
}
332+
333+
base := p.manager.Path("")
334+
if base == "" { // No cgroup to join.
335+
return nil, nil
336+
}
337+
sub := ""
338+
if p.process.SubCgroupPaths != nil {
339+
sub = p.process.SubCgroupPaths[""]
340+
}
341+
cgroup := path.Join(base, sub)
342+
if !strings.HasPrefix(cgroup, base) {
343+
return nil, fmt.Errorf("bad sub cgroup path: %s", sub)
344+
}
345+
346+
fd, err := cgroups.OpenFile(base, sub, unix.O_PATH|unix.O_DIRECTORY|unix.O_CLOEXEC)
347+
if err != nil {
348+
if p.rootlessCgroups {
349+
return nil, nil
350+
}
351+
return nil, fmt.Errorf("can't open cgroup: %w", err)
352+
}
353+
354+
logrus.Debugf("using CLONE_INTO_CGROUP %q", cgroup)
355+
if p.cmd.SysProcAttr == nil {
356+
p.cmd.SysProcAttr = &syscall.SysProcAttr{}
357+
}
358+
p.cmd.SysProcAttr.UseCgroupFD = true
359+
p.cmd.SysProcAttr.CgroupFD = int(fd.Fd())
360+
361+
return fd, nil
362+
}
363+
247364
func (p *setnsProcess) start() (retErr error) {
248365
defer p.comm.closeParent()
249366

367+
fd, err := p.prepareCgroupFD()
368+
if err != nil {
369+
return err
370+
}
371+
250372
// Get the "before" value of oom kill count.
251373
oom, _ := p.manager.OOMKillCount()
252-
err := p.startWithCPUAffinity()
374+
375+
err = p.startWithCPUAffinity()
376+
if fd != nil {
377+
fd.Close()
378+
}
379+
if err != nil && p.cmd.SysProcAttr.UseCgroupFD {
380+
logrus.Debugf("exec with CLONE_INTO_CGROUP failed: %v; retrying without", err)
381+
// SysProcAttr.CgroupFD is never used when UseCgroupFD is unset.
382+
p.cmd.SysProcAttr.UseCgroupFD = false
383+
err = p.startWithCPUAffinity()
384+
}
385+
253386
// Close the child-side of the pipes (controlled by child).
254387
p.comm.closeChild()
255388
if err != nil {
@@ -277,28 +410,8 @@ func (p *setnsProcess) start() (retErr error) {
277410
if err := p.execSetns(); err != nil {
278411
return fmt.Errorf("error executing setns process: %w", err)
279412
}
280-
for _, path := range p.cgroupPaths {
281-
if err := cgroups.WriteCgroupProc(path, p.pid()); err != nil && !p.rootlessCgroups {
282-
// On cgroup v2 + nesting + domain controllers, WriteCgroupProc may fail with EBUSY.
283-
// https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643
284-
// Try to join the cgroup of InitProcessPid.
285-
if cgroups.IsCgroup2UnifiedMode() && p.initProcessPid != 0 {
286-
initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid)
287-
initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile)
288-
if initCgErr == nil {
289-
if initCgPath, ok := initCg[""]; ok {
290-
initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath)
291-
logrus.Debugf("adding pid %d to cgroups %v failed (%v), attempting to join %q (obtained from %s)",
292-
p.pid(), p.cgroupPaths, err, initCg, initCgDirpath)
293-
// NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container.
294-
err = cgroups.WriteCgroupProc(initCgDirpath, p.pid())
295-
}
296-
}
297-
}
298-
if err != nil {
299-
return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
300-
}
301-
}
413+
if err := p.addIntoCgroup(); err != nil {
414+
return err
302415
}
303416
// Set final CPU affinity right after the process is moved into container's cgroup.
304417
if err := p.setFinalCPUAffinity(); err != nil {

script/setup_rootless.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,4 @@ ssh-keygen -t ecdsa -N "" -f "$HOME/.ssh/rootless.key"
1212
sudo mkdir -p -m 0700 /home/rootless/.ssh
1313
sudo cp "$HOME/.ssh/rootless.key" /home/rootless/.ssh/id_ecdsa
1414
sudo cp "$HOME/.ssh/rootless.key.pub" /home/rootless/.ssh/authorized_keys
15-
sudo chown -R rootless.rootless /home/rootless
15+
sudo chown -R rootless:rootless /home/rootless

tests/integration/exec.bats

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -226,17 +226,17 @@ function check_exec_debug() {
226226
# Check we can't join parent cgroup.
227227
runc exec --cgroup ".." test_busybox cat /proc/self/cgroup
228228
[ "$status" -ne 0 ]
229-
[[ "$output" == *" .. is not a sub cgroup path"* ]]
229+
[[ "$output" == *"bad sub cgroup path"* ]]
230230

231231
# Check we can't join non-existing subcgroup.
232232
runc exec --cgroup nonexistent test_busybox cat /proc/self/cgroup
233233
[ "$status" -ne 0 ]
234-
[[ "$output" == *" adding pid "*"/nonexistent/cgroup.procs: no such file "* ]]
234+
[[ "$output" == *" adding pid "*"o such file or directory"* ]]
235235

236236
# Check we can't join non-existing subcgroup (for a particular controller).
237237
runc exec --cgroup cpu:nonexistent test_busybox cat /proc/self/cgroup
238238
[ "$status" -ne 0 ]
239-
[[ "$output" == *" adding pid "*"/nonexistent/cgroup.procs: no such file "* ]]
239+
[[ "$output" == *" adding pid "*"o such file or directory"* ]]
240240

241241
# Check we can't specify non-existent controller.
242242
runc exec --cgroup whaaat:/ test_busybox true
@@ -277,12 +277,12 @@ function check_exec_debug() {
277277
# Check we can't join parent cgroup.
278278
runc exec --cgroup ".." test_busybox cat /proc/self/cgroup
279279
[ "$status" -ne 0 ]
280-
[[ "$output" == *" .. is not a sub cgroup path"* ]]
280+
[[ "$output" == *"bad sub cgroup path"* ]]
281281

282282
# Check we can't join non-existing subcgroup.
283283
runc exec --cgroup nonexistent test_busybox cat /proc/self/cgroup
284284
[ "$status" -ne 0 ]
285-
[[ "$output" == *" adding pid "*"/nonexistent/cgroup.procs: no such file "* ]]
285+
[[ "$output" == *" cgroup"*"o such file or directory"* ]]
286286

287287
# Check we can join top-level cgroup (implicit).
288288
runc exec test_busybox grep '^0::/$' /proc/self/cgroup
@@ -318,7 +318,7 @@ function check_exec_debug() {
318318
# Check that --cgroup / disables the init cgroup fallback.
319319
runc exec --cgroup / test_busybox true
320320
[ "$status" -ne 0 ]
321-
[[ "$output" == *" adding pid "*" to cgroups"*"/cgroup.procs: device or resource busy"* ]]
321+
[[ "$output" == *" adding pid "*" to cgroups"*"evice or resource busy"* ]]
322322

323323
# Check that explicit --cgroup foobar works.
324324
runc exec --cgroup foobar test_busybox grep '^0::/foobar$' /proc/self/cgroup

vendor/github.com/opencontainers/cgroups/cgroups.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/opencontainers/cgroups/fs/fs.go

Lines changed: 29 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)