Skip to content

Commit 6e848b2

Browse files
committed
cmd, pkg/nvidia: Enable the proprietary NVIDIA driver
This uses the NVIDIA Container Toolkit [1] to generate a Container Device Interface specification [2] on the host during the 'enter' and 'run' commands. The specification is saved as JSON in the runtime directories at /run/toolbox or $XDG_RUNTIME_DIR/toolbox to make it available to the Toolbx container's entry point. The environment variables in the specification are directly passed to 'podman exec', while the hooks and mounts are handled by the entry point. Toolbx containers already have access to all the devices in the host operating system's /dev, and containers share the kernel space driver with the host. So, this is only about making the user space driver available to the container. It's done by bind mounting the files mentioned in the generated CDI specification from the host to the container, and then updating the container's dynamic linker cache. This neither depends on 'nvidia-ctk cdi generate' to generate the Container Device Interface specification nor on 'podman create --device' to consume it. The main problem with nvidia-ctk and 'podman create' is that the specification must be saved in /etc/cdi or /var/run/cdi, both of which require root access, for it to be visible to 'podman create --device'. Toolbx containers are often used rootless, so requiring root privileges for hardware support, something that's not necessary on the host, will be a problem. Secondly, updating the toolbox(1) binary won't let existing containers use the proprietary NVIDIA driver, because 'podman create' only affects new containers. Therefore, toolbox(1) uses the Go APIs used by 'nvidia-ctk cdi generate' and 'podman create --device' to generate, save, load and apply the CDI specification itself. This removes the need for root privileges due to /etc/cdi or /var/run/cdi, and makes the driver available to existing containers. Until Bats 1.10.0, 'run --keep-empty-lines' had a bug where it counted the trailing newline on the last line as a separate line [3]. However, Bats 1.10.0 is only available in Fedora >= 39 and is absent from Fedora 38. Based on an idea from Ievgen Popovych. [1] https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/ https://github.com/NVIDIA/nvidia-container-toolkit [2] https://github.com/cncf-tags/container-device-interface [3] Bats commit 6648e2143bffb933 bats-core/bats-core@6648e2143bffb933 bats-core/bats-core#708 #116
1 parent ef98adb commit 6e848b2

23 files changed

+1070
-11
lines changed

src/cmd/initContainer.go

Lines changed: 182 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,12 @@ import (
3030
"github.com/containers/toolbox/pkg/shell"
3131
"github.com/containers/toolbox/pkg/utils"
3232
"github.com/fsnotify/fsnotify"
33+
"github.com/google/renameio/v2"
3334
"github.com/sirupsen/logrus"
3435
"github.com/spf13/cobra"
3536
"golang.org/x/sys/unix"
37+
"tags.cncf.io/container-device-interface/pkg/cdi"
38+
"tags.cncf.io/container-device-interface/specs-go"
3639
)
3740

3841
var (
@@ -264,6 +267,36 @@ func initContainer(cmd *cobra.Command, args []string) error {
264267
return err
265268
}
266269

270+
uidString := strconv.Itoa(initContainerFlags.uid)
271+
targetUser, err := user.LookupId(uidString)
272+
if err != nil {
273+
return fmt.Errorf("failed to look up user ID %s: %w", uidString, err)
274+
}
275+
276+
cdiFileForNvidia, err := getCDIFileForNvidia(targetUser)
277+
if err != nil {
278+
return err
279+
}
280+
281+
logrus.Debugf("Loading Container Device Interface for NVIDIA from file %s", cdiFileForNvidia)
282+
283+
cdiSpecForNvidia, err := loadCDISpecFrom(cdiFileForNvidia)
284+
if err != nil {
285+
if errors.Is(err, os.ErrNotExist) {
286+
logrus.Debugf("Loading Container Device Interface for NVIDIA: file %s not found",
287+
cdiFileForNvidia)
288+
} else {
289+
logrus.Debugf("Loading Container Device Interface for NVIDIA: failed: %s", err)
290+
return errors.New("failed to load Container Device Interface for NVIDIA")
291+
}
292+
}
293+
294+
if cdiSpecForNvidia != nil {
295+
if err := applyCDISpecForNvidia(cdiSpecForNvidia); err != nil {
296+
return err
297+
}
298+
}
299+
267300
if utils.PathExists("/etc/krb5.conf.d") && !utils.PathExists("/etc/krb5.conf.d/kcm_default_ccache") {
268301
logrus.Debug("Setting KCM as the default Kerberos credential cache")
269302

@@ -338,12 +371,6 @@ func initContainer(cmd *cobra.Command, args []string) error {
338371

339372
logrus.Debug("Finished initializing container")
340373

341-
uidString := strconv.Itoa(initContainerFlags.uid)
342-
targetUser, err := user.LookupId(uidString)
343-
if err != nil {
344-
return fmt.Errorf("failed to look up user ID %s: %w", uidString, err)
345-
}
346-
347374
toolboxRuntimeDirectory, err := utils.GetRuntimeDirectory(targetUser)
348375
if err != nil {
349376
return err
@@ -404,6 +431,83 @@ func initContainerHelp(cmd *cobra.Command, args []string) {
404431
}
405432
}
406433

434+
func applyCDISpecForNvidia(spec *specs.Spec) error {
435+
if spec == nil {
436+
panic("spec not specified")
437+
}
438+
439+
logrus.Debug("Applying Container Device Interface for NVIDIA")
440+
441+
for _, mount := range spec.ContainerEdits.Mounts {
442+
if err := (&cdi.Mount{Mount: mount}).Validate(); err != nil {
443+
logrus.Debugf("Applying Container Device Interface for NVIDIA: invalid mount: %s", err)
444+
return errors.New("invalid mount in Container Device Interface for NVIDIA")
445+
}
446+
447+
if mount.Type == "" {
448+
mount.Type = "bind"
449+
}
450+
451+
if mount.Type != "bind" {
452+
logrus.Debugf("Applying Container Device Interface for NVIDIA: unknown mount type %s",
453+
mount.Type)
454+
continue
455+
}
456+
457+
flags := strings.Join(mount.Options, ",")
458+
hostPath := filepath.Join(string(filepath.Separator), "run", "host", mount.HostPath)
459+
if err := mountBind(mount.ContainerPath, hostPath, flags); err != nil {
460+
logrus.Debugf("Applying Container Device Interface for NVIDIA: %s", err)
461+
return errors.New("failed to apply mount from Container Device Interface for NVIDIA")
462+
}
463+
}
464+
465+
for _, hook := range spec.ContainerEdits.Hooks {
466+
if err := (&cdi.Hook{Hook: hook}).Validate(); err != nil {
467+
logrus.Debugf("Applying Container Device Interface for NVIDIA: invalid hook: %s", err)
468+
return errors.New("invalid hook in Container Device Interface for NVIDIA")
469+
}
470+
471+
if hook.HookName != cdi.CreateContainerHook {
472+
logrus.Debugf("Applying Container Device Interface for NVIDIA: unknown hook name %s",
473+
hook.HookName)
474+
continue
475+
}
476+
477+
if len(hook.Args) < 3 ||
478+
hook.Args[0] != "nvidia-ctk" ||
479+
hook.Args[1] != "hook" ||
480+
hook.Args[2] != "update-ldcache" {
481+
logrus.Debugf("Applying Container Device Interface for NVIDIA: unknown hook arguments")
482+
continue
483+
}
484+
485+
var folderFlag bool
486+
var folders []string
487+
hookArgs := hook.Args[3:]
488+
489+
for _, hookArg := range hookArgs {
490+
if hookArg == "--folder" {
491+
folderFlag = true
492+
continue
493+
}
494+
495+
if folderFlag {
496+
folders = append(folders, hookArg)
497+
}
498+
499+
folderFlag = false
500+
}
501+
502+
if err := ldConfig("toolbx-nvidia.conf", folders); err != nil {
503+
logrus.Debugf("Applying Container Device Interface for NVIDIA: %s", err)
504+
return errors.New("failed to update ldcache for Container Device Interface for NVIDIA")
505+
}
506+
}
507+
508+
return nil
509+
}
510+
407511
func configureUsers(targetUserUid int, targetUser, targetUserHome, targetUserShell string, homeLink bool) error {
408512
if homeLink {
409513
if err := redirectPath("/home", "/var/home", true); err != nil {
@@ -517,6 +621,73 @@ func handleFileSystemEvent(event fsnotify.Event) {
517621
}
518622
}
519623

624+
func ldConfig(configFileBase string, dirs []string) error {
625+
logrus.Debug("Updating dynamic linker cache")
626+
627+
var args []string
628+
629+
if !utils.PathExists("/etc/ld.so.cache") {
630+
logrus.Debug("Updating dynamic linker cache: no /etc/ld.so.cache found")
631+
args = append(args, "-N")
632+
}
633+
634+
if utils.PathExists("/etc/ld.so.conf.d") {
635+
if len(dirs) > 0 {
636+
var builder strings.Builder
637+
builder.WriteString("# Written by Toolbx\n")
638+
builder.WriteString("# https://containertoolbx.org/\n")
639+
builder.WriteString("\n")
640+
641+
configured := make(map[string]struct{})
642+
643+
for _, dir := range dirs {
644+
if _, ok := configured[dir]; ok {
645+
continue
646+
}
647+
648+
configured[dir] = struct{}{}
649+
builder.WriteString(dir)
650+
builder.WriteString("\n")
651+
}
652+
653+
dirConfigString := builder.String()
654+
dirConfigBytes := []byte(dirConfigString)
655+
configFile := filepath.Join("/etc/ld.so.conf.d", configFileBase)
656+
if err := renameio.WriteFile(configFile, dirConfigBytes, 0644); err != nil {
657+
logrus.Debugf("Updating dynamic linker cache: failed to update configuration: %s", err)
658+
return errors.New("failed to update dynamic linker cache configuration")
659+
}
660+
}
661+
} else {
662+
logrus.Debug("Updating dynamic linker cache: no /etc/ld.so.conf.d found")
663+
args = append(args, dirs...)
664+
}
665+
666+
if err := shell.Run("ldconfig", nil, nil, nil, args...); err != nil {
667+
logrus.Debugf("Updating dynamic linker cache: failed: %s", err)
668+
return errors.New("failed to update dynamic linker cache")
669+
}
670+
671+
return nil
672+
}
673+
674+
func loadCDISpecFrom(path string) (*specs.Spec, error) {
675+
data, err := os.ReadFile(path)
676+
if err != nil {
677+
return nil, err
678+
}
679+
680+
spec, err := cdi.ParseSpec(data)
681+
if err != nil {
682+
return nil, err
683+
}
684+
if spec == nil {
685+
return nil, errors.New("missing data")
686+
}
687+
688+
return spec, nil
689+
}
690+
520691
func mountBind(containerPath, source, flags string) error {
521692
fi, err := os.Stat(source)
522693
if err != nil {
@@ -537,6 +708,11 @@ func mountBind(containerPath, source, flags string) error {
537708
} else if fileMode.IsRegular() {
538709
logrus.Debugf("Creating regular file %s", containerPath)
539710

711+
containerPathDir := filepath.Dir(containerPath)
712+
if err := os.MkdirAll(containerPathDir, 0755); err != nil {
713+
return fmt.Errorf("failed to create directory %s: %w", containerPathDir, err)
714+
}
715+
540716
containerPathFile, err := os.Create(containerPath)
541717
if err != nil && !os.IsExist(err) {
542718
return fmt.Errorf("failed to create regular file %s: %w", containerPath, err)

src/cmd/root.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"strings"
2727
"syscall"
2828

29+
"github.com/containers/toolbox/pkg/nvidia"
2930
"github.com/containers/toolbox/pkg/podman"
3031
"github.com/containers/toolbox/pkg/utils"
3132
"github.com/containers/toolbox/pkg/version"
@@ -382,6 +383,7 @@ func setUpLoggers() error {
382383
logrus.SetLevel(logLevel)
383384

384385
if rootFlags.verbose > 1 {
386+
nvidia.SetLogLevel(logLevel)
385387
rootFlags.logPodman = true
386388
}
387389

src/cmd/run.go

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package cmd
1919
import (
2020
"bufio"
2121
"context"
22+
"encoding/json"
2223
"errors"
2324
"fmt"
2425
"io"
@@ -28,15 +29,18 @@ import (
2829
"strings"
2930
"time"
3031

32+
"github.com/containers/toolbox/pkg/nvidia"
3133
"github.com/containers/toolbox/pkg/podman"
3234
"github.com/containers/toolbox/pkg/shell"
3335
"github.com/containers/toolbox/pkg/term"
3436
"github.com/containers/toolbox/pkg/utils"
3537
"github.com/fsnotify/fsnotify"
3638
"github.com/go-logfmt/logfmt"
39+
"github.com/google/renameio/v2"
3740
"github.com/sirupsen/logrus"
3841
"github.com/spf13/cobra"
3942
"golang.org/x/sys/unix"
43+
"tags.cncf.io/container-device-interface/specs-go"
4044
)
4145

4246
type collectEntryPointErrorFunc func(err error)
@@ -273,9 +277,31 @@ func runCommand(container string,
273277
return err
274278
}
275279

280+
var cdiEnviron []string
281+
282+
cdiSpecForNvidia, err := nvidia.GenerateCDISpec()
283+
if err != nil {
284+
if !errors.Is(err, nvidia.ErrPlatformUnsupported) {
285+
return err
286+
}
287+
} else {
288+
cdiEnviron = append(cdiEnviron, cdiSpecForNvidia.ContainerEdits.Env...)
289+
}
290+
276291
startContainerTimestamp := time.Unix(-1, 0)
277292

278293
if entryPointPID <= 0 {
294+
if cdiSpecForNvidia != nil {
295+
cdiFileForNvidia, err := getCDIFileForNvidia(currentUser)
296+
if err != nil {
297+
return err
298+
}
299+
300+
if err := saveCDISpecTo(cdiSpecForNvidia, cdiFileForNvidia); err != nil {
301+
return err
302+
}
303+
}
304+
279305
startContainerTimestamp = time.Now()
280306

281307
logrus.Debugf("Starting container %s", container)
@@ -317,6 +343,7 @@ func runCommand(container string,
317343
if err := runCommandWithFallbacks(container,
318344
preserveFDs,
319345
command,
346+
cdiEnviron,
320347
emitEscapeSequence,
321348
fallbackToBash); err != nil {
322349
return err
@@ -327,7 +354,7 @@ func runCommand(container string,
327354

328355
func runCommandWithFallbacks(container string,
329356
preserveFDs uint,
330-
command []string,
357+
command, environ []string,
331358
emitEscapeSequence, fallbackToBash bool) error {
332359

333360
logrus.Debug("Checking if 'podman exec' supports disabling the detach keys")
@@ -340,6 +367,12 @@ func runCommandWithFallbacks(container string,
340367
}
341368

342369
envOptions := utils.GetEnvOptionsForPreservedVariables()
370+
for _, env := range environ {
371+
logrus.Debugf("%s", env)
372+
envOption := "--env=" + env
373+
envOptions = append(envOptions, envOption)
374+
}
375+
343376
preserveFDsString := fmt.Sprint(preserveFDs)
344377

345378
var stderr io.Writer
@@ -828,6 +861,36 @@ func isUsePollingSet() bool {
828861
return true
829862
}
830863

864+
func saveCDISpecTo(spec *specs.Spec, path string) error {
865+
if path == "" {
866+
panic("path not specified")
867+
}
868+
869+
if spec == nil {
870+
panic("spec not specified")
871+
}
872+
873+
logrus.Debugf("Saving Container Device Interface to file %s", path)
874+
875+
if extension := filepath.Ext(path); extension != ".json" {
876+
panicMsg := fmt.Sprintf("path has invalid extension %s", extension)
877+
panic(panicMsg)
878+
}
879+
880+
data, err := json.MarshalIndent(spec, "", " ")
881+
if err != nil {
882+
logrus.Debugf("Saving Container Device Interface: failed to marshal JSON: %s", err)
883+
return errors.New("failed to marshal Container Device Interface to JSON")
884+
}
885+
886+
if err := renameio.WriteFile(path, data, 0644); err != nil {
887+
logrus.Debugf("Saving Container Device Interface: failed to write file: %s", err)
888+
return errors.New("failed to write Container Device Interface to file")
889+
}
890+
891+
return nil
892+
}
893+
831894
func showEntryPointLog(line string) error {
832895
var logLevel logrus.Level
833896
var logLevelFound bool

0 commit comments

Comments
 (0)