From a302c81a04f127d9d52ffac04b0732fdd924168a Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Mon, 15 Sep 2025 15:53:58 +0200 Subject: [PATCH 1/6] storage/internal/tempdir: add StageAddition() Add a new function to stage additions. This should be used to extract the layer content into a temp directory without holding the storage lock and then under the lock just rename the directory into the final location to reduce the lock contention. Signed-off-by: Paul Holzinger --- storage/internal/tempdir/tempdir.go | 41 ++++++++++++++++ storage/internal/tempdir/tempdir_test.go | 60 ++++++++++++++++++++++++ 2 files changed, 101 insertions(+) diff --git a/storage/internal/tempdir/tempdir.go b/storage/internal/tempdir/tempdir.go index 6522c45d18..c5b001f8dd 100644 --- a/storage/internal/tempdir/tempdir.go +++ b/storage/internal/tempdir/tempdir.go @@ -91,6 +91,30 @@ type TempDir struct { counter uint64 } +// StageAddition is a temporary object which holds the information of where to +// put the data into and then use Commit() to move the data into the final location. +type StageAddition struct { + // Path is the temporary path. The path is not created so caller must create + // a file or directory on it in order to use Commit(). The path is only valid + // until Commit() is called or until the TempDir instance Cleanup() method is used. + Path string +} + +// CommitFunc is a function type that can be returned by operations +// which need to perform the commit operation later. +type CommitFunc func(destination string) error + +// Commit the staged content into its final destination by using os.Rename(). +// That means the dest must be on the same on the same fs as the root directory +// that was given to NewTempDir() and the dest must not exist yet. +// Commit must only be called once per instance returned from the +// StageAddition() call. +func (s *StageAddition) Commit(destination string) error { + err := os.Rename(s.Path, destination) + s.Path = "" // invalidate Path to avoid reuse + return err +} + // CleanupTempDirFunc is a function type that can be returned by operations // which need to perform cleanup actions later. type CleanupTempDirFunc func() error @@ -190,6 +214,23 @@ func NewTempDir(rootDir string) (*TempDir, error) { return td, nil } +// StageAddition creates a new temporary path that is returned as field in the StageAddition +// struct. The returned type has a type a the Commit() function to move the content from +// the temporary location to the final one. +// +// The caller MUST ensure .Cleanup() is called after Commit() otherwise the staged content +// will be deleted and the move will fail. +// If the TempDir has been cleaned up already, this method will return an error. +func (td *TempDir) StageAddition() (*StageAddition, error) { + if td.tempDirLock == nil { + return nil, fmt.Errorf("temp dir instance not initialized or already cleaned up") + } + fileName := fmt.Sprintf("%d-", td.counter) + "addition" + tmpAddPath := filepath.Join(td.tempDirPath, fileName) + td.counter++ + return &StageAddition{Path: tmpAddPath}, nil +} + // StageDeletion moves the specified file into the instance's temporary directory. // The temporary directory must already exist (created during NewTempDir). // Files are renamed with a counter-based prefix (e.g., "0-filename", "1-filename") to ensure uniqueness. diff --git a/storage/internal/tempdir/tempdir_test.go b/storage/internal/tempdir/tempdir_test.go index a556950524..dbfbe3ac1a 100644 --- a/storage/internal/tempdir/tempdir_test.go +++ b/storage/internal/tempdir/tempdir_test.go @@ -271,3 +271,63 @@ func TestTempDirFileNaming(t *testing.T) { assert.True(t, found, "Expected file %s not found", expectedName) } } + +func TestStageAddition(t *testing.T) { + rootDir := t.TempDir() + td, err := NewTempDir(rootDir) + require.NoError(t, err) + t.Cleanup(func() { + assert.NoError(t, td.Cleanup()) + }) + + sa1, err := td.StageAddition() + require.NoError(t, err) + // Path should not be created by StageAddition. + assert.NoFileExists(t, sa1.Path) + + // ensure we can create file + f, err := os.Create(sa1.Path) + require.NoError(t, err) + require.NoError(t, f.Close()) + + // need to use a dest file which does not exist yet + dest := filepath.Join(t.TempDir(), "file1") + + err = sa1.Commit(dest) + require.NoError(t, err) + assert.FileExists(t, dest) + assert.NoFileExists(t, sa1.Path) + + // now test the same with a directory + sa2, err := td.StageAddition() + require.NoError(t, err) + // Path should not be created by StageAddition. + assert.NoDirExists(t, sa2.Path) + + // ensure we can create a directory + err = os.Mkdir(sa2.Path, 0o755) + require.NoError(t, err) + + // need to use a dest which does not exist yet + dest = filepath.Join(t.TempDir(), "dir") + + err = sa2.Commit(dest) + require.NoError(t, err) + assert.DirExists(t, dest) + assert.NoDirExists(t, sa2.Path) + + // Commit the same stage addition struct again should error + err = sa2.Commit(dest) + require.Error(t, err) + + // Cleanup() should cleanup the temp paths from StageAddition + sa3, err := td.StageAddition() + require.NoError(t, err) + + err = os.Mkdir(sa3.Path, 0o755) + require.NoError(t, err) + + err = td.Cleanup() + require.NoError(t, err) + assert.NoDirExists(t, sa3.Path) +} From 8332a27f61590332e29537a0c35919344c6aa79b Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Wed, 8 Oct 2025 12:40:56 +0200 Subject: [PATCH 2/6] storage: simplify ApplyDiff() in overlay driver It is not clear to me when it will hit the code path there, by normal layer creation we always pass a valid parent so this branch is never reached AFAICT. Let's remove it and see if all tests still pass in podman, buildah and others... Signed-off-by: Paul Holzinger --- storage/drivers/overlay/overlay.go | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/storage/drivers/overlay/overlay.go b/storage/drivers/overlay/overlay.go index c08e060466..5e894216c1 100644 --- a/storage/drivers/overlay/overlay.go +++ b/storage/drivers/overlay/overlay.go @@ -2371,16 +2371,6 @@ func (d *Driver) DifferTarget(id string) (string, error) { // ApplyDiff applies the new layer into a root func (d *Driver) ApplyDiff(id, parent string, options graphdriver.ApplyDiffOpts) (size int64, err error) { - if !d.isParent(id, parent) { - if d.options.ignoreChownErrors { - options.IgnoreChownErrors = d.options.ignoreChownErrors - } - if d.options.forceMask != nil { - options.ForceMask = d.options.forceMask - } - return d.naiveDiff.ApplyDiff(id, parent, options) - } - idMappings := options.Mappings if idMappings == nil { idMappings = &idtools.IDMappings{} From ff03216a4a77a61424f682ab2dbe6e03ba3bd1dd Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Fri, 10 Oct 2025 14:34:40 +0200 Subject: [PATCH 3/6] overlay: add StartStagingDiffToApply() Add a function to apply the diff into a tmporary directory so we can do that unlcoked and only rename under the lock. Signed-off-by: Paul Holzinger --- storage/drivers/driver.go | 9 +++ storage/drivers/overlay/overlay.go | 86 ++++++++++++++++++++++--- storage/drivers/overlay/overlay_test.go | 3 + 3 files changed, 90 insertions(+), 8 deletions(-) diff --git a/storage/drivers/driver.go b/storage/drivers/driver.go index fed80751b5..e178ea0f67 100644 --- a/storage/drivers/driver.go +++ b/storage/drivers/driver.go @@ -299,6 +299,15 @@ type DriverWithDiffer interface { DifferTarget(id string) (string, error) } +// ApplyDiffStaging is an interface for driver who can apply the diff without holding the main storage lock. +// This API is experimental and can be changed without bumping the major version number. +type ApplyDiffStaging interface { + // StartStagingDiffToApply applies the layer in a temporary directory. This can be done without holding the storage lock. + StartStagingDiffToApply(options ApplyDiffOpts) (tempdir.CleanupTempDirFunc, *tempdir.StageAddition, int64, error) + // CommitStagedLayer commits the staged layer from StartStagingDiffToApply(). This must be done while the storage lock. + CommitStagedLayer(id string, commit *tempdir.StageAddition) error +} + // Capabilities defines a list of capabilities a driver may implement. // These capabilities are not required; however, they do determine how a // graphdriver can be used. diff --git a/storage/drivers/overlay/overlay.go b/storage/drivers/overlay/overlay.go index 5e894216c1..559bd55669 100644 --- a/storage/drivers/overlay/overlay.go +++ b/storage/drivers/overlay/overlay.go @@ -2369,21 +2369,91 @@ func (d *Driver) DifferTarget(id string) (string, error) { return d.getDiffPath(id) } -// ApplyDiff applies the new layer into a root -func (d *Driver) ApplyDiff(id, parent string, options graphdriver.ApplyDiffOpts) (size int64, err error) { - idMappings := options.Mappings - if idMappings == nil { - idMappings = &idtools.IDMappings{} +// StartStagingDiffToApply applies the new layer into a temporary directory. +// It returns a CleanupTempDirFunc which can nil or set regardless if the function return an error or not. +// CommitFunc is only set when there is no error returned and the int64 value returns the size of the layer. +// +// This API is experimental and can be changed without bumping the major version number. +func (d *Driver) StartStagingDiffToApply(options graphdriver.ApplyDiffOpts) (tempdir.CleanupTempDirFunc, *tempdir.StageAddition, int64, error) { + // FIXME: how to consolidate with d.getTempDirRoot(id) if we don't have the id? + tempDirRoot := filepath.Join(d.homeDirForImageStore(), tempDirName) + t, err := tempdir.NewTempDir(tempDirRoot) + if err != nil { + return nil, nil, -1, err + } + + sa, err := t.StageAddition() + if err != nil { + return nil, nil, -1, err + } + + size, err := d.applyDiff(sa.Path, options) + if err != nil { + return t.Cleanup, nil, -1, err + } + + return t.Cleanup, sa, size, nil +} + +// CommitStagedLayer that was created with StartStagingDiffToApply(). +// +// This API is experimental and can be changed without bumping the major version number. +func (d *Driver) CommitStagedLayer(id string, sa *tempdir.StageAddition) error { + applyDir, err := d.getDiffPath(id) + if err != nil { + return err + } + + // FIXME: Is there a better way to do this? + stat, err := system.Stat(applyDir) + if err != nil { + return err } + if err := os.Chmod(sa.Path, os.FileMode(stat.Mode())); err != nil { + return err + } + + if err := os.Chown(sa.Path, int(stat.UID()), int(stat.GID())); err != nil { + return err + } + if d.options.forceMask != nil { + st, err := idtools.GetContainersOverrideXattr(applyDir) + if err != nil { + return err + } + if err := idtools.SetContainersOverrideXattr(sa.Path, st); err != nil { + return err + } + } + + // The os.Rename() function used by CommitFunc errors when the target directory already + // exists, as such delete the dir. + if err := os.Remove(applyDir); err != nil { + return err + } + + return sa.Commit(applyDir) +} +// ApplyDiff applies the new layer into a root +func (d *Driver) ApplyDiff(id, parent string, options graphdriver.ApplyDiffOpts) (size int64, err error) { applyDir, err := d.getDiffPath(id) if err != nil { return 0, err } + return d.applyDiff(applyDir, options) +} + +// ApplyDiff applies the new layer into a root +func (d *Driver) applyDiff(target string, options graphdriver.ApplyDiffOpts) (size int64, err error) { + idMappings := options.Mappings + if idMappings == nil { + idMappings = &idtools.IDMappings{} + } - logrus.Debugf("Applying tar in %s", applyDir) + logrus.Debugf("Applying tar in %s", target) // Overlay doesn't need the parent id to apply the diff - if err := untar(options.Diff, applyDir, &archive.TarOptions{ + if err := untar(options.Diff, target, &archive.TarOptions{ UIDMaps: idMappings.UIDs(), GIDMaps: idMappings.GIDs(), IgnoreChownErrors: d.options.ignoreChownErrors, @@ -2394,7 +2464,7 @@ func (d *Driver) ApplyDiff(id, parent string, options graphdriver.ApplyDiffOpts) return 0, err } - return directory.Size(applyDir) + return directory.Size(target) } func (d *Driver) getComposefsData(id string) string { diff --git a/storage/drivers/overlay/overlay_test.go b/storage/drivers/overlay/overlay_test.go index dea86bb5b6..d1e96e7f14 100644 --- a/storage/drivers/overlay/overlay_test.go +++ b/storage/drivers/overlay/overlay_test.go @@ -17,6 +17,9 @@ import ( const driverName = "overlay" +// check that Driver correctly implements the ApplyDiffTemporary interface +var _ graphdriver.ApplyDiffStaging = &Driver{} + func init() { // Do not sure chroot to speed run time and allow archive // errors or hangs to be debugged directly from the test process. From a6fa507d8b41c6aac3ea4cefa5a93ff260f61de3 Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Tue, 21 Oct 2025 15:48:39 +0200 Subject: [PATCH 4/6] storage: don't buffer tar split file in memory I cannot see any reason why we should buffer the full tar split content in memory before writing it. That layer is still mark partial at this point and the store is locked so there is no concurrent access either thus we do not need the atomic rename here. Signed-off-by: Paul Holzinger --- storage/layers.go | 48 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/storage/layers.go b/storage/layers.go index 64d3f5c72c..e1024c9369 100644 --- a/storage/layers.go +++ b/storage/layers.go @@ -31,6 +31,7 @@ import ( "go.podman.io/storage/pkg/ioutils" "go.podman.io/storage/pkg/lockfile" "go.podman.io/storage/pkg/mount" + "go.podman.io/storage/pkg/pools" "go.podman.io/storage/pkg/stringid" "go.podman.io/storage/pkg/system" "go.podman.io/storage/pkg/tarlog" @@ -2398,6 +2399,13 @@ func (r *layerStore) ApplyDiff(to string, diff io.Reader) (size int64, err error return r.applyDiffWithOptions(to, nil, diff) } +func createTarSplitFile(r *layerStore, layerID string) (*os.File, error) { + if err := os.MkdirAll(filepath.Dir(r.tspath(layerID)), 0o700); err != nil { + return nil, err + } + return os.OpenFile(r.tspath(layerID), os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o600) +} + // Requires startWriting. func (r *layerStore) applyDiffWithOptions(to string, layerOptions *LayerOptions, diff io.Reader) (size int64, err error) { if !r.lockfile.IsReadWrite() { @@ -2442,13 +2450,19 @@ func (r *layerStore) applyDiffWithOptions(to string, layerOptions *LayerOptions, compressedCounter := ioutils.NewWriteCounter(compressedWriter) defragmented = io.TeeReader(defragmented, compressedCounter) - tsdata := bytes.Buffer{} + tarSplitFile, err := createTarSplitFile(r, layer.ID) + if err != nil { + return -1, err + } + defer tarSplitFile.Close() + tarSplitWriter := pools.BufioWriter32KPool.Get(tarSplitFile) + uidLog := make(map[uint32]struct{}) gidLog := make(map[uint32]struct{}) var uncompressedCounter *ioutils.WriteCounter size, err = func() (int64, error) { // A scope for defer - compressor, err := pgzip.NewWriterLevel(&tsdata, pgzip.BestSpeed) + compressor, err := pgzip.NewWriterLevel(tarSplitWriter, pgzip.BestSpeed) if err != nil { return -1, err } @@ -2496,12 +2510,13 @@ func (r *layerStore) applyDiffWithOptions(to string, layerOptions *LayerOptions, return -1, err } - if err := os.MkdirAll(filepath.Dir(r.tspath(layer.ID)), 0o700); err != nil { - return -1, err + if err := tarSplitWriter.Flush(); err != nil { + return -1, fmt.Errorf("failed to flush tar split writer buffer: %w", err) } - if err := ioutils.AtomicWriteFile(r.tspath(layer.ID), tsdata.Bytes(), 0o600); err != nil { - return -1, err + if err := tarSplitFile.Sync(); err != nil { + return -1, fmt.Errorf("sync tar split file: %w", err) } + if compressedDigester != nil { compressedDigest = compressedDigester.Digest() } @@ -2597,10 +2612,16 @@ func (r *layerStore) applyDiffFromStagingDirectory(id string, diffOutput *driver } if diffOutput.TarSplit != nil { - tsdata := bytes.Buffer{} - compressor, err := pgzip.NewWriterLevel(&tsdata, pgzip.BestSpeed) + tarSplitFile, err := createTarSplitFile(r, layer.ID) if err != nil { - compressor = pgzip.NewWriter(&tsdata) + return err + } + defer tarSplitFile.Close() + tarSplitWriter := pools.BufioWriter32KPool.Get(tarSplitFile) + + compressor, err := pgzip.NewWriterLevel(tarSplitWriter, pgzip.BestSpeed) + if err != nil { + compressor = pgzip.NewWriter(tarSplitWriter) } if _, err := diffOutput.TarSplit.Seek(0, io.SeekStart); err != nil { return err @@ -2614,11 +2635,12 @@ func (r *layerStore) applyDiffFromStagingDirectory(id string, diffOutput *driver return err } compressor.Close() - if err := os.MkdirAll(filepath.Dir(r.tspath(layer.ID)), 0o700); err != nil { - return err + + if err := tarSplitWriter.Flush(); err != nil { + return fmt.Errorf("failed to flush tar split writer buffer: %w", err) } - if err := ioutils.AtomicWriteFile(r.tspath(layer.ID), tsdata.Bytes(), 0o600); err != nil { - return err + if err := tarSplitFile.Sync(); err != nil { + return fmt.Errorf("sync tar split file: %w", err) } } for k, v := range diffOutput.BigData { From 361278b423a78cda7a82aeb2d9371033a741ba6f Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Wed, 22 Oct 2025 18:21:04 +0200 Subject: [PATCH 5/6] storage: rework applyDiffWithOptions() Split it into multiple function to make it reusable without having a layer and so that it can be used unlocked see the following commits. Signed-off-by: Paul Holzinger --- storage/layers.go | 150 +++++++++++++++++++++++++++++----------------- 1 file changed, 94 insertions(+), 56 deletions(-) diff --git a/storage/layers.go b/storage/layers.go index e1024c9369..5f543ea144 100644 --- a/storage/layers.go +++ b/storage/layers.go @@ -203,6 +203,17 @@ type stagedLayerOptions struct { DiffOptions *drivers.ApplyDiffWithDifferOpts } +type applyDiffResult struct { + compressedDigest digest.Digest + compressedSize int64 + compressionType archive.Compression + uncompressedDigest digest.Digest + uncompressedSize int64 + size int64 + uids []uint32 + gids []uint32 +} + // roLayerStore wraps a graph driver, adding the ability to refer to layers by // name, and keeping track of parent-child relationships, along with a list of // all known layers. @@ -2406,37 +2417,27 @@ func createTarSplitFile(r *layerStore, layerID string) (*os.File, error) { return os.OpenFile(r.tspath(layerID), os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o600) } -// Requires startWriting. -func (r *layerStore) applyDiffWithOptions(to string, layerOptions *LayerOptions, diff io.Reader) (size int64, err error) { - if !r.lockfile.IsReadWrite() { - return -1, fmt.Errorf("not allowed to modify layer contents at %q: %w", r.layerdir, ErrStoreIsReadOnly) - } - - layer, ok := r.lookup(to) - if !ok { - return -1, ErrLayerUnknown - } - +func applyDiff(layerOptions *LayerOptions, diff io.Reader, tarSplitFile *os.File, applyDriverFunc func(io.Reader) (int64, error)) (*applyDiffResult, error) { header := make([]byte, 10240) n, err := diff.Read(header) if err != nil && err != io.EOF { - return -1, err + return nil, err } compression := archive.DetectCompression(header[:n]) defragmented := io.MultiReader(bytes.NewReader(header[:n]), diff) - // Decide if we need to compute digests - var compressedDigest, uncompressedDigest digest.Digest // = "" + result := applyDiffResult{} + var compressedDigester, uncompressedDigester digest.Digester // = nil if layerOptions != nil && layerOptions.OriginalDigest != "" && layerOptions.OriginalDigest.Algorithm() == digest.Canonical { - compressedDigest = layerOptions.OriginalDigest + result.compressedDigest = layerOptions.OriginalDigest } else { compressedDigester = digest.Canonical.Digester() } if layerOptions != nil && layerOptions.UncompressedDigest != "" && layerOptions.UncompressedDigest.Algorithm() == digest.Canonical { - uncompressedDigest = layerOptions.UncompressedDigest + result.uncompressedDigest = layerOptions.UncompressedDigest } else if compression != archive.Uncompressed { uncompressedDigester = digest.Canonical.Digester() } @@ -2450,18 +2451,13 @@ func (r *layerStore) applyDiffWithOptions(to string, layerOptions *LayerOptions, compressedCounter := ioutils.NewWriteCounter(compressedWriter) defragmented = io.TeeReader(defragmented, compressedCounter) - tarSplitFile, err := createTarSplitFile(r, layer.ID) - if err != nil { - return -1, err - } - defer tarSplitFile.Close() tarSplitWriter := pools.BufioWriter32KPool.Get(tarSplitFile) uidLog := make(map[uint32]struct{}) gidLog := make(map[uint32]struct{}) var uncompressedCounter *ioutils.WriteCounter - size, err = func() (int64, error) { // A scope for defer + size, err := func() (int64, error) { // A scope for defer compressor, err := pgzip.NewWriterLevel(tarSplitWriter, pgzip.BestSpeed) if err != nil { return -1, err @@ -2495,63 +2491,105 @@ func (r *layerStore) applyDiffWithOptions(to string, layerOptions *LayerOptions, if err != nil { return -1, err } - options := drivers.ApplyDiffOpts{ - Diff: payload, - Mappings: r.layerMappings(layer), - MountLabel: layer.MountLabel, - } - size, err := r.driver.ApplyDiff(layer.ID, layer.Parent, options) - if err != nil { - return -1, err - } - return size, err + + return applyDriverFunc(payload) }() if err != nil { - return -1, err + return nil, err } if err := tarSplitWriter.Flush(); err != nil { - return -1, fmt.Errorf("failed to flush tar split writer buffer: %w", err) - } - if err := tarSplitFile.Sync(); err != nil { - return -1, fmt.Errorf("sync tar split file: %w", err) + return nil, fmt.Errorf("failed to flush tar split writer buffer: %w", err) } if compressedDigester != nil { - compressedDigest = compressedDigester.Digest() + result.compressedDigest = compressedDigester.Digest() } if uncompressedDigester != nil { - uncompressedDigest = uncompressedDigester.Digest() + result.uncompressedDigest = uncompressedDigester.Digest() } - if uncompressedDigest == "" && compression == archive.Uncompressed { - uncompressedDigest = compressedDigest + if result.uncompressedDigest == "" && compression == archive.Uncompressed { + result.uncompressedDigest = result.compressedDigest } - updateDigestMap(&r.bycompressedsum, layer.CompressedDigest, compressedDigest, layer.ID) - layer.CompressedDigest = compressedDigest if layerOptions != nil && layerOptions.OriginalDigest != "" && layerOptions.OriginalSize != nil { - layer.CompressedSize = *layerOptions.OriginalSize + result.compressedSize = *layerOptions.OriginalSize } else { - layer.CompressedSize = compressedCounter.Count + result.compressedSize = compressedCounter.Count } - updateDigestMap(&r.byuncompressedsum, layer.UncompressedDigest, uncompressedDigest, layer.ID) - layer.UncompressedDigest = uncompressedDigest - layer.UncompressedSize = uncompressedCounter.Count - layer.CompressionType = compression - layer.UIDs = make([]uint32, 0, len(uidLog)) + result.uncompressedSize = uncompressedCounter.Count + result.compressionType = compression + + result.uids = make([]uint32, 0, len(uidLog)) for uid := range uidLog { - layer.UIDs = append(layer.UIDs, uid) + result.uids = append(result.uids, uid) } - slices.Sort(layer.UIDs) - layer.GIDs = make([]uint32, 0, len(gidLog)) + slices.Sort(result.uids) + result.gids = make([]uint32, 0, len(gidLog)) for gid := range gidLog { - layer.GIDs = append(layer.GIDs, gid) + result.gids = append(result.gids, gid) } - slices.Sort(layer.GIDs) + slices.Sort(result.gids) + + result.size = size + + return &result, err +} + +// Requires startWriting. +func (r *layerStore) applyDiffWithOptions(to string, layerOptions *LayerOptions, diff io.Reader) (int64, error) { + if !r.lockfile.IsReadWrite() { + return -1, fmt.Errorf("not allowed to modify layer contents at %q: %w", r.layerdir, ErrStoreIsReadOnly) + } + + layer, ok := r.lookup(to) + if !ok { + return -1, ErrLayerUnknown + } + + tarSplitFile, err := createTarSplitFile(r, layer.ID) + if err != nil { + return -1, err + } + defer tarSplitFile.Close() + + result, err := applyDiff(layerOptions, diff, tarSplitFile, func(payload io.Reader) (int64, error) { + options := drivers.ApplyDiffOpts{ + Diff: payload, + Mappings: r.layerMappings(layer), + MountLabel: layer.MountLabel, + } + return r.driver.ApplyDiff(layer.ID, layer.Parent, options) + }) + if err != nil { + return -1, err + } + + if err := tarSplitFile.Sync(); err != nil { + return -1, fmt.Errorf("sync tar split file: %w", err) + } + + applyDiffResultToLayer(r, layer, layerOptions, result) err = r.saveFor(layer) - return size, err + return result.size, err +} + +func applyDiffResultToLayer(r *layerStore, layer *Layer, layerOptions *LayerOptions, result *applyDiffResult) { + updateDigestMap(&r.bycompressedsum, layer.CompressedDigest, result.compressedDigest, layer.ID) + layer.CompressedDigest = result.compressedDigest + if layerOptions != nil && layerOptions.OriginalDigest != "" && layerOptions.OriginalSize != nil { + layer.CompressedSize = *layerOptions.OriginalSize + } else { + layer.CompressedSize = result.compressedSize + } + updateDigestMap(&r.byuncompressedsum, layer.UncompressedDigest, result.uncompressedDigest, layer.ID) + layer.UncompressedDigest = result.uncompressedDigest + layer.UncompressedSize = result.uncompressedSize + layer.CompressionType = result.compressionType + layer.UIDs = result.uids + layer.GIDs = result.gids } // Requires (startReading or?) startWriting. From e60d33903cfe6bb3f6d68f638432dd26c5244597 Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Wed, 22 Oct 2025 20:13:54 +0200 Subject: [PATCH 6/6] storage: support staged addition of layers The extracting of the tar under the store lock is a bottleneck as many concurrent processes might hold the locks for a long time on big layers. To address this move the layer extraction before we take the locks if possible. Currently this only work when using the overlay driver as the implementation requires driver specifc details in order for a rename() to work. Signed-off-by: Paul Holzinger --- storage/layers.go | 125 ++++++++++++++++++++++++++++++++++++++++++---- storage/store.go | 32 +++++++++--- storage/userns.go | 2 +- 3 files changed, 141 insertions(+), 18 deletions(-) diff --git a/storage/layers.go b/storage/layers.go index 5f543ea144..ae84dbc8ca 100644 --- a/storage/layers.go +++ b/storage/layers.go @@ -199,8 +199,26 @@ type DiffOptions struct { // stagedLayerOptions are the options passed to .create to populate a staged // layer type stagedLayerOptions struct { + // These are used via the zstd:chunked pull paths DiffOutput *drivers.DriverWithDifferOutput DiffOptions *drivers.ApplyDiffWithDifferOpts + + // stagedLayerExtraction is used by the normal tar layer extraction. + stagedLayerExtraction *maybeStagedLayerExtraction +} + +// maybeStagedLayerExtraction is a helper to encapsulate details around extracting +// a layer potentially before we even take a look if the driver implements the +// ApplyDiffStaging interface. +type maybeStagedLayerExtraction struct { + // diff contains the tar archive, can be compressed + diff io.Reader + staging drivers.ApplyDiffStaging + result *applyDiffResult + + cleanupFuncs []tempdir.CleanupTempDirFunc + stagedTarSplit *tempdir.StageAddition + stagedLayer *tempdir.StageAddition } type applyDiffResult struct { @@ -300,7 +318,7 @@ type rwLayerStore interface { // underlying drivers do not themselves distinguish between writeable // and read-only layers. Returns the new layer structure and the size of the // diff which was applied to its parent to initialize its contents. - create(id string, parent *Layer, names []string, mountLabel string, options map[string]string, moreOptions *LayerOptions, writeable bool, diff io.Reader, slo *stagedLayerOptions) (*Layer, int64, error) + create(id string, parent *Layer, names []string, mountLabel string, options map[string]string, moreOptions *LayerOptions, writeable bool, slo *stagedLayerOptions) (*Layer, int64, error) // updateNames modifies names associated with a layer based on (op, names). updateNames(id string, names []string, op updateNameOperation) error @@ -1391,7 +1409,7 @@ func (r *layerStore) pickStoreLocation(volatile, writeable bool) layerLocations } // Requires startWriting. -func (r *layerStore) create(id string, parentLayer *Layer, names []string, mountLabel string, options map[string]string, moreOptions *LayerOptions, writeable bool, diff io.Reader, slo *stagedLayerOptions) (layer *Layer, size int64, err error) { +func (r *layerStore) create(id string, parentLayer *Layer, names []string, mountLabel string, options map[string]string, moreOptions *LayerOptions, writeable bool, slo *stagedLayerOptions) (layer *Layer, size int64, err error) { if moreOptions == nil { moreOptions = &LayerOptions{} } @@ -1580,15 +1598,28 @@ func (r *layerStore) create(id string, parentLayer *Layer, names []string, mount } size = -1 - if diff != nil { - if size, err = r.applyDiffWithOptions(layer.ID, moreOptions, diff); err != nil { - cleanupFailureContext = "applying layer diff" - return nil, -1, err - } - } else if slo != nil { - if err := r.applyDiffFromStagingDirectory(layer.ID, slo.DiffOutput, slo.DiffOptions); err != nil { - cleanupFailureContext = "applying staged directory diff" - return nil, -1, err + if slo != nil { + if slo.stagedLayerExtraction != nil { + if slo.stagedLayerExtraction.result != nil { + // The layer is staged, just commit it and update the metadata. + if err := slo.stagedLayerExtraction.commitLayer(r, layer.ID); err != nil { + cleanupFailureContext = "committing staged layer diff" + return nil, -1, err + } + applyDiffResultToLayer(r, layer, moreOptions, slo.stagedLayerExtraction.result) + } else { + // The diff was not staged, apply it now here instead. + if size, err = r.applyDiffWithOptions(layer.ID, moreOptions, slo.stagedLayerExtraction.diff); err != nil { + cleanupFailureContext = "applying layer diff" + return nil, -1, err + } + } + } else { + // staging logic for the chunked pull path + if err := r.applyDiffFromStagingDirectory(layer.ID, slo.DiffOutput, slo.DiffOptions); err != nil { + cleanupFailureContext = "applying staged directory diff" + return nil, -1, err + } } } else { // applyDiffWithOptions() would have updated r.bycompressedsum @@ -2417,6 +2448,78 @@ func createTarSplitFile(r *layerStore, layerID string) (*os.File, error) { return os.OpenFile(r.tspath(layerID), os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o600) } +// newMaybeStagedLayerExtraction initlaizes a new maybeStagedLayerExtraction. The caller +// must call .cleanup() to remove any temporary files. +func newMaybeStagedLayerExtraction(diff io.Reader, driver drivers.Driver) *maybeStagedLayerExtraction { + m := &maybeStagedLayerExtraction{ + diff: diff, + } + if d, ok := driver.(drivers.ApplyDiffStaging); ok { + m.staging = d + } + return m +} + +func (sl *maybeStagedLayerExtraction) cleanup() error { + return tempdir.CleanupTemporaryDirectories(sl.cleanupFuncs...) +} + +// stageWithUnlockedStore stages the layer content without needing the store locked. +// If the driver does not support stage addition then this is a NOP and does nothing. +func (sl *maybeStagedLayerExtraction) stageWithUnlockedStore(r *layerStore, layerOptions *LayerOptions) error { + if sl.staging == nil { + // driver does not implement stage addition + return nil + } + td, err := tempdir.NewTempDir(filepath.Join(r.layerdir, tempDirPath)) + if err != nil { + return err + } + sl.cleanupFuncs = append(sl.cleanupFuncs, td.Cleanup) + + stageTarSplit, err := td.StageAddition() + if err != nil { + return err + } + sl.stagedTarSplit = stageTarSplit + + f, err := os.OpenFile(stageTarSplit.Path, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o600) + if err != nil { + return err + } + defer f.Close() + + result, err := applyDiff(layerOptions, sl.diff, f, func(payload io.Reader) (int64, error) { + cleanup, stageLayer, size, err := sl.staging.StartStagingDiffToApply(drivers.ApplyDiffOpts{ + Diff: payload, + Mappings: idtools.NewIDMappingsFromMaps(layerOptions.UIDMap, layerOptions.GIDMap), + // FIXME: What to do here? We have no lock and assigned label yet. + // Overlayfs should not need it anyway so this seems fine for now. + MountLabel: "", + }) + sl.cleanupFuncs = append(sl.cleanupFuncs, cleanup) + sl.stagedLayer = stageLayer + return size, err + }) + if err != nil { + return err + } + + sl.result = result + return nil +} + +// commitLayer() commits the content that was staged in stageWithUnlockedStore() +// +// Requires startWriting. +func (sl *maybeStagedLayerExtraction) commitLayer(r *layerStore, layerID string) error { + err := sl.stagedTarSplit.Commit(r.tspath(layerID)) + if err != nil { + return err + } + return sl.staging.CommitStagedLayer(layerID, sl.stagedLayer) +} + func applyDiff(layerOptions *LayerOptions, diff io.Reader, tarSplitFile *os.File, applyDriverFunc func(io.Reader) (int64, error)) (*applyDiffResult, error) { header := make([]byte, 10240) n, err := diff.Read(header) diff --git a/storage/store.go b/storage/store.go index 84019a4947..7933e95b9b 100644 --- a/storage/store.go +++ b/storage/store.go @@ -1452,7 +1452,7 @@ func (s *store) canUseShifting(uidmap, gidmap []idtools.IDMap) bool { // On entry: // - rlstore must be locked for writing // - rlstores MUST NOT be locked -func (s *store) putLayer(rlstore rwLayerStore, rlstores []roLayerStore, id, parent string, names []string, mountLabel string, writeable bool, lOptions *LayerOptions, diff io.Reader, slo *stagedLayerOptions) (*Layer, int64, error) { +func (s *store) putLayer(rlstore rwLayerStore, rlstores []roLayerStore, id, parent string, names []string, mountLabel string, writeable bool, lOptions *LayerOptions, slo *stagedLayerOptions) (*Layer, int64, error) { var parentLayer *Layer var options LayerOptions if lOptions != nil { @@ -1533,7 +1533,7 @@ func (s *store) putLayer(rlstore rwLayerStore, rlstores []roLayerStore, id, pare GIDMap: copySlicePreferringNil(gidMap), } } - return rlstore.create(id, parentLayer, names, mountLabel, nil, &options, writeable, diff, slo) + return rlstore.create(id, parentLayer, names, mountLabel, nil, &options, writeable, slo) } func (s *store) PutLayer(id, parent string, names []string, mountLabel string, writeable bool, lOptions *LayerOptions, diff io.Reader) (*Layer, int64, error) { @@ -1541,11 +1541,31 @@ func (s *store) PutLayer(id, parent string, names []string, mountLabel string, w if err != nil { return nil, -1, err } + + var slo *stagedLayerOptions + + if diff != nil { + m := newMaybeStagedLayerExtraction(diff, s.graphDriver) + defer func() { + if err := m.cleanup(); err != nil { + logrus.Errorf("Error cleaning up temporary directories: %v", err) + } + }() + // FIXME: type case should be safe for now but really there should be a better way to do this + err = m.stageWithUnlockedStore(rlstore.(*layerStore), lOptions) + if err != nil { + return nil, -1, err + } + slo = &stagedLayerOptions{ + stagedLayerExtraction: m, + } + } + if err := rlstore.startWriting(); err != nil { return nil, -1, err } defer rlstore.stopWriting() - return s.putLayer(rlstore, rlstores, id, parent, names, mountLabel, writeable, lOptions, diff, nil) + return s.putLayer(rlstore, rlstores, id, parent, names, mountLabel, writeable, lOptions, slo) } func (s *store) CreateLayer(id, parent string, names []string, mountLabel string, writeable bool, options *LayerOptions) (*Layer, error) { @@ -1753,7 +1773,7 @@ func (s *store) imageTopLayerForMapping(image *Image, ristore roImageStore, rlst } } layerOptions.TemplateLayer = layer.ID - mappedLayer, _, err := rlstore.create("", parentLayer, nil, layer.MountLabel, nil, &layerOptions, false, nil, nil) + mappedLayer, _, err := rlstore.create("", parentLayer, nil, layer.MountLabel, nil, &layerOptions, false, nil) if err != nil { return nil, fmt.Errorf("creating an ID-mapped copy of layer %q: %w", layer.ID, err) } @@ -1924,7 +1944,7 @@ func (s *store) CreateContainer(id string, names []string, image, layer, metadat options.Flags[mountLabelFlag] = mountLabel } - clayer, _, err := rlstore.create(layer, imageTopLayer, nil, mlabel, options.StorageOpt, layerOptions, true, nil, nil) + clayer, _, err := rlstore.create(layer, imageTopLayer, nil, mlabel, options.StorageOpt, layerOptions, true, nil) if err != nil { return nil, err } @@ -3186,7 +3206,7 @@ func (s *store) ApplyStagedLayer(args ApplyStagedLayerOptions) (*Layer, error) { DiffOutput: args.DiffOutput, DiffOptions: args.DiffOptions, } - layer, _, err = s.putLayer(rlstore, rlstores, args.ID, args.ParentLayer, args.Names, args.MountLabel, args.Writeable, args.LayerOptions, nil, &slo) + layer, _, err = s.putLayer(rlstore, rlstores, args.ID, args.ParentLayer, args.Names, args.MountLabel, args.Writeable, args.LayerOptions, &slo) return layer, err } diff --git a/storage/userns.go b/storage/userns.go index b3d76a31bf..91bfc27c90 100644 --- a/storage/userns.go +++ b/storage/userns.go @@ -197,7 +197,7 @@ outer: // We need to create a temporary layer so we can mount it and lookup the // maximum IDs used. - clayer, _, err := rlstore.create("", topLayer, nil, "", nil, layerOptions, false, nil, nil) + clayer, _, err := rlstore.create("", topLayer, nil, "", nil, layerOptions, false, nil) if err != nil { return 0, err }