Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 164 additions & 0 deletions cmd/entire/cli/checkpoint/shadow_ref.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
package checkpoint

import (
"context"
"errors"
"fmt"
"math/rand/v2"
"os"
"os/exec"
"path/filepath"
"strings"
"time"

"github.com/entireio/cli/cmd/entire/cli/internal/flock"

"github.com/go-git/go-git/v6/plumbing"
)

// ErrShadowRefBusy is returned by casUpdateShadowBranchRef when the ref has
// moved since the caller read it. Callers retry with a fresh parent.
var ErrShadowRefBusy = errors.New("shadow branch ref moved (CAS mismatch)")

// shadowRefMaxRetries bounds the WriteTemporary retry loop. With the
// per-shadow-branch flock held, our own writers never collide; this budget
// is purely a safety net against an external `git update-ref` writer that
// repeatedly beats us to the ref.
const shadowRefMaxRetries = 16

// shadowRefMaxJitter is the upper bound for randomized backoff between CAS
// retries. Random jitter avoids thundering-herd retry patterns when many
// sessions hit the same shadow branch simultaneously.
const shadowRefMaxJitter = 8 * time.Millisecond

// repoDirs returns the worktree root and git common dir for the store's
// repository. Callers use the worktree root as cmd.Dir for git invocations
// and the common dir to locate filesystem paths (lock files, loose objects)
// — both without depending on the process cwd.
func (s *GitStore) repoDirs(ctx context.Context) (worktreeRoot, commonDir string, err error) {
wt, err := s.repo.Worktree()
if err != nil {
return "", "", fmt.Errorf("open worktree: %w", err)
}
worktreeRoot = wt.Filesystem().Root()
if worktreeRoot == "" {
return "", "", errors.New("repository worktree filesystem has no root path")
}
commonDir, err = resolveGitCommonDir(ctx, s.repo)
if err != nil {
return "", "", err
}
return worktreeRoot, commonDir, nil
}

// casUpdateShadowBranchRef atomically updates a shadow branch ref via
// `git update-ref <ref> <new> <old>`. Pass plumbing.ZeroHash as expectedHash
// to require the ref to NOT exist (first-checkpoint case).
//
// repoRoot is used as cmd.Dir so the update targets the same repository as
// the rest of WriteTemporary (i.e. s.repo) regardless of the process cwd.
//
// Returns ErrShadowRefBusy when git reports the ref moved since expectedHash
// was observed; callers retry with a fresh parent. Any other failure is
// returned wrapped.
//
// Why shell out: git's ref-locking is the canonical cross-process atomic
// CAS — go-git's CheckAndSetReference doesn't interoperate with native git's
// .lock files, and shadow branches can be touched concurrently by separate
// `entire` hook processes.
func casUpdateShadowBranchRef(ctx context.Context, repoRoot, branchName string, newHash, expectedHash plumbing.Hash) error {
refName := "refs/heads/" + branchName

// All-zeros OID with the repo's object-format width means "must not
// exist". SHA-1 repos want 40 zeros, SHA-256 repos want 64; mirror
// newHash's hex width so we pick the right one without an extra git call.
newValue := newHash.String()
oldValue := strings.Repeat("0", newHash.HexSize())
if expectedHash != plumbing.ZeroHash {
oldValue = expectedHash.String()
}

cmd := exec.CommandContext(ctx, "git", "update-ref", refName, newValue, oldValue)
cmd.Dir = repoRoot
// Force English diagnostics so the CAS-conflict pattern match below
// isn't defeated by a translated stderr message in a non-C locale.
cmd.Env = append(os.Environ(), "LC_ALL=C", "LANG=C")
output, err := cmd.CombinedOutput()
Comment thread
pjbgf marked this conversation as resolved.
if err == nil {
return nil
}

out := string(output)
// Git's CAS-failure messages: "cannot lock ref ..." (covers both
// "is at X but expected Y" and "reference already exists" for the
// zero-OID case). Other failures propagate.
if strings.Contains(out, "cannot lock ref") || strings.Contains(out, "but expected") {
return ErrShadowRefBusy
}
return fmt.Errorf("git update-ref %s: %s: %w", refName, strings.TrimSpace(out), err)
Comment thread
pjbgf marked this conversation as resolved.
}

// shadowRefBackoff sleeps for a small random jitter before the next CAS
// retry. After several retries the upper bound doubles to slow the
// thundering herd further. Respects context cancellation.
func shadowRefBackoff(ctx context.Context, attempt int) error {
base := shadowRefMaxJitter
if attempt > 4 {
base *= 2
}
// Add a 1ms floor so the chosen sleep is always non-trivial, even when
// rand.Int64N happens to return 0.
d := time.Duration(rand.Int64N(int64(base))) + time.Millisecond //nolint:gosec // jitter, not security-sensitive
select {
case <-time.After(d):
return nil
case <-ctx.Done():
return ctx.Err() //nolint:wrapcheck // canonical context cancellation
}
Comment thread
pjbgf marked this conversation as resolved.
}

// shadowBranchLockPath returns the per-shadow-branch flock file path. Lock
// files live in <git-common-dir>/entire-shadow-locks/ so they don't pollute
// the session-state directory. Branch names are slash-escaped because the
// shadow-branch convention "entire/<hash>" would otherwise nest directories.
func shadowBranchLockPath(commonDir, branchName string) (string, error) {
lockDir := filepath.Join(commonDir, "entire-shadow-locks")
if err := os.MkdirAll(lockDir, 0o750); err != nil {
return "", fmt.Errorf("create shadow lock directory: %w", err)
}
safe := strings.ReplaceAll(branchName, "/", "_")
return filepath.Join(lockDir, safe+".lock"), nil
}

// withShadowBranchFlock acquires the per-shadow-branch flock, runs fn, and
// releases the flock. Serializes all WriteTemporary callers that target the
// same shadow branch — across goroutines AND across processes — so the CAS
// in casUpdateShadowBranchRef only sees external writers as contention.
//
// commonDir is the git common directory (from s.repoDirs); it locates the
// lock file independently of the process cwd.
func withShadowBranchFlock(commonDir, branchName string, fn func() error) error {
path, err := shadowBranchLockPath(commonDir, branchName)
if err != nil {
return err
}
release, err := flock.Acquire(path)
if err != nil {
return fmt.Errorf("acquire shadow flock %s: %w", branchName, err)
}
defer release()
return fn()
}

// tryDeleteLooseObject best-effort removes a loose object file. Used to
// clean up dangling commits created during a CAS-losing attempt. Failures
// (e.g. object already packed by a concurrent gc, or never written as a
// loose object) are ignored — the object will be picked up by the next gc
// pass either way.
func tryDeleteLooseObject(commonDir string, hash plumbing.Hash) {
h := hash.String()
if len(h) < 3 {
return
}
_ = os.Remove(filepath.Join(commonDir, "objects", h[:2], h[2:]))
}
191 changes: 123 additions & 68 deletions cmd/entire/cli/checkpoint/temporary.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,21 +68,9 @@ func (s *GitStore) WriteTemporary(ctx context.Context, opts WriteTemporaryOption
// Get shadow branch name
shadowBranchName := ShadowBranchNameForCommit(opts.BaseCommit, opts.WorktreeID)

// Get or create shadow branch
parentHash, baseTreeHash, err := s.getOrCreateShadowBranch(shadowBranchName)
if err != nil {
return WriteTemporaryResult{}, fmt.Errorf("failed to get shadow branch: %w", err)
}

// Get the last checkpoint's tree hash for deduplication
var lastTreeHash plumbing.Hash
if parentHash != plumbing.ZeroHash {
if lastCommit, err := s.repo.CommitObject(parentHash); err == nil {
lastTreeHash = lastCommit.TreeHash
}
}

// Collect all files to include
// Collect file lists once — the worktree state is stable across retries,
// so this work doesn't repeat per CAS attempt. Tree-building (which
// depends on the parent tree) is what re-runs inside the retry loop.
var allFiles []string
var allDeletedFiles []string
if opts.IsFirstCheckpoint {
Expand Down Expand Up @@ -110,39 +98,86 @@ func (s *GitStore) WriteTemporary(ctx context.Context, opts WriteTemporaryOption
allDeletedFiles = opts.DeletedFiles
}

// Build tree with changes
treeHash, err := s.buildTreeWithChanges(ctx, baseTreeHash, allFiles, allDeletedFiles, opts.MetadataDir, opts.MetadataDirAbs)
commitMsg := trailers.FormatShadowCommit(opts.CommitMessage, opts.MetadataDir, opts.SessionID)

repoRoot, commonDir, err := s.repoDirs(ctx)
if err != nil {
return WriteTemporaryResult{}, fmt.Errorf("failed to build tree: %w", err)
}
return WriteTemporaryResult{}, fmt.Errorf("failed to resolve repo dirs: %w", err)
}

var result WriteTemporaryResult
// withShadowBranchFlock serializes all writers targeting this shadow
// branch — across goroutines and across processes — so the inner CAS
// only sees contention from external `git update-ref` callers (rare).
err = withShadowBranchFlock(commonDir, shadowBranchName, func() error {
// Tiny CAS retry budget: with the flock held, races against our own
// code are impossible. Retries cover the pathological case of an
// external writer (a user invoking `git update-ref` manually, etc.).
for attempt := range shadowRefMaxRetries {
parentHash, baseTreeHash, gErr := s.getOrCreateShadowBranch(shadowBranchName)
if gErr != nil {
return fmt.Errorf("failed to get shadow branch: %w", gErr)
}

// Deduplication: skip if tree hash matches the last checkpoint
if lastTreeHash != plumbing.ZeroHash && treeHash == lastTreeHash {
return WriteTemporaryResult{
CommitHash: parentHash,
Skipped: true,
}, nil
}
// Get the last checkpoint's tree hash for deduplication
var lastTreeHash plumbing.Hash
if parentHash != plumbing.ZeroHash {
if lastCommit, lcErr := s.repo.CommitObject(parentHash); lcErr == nil {
lastTreeHash = lastCommit.TreeHash
}
}

// Create checkpoint commit with trailers
commitMsg := trailers.FormatShadowCommit(opts.CommitMessage, opts.MetadataDir, opts.SessionID)
treeHash, tErr := s.buildTreeWithChanges(ctx, baseTreeHash, allFiles, allDeletedFiles, opts.MetadataDir, opts.MetadataDirAbs)
if tErr != nil {
return fmt.Errorf("failed to build tree: %w", tErr)
}

commitHash, err := s.createCommit(ctx, treeHash, parentHash, commitMsg, opts.AuthorName, opts.AuthorEmail)
if err != nil {
return WriteTemporaryResult{}, fmt.Errorf("failed to create commit: %w", err)
}
// Deduplication: skip if tree hash matches the current shadow tip.
if lastTreeHash != plumbing.ZeroHash && treeHash == lastTreeHash {
result = WriteTemporaryResult{
CommitHash: parentHash,
Skipped: true,
}
return nil
}

// Update branch reference
refName := plumbing.NewBranchReferenceName(shadowBranchName)
newRef := plumbing.NewHashReference(refName, commitHash)
if err := s.repo.Storer.SetReference(newRef); err != nil {
return WriteTemporaryResult{}, fmt.Errorf("failed to update branch reference: %w", err)
}
commitHash, cErr := s.createCommit(ctx, treeHash, parentHash, commitMsg, opts.AuthorName, opts.AuthorEmail)
if cErr != nil {
return fmt.Errorf("failed to create commit: %w", cErr)
}

return WriteTemporaryResult{
CommitHash: commitHash,
Skipped: false,
}, nil
refErr := casUpdateShadowBranchRef(ctx, repoRoot, shadowBranchName, commitHash, parentHash)
if refErr == nil {
result = WriteTemporaryResult{
CommitHash: commitHash,
Skipped: false,
}
return nil
}
if !errors.Is(refErr, ErrShadowRefBusy) {
return fmt.Errorf("failed to update shadow branch reference: %w", refErr)
}
// Our commit is now dangling — best-effort remove it so we don't
// leak loose objects across many losing attempts.
tryDeleteLooseObject(commonDir, commitHash)
if bErr := shadowRefBackoff(ctx, attempt); bErr != nil {
return bErr
}
}
Comment thread
pjbgf marked this conversation as resolved.
// Retry budget exhausted. With the flock held this means an external
// writer beat us shadowRefMaxRetries times in a row — surface it in
// .entire/logs/ so operators can see a stuck shadow branch.
logging.Warn(logging.WithComponent(ctx, "checkpoint"),
"shadow branch CAS retry budget exhausted",
slog.String("shadow_branch", shadowBranchName),
slog.Int("retries", shadowRefMaxRetries),
)
return fmt.Errorf("failed to update shadow branch reference after %d CAS retries: %w", shadowRefMaxRetries, ErrShadowRefBusy)
})
if err != nil {
return WriteTemporaryResult{}, err
}
return result, nil
}

// ReadTemporary reads the latest checkpoint from a shadow branch.
Expand Down Expand Up @@ -257,12 +292,6 @@ func (s *GitStore) WriteTemporaryTask(ctx context.Context, opts WriteTemporaryTa
// Get shadow branch name
shadowBranchName := ShadowBranchNameForCommit(opts.BaseCommit, opts.WorktreeID)

// Get or create shadow branch
parentHash, baseTreeHash, err := s.getOrCreateShadowBranch(shadowBranchName)
if err != nil {
return plumbing.ZeroHash, fmt.Errorf("failed to get shadow branch: %w", err)
}

// Collect all files to include in the commit.
// Filter out gitignored files — subagent transcripts may report files like .env
// that exist on disk but are gitignored. Without filtering, secrets would leak
Expand All @@ -272,32 +301,58 @@ func (s *GitStore) WriteTemporaryTask(ctx context.Context, opts WriteTemporaryTa
candidateFiles = append(candidateFiles, opts.NewFiles...)
allFiles := filterGitIgnoredFiles(ctx, s.repo, candidateFiles)

// Build new tree with code changes (no metadata dir yet)
newTreeHash, err := s.buildTreeWithChanges(ctx, baseTreeHash, allFiles, opts.DeletedFiles, "", "")
repoRoot, commonDir, err := s.repoDirs(ctx)
if err != nil {
return plumbing.ZeroHash, fmt.Errorf("failed to build tree: %w", err)
return plumbing.ZeroHash, fmt.Errorf("failed to resolve repo dirs: %w", err)
}

// Add task metadata to tree
newTreeHash, err = s.addTaskMetadataToTree(ctx, newTreeHash, opts)
if err != nil {
return plumbing.ZeroHash, fmt.Errorf("failed to add task metadata: %w", err)
}
var resultHash plumbing.Hash
err = withShadowBranchFlock(commonDir, shadowBranchName, func() error {
for attempt := range shadowRefMaxRetries {
parentHash, baseTreeHash, gErr := s.getOrCreateShadowBranch(shadowBranchName)
if gErr != nil {
return fmt.Errorf("failed to get shadow branch: %w", gErr)
}

// Create the commit
commitHash, err := s.createCommit(ctx, newTreeHash, parentHash, opts.CommitMessage, opts.AuthorName, opts.AuthorEmail)
if err != nil {
return plumbing.ZeroHash, fmt.Errorf("failed to create commit: %w", err)
}
newTreeHash, tErr := s.buildTreeWithChanges(ctx, baseTreeHash, allFiles, opts.DeletedFiles, "", "")
if tErr != nil {
return fmt.Errorf("failed to build tree: %w", tErr)
}

// Update shadow branch reference
refName := plumbing.NewBranchReferenceName(shadowBranchName)
ref := plumbing.NewHashReference(refName, commitHash)
if err := s.repo.Storer.SetReference(ref); err != nil {
return plumbing.ZeroHash, fmt.Errorf("failed to update shadow branch reference: %w", err)
}
newTreeHash, tErr = s.addTaskMetadataToTree(ctx, newTreeHash, opts)
if tErr != nil {
return fmt.Errorf("failed to add task metadata: %w", tErr)
}

return commitHash, nil
commitHash, cErr := s.createCommit(ctx, newTreeHash, parentHash, opts.CommitMessage, opts.AuthorName, opts.AuthorEmail)
if cErr != nil {
return fmt.Errorf("failed to create commit: %w", cErr)
}

refErr := casUpdateShadowBranchRef(ctx, repoRoot, shadowBranchName, commitHash, parentHash)
if refErr == nil {
resultHash = commitHash
return nil
}
if !errors.Is(refErr, ErrShadowRefBusy) {
return fmt.Errorf("failed to update shadow branch reference: %w", refErr)
}
tryDeleteLooseObject(commonDir, commitHash)
if bErr := shadowRefBackoff(ctx, attempt); bErr != nil {
return bErr
}
}
logging.Warn(logging.WithComponent(ctx, "checkpoint"),
"shadow branch CAS retry budget exhausted (task checkpoint)",
slog.String("shadow_branch", shadowBranchName),
slog.Int("retries", shadowRefMaxRetries),
)
return fmt.Errorf("failed to update shadow branch reference after %d CAS retries: %w", shadowRefMaxRetries, ErrShadowRefBusy)
})
if err != nil {
return plumbing.ZeroHash, err
}
return resultHash, nil
}

// addTaskMetadataToTree adds task checkpoint metadata to a git tree.
Expand Down
Loading
Loading