Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions cmd/entire/cli/checkpoint/flock_unix.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
//go:build unix

package checkpoint

import (
"fmt"
"os"
"syscall"
)

// acquireShadowFlock takes an exclusive POSIX advisory lock on path. The
// returned release closes the file (which drops the flock). Callers must call
// release exactly once. The lock file persists between runs — flock state is
// held by the file descriptor, not the inode on disk.
//
// Mirrors strategy.acquireStateFileLock; duplicated rather than imported to
// avoid pulling the strategy package into checkpoint.
func acquireShadowFlock(path string) (release func(), err error) {
f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0o600) //nolint:gosec // path built from validated branch name
if err != nil {
return nil, fmt.Errorf("open shadow lock: %w", err)
}
if err := syscall.Flock(int(f.Fd()), syscall.LOCK_EX); err != nil { //nolint:gosec // file descriptors are non-negative; standard Go pattern for syscall.Flock
_ = f.Close()
return nil, fmt.Errorf("flock shadow lock: %w", err)
}
return func() { _ = f.Close() }, nil
}
Comment thread
pjbgf marked this conversation as resolved.
Outdated
32 changes: 32 additions & 0 deletions cmd/entire/cli/checkpoint/flock_windows.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
//go:build windows

package checkpoint

import (
"fmt"
"os"

"golang.org/x/sys/windows"
)

// acquireShadowFlock takes an exclusive lock on path via Windows LockFileEx.
// The returned release unlocks and closes the file. Callers must call release
// exactly once.
//
// Mirrors strategy.acquireStateFileLock; duplicated rather than imported to
// avoid pulling the strategy package into checkpoint.
func acquireShadowFlock(path string) (release func(), err error) {
f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0o600) //nolint:gosec // path built from validated branch name
if err != nil {
return nil, fmt.Errorf("open shadow lock: %w", err)
}
overlapped := new(windows.Overlapped)
if err := windows.LockFileEx(windows.Handle(f.Fd()), windows.LOCKFILE_EXCLUSIVE_LOCK, 0, 1, 0, overlapped); err != nil {
_ = f.Close()
return nil, fmt.Errorf("lock shadow lock: %w", err)
}
return func() {
_ = windows.UnlockFileEx(windows.Handle(f.Fd()), 0, 1, 0, overlapped)
_ = f.Close()
}, nil
}
126 changes: 126 additions & 0 deletions cmd/entire/cli/checkpoint/shadow_ref.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
package checkpoint

import (
"context"
"errors"
"fmt"
"math/rand/v2"
"os"
"os/exec"
"path/filepath"
"strings"
"time"

"github.com/entireio/cli/cmd/entire/cli/session"

"github.com/go-git/go-git/v6/plumbing"
)

// shadowBranchLockPath returns the per-shadow-branch flock file path. Lock
// files live in <git-common-dir>/entire-shadow-locks/ so they don't pollute
// the session-state directory. Branch names are slash-escaped because some
// filesystems disallow nested directory names that mirror the shadow branch
// hierarchy (entire/<hash>).
func shadowBranchLockPath(ctx context.Context, branchName string) (string, error) {
commonDir, err := session.GetGitCommonDir(ctx)
if err != nil {
return "", fmt.Errorf("get git common dir: %w", err)
}
lockDir := filepath.Join(commonDir, "entire-shadow-locks")
if err := os.MkdirAll(lockDir, 0o750); err != nil {
return "", fmt.Errorf("create shadow lock directory: %w", err)
}
safe := strings.ReplaceAll(branchName, "/", "_")
return filepath.Join(lockDir, safe+".lock"), nil
}

// withShadowBranchFlock acquires the per-shadow-branch flock, runs fn, and
// releases the flock. Serializes all WriteTemporary callers that target the
// same shadow branch — across goroutines AND across processes — so the CAS
// in casUpdateShadowBranchRef only sees external writers as contention.
func withShadowBranchFlock(ctx context.Context, branchName string, fn func() error) error {
path, err := shadowBranchLockPath(ctx, branchName)
if err != nil {
return err
}
release, err := acquireShadowFlock(path)
if err != nil {
return fmt.Errorf("acquire shadow flock %s: %w", branchName, err)
}
defer release()
return fn()
}

// ErrShadowRefBusy is returned by casUpdateShadowBranchRef when the ref has
// moved since the caller read it. Callers retry with a fresh parent.
var ErrShadowRefBusy = errors.New("shadow branch ref moved (CAS mismatch)")

// shadowRefMaxRetries bounds the WriteTemporary retry loop so a pathologically
// hot shadow branch can't hang a hook indefinitely. 16 is well above the
// number of concurrent sessions we've ever observed; the wins-required-to-
// finish distribution is bounded by N (number of concurrent writers).
const shadowRefMaxRetries = 16

// shadowRefMaxJitter is the upper bound for randomized backoff between CAS
// retries. Random jitter avoids thundering-herd retry patterns when many
// sessions hit the same shadow branch simultaneously.
const shadowRefMaxJitter = 8 * time.Millisecond

// zeroOID is the all-zeros object id that git accepts as the <oldvalue>
// argument to `git update-ref` to mean "must not exist".
const zeroOID = "0000000000000000000000000000000000000000"

// casUpdateShadowBranchRef atomically updates a shadow branch ref via
// `git update-ref <ref> <new> <old>`. Pass plumbing.ZeroHash as expectedHash
// to require the ref to NOT exist (first-checkpoint case).
//
// Returns ErrShadowRefBusy when git reports the ref moved since expectedHash
// was observed; callers retry with a fresh parent. Any other failure is
// returned wrapped.
//
// Why shell out: git's ref-locking is the canonical cross-process atomic
// CAS — go-git's CheckAndSetReference doesn't interoperate with native git's
// .lock files, and shadow branches can be touched concurrently by separate
// `entire` hook processes.
func casUpdateShadowBranchRef(ctx context.Context, branchName string, newHash, expectedHash plumbing.Hash) error {
refName := "refs/heads/" + branchName

oldValue := zeroOID
if expectedHash != plumbing.ZeroHash {
oldValue = expectedHash.String()
}

cmd := exec.CommandContext(ctx, "git", "update-ref", refName, newHash.String(), oldValue)
Comment thread
pjbgf marked this conversation as resolved.
Outdated
output, err := cmd.CombinedOutput()
Comment thread
pjbgf marked this conversation as resolved.
if err == nil {
return nil
}

out := string(output)
// Git's CAS-failure messages: "cannot lock ref ..." (covers both
// "is at X but expected Y" and "reference already exists" for the
// zero-OID case). Other failures propagate.
if strings.Contains(out, "cannot lock ref") || strings.Contains(out, "but expected") {
return ErrShadowRefBusy
}
return fmt.Errorf("git update-ref %s: %s: %w", refName, strings.TrimSpace(out), err)
Comment thread
pjbgf marked this conversation as resolved.
}

// shadowRefBackoff sleeps for a small random jitter before the next CAS
// retry. After several retries the jitter doubles to slow the thundering
// herd further. Respects context cancellation.
func shadowRefBackoff(ctx context.Context, attempt int) error {
d := time.Duration(rand.Int64N(int64(shadowRefMaxJitter))) //nolint:gosec // jitter, not security-sensitive
if attempt > 4 {
d *= 2
}
if d <= 0 {
d = time.Millisecond
}
select {
case <-time.After(d):
return nil
case <-ctx.Done():
return ctx.Err() //nolint:wrapcheck // canonical context cancellation
}
Comment thread
pjbgf marked this conversation as resolved.
}
170 changes: 99 additions & 71 deletions cmd/entire/cli/checkpoint/temporary.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,21 +68,9 @@ func (s *GitStore) WriteTemporary(ctx context.Context, opts WriteTemporaryOption
// Get shadow branch name
shadowBranchName := ShadowBranchNameForCommit(opts.BaseCommit, opts.WorktreeID)

// Get or create shadow branch
parentHash, baseTreeHash, err := s.getOrCreateShadowBranch(shadowBranchName)
if err != nil {
return WriteTemporaryResult{}, fmt.Errorf("failed to get shadow branch: %w", err)
}

// Get the last checkpoint's tree hash for deduplication
var lastTreeHash plumbing.Hash
if parentHash != plumbing.ZeroHash {
if lastCommit, err := s.repo.CommitObject(parentHash); err == nil {
lastTreeHash = lastCommit.TreeHash
}
}

// Collect all files to include
// Collect file lists once — the worktree state is stable across retries,
// so this work doesn't repeat per CAS attempt. Tree-building (which
// depends on the parent tree) is what re-runs inside the retry loop.
var allFiles []string
var allDeletedFiles []string
if opts.IsFirstCheckpoint {
Expand Down Expand Up @@ -110,39 +98,70 @@ func (s *GitStore) WriteTemporary(ctx context.Context, opts WriteTemporaryOption
allDeletedFiles = opts.DeletedFiles
}

// Build tree with changes
treeHash, err := s.buildTreeWithChanges(ctx, baseTreeHash, allFiles, allDeletedFiles, opts.MetadataDir, opts.MetadataDirAbs)
if err != nil {
return WriteTemporaryResult{}, fmt.Errorf("failed to build tree: %w", err)
}
commitMsg := trailers.FormatShadowCommit(opts.CommitMessage, opts.MetadataDir, opts.SessionID)

// Deduplication: skip if tree hash matches the last checkpoint
if lastTreeHash != plumbing.ZeroHash && treeHash == lastTreeHash {
return WriteTemporaryResult{
CommitHash: parentHash,
Skipped: true,
}, nil
}
var result WriteTemporaryResult
// withShadowBranchFlock serializes all writers targeting this shadow
// branch — across goroutines and across processes — so the inner CAS
// only sees contention from external `git update-ref` callers (rare).
err := withShadowBranchFlock(ctx, shadowBranchName, func() error {
// Tiny CAS retry budget: with the flock held, races against our own
// code are impossible. Retries cover the pathological case of an
// external writer (a user invoking `git update-ref` manually, etc.).
for attempt := range shadowRefMaxRetries {
parentHash, baseTreeHash, gErr := s.getOrCreateShadowBranch(shadowBranchName)
if gErr != nil {
return fmt.Errorf("failed to get shadow branch: %w", gErr)
}

// Create checkpoint commit with trailers
commitMsg := trailers.FormatShadowCommit(opts.CommitMessage, opts.MetadataDir, opts.SessionID)
// Get the last checkpoint's tree hash for deduplication
var lastTreeHash plumbing.Hash
if parentHash != plumbing.ZeroHash {
if lastCommit, lcErr := s.repo.CommitObject(parentHash); lcErr == nil {
lastTreeHash = lastCommit.TreeHash
}
}

commitHash, err := s.createCommit(ctx, treeHash, parentHash, commitMsg, opts.AuthorName, opts.AuthorEmail)
if err != nil {
return WriteTemporaryResult{}, fmt.Errorf("failed to create commit: %w", err)
}
treeHash, tErr := s.buildTreeWithChanges(ctx, baseTreeHash, allFiles, allDeletedFiles, opts.MetadataDir, opts.MetadataDirAbs)
if tErr != nil {
return fmt.Errorf("failed to build tree: %w", tErr)
}

// Update branch reference
refName := plumbing.NewBranchReferenceName(shadowBranchName)
newRef := plumbing.NewHashReference(refName, commitHash)
if err := s.repo.Storer.SetReference(newRef); err != nil {
return WriteTemporaryResult{}, fmt.Errorf("failed to update branch reference: %w", err)
}
// Deduplication: skip if tree hash matches the current shadow tip.
if lastTreeHash != plumbing.ZeroHash && treeHash == lastTreeHash {
result = WriteTemporaryResult{
CommitHash: parentHash,
Skipped: true,
}
return nil
}

return WriteTemporaryResult{
CommitHash: commitHash,
Skipped: false,
}, nil
commitHash, cErr := s.createCommit(ctx, treeHash, parentHash, commitMsg, opts.AuthorName, opts.AuthorEmail)
if cErr != nil {
return fmt.Errorf("failed to create commit: %w", cErr)
}

refErr := casUpdateShadowBranchRef(ctx, shadowBranchName, commitHash, parentHash)
Comment thread
pjbgf marked this conversation as resolved.
Outdated
if refErr == nil {
result = WriteTemporaryResult{
CommitHash: commitHash,
Skipped: false,
}
return nil
}
if !errors.Is(refErr, ErrShadowRefBusy) {
return fmt.Errorf("failed to update shadow branch reference: %w", refErr)
}
if bErr := shadowRefBackoff(ctx, attempt); bErr != nil {
return bErr
}
}
Comment thread
pjbgf marked this conversation as resolved.
return fmt.Errorf("failed to update shadow branch reference after %d CAS retries: %w", shadowRefMaxRetries, ErrShadowRefBusy)
})
if err != nil {
return WriteTemporaryResult{}, err
}
return result, nil
}

// ReadTemporary reads the latest checkpoint from a shadow branch.
Expand Down Expand Up @@ -257,12 +276,6 @@ func (s *GitStore) WriteTemporaryTask(ctx context.Context, opts WriteTemporaryTa
// Get shadow branch name
shadowBranchName := ShadowBranchNameForCommit(opts.BaseCommit, opts.WorktreeID)

// Get or create shadow branch
parentHash, baseTreeHash, err := s.getOrCreateShadowBranch(shadowBranchName)
if err != nil {
return plumbing.ZeroHash, fmt.Errorf("failed to get shadow branch: %w", err)
}

// Collect all files to include in the commit.
// Filter out gitignored files — subagent transcripts may report files like .env
// that exist on disk but are gitignored. Without filtering, secrets would leak
Expand All @@ -272,32 +285,47 @@ func (s *GitStore) WriteTemporaryTask(ctx context.Context, opts WriteTemporaryTa
candidateFiles = append(candidateFiles, opts.NewFiles...)
allFiles := filterGitIgnoredFiles(ctx, s.repo, candidateFiles)

// Build new tree with code changes (no metadata dir yet)
newTreeHash, err := s.buildTreeWithChanges(ctx, baseTreeHash, allFiles, opts.DeletedFiles, "", "")
if err != nil {
return plumbing.ZeroHash, fmt.Errorf("failed to build tree: %w", err)
}
var resultHash plumbing.Hash
err := withShadowBranchFlock(ctx, shadowBranchName, func() error {
for attempt := range shadowRefMaxRetries {
parentHash, baseTreeHash, gErr := s.getOrCreateShadowBranch(shadowBranchName)
if gErr != nil {
return fmt.Errorf("failed to get shadow branch: %w", gErr)
}

// Add task metadata to tree
newTreeHash, err = s.addTaskMetadataToTree(ctx, newTreeHash, opts)
if err != nil {
return plumbing.ZeroHash, fmt.Errorf("failed to add task metadata: %w", err)
}
newTreeHash, tErr := s.buildTreeWithChanges(ctx, baseTreeHash, allFiles, opts.DeletedFiles, "", "")
if tErr != nil {
return fmt.Errorf("failed to build tree: %w", tErr)
}

// Create the commit
commitHash, err := s.createCommit(ctx, newTreeHash, parentHash, opts.CommitMessage, opts.AuthorName, opts.AuthorEmail)
if err != nil {
return plumbing.ZeroHash, fmt.Errorf("failed to create commit: %w", err)
}
newTreeHash, tErr = s.addTaskMetadataToTree(ctx, newTreeHash, opts)
if tErr != nil {
return fmt.Errorf("failed to add task metadata: %w", tErr)
}

// Update shadow branch reference
refName := plumbing.NewBranchReferenceName(shadowBranchName)
ref := plumbing.NewHashReference(refName, commitHash)
if err := s.repo.Storer.SetReference(ref); err != nil {
return plumbing.ZeroHash, fmt.Errorf("failed to update shadow branch reference: %w", err)
}
commitHash, cErr := s.createCommit(ctx, newTreeHash, parentHash, opts.CommitMessage, opts.AuthorName, opts.AuthorEmail)
if cErr != nil {
return fmt.Errorf("failed to create commit: %w", cErr)
}

return commitHash, nil
refErr := casUpdateShadowBranchRef(ctx, shadowBranchName, commitHash, parentHash)
if refErr == nil {
resultHash = commitHash
return nil
}
if !errors.Is(refErr, ErrShadowRefBusy) {
return fmt.Errorf("failed to update shadow branch reference: %w", refErr)
}
if bErr := shadowRefBackoff(ctx, attempt); bErr != nil {
return bErr
}
}
return fmt.Errorf("failed to update shadow branch reference after %d CAS retries: %w", shadowRefMaxRetries, ErrShadowRefBusy)
})
if err != nil {
return plumbing.ZeroHash, err
}
return resultHash, nil
}

// addTaskMetadataToTree adds task checkpoint metadata to a git tree.
Expand Down
Loading
Loading