Merge pull request #382 from SiaFoundation/nate/stuck-syncers

n8mgr · web-flow · commit 2121abe4ab64 · 2026-01-05T08:34:34.000-08:00
Fixed an issue where fully connected peers could get stuck after reorgs
diff --git a/.changeset/fixed_an_issue_where_peers_would_not_be_notified_of_chain_reorgs_pulled_from_other_peers.md b/.changeset/fixed_an_issue_where_peers_would_not_be_notified_of_chain_reorgs_pulled_from_other_peers.md
@@ -0,0 +1,5 @@
+---
+default: patch
+---
+
+# Fixed an issue where fully connected peers could get stuck on a stale chain after a reorg.
diff --git a/miner.go b/miner.go
@@ -6,6 +6,7 @@ import (
 	"go.sia.tech/core/consensus"
 	"go.sia.tech/core/types"
 	"go.sia.tech/coreutils/chain"
+	"lukechampine.com/frand"
 )
 
 // FindBlockNonce attempts to find a nonce for b that meets the PoW target.
@@ -48,6 +49,9 @@ retry:
 	if childHeight >= cs.Network.HardforkV2.AllowHeight {
 		b.V2 = &types.V2BlockData{
 			Height: childHeight,
+			Transactions: []types.V2Transaction{
+				{ArbitraryData: frand.Bytes(12)}, // to ensure unique block ID
+			},
 		}
 	}
 
diff --git a/syncer/parallel_sync.go b/syncer/parallel_sync.go
@@ -87,7 +87,8 @@ func (s *Syncer) parallelSync(ctx context.Context, cs consensus.State, headers [
 			headers := headers[req.base.Height-cs.Index.Height:][:req.numBlocks]
 			for i := range blocks {
 				if blocks[i].ID() != headers[i].ID() {
-					s.ban(p, errors.New("sent blocks that do not match header chain"))
+					// note: this is not necessarily a ban-worthy offense, as it could
+					// be caused by a peer on a fork that could be valid.
 					return Resp{req: req, peer: p, err: errors.New("peer returned blocks that do not match header chain")}
 				}
 			}
diff --git a/syncer/syncer.go b/syncer/syncer.go
@@ -711,6 +711,7 @@ func (s *Syncer) syncLoop(ctx context.Context) error {
 				if err := s.parallelSync(ctx, r.cs, r.headers); err != nil {
 					s.log.Warn("sync failed", zap.Stringer("peer", r.peer), zap.Error(err))
 				}
+				go s.relayV2Header(r.headers[len(r.headers)-1], r.peer)
 			}
 		}
 	}
diff --git a/syncer/syncer_test.go b/syncer/syncer_test.go
@@ -9,6 +9,7 @@ import (
 
 	"go.sia.tech/core/gateway"
 	"go.sia.tech/core/types"
+	"go.sia.tech/coreutils"
 	"go.sia.tech/coreutils/chain"
 	"go.sia.tech/coreutils/syncer"
 	"go.sia.tech/coreutils/testutil"
@@ -17,7 +18,47 @@ import (
 	"go.uber.org/zap/zaptest"
 )
 
-func newTestSyncer(t testing.TB, name string, log *zap.Logger) (*syncer.Syncer, *chain.Manager) {
+// helper to wait for all provided chain managers to be synced
+func synced(t *testing.T, cm ...*chain.Manager) {
+	t.Helper()
+
+	var heights []uint64
+	for range 100 {
+		heights = heights[:0]
+		heights = append(heights, cm[0].Tip().Height)
+		allEqual := true
+		for _, c := range cm[1:] {
+			heights = append(heights, c.Tip().Height)
+			if c.Tip() != cm[0].Tip() {
+				allEqual = false
+			}
+		}
+		if allEqual {
+			return
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+	t.Fatalf("tips are not equal: %v", heights)
+}
+
+// helper to mine blocks on cm and broadcast to syncer s
+func mineBlocks(t *testing.T, s *syncer.Syncer, cm *chain.Manager, n int) {
+	t.Helper()
+	for range n {
+		b, ok := coreutils.MineBlock(cm, types.VoidAddress, time.Second)
+		if !ok {
+			t.Fatal("failed to mine block")
+		} else if err := cm.AddBlocks([]types.Block{b}); err != nil {
+			t.Fatal(err)
+		}
+		if b.V2 != nil {
+			// error is ignored, best effort relay
+			s.BroadcastV2BlockOutline(gateway.OutlineBlock(b, cm.PoolTransactions(), cm.V2PoolTransactions()))
+		}
+	}
+}
+
+func newTestSyncer(t testing.TB, opts ...syncer.Option) (*syncer.Syncer, *chain.Manager) {
 	n, genesis := testutil.Network()
 	store, tipState1, err := chain.NewDBStore(chain.NewMemDB(), n, genesis, nil)
 	if err != nil {
@@ -33,22 +74,23 @@ func newTestSyncer(t testing.TB, name string, log *zap.Logger) (*syncer.Syncer,
 		l.Close()
 	})
 
+	opts = append([]syncer.Option{syncer.WithSyncInterval(100 * time.Millisecond)}, opts...)
 	s := syncer.New(l, cm, testutil.NewEphemeralPeerStore(), gateway.Header{
 		GenesisID:  genesis.ID(),
 		UniqueID:   gateway.GenerateUniqueID(),
 		NetAddress: l.Addr().String(),
-	}, syncer.WithLogger(log.Named(name)), syncer.WithSyncInterval(100*time.Millisecond))
+	}, opts...)
 	go s.Run()
 	return s, cm
 }
 
 func TestSyncer(t *testing.T) {
 	log := zaptest.NewLogger(t)
 
-	s1, cm1 := newTestSyncer(t, "syncer1", log)
+	s1, cm1 := newTestSyncer(t, syncer.WithLogger(log.Named("syncer1")))
 	defer s1.Close()
 
-	s2, cm2 := newTestSyncer(t, "syncer2", log)
+	s2, cm2 := newTestSyncer(t, syncer.WithLogger(log.Named("syncer2")))
 	defer s2.Close()
 
 	// mine enough blocks to test both v1 and v2 regimes
@@ -89,10 +131,10 @@ func (es evilManager) BlocksForHistory(history []types.BlockID, maxBlocks uint64
 func TestSyncWithBadPeer(t *testing.T) {
 	log := zaptest.NewLogger(t)
 
-	s1, cm1 := newTestSyncer(t, "syncer1", log)
+	s1, cm1 := newTestSyncer(t, syncer.WithLogger(log.Named("syncer1")))
 	defer s1.Close()
 
-	s2, cm2 := newTestSyncer(t, "syncer2", log)
+	s2, cm2 := newTestSyncer(t, syncer.WithLogger(log.Named("syncer2")))
 	defer s2.Close()
 
 	// mine enough blocks to test both v1 and v2 regimes
@@ -141,7 +183,7 @@ func TestSyncWithBadPeer(t *testing.T) {
 func TestSyncerConnectAfterClose(t *testing.T) {
 	log := zaptest.NewLogger(t)
 
-	s, _ := newTestSyncer(t, "syncer1", log)
+	s, _ := newTestSyncer(t, syncer.WithLogger(log.Named("syncer1")))
 	if err := s.Close(); err != nil {
 		t.Fatal(err)
 	} else if _, err := s.Connect(context.Background(), "localhost:1234"); !errors.Is(err, threadgroup.ErrClosed) {
@@ -162,10 +204,10 @@ func hashEq(a, b types.EncoderTo) bool {
 func TestSendCheckpoint(t *testing.T) {
 	log := zaptest.NewLogger(t)
 
-	s1, cm1 := newTestSyncer(t, "syncer1", log)
+	s1, cm1 := newTestSyncer(t, syncer.WithLogger(log.Named("syncer1")))
 	defer s1.Close()
 
-	s2, _ := newTestSyncer(t, "syncer2", log)
+	s2, _ := newTestSyncer(t, syncer.WithLogger(log.Named("syncer2")))
 	defer s2.Close()
 
 	// mine above v2 hardfork height
@@ -190,7 +232,7 @@ func TestInstantSync(t *testing.T) {
 	n, genesis := testutil.Network()
 	log := zap.NewNop()
 
-	s, cm := newTestSyncer(t, "syncer", log)
+	s, cm := newTestSyncer(t, syncer.WithLogger(log.Named("syncer")))
 	defer s.Close()
 
 	// mine a few blocks above v2 hardfork height
@@ -257,10 +299,10 @@ func TestInstantSync(t *testing.T) {
 func TestSendHeaders(t *testing.T) {
 	log := zaptest.NewLogger(t)
 
-	s1, cm1 := newTestSyncer(t, "syncer1", log)
+	s1, cm1 := newTestSyncer(t, syncer.WithLogger(log.Named("syncer1")))
 	defer s1.Close()
 
-	s2, cm2 := newTestSyncer(t, "syncer2", log)
+	s2, cm2 := newTestSyncer(t, syncer.WithLogger(log.Named("syncer2")))
 	defer s2.Close()
 	cs := cm2.TipState()
 
@@ -279,3 +321,98 @@ func TestSendHeaders(t *testing.T) {
 		t.Fatalf("expected 10 remaining headers, got %d", rem)
 	}
 }
+
+func TestSyncerReorg(t *testing.T) {
+	log := zaptest.NewLogger(t)
+
+	s1, cm1 := newTestSyncer(t, syncer.WithLogger(log.Named("syncer1")))
+	defer s1.Close()
+
+	// s2 must only be able to sync from s1 to force reorg propagation
+	s2, cm2 := newTestSyncer(t, syncer.WithLogger(log.Named("syncer2")), syncer.WithSyncInterval(100*time.Millisecond), syncer.WithMaxInboundPeers(1), syncer.WithMaxOutboundPeers(0))
+	defer s2.Close()
+
+	s3, cm3 := newTestSyncer(t, syncer.WithLogger(log.Named("syncer3")))
+	defer s3.Close()
+
+	// connect s1 and s2
+	if _, err := s1.Connect(context.Background(), s2.Addr()); err != nil {
+		t.Fatal(err)
+	}
+	log.Debug("connected s1 and s2")
+
+	// mine above the v2 require height
+	mineBlocks(t, s1, cm1, int(cm1.TipState().Network.HardforkV2.RequireHeight+10))
+
+	// apply cm1 blocks manually to cm3 to simulate a synced node
+	_, applied, err := cm1.UpdatesSince(types.ChainIndex{}, 1000)
+	if err != nil {
+		t.Fatalf("failed to get updates since genesis: %v", err)
+	}
+	for _, cau := range applied {
+		if err := cm3.AddBlocks([]types.Block{cau.Block}); err != nil {
+			t.Fatalf("failed to apply block at height %d: %v", cau.Block.V2.Height, err)
+		}
+	}
+
+	// check that all three nodes are at the same tip
+	synced(t, cm1, cm2, cm3)
+
+	// mine conflicting chains on cm1 and cm3
+	mineBlocks(t, s1, cm1, 1)
+	mineBlocks(t, s3, cm3, 5)
+
+	// connect cm1 and cm3, triggering a reorg on cm1 and cm2
+	if _, err := s1.Connect(context.Background(), s3.Addr()); err != nil {
+		t.Fatal(err)
+	}
+	log.Debug("syncer peers", zap.Int("s1", len(s1.Peers())), zap.Int("s2", len(s2.Peers())), zap.Int("s3", len(s3.Peers())))
+	synced(t, cm1, cm2, cm3)
+}
+
+func TestParallelSyncReorgSplit(t *testing.T) {
+	log := zaptest.NewLogger(t)
+
+	s1, cm1 := newTestSyncer(t, syncer.WithLogger(log.Named("syncer1")), syncer.WithSyncInterval(100*time.Millisecond))
+	defer s1.Close()
+
+	// s2 and s3 should not be able to connect to each other
+	s2, cm2 := newTestSyncer(t, syncer.WithLogger(log.Named("syncer2")), syncer.WithMaxInboundPeers(1), syncer.WithSyncInterval(100*time.Millisecond))
+	defer s2.Close()
+
+	s3, cm3 := newTestSyncer(t, syncer.WithLogger(log.Named("syncer3")), syncer.WithMaxOutboundPeers(1), syncer.WithSyncInterval(100*time.Millisecond))
+	defer s3.Close()
+
+	// mine after the v2 hardfork height
+	testutil.MineBlocks(t, cm2, types.VoidAddress, int(cm2.TipState().Network.HardforkV2.RequireHeight+10))
+
+	// apply cm1 blocks manually to cm3 to simulate a synced node
+	_, applied, err := cm2.UpdatesSince(types.ChainIndex{}, 1000)
+	if err != nil {
+		t.Fatalf("failed to get updates: %v", err)
+	}
+	for _, cau := range applied {
+		if err := cm3.AddBlocks([]types.Block{cau.Block}); err != nil {
+			t.Fatalf("failed to apply block: %v", err)
+		}
+	}
+
+	// create a split on cm2 and cm3
+	testutil.MineBlocks(t, cm2, types.VoidAddress, 5)
+	testutil.MineBlocks(t, cm3, types.VoidAddress, 6)
+
+	// Verify they've diverged
+	if cm2.Tip() == cm3.Tip() {
+		t.Fatal("chains should have diverged")
+	}
+
+	// Connect s1 to both s2 and s3
+	// s1 will get headers from s2 (longer chain) and may ask s3 for blocks
+	if _, err := s1.Connect(context.Background(), s2.Addr()); err != nil {
+		t.Fatal(err)
+	}
+	if _, err := s1.Connect(context.Background(), s3.Addr()); err != nil {
+		t.Fatal(err)
+	}
+	synced(t, cm1, cm2, cm3)
+}
diff --git a/wallet/wallet_test.go b/wallet/wallet_test.go
@@ -2159,6 +2159,7 @@ func TestSplitUTXO(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
+	defer w.Close()
 
 	largestUTXO := cm.TipState().BlockReward().Sub(w.RecommendedFee().Mul64(estimatedTxnSize)) // miner fee is subtracted
 	// fund the wallet

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +default: patch
 +---
++
 +# Fixed an issue where fully connected peers could get stuck on a stale chain after a reorg.
Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,7 @@ import (`
`6`	`6`	`"go.sia.tech/core/consensus"`
`7`	`7`	`"go.sia.tech/core/types"`
`8`	`8`	`"go.sia.tech/coreutils/chain"`
	`9`	`+ "lukechampine.com/frand"`
`9`	`10`	`)`
`10`	`11`
`11`	`12`	`// FindBlockNonce attempts to find a nonce for b that meets the PoW target.`
`@@ -48,6 +49,9 @@ retry:`
`48`	`49`	`if childHeight >= cs.Network.HardforkV2.AllowHeight {`
`49`	`50`	`b.V2 = &types.V2BlockData{`
`50`	`51`	`Height: childHeight,`
	`52`	`+ Transactions: []types.V2Transaction{`
	`53`	`+ {ArbitraryData: frand.Bytes(12)}, // to ensure unique block ID`
	`54`	`+ },`
`51`	`55`	`}`
`52`	`56`	`}`
`53`	`57`
Original file line number	Diff line number	Diff line change
`@@ -87,7 +87,8 @@ func (s *Syncer) parallelSync(ctx context.Context, cs consensus.State, headers [`
`87`	`87`	`headers := headers[req.base.Height-cs.Index.Height:][:req.numBlocks]`
`88`	`88`	`for i := range blocks {`
`89`	`89`	`if blocks[i].ID() != headers[i].ID() {`
`90`		`- s.ban(p, errors.New("sent blocks that do not match header chain"))`
	`90`	`+ // note: this is not necessarily a ban-worthy offense, as it could`
	`91`	`+ // be caused by a peer on a fork that could be valid.`
`91`	`92`	`return Resp{req: req, peer: p, err: errors.New("peer returned blocks that do not match header chain")}`
`92`	`93`	`}`
`93`	`94`	`}`
Original file line number	Diff line number	Diff line change
`@@ -711,6 +711,7 @@ func (s *Syncer) syncLoop(ctx context.Context) error {`
`711`	`711`	`if err := s.parallelSync(ctx, r.cs, r.headers); err != nil {`
`712`	`712`	`s.log.Warn("sync failed", zap.Stringer("peer", r.peer), zap.Error(err))`
`713`	`713`	`}`
	`714`	`+ go s.relayV2Header(r.headers[len(r.headers)-1], r.peer)`
`714`	`715`	`}`
`715`	`716`	`}`
`716`	`717`	`}`
Original file line number	Diff line number	Diff line change
`@@ -2159,6 +2159,7 @@ func TestSplitUTXO(t *testing.T) {`
`2159`	`2159`	`if err != nil {`
`2160`	`2160`	`t.Fatal(err)`
`2161`	`2161`	`}`
	`2162`	`+ defer w.Close()`
`2162`	`2163`
`2163`	`2164`	`largestUTXO := cm.TipState().BlockReward().Sub(w.RecommendedFee().Mul64(estimatedTxnSize)) // miner fee is subtracted`
`2164`	`2165`	`// fund the wallet`