@@ -588,54 +588,60 @@ func (r *raft) send(m pb.Message) {
588
588
}
589
589
}
590
590
591
- // sendAppend sends an append RPC with new entries (if any) and the
592
- // current commit index to the given peer.
593
- func (r * raft ) sendAppend (to uint64 ) {
594
- r .maybeSendAppend (to , true )
595
- }
596
-
597
- // maybeSendAppend sends an append RPC with new entries to the given peer,
598
- // if necessary. Returns true if a message was sent. The sendIfEmpty
599
- // argument controls whether messages with no entries will be sent
600
- // ("empty" messages are useful to convey updated Commit indexes, but
601
- // are undesirable when we're sending multiple messages in a batch).
591
+ // sendAppend sends an append RPC with new entries to the given peer, if
592
+ // necessary. Returns true if a message was sent.
602
593
//
603
- // TODO(pav-kv): make invocation of maybeSendAppend stateless. The Progress
604
- // struct contains all the state necessary for deciding whether to send a
605
- // message.
606
- func (r * raft ) maybeSendAppend (to uint64 , sendIfEmpty bool ) bool {
607
- pr := r .trk .Progress [to ]
608
- if pr .IsPaused () {
594
+ // This may send an empty append message (with no entries) if replication to
595
+ // this follower is throttled, or there are no new entries but the commit index
596
+ // for the follower can be bumped.
597
+ func (r * raft ) sendAppend (to uint64 , pr * tracker.Progress ) bool {
598
+ if pr .State == tracker .StateProbe {
599
+ return ! pr .MsgAppFlowPaused && r .maybeSendAppend (to , pr )
600
+ } else if pr .State != tracker .StateReplicate {
609
601
return false
610
- }
602
+ } // only StateReplicate below
603
+
604
+ // If there are any pending entries and the inflight tracking is not
605
+ // saturated, send a regular append message (or snapshot).
606
+ if pr .Next <= r .raftLog .lastIndex () && ! pr .Inflights .Full () {
607
+ return r .maybeSendAppend (to , pr )
608
+ }
609
+ // NB: the commit index is periodically sent in the heartbeat messages, so
610
+ // technically we don't need the CanBumpCommit clause here to guarantee commit
611
+ // index convergence on the follower. However, sending it via MsgApp here
612
+ // allows faster (no heartbeat interval delay) convergence in some cases.
613
+ if pr .CanBumpCommit (r .raftLog .committed ) {
614
+ return r .maybeSendEmptyAppend (to , pr )
615
+ }
616
+ // In a throttled StateReplicate, send an empty append message if we haven't
617
+ // done so recently.
618
+ //
619
+ // We must send periodic appends so that eventually the follower either
620
+ // accepts or rejects it. If we don't do so, replication can stall if all the
621
+ // in-flight appends are lost/dropped.
622
+ return ! pr .MsgAppFlowPaused && pr .Match < r .raftLog .lastIndex () &&
623
+ r .maybeSendEmptyAppend (to , pr )
624
+ }
611
625
626
+ // maybeSendAppend sends a non-empty append message to the given follower. It
627
+ // may send a snapshot instead if the required section of the log is no longer
628
+ // available in this leader's log. Returns true if a message was sent.
629
+ func (r * raft ) maybeSendAppend (to uint64 , pr * tracker.Progress ) bool {
630
+ // TODO(pav-kv): when pr.Next is updated, we always know the term of entry
631
+ // pr.Next-1, because the previous append message contains it. We should store
632
+ // (Next-1, Term) in Progress, instead of just Next. Then we don't have to
633
+ // fetch the term here, and may avoid an unnecessary snapshot.
612
634
prevIndex := pr .Next - 1
613
635
prevTerm , err := r .raftLog .term (prevIndex )
614
636
if err != nil {
615
637
// The log probably got truncated at >= pr.Next, so we can't catch up the
616
638
// follower log anymore. Send a snapshot instead.
617
639
return r .maybeSendSnapshot (to , pr )
618
640
}
619
-
620
- var ents []pb.Entry
621
- // In a throttled StateReplicate only send empty MsgApp, to ensure progress.
622
- // Otherwise, if we had a full Inflights and all inflight messages were in
623
- // fact dropped, replication to that follower would stall. Instead, an empty
624
- // MsgApp will eventually reach the follower (heartbeats responses prompt the
625
- // leader to send an append), allowing it to be acked or rejected, both of
626
- // which will clear out Inflights.
627
- if pr .State != tracker .StateReplicate || ! pr .Inflights .Full () {
628
- ents , err = r .raftLog .entries (pr .Next , r .maxMsgSize )
629
- }
630
- if len (ents ) == 0 && ! sendIfEmpty {
631
- return false
632
- }
633
- // TODO(pav-kv): move this check up to where err is returned.
641
+ ents , err := r .raftLog .entries (pr .Next , r .maxMsgSize )
634
642
if err != nil { // send a snapshot if we failed to get the entries
635
643
return r .maybeSendSnapshot (to , pr )
636
644
}
637
-
638
- // Send the actual MsgApp otherwise, and update the progress accordingly.
639
645
r .send (pb.Message {
640
646
To : to ,
641
647
Type : pb .MsgApp ,
@@ -649,6 +655,29 @@ func (r *raft) maybeSendAppend(to uint64, sendIfEmpty bool) bool {
649
655
return true
650
656
}
651
657
658
+ func (r * raft ) maybeSendEmptyAppend (to uint64 , pr * tracker.Progress ) bool {
659
+ // TODO(pav-kv): when pr.Next is updated, we always know the term of entry
660
+ // pr.Next-1, because the append message contains it. Store (Next-1, Term) in
661
+ // Progress, instead of just Next. Then we don't have to fetch the term and
662
+ // send a potentially unnecessary snapshot here.
663
+ prevTerm , err := r .raftLog .term (pr .Next - 1 )
664
+ if err != nil {
665
+ // The log probably got truncated at >= pr.Next, so we can't catch up the
666
+ // follower log anymore. Send a snapshot instead.
667
+ return r .maybeSendSnapshot (to , pr )
668
+ }
669
+ r .send (pb.Message {
670
+ To : to ,
671
+ Type : pb .MsgApp ,
672
+ Index : pr .Next - 1 ,
673
+ LogTerm : prevTerm ,
674
+ Commit : r .raftLog .committed ,
675
+ })
676
+ pr .SentEntries (0 , 0 )
677
+ pr .SentCommit (r .raftLog .committed )
678
+ return true
679
+ }
680
+
652
681
// maybeSendSnapshot fetches a snapshot from Storage, and sends it to the given
653
682
// node. Returns true iff the snapshot message has been emitted successfully.
654
683
func (r * raft ) maybeSendSnapshot (to uint64 , pr * tracker.Progress ) bool {
@@ -700,11 +729,11 @@ func (r *raft) sendHeartbeat(to uint64, ctx []byte) {
700
729
// bcastAppend sends RPC, with entries to all peers that are not up-to-date
701
730
// according to the progress recorded in r.trk.
702
731
func (r * raft ) bcastAppend () {
703
- r .trk .Visit (func (id uint64 , _ * tracker.Progress ) {
732
+ r .trk .Visit (func (id uint64 , pr * tracker.Progress ) {
704
733
if id == r .id {
705
734
return
706
735
}
707
- r .sendAppend (id )
736
+ r .sendAppend (id , pr )
708
737
})
709
738
}
710
739
@@ -1482,7 +1511,7 @@ func stepLeader(r *raft, m pb.Message) error {
1482
1511
if pr .State == tracker .StateReplicate {
1483
1512
pr .BecomeProbe ()
1484
1513
}
1485
- r .sendAppend (m .From )
1514
+ r .sendAppend (m .From , pr )
1486
1515
}
1487
1516
} else {
1488
1517
// We want to update our tracking if the response updates our
@@ -1521,21 +1550,13 @@ func stepLeader(r *raft, m pb.Message) error {
1521
1550
// to respond to pending read index requests
1522
1551
releasePendingReadIndexMessages (r )
1523
1552
r .bcastAppend ()
1524
- } else if r .id != m .From && pr .CanBumpCommit (r .raftLog .committed ) {
1525
- // This node may be missing the latest commit index, so send it.
1526
- // NB: this is not strictly necessary because the periodic heartbeat
1527
- // messages deliver commit indices too. However, a message sent now
1528
- // may arrive earlier than the next heartbeat fires.
1529
- r .sendAppend (m .From )
1530
1553
}
1531
- // We've updated flow control information above, which may
1532
- // allow us to send multiple (size-limited) in-flight messages
1533
- // at once (such as when transitioning from probe to
1534
- // replicate, or when freeTo() covers multiple messages). If
1535
- // we have more entries to send, send as many messages as we
1536
- // can (without sending empty messages for the commit index)
1554
+ // We've updated flow control information above, which may allow us to
1555
+ // send multiple (size-limited) in-flight messages at once (such as when
1556
+ // transitioning from StateProbe to StateReplicate). Send as many
1557
+ // messages as we can.
1537
1558
if r .id != m .From {
1538
- for r .maybeSendAppend (m .From , false /* sendIfEmpty */ ) {
1559
+ for r .sendAppend (m .From , pr ) {
1539
1560
}
1540
1561
}
1541
1562
// Transfer leadership is in progress.
@@ -1562,9 +1583,7 @@ func stepLeader(r *raft, m pb.Message) error {
1562
1583
// Note that StateSnapshot typically satisfies pr.Match < lastIndex, but
1563
1584
// `pr.Paused()` is always true for StateSnapshot, so sendAppend is a
1564
1585
// no-op.
1565
- if pr .Match < r .raftLog .lastIndex () || pr .State == tracker .StateProbe {
1566
- r .sendAppend (m .From )
1567
- }
1586
+ r .sendAppend (m .From , pr )
1568
1587
1569
1588
if r .readOnly .option != ReadOnlySafe || len (m .Context ) == 0 {
1570
1589
return nil
@@ -1634,7 +1653,8 @@ func stepLeader(r *raft, m pb.Message) error {
1634
1653
r .sendTimeoutNow (leadTransferee )
1635
1654
r .logger .Infof ("%x sends MsgTimeoutNow to %x immediately as %x already has up-to-date log" , r .id , leadTransferee , leadTransferee )
1636
1655
} else {
1637
- r .sendAppend (leadTransferee )
1656
+ pr .MsgAppFlowPaused = false // force a MsgApp even if paused
1657
+ r .sendAppend (leadTransferee , pr )
1638
1658
}
1639
1659
}
1640
1660
return nil
@@ -1985,21 +2005,14 @@ func (r *raft) switchToConfig(cfg tracker.Config, trk tracker.ProgressMap) pb.Co
1985
2005
return cs
1986
2006
}
1987
2007
1988
- if r .maybeCommit () {
1989
- // If the configuration change means that more entries are committed now,
1990
- // broadcast/append to everyone in the updated config.
1991
- r .bcastAppend ()
1992
- } else {
1993
- // Otherwise, still probe the newly added replicas; there's no reason to
1994
- // let them wait out a heartbeat interval (or the next incoming
1995
- // proposal).
1996
- r .trk .Visit (func (id uint64 , pr * tracker.Progress ) {
1997
- if id == r .id {
1998
- return
1999
- }
2000
- r .maybeSendAppend (id , false /* sendIfEmpty */ )
2001
- })
2002
- }
2008
+ // If the configuration change means that more entries are committed now,
2009
+ // broadcast/append to everyone in the updated config.
2010
+ //
2011
+ // Otherwise, still probe the newly added replicas; there's no reason to let
2012
+ // them wait out a heartbeat interval (or the next incoming proposal).
2013
+ r .maybeCommit ()
2014
+ r .bcastAppend ()
2015
+
2003
2016
// If the leadTransferee was removed or demoted, abort the leadership transfer.
2004
2017
if _ , tOK := r .trk .Config .Voters .IDs ()[r .leadTransferee ]; ! tOK && r .leadTransferee != 0 {
2005
2018
r .abortLeaderTransfer ()
0 commit comments