@@ -588,24 +588,24 @@ func (r *raft) send(m pb.Message) {
588
588
}
589
589
}
590
590
591
- // sendAppend sends an append RPC with new entries (if any) and the
592
- // current commit index to the given peer.
593
- func (r * raft ) sendAppend (to uint64 ) {
594
- r .maybeSendAppend (to , true )
595
- }
596
-
597
- // maybeSendAppend sends an append RPC with new entries to the given peer,
598
- // if necessary. Returns true if a message was sent. The sendIfEmpty
599
- // argument controls whether messages with no entries will be sent
600
- // ("empty" messages are useful to convey updated Commit indexes, but
601
- // are undesirable when we're sending multiple messages in a batch).
591
+ // maybeSendAppend sends an append RPC with log entries (if any) that are not
592
+ // yet known to be replicated in the given peer's log, as well as the current
593
+ // commit index. Usually it sends a MsgApp message, but in some cases (e.g. the
594
+ // log has been compacted) it can send a MsgSnap.
595
+ //
596
+ // In some cases, the MsgApp message can have zero entries, and yet being sent.
597
+ // When the follower log is not fully up-to-date, we must send a MsgApp
598
+ // periodically so that eventually the flow is either accepted or rejected. Not
599
+ // doing so can result in replication stall, in cases when a MsgApp is dropped.
602
600
//
603
- // TODO(pav-kv): make invocation of maybeSendAppend stateless. The Progress
604
- // struct contains all the state necessary for deciding whether to send a
605
- // message.
606
- func (r * raft ) maybeSendAppend (to uint64 , sendIfEmpty bool ) bool {
601
+ // Returns true if a message was sent, or false otherwise. A message is not sent
602
+ // if the follower log and commit index are up-to-date, the flow is paused (for
603
+ // reasons like in-flight limits), or the message could not be constructed .
604
+ func (r * raft ) maybeSendAppend (to uint64 ) bool {
607
605
pr := r .trk .Progress [to ]
608
- if pr .IsPaused () {
606
+
607
+ last , commit := r .raftLog .lastIndex (), r .raftLog .committed
608
+ if ! pr .ShouldSendMsgApp (last , commit ) {
609
609
return false
610
610
}
611
611
@@ -617,35 +617,25 @@ func (r *raft) maybeSendAppend(to uint64, sendIfEmpty bool) bool {
617
617
return r .maybeSendSnapshot (to , pr )
618
618
}
619
619
620
- var ents []pb.Entry
621
- // In a throttled StateReplicate only send empty MsgApp, to ensure progress.
622
- // Otherwise, if we had a full Inflights and all inflight messages were in
623
- // fact dropped, replication to that follower would stall. Instead, an empty
624
- // MsgApp will eventually reach the follower (heartbeats responses prompt the
625
- // leader to send an append), allowing it to be acked or rejected, both of
626
- // which will clear out Inflights.
627
- if pr .State != tracker .StateReplicate || ! pr .Inflights .Full () {
628
- ents , err = r .raftLog .entries (pr .Next , r .maxMsgSize )
629
- }
630
- if len (ents ) == 0 && ! sendIfEmpty {
631
- return false
632
- }
633
- // TODO(pav-kv): move this check up to where err is returned.
634
- if err != nil { // send a snapshot if we failed to get the entries
635
- return r .maybeSendSnapshot (to , pr )
620
+ var entries []pb.Entry
621
+ if pr .CanSendEntries (last ) {
622
+ if entries , err = r .raftLog .entries (pr .Next , r .maxMsgSize ); err != nil {
623
+ // Send a snapshot if we failed to get the entries.
624
+ return r .maybeSendSnapshot (to , pr )
625
+ }
636
626
}
637
627
638
- // Send the actual MsgApp otherwise , and update the progress accordingly.
628
+ // Send the MsgApp, and update the progress accordingly.
639
629
r .send (pb.Message {
640
630
To : to ,
641
631
Type : pb .MsgApp ,
642
632
Index : prevIndex ,
643
633
LogTerm : prevTerm ,
644
- Entries : ents ,
645
- Commit : r . raftLog . committed ,
634
+ Entries : entries ,
635
+ Commit : commit ,
646
636
})
647
- pr .SentEntries (len (ents ), uint64 (payloadsSize (ents )))
648
- pr .SentCommit (r . raftLog . committed )
637
+ pr .SentEntries (len (entries ), uint64 (payloadsSize (entries )))
638
+ pr .SentCommit (commit )
649
639
return true
650
640
}
651
641
@@ -704,7 +694,7 @@ func (r *raft) bcastAppend() {
704
694
if id == r .id {
705
695
return
706
696
}
707
- r .sendAppend (id )
697
+ r .maybeSendAppend (id )
708
698
})
709
699
}
710
700
@@ -1482,7 +1472,7 @@ func stepLeader(r *raft, m pb.Message) error {
1482
1472
if pr .State == tracker .StateReplicate {
1483
1473
pr .BecomeProbe ()
1484
1474
}
1485
- r .sendAppend (m .From )
1475
+ r .maybeSendAppend (m .From )
1486
1476
}
1487
1477
} else {
1488
1478
// We want to update our tracking if the response updates our
@@ -1523,21 +1513,13 @@ func stepLeader(r *raft, m pb.Message) error {
1523
1513
// to respond to pending read index requests
1524
1514
releasePendingReadIndexMessages (r )
1525
1515
r .bcastAppend ()
1526
- } else if r .id != m .From && pr .CanBumpCommit (r .raftLog .committed ) {
1527
- // This node may be missing the latest commit index, so send it.
1528
- // NB: this is not strictly necessary because the periodic heartbeat
1529
- // messages deliver commit indices too. However, a message sent now
1530
- // may arrive earlier than the next heartbeat fires.
1531
- r .sendAppend (m .From )
1532
1516
}
1533
- // We've updated flow control information above, which may
1534
- // allow us to send multiple (size-limited) in-flight messages
1535
- // at once (such as when transitioning from probe to
1536
- // replicate, or when freeTo() covers multiple messages). If
1537
- // we have more entries to send, send as many messages as we
1538
- // can (without sending empty messages for the commit index)
1517
+ // We've updated flow control information above, which may allow us to
1518
+ // send multiple (size-limited) in-flight messages at once (such as when
1519
+ // transitioning from probe to replicate, or when freeTo() covers
1520
+ // multiple messages). Send as many messages as we can.
1539
1521
if r .id != m .From {
1540
- for r .maybeSendAppend (m .From , false /* sendIfEmpty */ ) {
1522
+ for r .maybeSendAppend (m .From ) {
1541
1523
}
1542
1524
}
1543
1525
// Transfer leadership is in progress.
@@ -1549,24 +1531,8 @@ func stepLeader(r *raft, m pb.Message) error {
1549
1531
}
1550
1532
case pb .MsgHeartbeatResp :
1551
1533
pr .RecentActive = true
1552
- pr .MsgAppFlowPaused = false
1553
-
1554
- // NB: if the follower is paused (full Inflights), this will still send an
1555
- // empty append, allowing it to recover from situations in which all the
1556
- // messages that filled up Inflights in the first place were dropped. Note
1557
- // also that the outgoing heartbeat already communicated the commit index.
1558
- //
1559
- // If the follower is fully caught up but also in StateProbe (as can happen
1560
- // if ReportUnreachable was called), we also want to send an append (it will
1561
- // be empty) to allow the follower to transition back to StateReplicate once
1562
- // it responds.
1563
- //
1564
- // Note that StateSnapshot typically satisfies pr.Match < lastIndex, but
1565
- // `pr.Paused()` is always true for StateSnapshot, so sendAppend is a
1566
- // no-op.
1567
- if pr .Match < r .raftLog .lastIndex () || pr .State == tracker .StateProbe {
1568
- r .sendAppend (m .From )
1569
- }
1534
+ pr .PauseMsgAppProbes (false )
1535
+ r .maybeSendAppend (m .From )
1570
1536
1571
1537
if r .readOnly .option != ReadOnlySafe || len (m .Context ) == 0 {
1572
1538
return nil
@@ -1636,7 +1602,8 @@ func stepLeader(r *raft, m pb.Message) error {
1636
1602
r .sendTimeoutNow (leadTransferee )
1637
1603
r .logger .Infof ("%x sends MsgTimeoutNow to %x immediately as %x already has up-to-date log" , r .id , leadTransferee , leadTransferee )
1638
1604
} else {
1639
- r .sendAppend (leadTransferee )
1605
+ pr .PauseMsgAppProbes (false )
1606
+ r .maybeSendAppend (leadTransferee )
1640
1607
}
1641
1608
}
1642
1609
return nil
@@ -1984,21 +1951,14 @@ func (r *raft) switchToConfig(cfg tracker.Config, trk tracker.ProgressMap) pb.Co
1984
1951
return cs
1985
1952
}
1986
1953
1987
- if r .maybeCommit () {
1988
- // If the configuration change means that more entries are committed now,
1989
- // broadcast/append to everyone in the updated config.
1990
- r .bcastAppend ()
1991
- } else {
1992
- // Otherwise, still probe the newly added replicas; there's no reason to
1993
- // let them wait out a heartbeat interval (or the next incoming
1994
- // proposal).
1995
- r .trk .Visit (func (id uint64 , pr * tracker.Progress ) {
1996
- if id == r .id {
1997
- return
1998
- }
1999
- r .maybeSendAppend (id , false /* sendIfEmpty */ )
2000
- })
2001
- }
1954
+ r .maybeCommit ()
1955
+ // If the configuration change means that more entries are committed now,
1956
+ // broadcast/append to everyone in the updated config.
1957
+ //
1958
+ // Otherwise, still probe the newly added replicas; there's no reason to let
1959
+ // them wait out a heartbeat interval (or the next incoming proposal).
1960
+ r .bcastAppend ()
1961
+
2002
1962
// If the leadTransferee was removed or demoted, abort the leadership transfer.
2003
1963
if _ , tOK := r .trk .Config .Voters .IDs ()[r .leadTransferee ]; ! tOK && r .leadTransferee != 0 {
2004
1964
r .abortLeaderTransfer ()
0 commit comments