Skip to content

Commit b5403db

Browse files
committed
Descriptive stream & consumer health errors
This should make the output of `healthz` less opaque when there are stream or consumer healthcheck errors. Signed-off-by: Neil Twigg <[email protected]>
1 parent 52df7e6 commit b5403db

File tree

2 files changed

+70
-65
lines changed

2 files changed

+70
-65
lines changed

server/jetstream_cluster.go

+64-59
Original file line numberDiff line numberDiff line change
@@ -445,108 +445,113 @@ func (cc *jetStreamCluster) isStreamCurrent(account, stream string) bool {
445445

446446
// isStreamHealthy will determine if the stream is up to date or very close.
447447
// For R1 it will make sure the stream is present on this server.
448-
func (js *jetStream) isStreamHealthy(acc *Account, sa *streamAssignment) bool {
448+
func (js *jetStream) isStreamHealthy(acc *Account, sa *streamAssignment) error {
449449
js.mu.RLock()
450450
s, cc := js.srv, js.cluster
451451
if cc == nil {
452452
// Non-clustered mode
453453
js.mu.RUnlock()
454-
return true
454+
return nil
455455
}
456-
457-
// Pull the group out.
458-
rg := sa.Group
459-
if rg == nil {
456+
if sa == nil || sa.Group == nil {
460457
js.mu.RUnlock()
461-
return false
458+
return fmt.Errorf("stream assignment or group missing")
462459
}
463-
464460
streamName := sa.Config.Name
465-
node := rg.node
461+
node := sa.Group.node
466462
js.mu.RUnlock()
467463

468464
// First lookup stream and make sure its there.
469465
mset, err := acc.lookupStream(streamName)
470466
if err != nil {
471-
return false
467+
return fmt.Errorf("stream not found")
472468
}
473469

474-
// If R1 we are good.
475-
if node == nil {
476-
return true
477-
}
470+
switch {
471+
case mset.cfg.Replicas <= 1:
472+
return nil // No further checks for R=1 streams
478473

479-
// Here we are a replicated stream.
480-
// First make sure our monitor routine is running.
481-
if !mset.isMonitorRunning() {
482-
return false
483-
}
474+
case node == nil:
475+
return fmt.Errorf("group node missing")
484476

485-
if node.Healthy() {
486-
// Check if we are processing a snapshot and are catching up.
487-
if !mset.isCatchingUp() {
488-
return true
489-
}
490-
} else { // node != nil
491-
if node != mset.raftNode() {
492-
s.Warnf("Detected stream cluster node skew '%s > %s'", acc.GetName(), streamName)
493-
node.Delete()
494-
mset.resetClusteredState(nil)
495-
}
477+
case !mset.isMonitorRunning():
478+
return fmt.Errorf("monitor goroutine not running")
479+
480+
case !node.Healthy():
481+
return fmt.Errorf("group node unhealthy")
482+
483+
case mset.isCatchingUp():
484+
return fmt.Errorf("stream catching up")
485+
486+
case node != mset.raftNode():
487+
s.Warnf("Detected stream cluster node skew '%s > %s'", acc.GetName(), streamName)
488+
node.Delete()
489+
mset.resetClusteredState(nil)
490+
return fmt.Errorf("cluster node skew detected")
491+
492+
default:
493+
return nil
496494
}
497-
return false
498495
}
499496

500497
// isConsumerHealthy will determine if the consumer is up to date.
501498
// For R1 it will make sure the consunmer is present on this server.
502-
func (js *jetStream) isConsumerHealthy(mset *stream, consumer string, ca *consumerAssignment) bool {
499+
func (js *jetStream) isConsumerHealthy(mset *stream, consumer string, ca *consumerAssignment) error {
503500
if mset == nil {
504-
return false
501+
return fmt.Errorf("stream missing")
505502
}
506-
507503
js.mu.RLock()
508-
cc := js.cluster
504+
s, cc := js.srv, js.cluster
509505
if cc == nil {
510506
// Non-clustered mode
511507
js.mu.RUnlock()
512-
return true
508+
return nil
513509
}
514-
// These are required.
515510
if ca == nil || ca.Group == nil {
516511
js.mu.RUnlock()
517-
return false
512+
return fmt.Errorf("consumer assignment or group missing")
518513
}
519-
s := js.srv
520-
// Capture RAFT node from assignment.
521514
node := ca.Group.node
522515
js.mu.RUnlock()
523516

524517
// Check if not running at all.
525518
o := mset.lookupConsumer(consumer)
526519
if o == nil {
527-
return false
520+
return fmt.Errorf("consumer not found")
528521
}
529522

530-
// Check RAFT node state.
531-
if node == nil || node.Healthy() {
532-
return true
533-
} else if node != nil {
534-
if node != o.raftNode() {
535-
mset.mu.RLock()
536-
accName, streamName := mset.acc.GetName(), mset.cfg.Name
537-
mset.mu.RUnlock()
538-
s.Warnf("Detected consumer cluster node skew '%s > %s > %s'", accName, streamName, consumer)
539-
node.Delete()
540-
o.deleteWithoutAdvisory()
523+
rc, _ := o.replica()
524+
switch {
525+
case rc <= 1:
526+
return nil // No further checks for R=1 consumers
541527

542-
// When we try to restart we nil out the node and reprocess the consumer assignment.
543-
js.mu.Lock()
544-
ca.Group.node = nil
545-
js.mu.Unlock()
546-
js.processConsumerAssignment(ca)
547-
}
528+
case node == nil:
529+
return fmt.Errorf("group node missing")
530+
531+
case !o.isMonitorRunning():
532+
return fmt.Errorf("monitor goroutine not running")
533+
534+
case !node.Healthy():
535+
return fmt.Errorf("group node unhealthy")
536+
537+
case node != mset.raftNode():
538+
mset.mu.RLock()
539+
accName, streamName := mset.acc.GetName(), mset.cfg.Name
540+
mset.mu.RUnlock()
541+
s.Warnf("Detected consumer cluster node skew '%s > %s > %s'", accName, streamName, consumer)
542+
node.Delete()
543+
o.deleteWithoutAdvisory()
544+
545+
// When we try to restart we nil out the node and reprocess the consumer assignment.
546+
js.mu.Lock()
547+
ca.Group.node = nil
548+
js.mu.Unlock()
549+
js.processConsumerAssignment(ca)
550+
return fmt.Errorf("cluster node skew detected")
551+
552+
default:
553+
return nil
548554
}
549-
return false
550555
}
551556

552557
// subjectsOverlap checks all existing stream assignments for the account cross-cluster for subject overlap

server/monitor.go

+6-6
Original file line numberDiff line numberDiff line change
@@ -3690,35 +3690,35 @@ func (s *Server) healthz(opts *HealthzOptions) *HealthStatus {
36903690

36913691
for stream, sa := range asa {
36923692
// Make sure we can look up
3693-
if !js.isStreamHealthy(acc, sa) {
3693+
if err := js.isStreamHealthy(acc, sa); err != nil {
36943694
if !details {
36953695
health.Status = na
3696-
health.Error = fmt.Sprintf("JetStream stream '%s > %s' is not current", accName, stream)
3696+
health.Error = fmt.Sprintf("JetStream stream '%s > %s' is not current: %s", accName, stream, err)
36973697
return health
36983698
}
36993699
health.Errors = append(health.Errors, HealthzError{
37003700
Type: HealthzErrorStream,
37013701
Account: accName,
37023702
Stream: stream,
3703-
Error: fmt.Sprintf("JetStream stream '%s > %s' is not current", accName, stream),
3703+
Error: fmt.Sprintf("JetStream stream '%s > %s' is not current: %s", accName, stream, err),
37043704
})
37053705
continue
37063706
}
37073707
mset, _ := acc.lookupStream(stream)
37083708
// Now check consumers.
37093709
for consumer, ca := range sa.consumers {
3710-
if !js.isConsumerHealthy(mset, consumer, ca) {
3710+
if err := js.isConsumerHealthy(mset, consumer, ca); err != nil {
37113711
if !details {
37123712
health.Status = na
3713-
health.Error = fmt.Sprintf("JetStream consumer '%s > %s > %s' is not current", acc, stream, consumer)
3713+
health.Error = fmt.Sprintf("JetStream consumer '%s > %s > %s' is not current: %s", acc, stream, consumer, err)
37143714
return health
37153715
}
37163716
health.Errors = append(health.Errors, HealthzError{
37173717
Type: HealthzErrorConsumer,
37183718
Account: accName,
37193719
Stream: stream,
37203720
Consumer: consumer,
3721-
Error: fmt.Sprintf("JetStream consumer '%s > %s > %s' is not current", acc, stream, consumer),
3721+
Error: fmt.Sprintf("JetStream consumer '%s > %s > %s' is not current: %s", acc, stream, consumer, err),
37223722
})
37233723
}
37243724
}

0 commit comments

Comments
 (0)