Skip to content

Add zone label to ring_members metric. #6900

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
* [ENHANCEMENT] Distributor: Add native histograms max sample size bytes limit validation. #6834
* [ENHANCEMENT] Querier: Support caching parquet labels file in parquet queryable. #6835
* [ENHANCEMENT] Querier: Support query limits in parquet queryable. #6870
* [ENHANCEMENT] Ring: Add zone label to ring_members metric. #6900
* [ENHANCEMENT] Ingester: Add new metric `cortex_ingester_push_errors_total` to track reasons for ingester request failures. #6901
* [BUGFIX] Ingester: Avoid error or early throttling when READONLY ingesters are present in the ring #6517
* [BUGFIX] Ingester: Fix labelset data race condition. #6573
Expand Down
28 changes: 21 additions & 7 deletions pkg/ring/ring.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,8 @@ type Ring struct {

// List of zones for which there's at least 1 instance in the ring. This list is guaranteed
// to be sorted alphabetically.
ringZones []string
ringZones []string
previousRingZones []string

// Cache of shuffle-sharded subrings per identifier. Invalidated when topology changes.
// If set to nil, no caching is done (used by tests, and subrings).
Expand Down Expand Up @@ -262,7 +263,7 @@ func NewWithStoreClientAndStrategy(cfg Config, name, key string, store kv.Client
Name: "ring_members",
Help: "Number of members in the ring",
ConstLabels: map[string]string{"name": name}},
[]string{"state"}),
[]string{"state", "zone"}),
totalTokensGauge: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
Name: "ring_tokens_total",
Help: "Number of tokens in the ring",
Expand Down Expand Up @@ -362,6 +363,7 @@ func (r *Ring) updateRingState(ringDesc *Desc) {
r.ringTokensByZone = ringTokensByZone
r.ringInstanceByToken = ringInstanceByToken
r.ringInstanceIdByAddr = ringInstanceByAddr
r.previousRingZones = r.ringZones
r.ringZones = ringZones
r.lastTopologyChange = now
if r.shuffledSubringCache != nil {
Expand Down Expand Up @@ -665,12 +667,19 @@ func (r *Ring) updateRingMetrics(compareResult CompareResult) {
return
}

numByState := map[string]int{}
numByStateByZone := map[string]map[string]int{}
oldestTimestampByState := map[string]int64{}

// Initialized to zero so we emit zero-metrics (instead of not emitting anything)
for _, s := range []string{unhealthy, ACTIVE.String(), LEAVING.String(), PENDING.String(), JOINING.String(), READONLY.String()} {
numByState[s] = 0
numByStateByZone[s] = map[string]int{}
// make sure removed zones got zero value
for _, zone := range r.previousRingZones {
numByStateByZone[s][zone] = 0
}
for _, zone := range r.ringZones {
numByStateByZone[s][zone] = 0
}
oldestTimestampByState[s] = 0
}

Expand All @@ -679,14 +688,19 @@ func (r *Ring) updateRingMetrics(compareResult CompareResult) {
if !r.IsHealthy(&instance, Reporting, r.KVClient.LastUpdateTime(r.key)) {
s = unhealthy
}
numByState[s]++
if _, ok := numByStateByZone[s]; !ok {
numByStateByZone[s] = map[string]int{}
}
numByStateByZone[s][instance.Zone]++
if oldestTimestampByState[s] == 0 || instance.Timestamp < oldestTimestampByState[s] {
oldestTimestampByState[s] = instance.Timestamp
}
}

for state, count := range numByState {
r.numMembersGaugeVec.WithLabelValues(state).Set(float64(count))
for state, zones := range numByStateByZone {
for zone, count := range zones {
r.numMembersGaugeVec.WithLabelValues(state, zone).Set(float64(count))
}
}
for state, timestamp := range oldestTimestampByState {
r.oldestTimestampGaugeVec.WithLabelValues(state).Set(float64(timestamp))
Expand Down
166 changes: 142 additions & 24 deletions pkg/ring/ring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3202,12 +3202,12 @@ func TestUpdateMetrics(t *testing.T) {
ring_member_ownership_percent{member="B",name="test"} 0.5000000002328306
# HELP ring_members Number of members in the ring
# TYPE ring_members gauge
ring_members{name="test",state="ACTIVE"} 2
ring_members{name="test",state="JOINING"} 0
ring_members{name="test",state="LEAVING"} 0
ring_members{name="test",state="PENDING"} 0
ring_members{name="test",state="READONLY"} 0
ring_members{name="test",state="Unhealthy"} 0
ring_members{name="test",state="ACTIVE",zone=""} 2
ring_members{name="test",state="JOINING",zone=""} 0
ring_members{name="test",state="LEAVING",zone=""} 0
ring_members{name="test",state="PENDING",zone=""} 0
ring_members{name="test",state="READONLY",zone=""} 0
ring_members{name="test",state="Unhealthy",zone=""} 0
# HELP ring_oldest_member_timestamp Timestamp of the oldest member in the ring.
# TYPE ring_oldest_member_timestamp gauge
ring_oldest_member_timestamp{name="test",state="ACTIVE"} 11
Expand All @@ -3230,12 +3230,12 @@ func TestUpdateMetrics(t *testing.T) {
Expected: `
# HELP ring_members Number of members in the ring
# TYPE ring_members gauge
ring_members{name="test",state="ACTIVE"} 2
ring_members{name="test",state="JOINING"} 0
ring_members{name="test",state="LEAVING"} 0
ring_members{name="test",state="PENDING"} 0
ring_members{name="test",state="READONLY"} 0
ring_members{name="test",state="Unhealthy"} 0
ring_members{name="test",state="ACTIVE",zone=""} 2
ring_members{name="test",state="JOINING",zone=""} 0
ring_members{name="test",state="LEAVING",zone=""} 0
ring_members{name="test",state="PENDING",zone=""} 0
ring_members{name="test",state="READONLY",zone=""} 0
ring_members{name="test",state="Unhealthy",zone=""} 0
# HELP ring_oldest_member_timestamp Timestamp of the oldest member in the ring.
# TYPE ring_oldest_member_timestamp gauge
ring_oldest_member_timestamp{name="test",state="ACTIVE"} 11
Expand Down Expand Up @@ -3310,12 +3310,12 @@ func TestUpdateMetricsWithRemoval(t *testing.T) {
ring_member_ownership_percent{member="B",name="test"} 0.5000000002328306
# HELP ring_members Number of members in the ring
# TYPE ring_members gauge
ring_members{name="test",state="ACTIVE"} 2
ring_members{name="test",state="JOINING"} 0
ring_members{name="test",state="LEAVING"} 0
ring_members{name="test",state="PENDING"} 0
ring_members{name="test",state="READONLY"} 0
ring_members{name="test",state="Unhealthy"} 0
ring_members{name="test",state="ACTIVE",zone=""} 2
ring_members{name="test",state="JOINING",zone=""} 0
ring_members{name="test",state="LEAVING",zone=""} 0
ring_members{name="test",state="PENDING",zone=""} 0
ring_members{name="test",state="READONLY",zone=""} 0
ring_members{name="test",state="Unhealthy",zone=""} 0
# HELP ring_oldest_member_timestamp Timestamp of the oldest member in the ring.
# TYPE ring_oldest_member_timestamp gauge
ring_oldest_member_timestamp{name="test",state="ACTIVE"} 11
Expand Down Expand Up @@ -3347,12 +3347,130 @@ func TestUpdateMetricsWithRemoval(t *testing.T) {
ring_member_ownership_percent{member="A",name="test"} 1
# HELP ring_members Number of members in the ring
# TYPE ring_members gauge
ring_members{name="test",state="ACTIVE"} 1
ring_members{name="test",state="JOINING"} 0
ring_members{name="test",state="LEAVING"} 0
ring_members{name="test",state="PENDING"} 0
ring_members{name="test",state="READONLY"} 0
ring_members{name="test",state="Unhealthy"} 0
ring_members{name="test",state="ACTIVE",zone=""} 1
ring_members{name="test",state="JOINING",zone=""} 0
ring_members{name="test",state="LEAVING",zone=""} 0
ring_members{name="test",state="PENDING",zone=""} 0
ring_members{name="test",state="READONLY",zone=""} 0
ring_members{name="test",state="Unhealthy",zone=""} 0
# HELP ring_oldest_member_timestamp Timestamp of the oldest member in the ring.
# TYPE ring_oldest_member_timestamp gauge
ring_oldest_member_timestamp{name="test",state="ACTIVE"} 22
ring_oldest_member_timestamp{name="test",state="JOINING"} 0
ring_oldest_member_timestamp{name="test",state="LEAVING"} 0
ring_oldest_member_timestamp{name="test",state="PENDING"} 0
ring_oldest_member_timestamp{name="test",state="READONLY"} 0
ring_oldest_member_timestamp{name="test",state="Unhealthy"} 0
# HELP ring_tokens_owned The number of tokens in the ring owned by the member
# TYPE ring_tokens_owned gauge
ring_tokens_owned{member="A",name="test"} 2
# HELP ring_tokens_total Number of tokens in the ring
# TYPE ring_tokens_total gauge
ring_tokens_total{name="test"} 2
`))
assert.NoError(t, err)
}

func TestUpdateMetricsWithZone(t *testing.T) {
cfg := Config{
KVStore: kv.Config{},
HeartbeatTimeout: 0, // get healthy stats
ReplicationFactor: 3,
ZoneAwarenessEnabled: true,
DetailedMetricsEnabled: true,
}

registry := prometheus.NewRegistry()

// create the ring to set up metrics, but do not start
ring, err := NewWithStoreClientAndStrategy(cfg, testRingName, testRingKey, &MockClient{}, NewDefaultReplicationStrategy(), registry, log.NewNopLogger())
require.NoError(t, err)

ringDesc := Desc{
Ingesters: map[string]InstanceDesc{
"A": {Addr: "127.0.0.1", Timestamp: 22, Zone: "zone1", Tokens: []uint32{math.MaxUint32 / 6, (math.MaxUint32 / 6) * 4}},
"B": {Addr: "127.0.0.2", Timestamp: 11, Zone: "zone2", Tokens: []uint32{(math.MaxUint32 / 6) * 2, (math.MaxUint32 / 6) * 5}},
"C": {Addr: "127.0.0.3", Timestamp: 33, Zone: "zone3", Tokens: []uint32{(math.MaxUint32 / 6) * 3, math.MaxUint32}},
},
}
ring.updateRingState(&ringDesc)

err = testutil.GatherAndCompare(registry, bytes.NewBufferString(`
# HELP ring_member_ownership_percent The percent ownership of the ring by member
# TYPE ring_member_ownership_percent gauge
ring_member_ownership_percent{member="A",name="test"} 0.3333333332557231
ring_member_ownership_percent{member="B",name="test"} 0.3333333330228925
ring_member_ownership_percent{member="C",name="test"} 0.3333333337213844
# HELP ring_members Number of members in the ring
# TYPE ring_members gauge
ring_members{name="test",state="ACTIVE",zone="zone1"} 1
ring_members{name="test",state="ACTIVE",zone="zone2"} 1
ring_members{name="test",state="ACTIVE",zone="zone3"} 1
ring_members{name="test",state="JOINING",zone="zone1"} 0
ring_members{name="test",state="JOINING",zone="zone2"} 0
ring_members{name="test",state="JOINING",zone="zone3"} 0
ring_members{name="test",state="LEAVING",zone="zone1"} 0
ring_members{name="test",state="LEAVING",zone="zone2"} 0
ring_members{name="test",state="LEAVING",zone="zone3"} 0
ring_members{name="test",state="PENDING",zone="zone1"} 0
ring_members{name="test",state="PENDING",zone="zone2"} 0
ring_members{name="test",state="PENDING",zone="zone3"} 0
ring_members{name="test",state="READONLY",zone="zone1"} 0
ring_members{name="test",state="READONLY",zone="zone2"} 0
ring_members{name="test",state="READONLY",zone="zone3"} 0
ring_members{name="test",state="Unhealthy",zone="zone1"} 0
ring_members{name="test",state="Unhealthy",zone="zone2"} 0
ring_members{name="test",state="Unhealthy",zone="zone3"} 0
# HELP ring_oldest_member_timestamp Timestamp of the oldest member in the ring.
# TYPE ring_oldest_member_timestamp gauge
ring_oldest_member_timestamp{name="test",state="ACTIVE"} 11
ring_oldest_member_timestamp{name="test",state="JOINING"} 0
ring_oldest_member_timestamp{name="test",state="LEAVING"} 0
ring_oldest_member_timestamp{name="test",state="PENDING"} 0
ring_oldest_member_timestamp{name="test",state="READONLY"} 0
ring_oldest_member_timestamp{name="test",state="Unhealthy"} 0
# HELP ring_tokens_owned The number of tokens in the ring owned by the member
# TYPE ring_tokens_owned gauge
ring_tokens_owned{member="A",name="test"} 2
ring_tokens_owned{member="B",name="test"} 2
ring_tokens_owned{member="C",name="test"} 2
# HELP ring_tokens_total Number of tokens in the ring
# TYPE ring_tokens_total gauge
ring_tokens_total{name="test"} 6
`))
require.NoError(t, err)

ringDescNew := Desc{
Ingesters: map[string]InstanceDesc{
"A": {Addr: "127.0.0.1", Timestamp: 22, Zone: "zone1", Tokens: []uint32{math.MaxUint32 / 6, (math.MaxUint32 / 6) * 4}},
},
}
ring.updateRingState(&ringDescNew)

err = testutil.GatherAndCompare(registry, bytes.NewBufferString(`
# HELP ring_member_ownership_percent The percent ownership of the ring by member
# TYPE ring_member_ownership_percent gauge
ring_member_ownership_percent{member="A",name="test"} 1
# HELP ring_members Number of members in the ring
# TYPE ring_members gauge
ring_members{name="test",state="ACTIVE",zone="zone1"} 1
ring_members{name="test",state="ACTIVE",zone="zone2"} 0
ring_members{name="test",state="ACTIVE",zone="zone3"} 0
ring_members{name="test",state="JOINING",zone="zone1"} 0
ring_members{name="test",state="JOINING",zone="zone2"} 0
ring_members{name="test",state="JOINING",zone="zone3"} 0
ring_members{name="test",state="LEAVING",zone="zone1"} 0
ring_members{name="test",state="LEAVING",zone="zone2"} 0
ring_members{name="test",state="LEAVING",zone="zone3"} 0
ring_members{name="test",state="PENDING",zone="zone1"} 0
ring_members{name="test",state="PENDING",zone="zone2"} 0
ring_members{name="test",state="PENDING",zone="zone3"} 0
ring_members{name="test",state="READONLY",zone="zone1"} 0
ring_members{name="test",state="READONLY",zone="zone2"} 0
ring_members{name="test",state="READONLY",zone="zone3"} 0
ring_members{name="test",state="Unhealthy",zone="zone1"} 0
ring_members{name="test",state="Unhealthy",zone="zone2"} 0
ring_members{name="test",state="Unhealthy",zone="zone3"} 0
# HELP ring_oldest_member_timestamp Timestamp of the oldest member in the ring.
# TYPE ring_oldest_member_timestamp gauge
ring_oldest_member_timestamp{name="test",state="ACTIVE"} 22
Expand Down
Loading