Skip to content

Commit 4ad14e0

Browse files
authored
add sensor for down nodes (#11592)
1 parent be454f6 commit 4ad14e0

File tree

5 files changed

+25
-0
lines changed

5 files changed

+25
-0
lines changed

ydb/core/mind/hive/hive_impl.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1723,6 +1723,22 @@ void THive::UpdateCounterTabletChannelHistorySize() {
17231723
}
17241724
}
17251725

1726+
void THive::UpdateCounterNodesDown(i64 nodesDownDiff) {
1727+
if (TabletCounters != nullptr) {
1728+
auto& counter = TabletCounters->Simple()[NHive::COUNTER_NODES_DOWN];
1729+
auto newValue = counter.Get() + nodesDownDiff;
1730+
counter.Set(newValue);
1731+
}
1732+
}
1733+
1734+
void THive::UpdateCounterNodesFrozen(i64 nodesFrozenDiff) {
1735+
if (TabletCounters != nullptr) {
1736+
auto& counter = TabletCounters->Simple()[NHive::COUNTER_NODES_FROZEN];
1737+
auto newValue = counter.Get() + nodesFrozenDiff;
1738+
counter.Set(newValue);
1739+
}
1740+
}
1741+
17261742
void THive::RecordTabletMove(const TTabletMoveInfo& moveInfo) {
17271743
TabletMoveHistory.PushBack(moveInfo);
17281744
TabletCounters->Cumulative()[NHive::COUNTER_TABLETS_MOVED].Increment(1);

ydb/core/mind/hive/hive_impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -679,6 +679,8 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
679679
void UpdateCounterTabletsStarting(i64 tabletsStartingDiff);
680680
void UpdateCounterPingQueueSize();
681681
void UpdateCounterTabletChannelHistorySize();
682+
void UpdateCounterNodesDown(i64 nodesDownDiff);
683+
void UpdateCounterNodesFrozen(i64 nodesFrozenDiff);
682684
void RecordTabletMove(const TTabletMoveInfo& info);
683685
bool DomainHasNodes(const TSubDomainKey &domainKey) const;
684686
void ProcessBootQueue();

ydb/core/mind/hive/node_info.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,7 @@ void TNodeInfo::SendReconnect(const TActorId& local) {
360360
}
361361

362362
void TNodeInfo::SetDown(bool down) {
363+
Hive.UpdateCounterNodesDown(static_cast<i64>(down) - static_cast<i64>(Down));
363364
Down = down;
364365
if (Down) {
365366
Hive.ObjectDistributions.RemoveNode(*this);
@@ -370,6 +371,7 @@ void TNodeInfo::SetDown(bool down) {
370371
}
371372

372373
void TNodeInfo::SetFreeze(bool freeze) {
374+
Hive.UpdateCounterNodesFrozen(static_cast<i64>(freeze) - static_cast<i64>(Freeze));
373375
Freeze = freeze;
374376
if (Freeze) {
375377
for (const auto& [state, tablets] : Tablets) {

ydb/core/mind/hive/tx__load_everything.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,9 @@ class TTxLoadEverything : public TTransactionBase<THive> {
326326
// That was not persisted to avoid issues with downgrades
327327
node.Down = true;
328328
}
329+
if (node.Down) {
330+
Self->UpdateCounterNodesDown(+1);
331+
}
329332
if (nodeRowset.HaveValue<Schema::Node::Location>()) {
330333
auto location = nodeRowset.GetValue<Schema::Node::Location>();
331334
if (location.HasDataCenter()) {

ydb/core/protos/counters_hive.proto

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ enum ESimpleCounters {
3434
COUNTER_NODES_RECOMMENDED = 24 [(CounterOpts) = {Name: "NodesRecommended"}];
3535
COUNTER_NODES_RECOMMENDED_DRY_RUN = 25 [(CounterOpts) = {Name: "NodesRecommendedDryRun"}];
3636
COUNTER_AVG_CPU_UTILIZATION = 26 [(CounterOpts) = {Name: "AvgCPUUtilization"}];
37+
COUNTER_NODES_DOWN = 27 [(CounterOpts) = {Name: "NodesDown"}];
38+
COUNTER_NODES_FROZEN = 28 [(CounterOpts) = {Name: "NodesFrozen"}];
3739
}
3840

3941
enum ECumulativeCounters {

0 commit comments

Comments
 (0)