From 082c5e0cc1a676a18d9b4eadb638209719293f51 Mon Sep 17 00:00:00 2001 From: Roman Gershman Date: Sun, 3 Mar 2024 20:01:21 +0200 Subject: [PATCH] chore: add oom stats to /metrics (#2680) * chore: add oom stats to /metrics Expose oom/cmd errors when we reject executing a command if we reached OOM state (controlled by oom_deny_ratio flag). Expose oom/insert errors when we do not insert a new key or do not grow a dashtable (controlled by table_growth_margin). Move OOM command check to a place that covers all types of transactions - including multi and squashing transactions. --------- Signed-off-by: Roman Gershman --- src/server/main_service.cc | 24 +++++++++++++----------- src/server/server_family.cc | 7 +++++++ src/server/server_state.cc | 3 ++- src/server/server_state.h | 3 +++ 4 files changed, 25 insertions(+), 12 deletions(-) diff --git a/src/server/main_service.cc b/src/server/main_service.cc index 60a87983d167..46c1cf245f2f 100644 --- a/src/server/main_service.cc +++ b/src/server/main_service.cc @@ -957,7 +957,19 @@ static optional VerifyConnectionAclStatus(const CommandId* cid, optional Service::VerifyCommandExecution(const CommandId* cid, const ConnectionContext* cntx, CmdArgList tail_args) { - // TODO: Move OOM check here + ServerState& etl = *ServerState::tlocal(); + + if ((cid->opt_mask() & CO::DENYOOM) && etl.is_master) { + uint64_t start_ns = absl::GetCurrentTimeNanos(); + + uint64_t used_memory = etl.GetUsedMemory(start_ns); + double oom_deny_ratio = GetFlag(FLAGS_oom_deny_ratio); + if (used_memory > (max_memory_limit * oom_deny_ratio)) { + etl.stats.oom_error_cmd_cnt++; + return facade::ErrorReply{kOutOfMemory}; + } + } + return VerifyConnectionAclStatus(cid, cntx, "ACL rules changed between the MULTI and EXEC", tail_args); } @@ -1125,16 +1137,6 @@ void Service::DispatchCommand(CmdArgList args, facade::ConnectionContext* cntx) return cntx->SendSimpleString("QUEUED"); } - if (cid->opt_mask() & CO::DENYOOM && etl.is_master) { - uint64_t start_ns = absl::GetCurrentTimeNanos(); - - uint64_t used_memory = etl.GetUsedMemory(start_ns); - double oom_deny_ratio = GetFlag(FLAGS_oom_deny_ratio); - if (used_memory > (max_memory_limit * oom_deny_ratio)) { - return cntx->reply_builder()->SendError(kOutOfMemory); - } - } - // Create command transaction intrusive_ptr dist_trans; diff --git a/src/server/server_family.cc b/src/server/server_family.cc index bc1609b1fc6a..0b9d0e307931 100644 --- a/src/server/server_family.cc +++ b/src/server/server_family.cc @@ -1007,6 +1007,13 @@ void PrintPrometheusMetrics(const Metrics& m, StringResponse* resp) { &resp->body()); AppendMetricWithoutLabels("memory_max_bytes", "", max_memory_limit, MetricType::GAUGE, &resp->body()); + + if (m.events.insertion_rejections | m.coordinator_stats.oom_error_cmd_cnt) { + AppendMetricValue("oom_errors_total", m.events.insertion_rejections, {"type"}, {"insert"}, + &resp->body()); + AppendMetricValue("oom_errors_total", m.coordinator_stats.oom_error_cmd_cnt, {"type"}, {"cmd"}, + &resp->body()); + } if (sdata_res.has_value()) { size_t rss = sdata_res->vm_rss + sdata_res->hugetlb_pages; AppendMetricWithoutLabels("used_memory_rss_bytes", "", rss, MetricType::GAUGE, &resp->body()); diff --git a/src/server/server_state.cc b/src/server/server_state.cc index 43402c1a4e1a..d117f3be0f4e 100644 --- a/src/server/server_state.cc +++ b/src/server/server_state.cc @@ -48,7 +48,7 @@ auto ServerState::Stats::operator=(Stats&& other) -> Stats& { } ServerState::Stats& ServerState::Stats::Add(unsigned num_shards, const ServerState::Stats& other) { - static_assert(sizeof(Stats) == 14 * 8, "Stats size mismatch"); + static_assert(sizeof(Stats) == 15 * 8, "Stats size mismatch"); for (int i = 0; i < NUM_TX_TYPES; ++i) { this->tx_type_cnt[i] += other.tx_type_cnt[i]; @@ -64,6 +64,7 @@ ServerState::Stats& ServerState::Stats::Add(unsigned num_shards, const ServerSta this->multi_squash_exec_reply_usec += other.multi_squash_exec_reply_usec; this->blocked_on_interpreter += other.blocked_on_interpreter; + this->oom_error_cmd_cnt += other.oom_error_cmd_cnt; if (this->tx_width_freq_arr == nullptr) { this->tx_width_freq_arr = new uint64_t[num_shards]; diff --git a/src/server/server_state.h b/src/server/server_state.h index 79aaa625b8c0..799cd9086c4f 100644 --- a/src/server/server_state.h +++ b/src/server/server_state.h @@ -108,6 +108,9 @@ class ServerState { // public struct - to allow initialization. uint64_t blocked_on_interpreter = 0; + // Number of times we rejected command dispatch due to OOM condition. + uint64_t oom_error_cmd_cnt = 0; + // Array of size of number of shards. // Each entry is how many transactions we had with this width (unique_shard_cnt). uint64_t* tx_width_freq_arr = nullptr;