Skip to content

Commit

Permalink
m/n/kubernetes: add metricsprovider
Browse files Browse the repository at this point in the history
Kubernetes has a metrics provider interface, add an adapter to be able
to get these into our Prometheus registry. This code exists in a similar
form inside K8s but against their custom metrics architecture, not plain
Prometheus.

As these metrics are shared across all workqueues we follow K8s in
implementing this with a singleton/global. It's not the prettiest, but
otherwise we may get issues with Prometheus and duplicate metrics.

Change-Id: I0b6d608d14793e44859166a5a59d446c8f662a25
Reviewed-on: https://review.monogon.dev/c/monogon/+/3829
Reviewed-by: Tim Windelschmidt <[email protected]>
Tested-by: Jenkins CI
  • Loading branch information
lorenz committed Feb 11, 2025
1 parent 08fd1cb commit e8beaed
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 1 deletion.
9 changes: 8 additions & 1 deletion build/analysis/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,14 @@ NOGO_CONFIG_OVERRIDES = {
"cgo/": "cgo",
},
},
"haslicense": {
"exclude_files": {
"metropolis/node/kubernetes/metricsprovider": "Multi-Party copyright statement (#376)",
"external/": "third_party",
"bazel-out/": "generated_output",
"cgo/": "cgo",
},
},
}

# All analyzers that should be disabled for external, generated or cgo code.
Expand All @@ -215,7 +223,6 @@ DISABLED_FOR_EXTERNAL_CODE = [
"hash",
"errcmp",
"gofmt",
"haslicense",
] + ALL_STATICCHECK_ANALYZERS

# We override the variable with itself unioned with the other
Expand Down
2 changes: 2 additions & 0 deletions metropolis/node/kubernetes/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,11 @@ go_library(
"//metropolis/node/core/curator/watcher",
"//metropolis/node/core/identity",
"//metropolis/node/core/localstorage",
"//metropolis/node/core/metrics",
"//metropolis/node/core/network",
"//metropolis/node/kubernetes/authproxy",
"//metropolis/node/kubernetes/clusternet",
"//metropolis/node/kubernetes/metricsprovider",
"//metropolis/node/kubernetes/metricsproxy",
"//metropolis/node/kubernetes/nfproxy",
"//metropolis/node/kubernetes/pki",
Expand Down
12 changes: 12 additions & 0 deletions metropolis/node/kubernetes/metricsprovider/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library")

go_library(
name = "metricsprovider",
srcs = ["metricsprovider.go"],
importpath = "source.monogon.dev/metropolis/node/kubernetes/metricsprovider",
visibility = ["//visibility:public"],
deps = [
"@com_github_prometheus_client_golang//prometheus",
"@io_k8s_client_go//util/workqueue",
],
)
114 changes: 114 additions & 0 deletions metropolis/node/kubernetes/metricsprovider/metricsprovider.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// Copyright The Monogon Project Authors.
// Copyright 2019 The Kubernetes Authors.
// SPDX-License-Identifier: Apache-2.0

// Package metricsprovider provides a Prometheus registry for code in K8s
// client-go capable of providing metrics. Currently it registers itself
// as a metrics backend for workqueues, more can be added in the future.
// The registry with all the metrics is available as `Registry`.
package metricsprovider

import (
"github.com/prometheus/client_golang/prometheus"
"k8s.io/client-go/util/workqueue"
)

// Metrics subsystem and keys used by the workqueue.
const (
WorkQueueSubsystem = "workqueue"
DepthKey = "depth"
AddsKey = "adds_total"
QueueLatencyKey = "queue_duration_seconds"
WorkDurationKey = "work_duration_seconds"
UnfinishedWorkKey = "unfinished_work_seconds"
LongestRunningProcessorKey = "longest_running_processor_seconds"
RetriesKey = "retries_total"
)

var Registry = prometheus.NewRegistry()

var (
depth = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Subsystem: WorkQueueSubsystem,
Name: DepthKey,
Help: "Current depth of workqueue",
}, []string{"name"})

adds = prometheus.NewCounterVec(prometheus.CounterOpts{
Subsystem: WorkQueueSubsystem,
Name: AddsKey,
Help: "Total number of adds handled by workqueue",
}, []string{"name"})

latency = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Subsystem: WorkQueueSubsystem,
Name: QueueLatencyKey,
Help: "How long in seconds an item stays in the workqueue before being requested.",
Buckets: prometheus.ExponentialBuckets(10e-9, 10, 10),
}, []string{"name"})

workDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Subsystem: WorkQueueSubsystem,
Name: WorkDurationKey,
Help: "How long in seconds processing an item from workqueue takes.",
Buckets: prometheus.ExponentialBuckets(10e-9, 10, 10),
}, []string{"name"})

unfinished = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Subsystem: WorkQueueSubsystem,
Name: UnfinishedWorkKey,
Help: "How many seconds of work has done that " +
"is in progress and hasn't been observed by work_duration. Large " +
"values indicate stuck threads. One can deduce the number of stuck " +
"threads by observing the rate at which this increases.",
}, []string{"name"})

longestRunningProcessor = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Subsystem: WorkQueueSubsystem,
Name: LongestRunningProcessorKey,
Help: "How many seconds has the longest running " +
"processor for workqueue been running.",
}, []string{"name"})

retries = prometheus.NewCounterVec(prometheus.CounterOpts{
Subsystem: WorkQueueSubsystem,
Name: RetriesKey,
Help: "Total number of retries handled by workqueue",
}, []string{"name"})
)

func init() {
Registry.MustRegister(depth, adds, latency, workDuration, unfinished, longestRunningProcessor, retries)
workqueue.SetProvider(&promProvider{})
}

type promProvider struct {
}

func (promProvider) NewDepthMetric(name string) workqueue.GaugeMetric {
return depth.WithLabelValues(name)
}

func (promProvider) NewAddsMetric(name string) workqueue.CounterMetric {
return adds.WithLabelValues(name)
}

func (promProvider) NewLatencyMetric(name string) workqueue.HistogramMetric {
return latency.WithLabelValues(name)
}

func (promProvider) NewWorkDurationMetric(name string) workqueue.HistogramMetric {
return workDuration.WithLabelValues(name)
}

func (promProvider) NewUnfinishedWorkSecondsMetric(name string) workqueue.SettableGaugeMetric {
return unfinished.WithLabelValues(name)
}

func (promProvider) NewLongestRunningProcessorSecondsMetric(name string) workqueue.SettableGaugeMetric {
return longestRunningProcessor.WithLabelValues(name)
}

func (promProvider) NewRetriesMetric(name string) workqueue.CounterMetric {
return retries.WithLabelValues(name)
}
4 changes: 4 additions & 0 deletions metropolis/node/kubernetes/service_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@ import (
"source.monogon.dev/metropolis/node"
oclusternet "source.monogon.dev/metropolis/node/core/clusternet"
"source.monogon.dev/metropolis/node/core/localstorage"
"source.monogon.dev/metropolis/node/core/metrics"
"source.monogon.dev/metropolis/node/core/network"
"source.monogon.dev/metropolis/node/kubernetes/clusternet"
"source.monogon.dev/metropolis/node/kubernetes/metricsprovider"
"source.monogon.dev/metropolis/node/kubernetes/nfproxy"
kpki "source.monogon.dev/metropolis/node/kubernetes/pki"
"source.monogon.dev/metropolis/node/kubernetes/plugins/kvmdevice"
Expand Down Expand Up @@ -56,6 +58,8 @@ func NewWorker(c ConfigWorker) *Worker {
}

func (s *Worker) Run(ctx context.Context) error {
metrics.CoreRegistry.MustRegister(metricsprovider.Registry)
defer metrics.CoreRegistry.Unregister(metricsprovider.Registry)
// Run apiproxy, which load-balances connections from worker components to this
// cluster's api servers. This is necessary as we want to round-robin across all
// available apiservers, and Kubernetes components do not implement client-side
Expand Down

0 comments on commit e8beaed

Please sign in to comment.