Skip to content

[e2ealerting] Add support for custom histogram buckets #204

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 29 additions & 4 deletions pkg/alerting/receiver.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"sync"
"time"

"github.com/cortexproject/cortex/pkg/util/flagext"
"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
"github.com/gorilla/mux"
Expand All @@ -23,6 +24,8 @@ const (
namespace = "e2ealerting"
)

var defaultHistogramBuckets = []float64{5, 15, 30, 60, 90, 120, 240}

// Receiver implements the Alertmanager webhook receiver. It evaluates the received alerts, finds if the alert holds an annonnation with a label of "time", and if it does computes now - time for a total duration.
type Receiver struct {
logger log.Logger
Expand All @@ -42,16 +45,18 @@ type Receiver struct {

// ReceiverConfig implements the configuration for the alertmanager webhook receiver
type ReceiverConfig struct {
RoundtripLabel string
PurgeLookback time.Duration
PurgeInterval time.Duration
CustomHistogramBuckets flagext.StringSliceCSV
PurgeInterval time.Duration
PurgeLookback time.Duration
RoundtripLabel string
}

// RegisterFlags registers the flags for the alertmanager webhook receiver
func (cfg *ReceiverConfig) RegisterFlags(f *flag.FlagSet) {
f.StringVar(&cfg.RoundtripLabel, "receiver.e2eduration-label", "", "Label name and value in the form 'name=value' to add for the Histogram that observes latency.")
f.DurationVar(&cfg.PurgeInterval, "receiver.purge-interval", 15*time.Minute, "How often should we purge the in-memory measured timestamps tracker.")
f.DurationVar(&cfg.PurgeLookback, "receiver.purge-lookback", 2*time.Hour, "Period at which measured timestamps will remain in-memory.")
f.Var(&cfg.CustomHistogramBuckets, "receiver.custom-histogram-buckets", "Custom histogram buckets, defaults to 5, 15, 30, 60, 90, 120, 240")
}

// NewReceiver returns an alertmanager webhook receiver
Expand Down Expand Up @@ -88,12 +93,27 @@ func NewReceiver(cfg ReceiverConfig, log log.Logger, reg prometheus.Registerer)
Help: "Total number of failed evaluations made by the webhook receiver.",
})

var buckets []float64

if len(cfg.CustomHistogramBuckets) == 0 {
buckets = defaultHistogramBuckets
} else {
buckets = make([]float64, len(cfg.CustomHistogramBuckets))
for i, v := range cfg.CustomHistogramBuckets {
n, err := strconv.ParseFloat(v, 64)
if err != nil {
panic(err)
}
buckets[i] = n
}
}

r.roundtripDuration = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: "webhook_receiver",
Name: "end_to_end_duration_seconds",
Help: "Time spent (in seconds) from scraping a metric to receiving an alert.",
Buckets: []float64{5, 15, 30, 60, 90, 120, 240},
Buckets: buckets,
ConstLabels: lbl,
}, []string{"alertname"})

Expand All @@ -106,6 +126,11 @@ func NewReceiver(cfg ReceiverConfig, log log.Logger, reg prometheus.Registerer)
// RegisterRoutes registers the receiver API routes with the provided router.
func (r *Receiver) RegisterRoutes(router *mux.Router) {
router.Path("/api/v1/receiver").Methods(http.MethodPost).Handler(http.HandlerFunc(r.measureLatency))
router.Path("/health").Methods(http.MethodGet).Handler(http.HandlerFunc(r.health))
}

func (r *Receiver) health(w http.ResponseWriter, req *http.Request) {
w.WriteHeader(http.StatusOK)
}

func (r *Receiver) measureLatency(w http.ResponseWriter, req *http.Request) {
Expand Down
117 changes: 117 additions & 0 deletions pkg/alerting/receiver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"fmt"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"

Expand All @@ -17,6 +18,25 @@ import (
"github.com/stretchr/testify/require"
)

func Test_healthCheck(t *testing.T) {
r, err := NewReceiver(
ReceiverConfig{PurgeInterval: 1 * time.Hour},
log.NewNopLogger(),
prometheus.NewRegistry(),
)
require.NoError(t, err)

router := mux.NewRouter()
r.RegisterRoutes(router)

req := httptest.NewRequest(http.MethodGet, "/health", nil)
w := httptest.NewRecorder()

router.ServeHTTP(w, req)

require.Equal(t, http.StatusOK, w.Code)
}

func Test_measureLatency(t *testing.T) {
tc := []struct {
name string
Expand Down Expand Up @@ -94,3 +114,100 @@ func Test_measureLatency(t *testing.T) {
})
}
}

func Test_measureLatencyCustomBuckets(t *testing.T) {
tc := []struct {
name string
alerts template.Data
err error
tracked []float64
}{
{
name: "with alerts to track",
alerts: template.Data{
Alerts: template.Alerts{
template.Alert{
Labels: template.KV{model.AlertNameLabel: "e2ealertingAlwaysFiring"},
Annotations: template.KV{"time": "1.604069614e+09"},
Status: string(model.AlertFiring),
},
template.Alert{
Labels: template.KV{model.AlertNameLabel: "e2ealertingAlwaysFiring"},
Annotations: template.KV{"time": "1.604069615e+09"},
Status: string(model.AlertFiring),
},
},
},
tracked: []float64{1604069614.00, 1604069615.00},
},
{
name: "with alerts that don't have a time annotation or alertname label it ignores them",
alerts: template.Data{
Alerts: template.Alerts{
template.Alert{
Labels: template.KV{model.AlertNameLabel: "e2ealertingAlwaysFiring"},
Annotations: template.KV{"time": "1.604069614e+09"},
Status: string(model.AlertFiring),
},
template.Alert{
Labels: template.KV{model.AlertNameLabel: "e2ealertingAlwaysFiring"},
Status: string(model.AlertFiring),
},
template.Alert{
Annotations: template.KV{"time": "1.604069614e+09"},
Status: string(model.AlertFiring),
},
},
},
tracked: []float64{1604069614.00},
},
}

for _, tt := range tc {
t.Run(tt.name, func(t *testing.T) {
reg := prometheus.NewRegistry()
r, err := NewReceiver(
ReceiverConfig{
PurgeInterval: 1 * time.Hour,
CustomHistogramBuckets: []string{"0.5", "1", "10"},
},
log.NewNopLogger(),
reg,
)
require.NoError(t, err)

router := mux.NewRouter()
r.RegisterRoutes(router)

b, err := json.Marshal(tt.alerts)
require.NoError(t, err)

req := httptest.NewRequest(http.MethodPost, "/api/v1/receiver", bytes.NewBuffer(b))
w := httptest.NewRecorder()

router.ServeHTTP(w, req)

require.Equal(t, http.StatusOK, w.Code)
require.Equal(t, len(tt.tracked), len(r.timestamps))
for _, timestamp := range tt.tracked {
_, exists := r.timestamps[timestamp]
require.True(t, exists, fmt.Sprintf("time %f is not tracked", timestamp))
}

metrics, err := reg.Gather()
require.NoError(t, err)
for _, metricFamily := range metrics {
if strings.Contains(*metricFamily.Name, "end_to_end_duration_seconds") {
for _, metric := range metricFamily.GetMetric() {
require.Len(t, metric.Histogram.Bucket, 3)
bucketBounds := []float64{}
for _, bucket := range metric.Histogram.Bucket {
bucketBounds = append(bucketBounds, *bucket.UpperBound)
}
require.Equal(t, []float64{0.5, 1, 10}, bucketBounds)
}
}
}
})
}
}