diff --git a/ctats/README.md b/ctats/README.md index 2a8bddd..7baf584 100644 --- a/ctats/README.md +++ b/ctats/README.md @@ -12,7 +12,7 @@ _noun_ OTEL metrics, despite being an invaluable addition to service telemetry, require an obnoxiously verbose setup and implementation. Ctats isn't -here to provide any new features. Instead in wants to make the current +here to provide any new features. Instead it wants to make the current features more accessible and less painful. ### Step 1: Init OTEL with Clues @@ -36,7 +36,7 @@ func main() { ```go func main() { // ... - ctx, err = ctats.Initialize(ctx) + ctx, err = ctats.Initialize(ctx) // ... } ``` @@ -46,11 +46,11 @@ func main() { ```go func main() { // We're not kidding, this step is purely optional. - ctx, err := ctats.RegisterHistogram( + ctx, err := ctats.RegisterSum( ctx, - "http.server.latency", // Name - "ms", // Unit - "New user additions.", // Description + "http.server.requests", // Name + "1", // Unit + "Incoming HTTP requests by status code.", // Description ) } ``` @@ -59,10 +59,12 @@ func main() { ```go func handler(ctx context.Context) { - //... - ctats.Histogram[int64]("http.server.latency").Record(latency) - //... - + // ... + ctats.Sum[int64]("http.server.requests"). + With("status_code", statusCode). + Inc(ctx) + // ... +} ``` ## How it works @@ -97,13 +99,64 @@ values are `float64`s behind the scenes. Easier to avoid the problem of potential conflicts altogether. What, would you prefer that we panic? -## Corner Case: histogram bucket definitions +## Histograms + +Histograms can be a bit more work than the other types because you have to think +about your data's distribution ahead of time. Sure, you can run with whatever +OTEL uses as the default (15 buckets, scaling exponentially up to 10000), but is that +really the best showcase for your data? Probably not. + +In case you're new to all this business, +[here's a good read](https://signoz.io/blog/opentelemetry-histogram/) +to catch you up with how histograms work under the hood. -You can't define your histogram buckets with Ctats. Why? Because -[OTEL doesn't let you define them at runtime either](https://github.com/open-telemetry/opentelemetry-go/issues/3826). -You'll have to take it up with the package authors, not us. +So, how do you set yourself up for histogram success using ctats? +Just register your buckets on init! Simple as that. -## Sum vs Counter vs Gauge +```go +func main() { + boundaries := ctats.MakeExponentialHistogramBoundaries(1, 60_000, 15, 1) + ctx, err := ctats.RegisterHistogram( + ctx, + "op.latency", + "ms", + "End-to-end operation latency.", + ctats.WithBoundaries(boundaries...), + ) +} + +func handler(ctx context.Context) { + ctats.Histogram[int64]("op.latency").Record(ctx, elapsed) +} +``` + +Registering is optional. You can also pass `WithBoundaries` directly to the +factory and the instrument is created on the first `Record` call. Just keep +in mind that the first creation wins — if the same id was already registered +or recorded against with different boundaries, the new ones are silently +ignored. + +### Picking your boundaries + +Use `MakeExponentialHistogramBoundaries` to generate logarithmically-spaced +buckets. `low` and `high` determine the supported range of your metric. + +The optional `scalingFactor` warps the bucket distribution. At 1 you get +uniform log-spacing. Above 1, more edges cluster near `low` (useful for +latency, where data tends to clump at the low end). Between 0 and 1, more +edges cluster near `high`. Values ≤ 0 default to 1. + +```go +// example: measuring http server latencies in ms up to 60s +boundaries := ctats.MakeExponentialHistogramBoundaries(1, 60_000, 15, 1) + +ctats.Histogram[int64]( + "job.duration", + ctats.WithBoundaries(boundaries...), +).Record(ctx, elapsed) +``` + +## Which metric type should I use? Feeling overwhelmed? Not sure which type to pick? Just answer these simple questions and you'll be a master in no time! @@ -111,6 +164,7 @@ these simple questions and you'll be a master in no time! * Sum -> OTEL Counter * Counter -> OTEL UpDownCounter * Gauge -> OTEL Gauge (who knew?) +* Histogram -> OTEL Histogram (surprise!) Do you need `Delta Temporality`? Use a Sum, it's your only option! @@ -119,6 +173,8 @@ Do you need to decrement values? Use a Counter! Do you need have a single threaded, single source of truth? Try a Gauge! +Do you need statistics such as percentiles? Use a Histogram! + Sums are the most foolproof option all around. Plug one in, count away. Counters are nearly as good, if it weren't for the temporality constraint. For monotonically increasing values, diff --git a/ctats/histogram.go b/ctats/histogram.go index 318afcd..4a034b5 100644 --- a/ctats/histogram.go +++ b/ctats/histogram.go @@ -3,6 +3,7 @@ package ctats import ( "context" "log" + "math" "github.com/pkg/errors" "go.opentelemetry.io/otel/metric" @@ -11,12 +12,89 @@ import ( "github.com/alcionai/clues/internal/node" ) +// MakeExponentialHistogramBoundaries returns count boundaries spaced logarithmically +// between low and high (both inclusive). For background on explicit bucket histograms +// and how boundaries map to OTel buckets, see the OTel metrics SDK spec +// (navigate to the "Explicit Bucket Histogram Aggregation" section): +// https://opentelemetry.io/docs/specs/otel/metrics/sdk/ +// +// scalingFactor warps the position distribution between low and high. At 1, +// positions are uniformly log-spaced — constant growth ratio. Values above 1 +// pack more buckets toward low (useful when data clusters at the low end). +// Values between 0 and 1 pack more buckets toward high. Values ≤ 0 are invalid +// and default to 1. +// +// Example: +// +// MakeExponentialHistogramBoundaries(1, 60_000, 15, 1) +// // → [1 2 5 11 23 51 112 245 537 1179 2588 5679 12461 27344 60000] +// +// MakeExponentialHistogramBoundaries(10, 1000, 5, 0.5) +// // → [10 100 260 540 1000] (denser at high end) +// +// MakeExponentialHistogramBoundaries(10, 1000, 5, 1) +// // → [10 32 100 316 1000] (uniform log-spacing) +// +// MakeExponentialHistogramBoundaries(10, 1000, 5, 2) +// // → [10 13 32 133 1000] (denser at low end) +func MakeExponentialHistogramBoundaries( + low, high float64, + count int, + scalingFactor float64, +) []float64 { + if scalingFactor <= 0 { + scalingFactor = 1 + } + + if count < 2 { + return []float64{low, high} + } + + b := make([]float64, count) + + for i := range b { + t := math.Pow(float64(i)/float64(count-1), scalingFactor) + b[i] = math.Round(low * math.Pow(high/low, t)) + } + + b[0] = low // guarantee exact floor, no rounding drift + b[count-1] = high // guarantee exact ceiling, no rounding drift + + return b +} + +type histogramCfg struct { + boundaries []float64 +} + +func (c histogramCfg) appendOpts( + opts ...metric.Float64HistogramOption, +) []metric.Float64HistogramOption { + if len(c.boundaries) > 0 { + opts = append(opts, metric.WithExplicitBucketBoundaries(c.boundaries...)) + } + + return opts +} + +type HistogramOption func(*histogramCfg) + +// WithBoundaries sets explicit bucket boundaries on the histogram. +// Boundaries are passed to the OTel SDK at instrument creation time and are +// ignored if a matching MeterProvider View is already configured. +func WithBoundaries(boundaries ...float64) HistogramOption { + return func(c *histogramCfg) { + c.boundaries = boundaries + } +} + // getOrCreateHistogram attempts to retrieve a histogram from the // context with the given ID. If it is unable to find a histogram // with that ID, a new histogram is generated. func getOrCreateHistogram( ctx context.Context, id string, + cfg histogramCfg, ) (recorder, error) { id = formatID(id) b := fromCtx(ctx) @@ -36,7 +114,10 @@ func getOrCreateHistogram( return nil, cluerr.Stack(errNoNodeInCtx) } - hist, err := nc.OTELMeter().Float64Histogram(id) + opts := cfg.appendOpts() + + // register the histogram + hist, err := nc.OTELMeter().Float64Histogram(id, opts...) if err != nil { return nil, errors.Wrap(err, "making new histogram") } @@ -50,17 +131,19 @@ func getOrCreateHistogram( // RegisterHistogram introduces a new histogram with the given unit and description. // If RegisterHistogram is not called before updating a metric value, a histogram with -// no unit or description is created. If RegisterHistogram is called for an ID that +// no unit or description is created. If RegisterHistogram is called for an ID that // has already been registered, it no-ops. func RegisterHistogram( ctx context.Context, - // all lowercase, period delimited id of the histogram. Ex: "http.response.status_code" + // all lowercase, period delimited id of the histogram. Ex: "http.response.size" id string, // (optional) the unit of measurement. Ex: "byte", "kB", "fnords" unit string, // (optional) a short description about the metric. // Ex: "number of times we saw the fnords". description string, + // (optional) histogram specific options + opts ...HistogramOption, ) (context.Context, error) { id = formatID(id) @@ -82,18 +165,24 @@ func RegisterHistogram( return ctx, errors.New("no clues in ctx") } - opts := []metric.Float64HistogramOption{} + var cfg histogramCfg + for _, o := range opts { + o(&cfg) + } + + var metricHistogramOpts []metric.Float64HistogramOption if len(description) > 0 { - opts = append(opts, metric.WithDescription(description)) + metricHistogramOpts = append(metricHistogramOpts, metric.WithDescription(description)) } if len(unit) > 0 { - opts = append(opts, metric.WithUnit(unit)) + metricHistogramOpts = append(metricHistogramOpts, metric.WithUnit(unit)) } - // register the histogram - hist, err := nc.OTELMeter().Float64Histogram(id, opts...) + metricHistogramOpts = cfg.appendOpts(metricHistogramOpts...) + + hist, err := nc.OTELMeter().Float64Histogram(id, metricHistogramOpts...) if err != nil { return ctx, errors.Wrap(err, "creating histogram") } @@ -107,17 +196,23 @@ func RegisterHistogram( // If a Histogram instance has been registered for that ID, the // registered instance will be used. If not, a new instance // will get generated. -func Histogram[N number](id string) histogram[N] { - return histogram[N]{base: base{id: formatID(id)}} +func Histogram[N number](id string, opts ...HistogramOption) histogram[N] { + hgm := histogram[N]{base: base{id: formatID(id)}} + for _, o := range opts { + o(&hgm.histogramCfg) + } + + return hgm } // histogram provides access to the factory functions. type histogram[N number] struct { base + histogramCfg } func (c histogram[N]) With(kvs ...any) histogram[N] { - return histogram[N]{base: c.with(kvs...)} + return histogram[N]{base: c.with(kvs...), histogramCfg: c.histogramCfg} } type recorder interface { @@ -128,9 +223,9 @@ type noopRecorder struct{} func (n noopRecorder) Record(context.Context, float64, ...metric.RecordOption) {} -// Add increments the histogram by n. n can be negative. +// Record records the measurement of n in the histogram. func (c histogram[number]) Record(ctx context.Context, n number) { - hist, err := getOrCreateHistogram(ctx, c.getID()) + hist, err := getOrCreateHistogram(ctx, c.getID(), c.histogramCfg) if err != nil { log.Printf("err getting histogram: %+v\n", err) return diff --git a/ctats/histogram_test.go b/ctats/histogram_test.go index 7aba6b3..2c809ee 100644 --- a/ctats/histogram_test.go +++ b/ctats/histogram_test.go @@ -7,6 +7,10 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.opentelemetry.io/otel/attribute" + sdkMetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" + + "github.com/alcionai/clues/internal/node" ) func TestHistogram(t *testing.T) { @@ -102,3 +106,268 @@ func TestHistogramWithDoesNotMutateBase(t *testing.T) { assert.Equal(t, attrs, withAttrs.getOTELKVAttrs()) assert.Len(t, second.getOTELKVAttrs(), 2) } + +// --------------------------------------------------------------------------- +// MakeExponentialHistogramBoundaries +// --------------------------------------------------------------------------- + +func TestBoundaries(t *testing.T) { + testCases := []struct { + name string + got []float64 + want []float64 + }{ + { + name: "count less than 2 returns [min, max]", + got: MakeExponentialHistogramBoundaries(1, 100, 1, 0), + want: []float64{1, 100}, + }, + { + name: "count less than 2 ignores scaling factor", + got: MakeExponentialHistogramBoundaries(1, 100, 1, 5), + want: []float64{1, 100}, + }, + { + name: "scaling factor 0 is invalid - defaults to 1", + got: MakeExponentialHistogramBoundaries(10, 1000, 5, 0), + want: []float64{10, 32, 100, 316, 1000}, + }, + { + name: "negative scaling factor is invalid - defaults to 1", + got: MakeExponentialHistogramBoundaries(10, 1000, 5, -3), + want: []float64{10, 32, 100, 316, 1000}, + }, + { + name: "15 buckets, uniform log-spacing", + got: MakeExponentialHistogramBoundaries(1, 60_000, 15, 1), + want: []float64{ + 1, 2, 5, 11, 23, 51, 112, 245, + 537, 1179, 2588, 5679, 12461, 27344, 60000, + }, + }, + { + name: "20 buckets, uniform log-spacing", + got: MakeExponentialHistogramBoundaries(1, 60_000, 20, 1), + want: []float64{ + 1, 2, 3, 6, 10, 18, 32, 58, 103, 183, + 327, 584, 1042, 1859, 3317, 5919, 10561, 18845, 33626, 60000, + }, + }, + { + name: "scaling factor 0.5: finer resolution at high end, min and max preserved", + got: MakeExponentialHistogramBoundaries(10, 1000, 5, 0.5), + want: []float64{10, 100, 260, 540, 1000}, + }, + { + name: "5 buckets, uniform log-spacing", + got: MakeExponentialHistogramBoundaries(10, 1000, 5, 1), + want: []float64{10, 32, 100, 316, 1000}, + }, + { + name: "scaling factor 2: finer resolution at low end, min and max preserved", + got: MakeExponentialHistogramBoundaries(10, 1000, 5, 2), + want: []float64{10, 13, 32, 133, 1000}, + }, + { + name: "scaling factor 3: even finer resolution at low end", + got: MakeExponentialHistogramBoundaries(10, 1000, 5, 3), + want: []float64{10, 11, 18, 70, 1000}, + }, + { + name: "scaling factor 5: extreme skewness, first intermediate bucket saturates to min", + got: MakeExponentialHistogramBoundaries(10, 1000, 5, 5), + want: []float64{10, 10, 12, 30, 1000}, + }, + } + + for _, test := range testCases { + t.Run(test.name, func(t *testing.T) { + assert.Equal(t, test.want, test.got) + }) + } +} + +// --------------------------------------------------------------------------- +// WithBoundaries option +// --------------------------------------------------------------------------- + +func TestHistogramWithBoundariesOption(t *testing.T) { + want := []float64{1, 10, 100, 1000} + + h := Histogram[int64]("bounds.hist", WithBoundaries(want...)) + + assert.Equal(t, want, h.boundaries) +} + +func TestHistogramWithBoundariesPreservedByWith(t *testing.T) { + boundaries := []float64{5, 50, 500} + + base := Histogram[int64]("preserve.bounds", WithBoundaries(boundaries...)) + child := base.With("key", "val") + + assert.Equal(t, boundaries, base.boundaries, "base boundaries unchanged") + assert.Equal(t, boundaries, child.boundaries, "With must copy boundaries to child") +} + +func TestHistogramWithBoundariesDoesNotMutateBase(t *testing.T) { + boundaries := []float64{1, 2, 3} + + base := Histogram[int64]("nomutate.bounds", WithBoundaries(boundaries...)) + assert.Nil(t, base.getOTELKVAttrs(), "base has no attributes before With") + + child := base.With("k", "v") + + assert.Nil(t, base.getOTELKVAttrs(), "base attributes still nil after With") + assert.Len(t, child.getOTELKVAttrs(), 1) + assert.Equal(t, boundaries, child.boundaries, "child carries boundaries") +} + +func TestHistogramNoBoundariesByDefault(t *testing.T) { + h := Histogram[float64]("no.bounds") + assert.Nil(t, h.boundaries, "no boundaries by default") +} + +func TestHistogramFirstBoundariesWin(t *testing.T) { + first := []float64{1, 10, 100} + second := []float64{500, 1000, 5000} + + testCases := []struct { + name string + setup func(t *testing.T, ctx context.Context) context.Context + }{ + { + name: "first via factory Record", + setup: func(t *testing.T, ctx context.Context) context.Context { + Histogram[int64]("first.wins", WithBoundaries(first...)).Record(ctx, 50) + return ctx + }, + }, + { + name: "first via RegisterHistogram", + setup: func(t *testing.T, ctx context.Context) context.Context { + ctx, err := RegisterHistogram(ctx, "first.wins", "", "", WithBoundaries(first...)) + require.NoError(t, err) + + return ctx + }, + }, + } + + for _, test := range testCases { + t.Run(test.name, func(t *testing.T) { + reader := sdkMetric.NewManualReader() + ctx := ctatsCtx(t, reader) + ctx = test.setup(t, ctx) + + Histogram[int64]("first.wins", WithBoundaries(second...)).Record(ctx, 50) + + dp := collectHistogram(t, reader, "first.wins") + assert.Equal( + t, float64(100), dp.Bounds[len(dp.Bounds)-1], + "second boundaries ignored, first wins", + ) + }) + } +} + +// --------------------------------------------------------------------------- +// Record end-to-end with real OTel MeterProvider +// --------------------------------------------------------------------------- + +// ctatsCtx returns a context wired with a real OTel MeterProvider backed by +// the given ManualReader, suitable for testing ctats.Record end-to-end. +func ctatsCtx(t *testing.T, reader *sdkMetric.ManualReader) context.Context { + t.Helper() + + mp := sdkMetric.NewMeterProvider(sdkMetric.WithReader(reader)) + + t.Cleanup(func() { _ = mp.Shutdown(context.Background()) }) + + otelClient := &node.OTELClient{ + Meter: mp.Meter("ctats-test"), + MeterProvider: mp, + } + + n := &node.Node{OTEL: otelClient} + ctx := node.EmbedInCtx(context.Background(), n) + + ctx, err := Initialize(ctx) + require.NoError(t, err) + + return ctx +} + +// collectHistogram retrieves the first data point for a named histogram from a +// ManualReader snapshot. +func collectHistogram( + t *testing.T, + reader *sdkMetric.ManualReader, + name string, +) metricdata.HistogramDataPoint[float64] { + t.Helper() + + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(context.Background(), &rm)) + + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + if m.Name == name { + h, ok := m.Data.(metricdata.Histogram[float64]) + require.True(t, ok, "metric %q is not a Histogram[float64]", name) + require.NotEmpty(t, h.DataPoints) + + return h.DataPoints[0] + } + } + } + + t.Fatalf("histogram %q not found", name) + + return metricdata.HistogramDataPoint[float64]{} +} + +// TestRecordWithDefaultLatencyBoundaries records a 15,000 ms value through the +// full ctats.Record path. +// +// 15,000 falls between bounds[12]=12,461 and bounds[13]=27,344 → bucket index 13. +func TestRecordWithDefaultLatencyBoundaries(t *testing.T) { + reader := sdkMetric.NewManualReader() + ctx := ctatsCtx(t, reader) + + boundaries := MakeExponentialHistogramBoundaries(1, 60_000, 15, 1) + Histogram[int64]("op.latency", WithBoundaries(boundaries...)).Record(ctx, 15_000) + + dp := collectHistogram(t, reader, "op.latency") + + assert.Equal( + t, float64(60_000), dp.Bounds[len(dp.Bounds)-1], + "last boundary is 60,000 ms", + ) + assert.Equal(t, uint64(0), dp.BucketCounts[len(dp.BucketCounts)-1], "no overflow") + + // 15,000 ms sits between bounds[12]=12,461 and bounds[13]=27,344 + assert.Equal( + t, uint64(1), dp.BucketCounts[13], + "15,000 ms lands in bucket 13 (12461–27344 ms)", + ) +} + +// TestRecordDefaultOTelBoundariesOverflow shows that without WithBoundaries, +// the OTel SDK default ceiling of 10,000 ms causes 15,000 ms to overflow. +func TestRecordDefaultOTelBoundariesOverflow(t *testing.T) { + reader := sdkMetric.NewManualReader() + ctx := ctatsCtx(t, reader) + + Histogram[int64]("op.latency.default").Record(ctx, 15_000) + + dp := collectHistogram(t, reader, "op.latency.default") + + assert.Equal( + t, float64(10_000), dp.Bounds[len(dp.Bounds)-1], + "default ceiling is 10,000 ms", + ) + assert.Equal( + t, uint64(1), dp.BucketCounts[len(dp.BucketCounts)-1], + "15,000 ms overflows to +Inf", + ) +}