-
Notifications
You must be signed in to change notification settings - Fork 837
feat(executor): add OTel metrics for the TaskAction controller #7483
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 3 commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
ce47c0f
feat(executor): add OTel metrics for the TaskAction controller
pingsutw 9af0d96
feat(executor): add TaskAction CRD r/w latency; drop server peer attrs
pingsutw 8a14894
refactor(executor): inject meter provider; instrument client instead …
pingsutw 80be43d
refactor(executor): time CRD ops inline instead of via client decorator
pingsutw c44b970
Apply suggestions from code review
pingsutw 865948f
Apply suggestions from code review
pingsutw 849800e
Apply suggestions from code review
pingsutw a78ea87
fix: restore observeCRDSize function signature
pingsutw 1f52ff1
refactor(executor): count active TaskActions from cache indexer (no d…
pingsutw File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,59 @@ | ||
| package controller | ||
|
|
||
| import ( | ||
| "context" | ||
|
|
||
| "sigs.k8s.io/controller-runtime/pkg/client" | ||
|
|
||
| flyteorgv1 "github.com/flyteorg/flyte/v2/executor/api/v1" | ||
| ) | ||
|
|
||
| // newInstrumentedClient wraps c so TaskAction CRD operations (Get, Update, and | ||
| // Status().Update) are timed under taskaction.k8s.duration. Operations on other | ||
| // object types pass through untimed. When metrics registration failed (m == nil) | ||
| // it returns c unchanged, so callers can wrap unconditionally. | ||
| // | ||
| // The reconciler embeds the wrapped client, which makes the instrumentation | ||
| // structural: call sites use the idiomatic r.Get/r.Update/r.Status().Update and | ||
| // cannot accidentally bypass the timing. | ||
|
|
||
| func newInstrumentedClient(c client.Client, m *taskActionMetrics) client.Client { | ||
| if m == nil { | ||
| return c | ||
| } | ||
| return &instrumentedClient{Client: c, metrics: m} | ||
| } | ||
|
|
||
| type instrumentedClient struct { | ||
| client.Client | ||
| metrics *taskActionMetrics | ||
| } | ||
|
|
||
| func (c *instrumentedClient) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error { | ||
| if _, ok := obj.(*flyteorgv1.TaskAction); !ok { | ||
| return c.Client.Get(ctx, key, obj, opts...) | ||
| } | ||
| return c.metrics.timeK8sOp(ctx, opGet, func() error { return c.Client.Get(ctx, key, obj, opts...) }) | ||
| } | ||
|
|
||
| func (c *instrumentedClient) Update(ctx context.Context, obj client.Object, opts ...client.UpdateOption) error { | ||
| if _, ok := obj.(*flyteorgv1.TaskAction); !ok { | ||
| return c.Client.Update(ctx, obj, opts...) | ||
| } | ||
| return c.metrics.timeK8sOp(ctx, opUpdate, func() error { return c.Client.Update(ctx, obj, opts...) }) | ||
| } | ||
|
|
||
| func (c *instrumentedClient) Status() client.SubResourceWriter { | ||
| return &instrumentedStatusWriter{SubResourceWriter: c.Client.Status(), metrics: c.metrics} | ||
| } | ||
|
|
||
| type instrumentedStatusWriter struct { | ||
| client.SubResourceWriter | ||
| metrics *taskActionMetrics | ||
| } | ||
|
|
||
| func (w *instrumentedStatusWriter) Update(ctx context.Context, obj client.Object, opts ...client.SubResourceUpdateOption) error { | ||
| if _, ok := obj.(*flyteorgv1.TaskAction); !ok { | ||
| return w.SubResourceWriter.Update(ctx, obj, opts...) | ||
| } | ||
| return w.metrics.timeK8sOp(ctx, opStatusUpdate, func() error { return w.SubResourceWriter.Update(ctx, obj, opts...) }) | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,116 @@ | ||
| package controller | ||
|
|
||
| import ( | ||
| "context" | ||
| "encoding/json" | ||
| "time" | ||
|
|
||
| "go.opentelemetry.io/otel/attribute" | ||
| "go.opentelemetry.io/otel/metric" | ||
| "sigs.k8s.io/controller-runtime/pkg/client" | ||
|
|
||
| flyteorgv1 "github.com/flyteorg/flyte/v2/executor/api/v1" | ||
| ) | ||
|
pingsutw marked this conversation as resolved.
Copilot marked this conversation as resolved.
|
||
|
|
||
| const taskActionMeterName = "taskaction-controller" | ||
|
|
||
| // Values for the "op" attribute on taskaction.k8s.duration. | ||
| const ( | ||
| opGet = "get" | ||
| opUpdate = "update" | ||
| opStatusUpdate = "status_update" | ||
| ) | ||
|
|
||
| // taskActionMetrics holds OTel instruments for the TaskAction controller. | ||
| // | ||
| // Reconcile throughput, active workers, and workqueue latency already come from | ||
| // the controller-runtime metrics server, and event-proxy send latency comes from | ||
| // the otelconnect-wrapped events client (rpc_client_duration). This adds what | ||
| // none of those provide: TaskAction CRD count by phase, serialized CRD size, and | ||
| // per-operation Kubernetes API read/write latency for the CRD. | ||
| type taskActionMetrics struct { | ||
| crdSizeBytes metric.Int64Histogram | ||
| crdOpDuration metric.Float64Histogram | ||
| } | ||
|
|
||
| // registerTaskActionMetrics wires the TaskAction OTel meters onto the given meter | ||
| // provider (the executor's, registered in executor/setup.go). The active-by-phase | ||
| // gauge is observed asynchronously by listing TaskActions from the controller cache. | ||
| func registerTaskActionMetrics(provider metric.MeterProvider, c client.Client) (*taskActionMetrics, error) { | ||
| meter := provider.Meter(taskActionMeterName) | ||
|
pingsutw marked this conversation as resolved.
Outdated
|
||
|
|
||
| crdSize, err := meter.Int64Histogram( | ||
| "taskaction.crd.size_bytes", | ||
| metric.WithDescription("Serialized (JSON) size of a TaskAction CRD"), | ||
| metric.WithUnit("By"), | ||
| ) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| crdOp, err := meter.Float64Histogram( | ||
| "taskaction.k8s.duration", | ||
| metric.WithDescription("Latency of TaskAction CRD operations against the Kubernetes API, labeled by op (get/update/status_update)"), | ||
| metric.WithUnit("ms"), | ||
| ) | ||
|
pingsutw marked this conversation as resolved.
pingsutw marked this conversation as resolved.
pingsutw marked this conversation as resolved.
|
||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| active, err := meter.Int64ObservableGauge( | ||
| "taskaction.active", | ||
| metric.WithDescription("Number of TaskAction CRDs, labeled by plugin phase"), | ||
| ) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| _, err = meter.RegisterCallback(func(ctx context.Context, o metric.Observer) error { | ||
| var list flyteorgv1.TaskActionList | ||
| if err := c.List(ctx, &list); err != nil { | ||
| return err | ||
| } | ||
|
pingsutw marked this conversation as resolved.
Outdated
pingsutw marked this conversation as resolved.
Outdated
|
||
| counts := make(map[string]int64, 8) | ||
|
pingsutw marked this conversation as resolved.
Outdated
|
||
| for i := range list.Items { | ||
| phase := list.Items[i].Status.PluginPhase | ||
| if phase == "" { | ||
| phase = "Unknown" | ||
| } | ||
| counts[phase]++ | ||
| } | ||
| for phase, n := range counts { | ||
| o.ObserveInt64(active, n, metric.WithAttributes(attribute.String("phase", phase))) | ||
| } | ||
| return nil | ||
| }, active) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| return &taskActionMetrics{crdSizeBytes: crdSize, crdOpDuration: crdOp}, nil | ||
| } | ||
|
|
||
| // observeCRDSize records the serialized size of a TaskAction CRD. No-op if metrics | ||
| // registration failed (m == nil). | ||
| func (m *taskActionMetrics) observeCRDSize(ctx context.Context, ta *flyteorgv1.TaskAction) { | ||
|
pingsutw marked this conversation as resolved.
Outdated
|
||
| if m == nil || m.crdSizeBytes == nil { | ||
| return | ||
| } | ||
| if b, err := json.Marshal(ta); err == nil { | ||
| m.crdSizeBytes.Record(ctx, int64(len(b))) | ||
| } | ||
| } | ||
|
|
||
| // timeK8sOp times a Kubernetes API operation against the TaskAction CRD and records | ||
| // its latency under taskaction.k8s.duration{op,error}. It is a transparent pass-through | ||
| // when metrics registration failed (m == nil), so callers can wrap unconditionally. | ||
| func (m *taskActionMetrics) timeK8sOp(ctx context.Context, op string, fn func() error) error { | ||
| if m == nil || m.crdOpDuration == nil { | ||
| return fn() | ||
| } | ||
| start := time.Now() | ||
| err := fn() | ||
| m.crdOpDuration.Record(ctx, float64(time.Since(start).Microseconds())/1000.0, | ||
| metric.WithAttributes(attribute.String("op", op), attribute.Bool("error", err != nil))) | ||
| return err | ||
| } | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.