Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

api: RequestCost configurations #103

Merged
merged 27 commits into from
Jan 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 135 additions & 0 deletions api/v1alpha1/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,99 @@ type AIGatewayRouteSpec struct {
// Currently, the filter is only implemented as an external process filter, which might be
// extended to other types of filters in the future. See https://github.com/envoyproxy/ai-gateway/issues/90
FilterConfig *AIGatewayFilterConfig `json:"filterConfig,omitempty"`

// LLMRequestCosts specifies how to capture the cost of the LLM-related request, notably the token usage.
// The AI Gateway filter will capture each specified number and store it in the Envoy's dynamic
// metadata per HTTP request. The namespaced key is "io.envoy.ai_gateway",
//
// For example, let's say we have the following LLMRequestCosts configuration:
mathetake marked this conversation as resolved.
Show resolved Hide resolved
//
// llmRequestCosts:
// - metadataKey: llm_input_token
// type: InputToken
// - metadataKey: llm_output_token
// type: OutputToken
// - metadataKey: llm_total_token
// type: TotalToken
//
// Then, with the following BackendTrafficPolicy of Envoy Gateway, you can have three
// rate limit buckets for each unique x-user-id header value. One bucket is for the input token,
// the other is for the output token, and the last one is for the total token.
// Each bucket will be reduced by the corresponding token usage captured by the AI Gateway filter.
//
// apiVersion: gateway.envoyproxy.io/v1alpha1
// kind: BackendTrafficPolicy
// metadata:
// name: some-example-token-rate-limit
// namespace: default
// spec:
// targetRefs:
// - group: gateway.networking.k8s.io
// kind: HTTPRoute
// name: usage-rate-limit
// rateLimit:
// type: Global
// global:
// rules:
// - clientSelectors:
// # Do the rate limiting based on the x-user-id header.
// - headers:
// - name: x-user-id
// type: Distinct
// limit:
// # Configures the number of "tokens" allowed per hour.
// requests: 10000
// unit: Hour
// cost:
// request:
// from: Number
// # Setting the request cost to zero allows to only check the rate limit budget,
// # and not consume the budget on the request path.
// number: 0
// # This specifies the cost of the response retrieved from the dynamic metadata set by the AI Gateway filter.
// # The extracted value will be used to consume the rate limit budget, and subsequent requests will be rate limited
// # if the budget is exhausted.
// response:
// from: Metadata
// metadata:
// namespace: io.envoy.ai_gateway
// key: llm_input_token
// - clientSelectors:
// - headers:
// - name: x-user-id
// type: Distinct
// limit:
// requests: 10000
// unit: Hour
// cost:
// request:
// from: Number
// number: 0
// response:
// from: Metadata
// metadata:
// namespace: io.envoy.ai_gateway
// key: llm_output_token
// - clientSelectors:
// - headers:
// - name: x-user-id
// type: Distinct
// limit:
// requests: 10000
// unit: Hour
// cost:
// request:
// from: Number
// number: 0
// response:
// from: Metadata
// metadata:
// namespace: io.envoy.ai_gateway
// key: llm_total_token
//
// +optional
// +kubebuilder:validation:MaxItems=36
LLMRequestCosts []LLMRequestCost `json:"llmRequestCosts,omitempty"`
}

// AIGatewayRouteRule is a rule that defines the routing behavior of the AIGatewayRoute.
Expand Down Expand Up @@ -230,6 +323,9 @@ type AIServiceBackendSpec struct {
//
// +optional
BackendSecurityPolicyRef *gwapiv1.LocalObjectReference `json:"backendSecurityPolicyRef,omitempty"`

// TODO: maybe add backend-level LLMRequestCost configuration that overrides the AIGatewayRoute-level LLMRequestCost.
// That may be useful for the backend that has a different cost calculation logic.
}

// VersionedAPISchema defines the API schema of either AIGatewayRoute (the input) or AIServiceBackend (the output).
Expand Down Expand Up @@ -378,3 +474,42 @@ type AWSOIDCExchangeToken struct {
// which maps to the temporary AWS security credentials exchanged using the authentication token issued by OIDC provider.
AwsRoleArn string `json:"awsRoleArn"`
}

// LLMRequestCost configures each request cost.
type LLMRequestCost struct {
// MetadataKey is the key of the metadata to store this cost of the request.
//
// +kubebuilder:validation:Required
MetadataKey string `json:"metadataKey"`
// Type specifies the type of the request cost. The default is "OutputToken",
mathetake marked this conversation as resolved.
Show resolved Hide resolved
// and it uses "output token" as the cost. The other types are "InputToken" and "TotalToken".
//
// +kubebuilder:validation:Enum=OutputToken;InputToken;TotalToken
Type LLMRequestCostType `json:"type"`
// CELExpression is the CEL expression to calculate the cost of the request.
// The CEL expression must return an integer value. The CEL expression should be
// able to access the request headers, model name, backend name, input/output tokens etc.
//
// +optional
// +notImplementedHide https://github.com/envoyproxy/ai-gateway/issues/97
CELExpression *string `json:"celExpression"`
mathetake marked this conversation as resolved.
Show resolved Hide resolved
}

// LLMRequestCostType specifies the type of the LLMRequestCost.
type LLMRequestCostType string

const (
// LLMRequestCostTypeInputToken is the cost type of the input token.
LLMRequestCostTypeInputToken LLMRequestCostType = "InputToken"
// LLMRequestCostTypeOutputToken is the cost type of the output token.
LLMRequestCostTypeOutputToken LLMRequestCostType = "OutputToken"
// LLMRequestCostTypeTotalToken is the cost type of the total token.
LLMRequestCostTypeTotalToken LLMRequestCostType = "TotalToken"
// LLMRequestCostTypeCEL is for calculating the cost using the CEL expression.
LLMRequestCostTypeCEL LLMRequestCostType = "CEL"
)

const (
// AIGatewayFilterMetadataNamespace is the namespace for the ai-gateway filter metadata.
AIGatewayFilterMetadataNamespace = "io.envoy.ai_gateway"
)
27 changes: 27 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

44 changes: 32 additions & 12 deletions filterconfig/filterconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ modelNameHeaderKey: x-envoy-ai-gateway-model
// name: OpenAI
// selectedBackendHeaderKey: x-envoy-ai-gateway-selected-backend
// modelNameHeaderKey: x-envoy-ai-gateway-model
// tokenUsageMetadata:
// llmRequestCost:
// namespace: ai_gateway_llm_ns
// key: token_usage_key
// rules:
Expand Down Expand Up @@ -66,11 +66,12 @@ modelNameHeaderKey: x-envoy-ai-gateway-model
// From Envoy configuration perspective, configuring the header matching based on `x-envoy-ai-gateway-selected-backend` is enough to route the request to the selected backend.
// That is because the matching decision is made by the filter and the selected backend is populated in the header `x-envoy-ai-gateway-selected-backend`.
type Config struct {
// TokenUsageMetadata is the namespace and key to be used in the filter metadata to store the usage token, optional.
// If this is provided, the filter will populate the usage token in the filter metadata at the end of the
// response body processing.
TokenUsageMetadata *TokenUsageMetadata `yaml:"tokenUsageMetadata,omitempty"`
// Schema specifies the API schema of the input format of requests to the filter.
// MetadataNamespace is the namespace of the dynamic metadata to be used by the filter.
MetadataNamespace string `yaml:"namespace"`
// LLMRequestCost configures the cost of each LLM-related request. Optional. If this is provided, the filter will populate
// the "calculated" cost in the filter metadata at the end of the response body processing.
LLMRequestCosts []LLMRequestCost `yaml:"llmRequestCosts,omitempty"`
// InputSchema specifies the API schema of the input format of requests to the filter.
Schema VersionedAPISchema `yaml:"schema"`
// ModelNameHeaderKey is the header key to be populated with the model name by the filter.
ModelNameHeaderKey string `yaml:"modelNameHeaderKey"`
Expand All @@ -82,18 +83,37 @@ type Config struct {
Rules []RouteRule `yaml:"rules"`
}

// TokenUsageMetadata is the namespace and key to be used in the filter metadata to store the usage token.
// LLMRequestCost specifies "where" the request cost is stored in the filter metadata as well as
// "how" the cost is calculated. By default, the cost is retrieved from "output token" in the response body.
//
// This can be used to subtract the usage token from the usage quota in the rate limit filter when
// the request completes combined with `apply_on_stream_done` and `hits_addend` fields of
// the rate limit configuration https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#config-route-v3-ratelimit
// which is introduced in Envoy 1.33 (to be released soon as of writing).
type TokenUsageMetadata struct {
// Namespace is the namespace of the metadata.
Namespace string `yaml:"namespace"`
// Key is the key of the metadata.
Key string `yaml:"key"`
type LLMRequestCost struct {
// MetadataKey is the key of the metadata storing the request cost.
MetadataKey string `yaml:"key"`
// Type is the kind of the request cost calculation.
Type LLMRequestCostType `yaml:"type"`
// CELExpression is the CEL expression to calculate the cost of the request.
// This is not empty when the Type is LLMRequestCostTypeCELExpression.
CELExpression string `yaml:"celExpression,omitempty"`
}

// LLMRequestCostType specifies the kind of the request cost calculation.
type LLMRequestCostType string

const (
// LLMRequestCostTypeOutputToken specifies that the request cost is calculated from the output token.
LLMRequestCostTypeOutputToken LLMRequestCostType = "OutputToken"
// LLMRequestCostTypeInputToken specifies that the request cost is calculated from the input token.
LLMRequestCostTypeInputToken LLMRequestCostType = "InputToken"
// LLMRequestCostTypeTotalToken specifies that the request cost is calculated from the total token.
LLMRequestCostTypeTotalToken LLMRequestCostType = "TotalToken"
// LLMRequestCostTypeCELExpression specifies that the request cost is calculated from the CEL expression.
LLMRequestCostTypeCELExpression LLMRequestCostType = "CEL"
)

// VersionedAPISchema corresponds to LLMAPISchema in api/v1alpha1/api.go.
type VersionedAPISchema struct {
// Name is the name of the API schema.
Expand Down
12 changes: 7 additions & 5 deletions filterconfig/filterconfig_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,10 @@ schema:
name: OpenAI
selectedBackendHeaderKey: x-envoy-ai-gateway-selected-backend
modelNameHeaderKey: x-envoy-ai-gateway-model
tokenUsageMetadata:
namespace: ai_gateway_llm_ns
key: token_usage_key
metadataNamespace: ai_gateway_llm_ns
llmRequestCosts:
- metadataKey: token_usage_key
type: OutputToken
rules:
- backends:
- name: kserve
Expand All @@ -60,8 +61,9 @@ rules:
require.NoError(t, os.WriteFile(configPath, []byte(config), 0o600))
cfg, err := filterconfig.UnmarshalConfigYaml(configPath)
require.NoError(t, err)
require.Equal(t, "ai_gateway_llm_ns", cfg.TokenUsageMetadata.Namespace)
require.Equal(t, "token_usage_key", cfg.TokenUsageMetadata.Key)
require.Equal(t, "ai_gateway_llm_ns", cfg.MetadataNamespace)
require.Equal(t, "token_usage_key", cfg.LLMRequestCosts[0].MetadataKey)
require.Equal(t, "OutputToken", string(cfg.LLMRequestCosts[0].Type))
require.Equal(t, "OpenAI", string(cfg.Schema.Name))
require.Equal(t, "x-envoy-ai-gateway-selected-backend", cfg.SelectedBackendHeaderKey)
require.Equal(t, "x-envoy-ai-gateway-model", cfg.ModelNameHeaderKey)
Expand Down
3 changes: 3 additions & 0 deletions internal/controller/ai_gateway_route.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ func (c *aiGatewayRouteController) reconcileExtProcExtensionPolicy(ctx context.C
Port: &port,
},
}}},
Metadata: &egv1a1.ExtProcMetadata{
WritableNamespaces: []string{aigv1a1.AIGatewayFilterMetadataNamespace},
},
}},
},
}
Expand Down
5 changes: 5 additions & 0 deletions internal/controller/ai_gateway_route_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,11 @@ func TestAIGatewayRouteController_reconcileExtProcExtensionPolicy(t *testing.T)
for i, target := range extPolicy.Spec.TargetRefs {
require.Equal(t, aiGatewayRoute.Spec.TargetRefs[i].Name, target.Name)
}
require.Equal(t, ownerRef, extPolicy.OwnerReferences)
require.Len(t, extPolicy.Spec.ExtProc, 1)
require.NotNil(t, extPolicy.Spec.ExtProc[0].Metadata)
require.NotEmpty(t, extPolicy.Spec.ExtProc[0].Metadata.WritableNamespaces)
require.Equal(t, aigv1a1.AIGatewayFilterMetadataNamespace, extPolicy.Spec.ExtProc[0].Metadata.WritableNamespaces[0])

// Update the policy.
aiGatewayRoute.Spec.TargetRefs = []gwapiv1a2.LocalPolicyTargetReferenceWithSectionName{
Expand Down
18 changes: 18 additions & 0 deletions internal/controller/sink.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,24 @@ func (c *configSink) updateExtProcConfigMap(aiGatewayRoute *aigv1a1.AIGatewayRou
}
}

ec.MetadataNamespace = aigv1a1.AIGatewayFilterMetadataNamespace
for _, cost := range aiGatewayRoute.Spec.LLMRequestCosts {
fc := filterconfig.LLMRequestCost{MetadataKey: cost.MetadataKey}
switch cost.Type {
case aigv1a1.LLMRequestCostTypeInputToken:
fc.Type = filterconfig.LLMRequestCostTypeInputToken
case aigv1a1.LLMRequestCostTypeOutputToken:
fc.Type = filterconfig.LLMRequestCostTypeOutputToken
case aigv1a1.LLMRequestCostTypeTotalToken:
fc.Type = filterconfig.LLMRequestCostTypeTotalToken
case aigv1a1.LLMRequestCostTypeCEL:
fc.Type = filterconfig.LLMRequestCostTypeCELExpression
default:
return fmt.Errorf("unknown request cost type: %s", cost.Type)
}
ec.LLMRequestCosts = append(ec.LLMRequestCosts, fc)
}

marshaled, err := yaml.Marshal(ec)
if err != nil {
return fmt.Errorf("failed to marshal extproc config: %w", err)
Expand Down
15 changes: 15 additions & 0 deletions internal/controller/sink_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,11 +267,22 @@ func Test_updateExtProcConfigMap(t *testing.T) {
},
},
},
LLMRequestCosts: []aigv1a1.LLMRequestCost{
{
Type: aigv1a1.LLMRequestCostTypeOutputToken,
MetadataKey: "output-token",
},
{
Type: aigv1a1.LLMRequestCostTypeInputToken,
MetadataKey: "input-token",
},
},
},
},
exp: &filterconfig.Config{
Schema: filterconfig.VersionedAPISchema{Name: filterconfig.APISchemaOpenAI, Version: "v123"},
ModelNameHeaderKey: aigv1a1.AIModelHeaderKey,
MetadataNamespace: aigv1a1.AIGatewayFilterMetadataNamespace,
SelectedBackendHeaderKey: selectedBackendHeaderKey,
Rules: []filterconfig.RouteRule{
{
Expand All @@ -285,6 +296,10 @@ func Test_updateExtProcConfigMap(t *testing.T) {
Headers: []filterconfig.HeaderMatch{{Name: aigv1a1.AIModelHeaderKey, Value: "another-ai"}},
},
},
LLMRequestCosts: []filterconfig.LLMRequestCost{
{Type: filterconfig.LLMRequestCostTypeOutputToken, MetadataKey: "output-token"},
{Type: filterconfig.LLMRequestCostTypeInputToken, MetadataKey: "input-token"},
},
},
},
} {
Expand Down
4 changes: 2 additions & 2 deletions internal/extproc/mocks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ type mockTranslator struct {
retHeaderMutation *extprocv3.HeaderMutation
retBodyMutation *extprocv3.BodyMutation
retOverride *extprocv3http.ProcessingMode
retUsedToken uint32
retUsedToken translator.LLMTokenUsage
retErr error
}

Expand All @@ -87,7 +87,7 @@ func (m mockTranslator) ResponseHeaders(headers map[string]string) (headerMutati
}

// ResponseBody implements [translator.Translator.ResponseBody].
func (m mockTranslator) ResponseBody(body io.Reader, _ bool) (headerMutation *extprocv3.HeaderMutation, bodyMutation *extprocv3.BodyMutation, usedToken uint32, err error) {
func (m mockTranslator) ResponseBody(body io.Reader, _ bool) (headerMutation *extprocv3.HeaderMutation, bodyMutation *extprocv3.BodyMutation, tokenUsage translator.LLMTokenUsage, err error) {
if m.expResponseBody != nil {
buf, err := io.ReadAll(body)
require.NoError(m.t, err)
Expand Down
Loading
Loading