envoyproxy · yuzisun · Jan 18, 2025 · Jan 16, 2025 · Jan 16, 2025 · Jan 16, 2025
@@ -80,6 +80,54 @@ type AIGatewayRouteSpec struct {
 	// Currently, the filter is only implemented as an external process filter, which might be
 	// extended to other types of filters in the future. See https://github.com/envoyproxy/ai-gateway/issues/90
 	FilterConfig *AIGatewayFilterConfig `json:"filterConfig,omitempty"`
+
+	// LLMRequestCost specifies the cost of the LLM-related request, notably the token usage.
+	// The AI Gateway filter will capture this information and store it in the Envoy's dynamic
+	// metadata per HTTP request. The namespaced key is "io.envoy.ai_gateway",
+	// and the key is "ai_gateway_route_request_cost".
+	//
+	// For example, with the following BackendTrafficPolicy of Envoy Gateway,
+	// the captured value is used as the "token usage rate limiting":
+	//
+	// apiVersion: gateway.envoyproxy.io/v1alpha1
+	// kind: BackendTrafficPolicy
+	// metadata:
+	//	name: some-example-token-rate-limit
+	//	namespace: default
+	// spec:
+	//	targetRefs:
+	//	  - group: gateway.networking.k8s.io
+	//	    kind: HTTPRoute
+	//	    name: usage-rate-limit
+	//	rateLimit:
+	//	  type: Global
+	//	  global:
+	//	    rules:
+	//	      - clientSelectors:
+	//	          # Do the rate limiting based on the x-user-id header.
+	//	          - headers:
+	//	              - name: x-user-id
+	//	                type: Exact
+	//	                value: one
+	//	        limit:
+	//	          # Configures the number of "tokens" allowed per hour.
+	//	          requests: 10000
+	//	          unit: Hour
+	//	        cost:
+	//	          request:
+	//	            from: Number
+	//	            # Setting the request cost to zero allows to only check the rate limit budget,
+	//	            # and not consume the budget on the request path.
+	//	            number: 0
+	//	          # This specifies the cost of the response retrieved from the dynamic metadata set by the AI Gateway filter.
+	//	          # The extracted value will be used to consume the rate limit budget, and subsequent requests will be rate limited
+	//	          # if the budget is exhausted.
+	//	          response:
+	//	            from: Metadata
+	//	            metadata:
+	//	              namespace: io.envoy.ai_gateway
+	//	              key: ai_gateway_route_request_cost
+	LLMRequestCost *LLMRequestCost `json:"llmRequestCost,omitempty"`
 }
 
 // AIGatewayRouteRule is a rule that defines the routing behavior of the AIGatewayRoute.
@@ -230,6 +278,9 @@ type AIServiceBackendSpec struct {
 	//
 	// +optional
 	BackendSecurityPolicyRef *gwapiv1.LocalObjectReference `json:"backendSecurityPolicyRef,omitempty"`
+
+	// TODO: maybe add backend-level LLMRequestCost configuration that overrides the AIGatewayRoute-level LLMRequestCost.
+	// 	That may be useful for the backend that has a different cost calculation logic.
 }
 
 // VersionedAPISchema defines the API schema of either AIGatewayRoute (the input) or AIServiceBackend (the output).
@@ -378,3 +429,41 @@ type AWSOIDCExchangeToken struct {
 	// which maps to the temporary AWS security credentials exchanged using the authentication token issued by OIDC provider.
 	AwsRoleArn string `json:"awsRoleArn"`
 }
+
+// LLMRequestCost configures the request cost.
+type LLMRequestCost struct {
+	// Type specifies the type of the request cost. The default is "OutputToken",
+	// and it uses "output token" as the cost. The other types are "InputToken" and "TotalToken".
+	//
+	// +kubebuilder:validation:Enum=OutputToken;InputToken;TotalToken
+	Type LLMRequestCostType `json:"type"`
+
+	// CELExpression is the CEL expression to calculate the cost of the request.
+	// The CEL expression must return an integer value. The CEL expression should be
+	// able to access the request headers, model name, backend name, input/output tokens etc.
+	//
+	// +optional
+	// +notImplementedHide https://github.com/envoyproxy/ai-gateway/issues/97
+	CELExpression *string `json:"celExpression"`
+}
+
+// LLMRequestCostType specifies the type of the LLMRequestCost.
+type LLMRequestCostType string
+
+const (
+	// LLMRequestCostTypeInputToken is the cost type of the input token.
+	LLMRequestCostTypeInputToken LLMRequestCostType = "InputToken"
+	// LLMRequestCostTypeOutputToken is the cost type of the output token.
+	LLMRequestCostTypeOutputToken LLMRequestCostType = "OutputToken"
+	// LLMRequestCostTypeTotalToken is the cost type of the total token.
+	LLMRequestCostTypeTotalToken LLMRequestCostType = "TotalToken"
+	// LLMRequestCostTypeCEL is for calculating the cost using the CEL expression.
+	LLMRequestCostTypeCEL LLMRequestCostType = "CEL"
+)
+
+const (
+	// AIGatewayFilterMetadataNamespace is the namespace for the ai-gateway filter metadata.
+	AIGatewayFilterMetadataNamespace = "io.envoy.ai_gateway"
+	// AIGatewayFilterMetadataRequestCostMetadataKey is the key for the request cost metadata.
+	AIGatewayFilterMetadataRequestCostMetadataKey = "ai_gateway_route_request_cost"
+)
@@ -33,7 +33,7 @@ modelNameHeaderKey: x-envoy-ai-gateway-model
 //	  schema: OpenAI
 //	selectedBackendHeaderKey: x-envoy-ai-gateway-selected-backend
 //	modelNameHeaderKey: x-envoy-ai-gateway-model
-//	tokenUsageMetadata:
+//	llmRequestCost:
 //	  namespace: ai_gateway_llm_ns
 //	  key: token_usage_key
 //	rules:
@@ -66,10 +66,9 @@ modelNameHeaderKey: x-envoy-ai-gateway-model
 // From Envoy configuration perspective, configuring the header matching based on `x-envoy-ai-gateway-selected-backend` is enough to route the request to the selected backend.
 // That is because the matching decision is made by the filter and the selected backend is populated in the header `x-envoy-ai-gateway-selected-backend`.
 type Config struct {
-	// TokenUsageMetadata is the namespace and key to be used in the filter metadata to store the usage token, optional.
-	// If this is provided, the filter will populate the usage token in the filter metadata at the end of the
-	// response body processing.
-	TokenUsageMetadata *TokenUsageMetadata `yaml:"tokenUsageMetadata,omitempty"`
+	// LLMRequestCost configures the cost of each LLM-related request. Optional. If this is provided, the filter will populate
+	// the "calculated" cost in the filter metadata at the end of the response body processing.
+	LLMRequestCost *LLMRequestCost `yaml:"llmRequestCost,omitempty"`
 	// InputSchema specifies the API schema of the input format of requests to the filter.
 	InputSchema VersionedAPISchema `yaml:"inputSchema"`
 	// ModelNameHeaderKey is the header key to be populated with the model name by the filter.
@@ -82,16 +81,42 @@ type Config struct {
 	Rules []RouteRule `yaml:"rules"`
 }
 
-// TokenUsageMetadata is the namespace and key to be used in the filter metadata to store the usage token.
+// LLMRequestCost specifies "where" the request cost is stored in the filter metadata as well as
+// "how" the cost is calculated. By default, the cost is retrieved from "output token" in the response body.
+//
 // This can be used to subtract the usage token from the usage quota in the rate limit filter when
 // the request completes combined with `apply_on_stream_done` and `hits_addend` fields of
 // the rate limit configuration https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#config-route-v3-ratelimit
 // which is introduced in Envoy 1.33 (to be released soon as of writing).
-type TokenUsageMetadata struct {
-	// Namespace is the namespace of the metadata.
+type LLMRequestCost struct {
+	// Namespace is the namespace of the metadata storing the request cost.
 	Namespace string `yaml:"namespace"`
-	// Key is the key of the metadata.
+	// Key is the key of the metadata storing the request cost.
 	Key string `yaml:"key"`
+	// Type is the kind of the request cost calculation.
+	Type LLMRequestCostType
+	// CELExpression is the CEL expression to calculate the cost of the request.
+	// This is not empty when the Type is LLMRequestCostTypeCELExpression.
+	CELExpression string `yaml:"celExpression,omitempty"`
+}
+
+// LLMRequestCostType specifies the kind of the request cost calculation.
+type LLMRequestCostType int
+
+const (
+	// LLMRequestCostTypeOutputToken specifies that the request cost is calculated from the output token.
+	LLMRequestCostTypeOutputToken LLMRequestCostType = iota
+	// LLMRequestCostTypeInputToken specifies that the request cost is calculated from the input token.
+	LLMRequestCostTypeInputToken
+	// LLMRequestCostTypeTotalToken specifies that the request cost is calculated from the total token.
+	LLMRequestCostTypeTotalToken
+	// LLMRequestCostTypeCELExpression specifies that the request cost is calculated from the CEL expression.
+	LLMRequestCostTypeCELExpression
+)
+
+// String implements fmt.Stringer.
+func (k LLMRequestCostType) String() string {
+	return [...]string{"OutputToken", "InputToken", "TotalToken", "CELExpression"}[k]
 }
 
 // VersionedAPISchema corresponds to LLMAPISchema in api/v1alpha1/api.go.

@@ -33,7 +33,7 @@ inputSchema:
   schema: OpenAI
 selectedBackendHeaderKey: x-envoy-ai-gateway-selected-backend
 modelNameHeaderKey: x-envoy-ai-gateway-model
-tokenUsageMetadata:
+llmRequestCost:
   namespace: ai_gateway_llm_ns
   key: token_usage_key
 rules:
@@ -60,8 +60,8 @@ rules:
 	require.NoError(t, os.WriteFile(configPath, []byte(config), 0o600))
 	cfg, err := filterconfig.UnmarshalConfigYaml(configPath)
 	require.NoError(t, err)
-	require.Equal(t, "ai_gateway_llm_ns", cfg.TokenUsageMetadata.Namespace)
-	require.Equal(t, "token_usage_key", cfg.TokenUsageMetadata.Key)
+	require.Equal(t, "ai_gateway_llm_ns", cfg.LLMRequestCost.Namespace)
+	require.Equal(t, "token_usage_key", cfg.LLMRequestCost.Key)
 	require.Equal(t, "OpenAI", string(cfg.InputSchema.Schema))
 	require.Equal(t, "x-envoy-ai-gateway-selected-backend", cfg.SelectedBackendHeaderKey)
 	require.Equal(t, "x-envoy-ai-gateway-model", cfg.ModelNameHeaderKey)

@@ -141,6 +141,9 @@ func (c *aiGatewayRouteController) reconcileExtProcExtensionPolicy(ctx context.C
 						Port:      &port,
 					},
 				}}},
+				Metadata: &egv1a1.ExtProcMetadata{
+					WritableNamespaces: []string{aigv1a1.AIGatewayFilterMetadataNamespace},
+				},
 			}},
 		},
 	}

@@ -191,6 +191,26 @@ func (c *configSink) updateExtProcConfigMap(aiGatewayRoute *aigv1a1.AIGatewayRou
 		}
 	}
 
+	if cost := aiGatewayRoute.Spec.LLMRequestCost; cost != nil {
+		fc := &filterconfig.LLMRequestCost{
+			Namespace: aigv1a1.AIGatewayFilterMetadataNamespace,
+			Key:       aigv1a1.AIGatewayFilterMetadataRequestCostMetadataKey,
+		}
+		switch cost.Type {
+		case aigv1a1.LLMRequestCostTypeInputToken:
+			fc.Type = filterconfig.LLMRequestCostTypeInputToken
+		case aigv1a1.LLMRequestCostTypeOutputToken:
+			fc.Type = filterconfig.LLMRequestCostTypeOutputToken
+		case aigv1a1.LLMRequestCostTypeTotalToken:
+			fc.Type = filterconfig.LLMRequestCostTypeTotalToken
+		case aigv1a1.LLMRequestCostTypeCEL:
+			fc.Type = filterconfig.LLMRequestCostTypeCELExpression
+		default:
+			return fmt.Errorf("unknown request cost type: %s", cost.Type)
+		}
+		ec.LLMRequestCost = fc
+	}
+
 	marshaled, err := yaml.Marshal(ec)
 	if err != nil {
 		return fmt.Errorf("failed to marshal extproc config: %w", err)

@@ -70,7 +70,7 @@ type mockTranslator struct {
 	retHeaderMutation *extprocv3.HeaderMutation
 	retBodyMutation   *extprocv3.BodyMutation
 	retOverride       *extprocv3http.ProcessingMode
-	retUsedToken      uint32
+	retUsedToken      translator.LLMTokenUsage
 	retErr            error
 }
 
@@ -87,7 +87,7 @@ func (m mockTranslator) ResponseHeaders(headers map[string]string) (headerMutati
 }
 
 // ResponseBody implements [translator.Translator.ResponseBody].
-func (m mockTranslator) ResponseBody(body io.Reader, _ bool) (headerMutation *extprocv3.HeaderMutation, bodyMutation *extprocv3.BodyMutation, usedToken uint32, err error) {
+func (m mockTranslator) ResponseBody(body io.Reader, _ bool) (headerMutation *extprocv3.HeaderMutation, bodyMutation *extprocv3.BodyMutation, tokenUsage translator.LLMTokenUsage, err error) {
 	if m.expResponseBody != nil {
 		buf, err := io.ReadAll(body)
 		require.NoError(m.t, err)

@@ -28,7 +28,7 @@ type processorConfig struct {
 	ModelNameHeaderKey, selectedBackendHeaderKey string
 	factories                                    map[filterconfig.VersionedAPISchema]translator.Factory
 	backendAuthHandlers                          map[string]backendauth.Handler
-	tokenUsageMetadata                           *filterconfig.TokenUsageMetadata
+	requestCost                                  *filterconfig.LLMRequestCost
 }
 
 // ProcessorIface is the interface for the processor.
@@ -56,6 +56,8 @@ type Processor struct {
 	requestHeaders   map[string]string
 	responseEncoding string
 	translator       translator.Translator
+	// cost is the cost of the request that is accumulated during the processing of the response.
+	costs translator.LLMTokenUsage
 }
 
 // ProcessRequestHeaders implements [Processor.ProcessRequestHeaders].
@@ -169,7 +171,7 @@ func (p *Processor) ProcessResponseBody(_ context.Context, body *extprocv3.HttpB
 	if p.translator == nil {
 		return &extprocv3.ProcessingResponse{Response: &extprocv3.ProcessingResponse_ResponseBody{}}, nil
 	}
-	headerMutation, bodyMutation, usedToken, err := p.translator.ResponseBody(br, body.EndOfStream)
+	headerMutation, bodyMutation, tokenUsage, err := p.translator.ResponseBody(br, body.EndOfStream)
 	if err != nil {
 		return nil, fmt.Errorf("failed to transform response: %w", err)
 	}
@@ -184,20 +186,39 @@ func (p *Processor) ProcessResponseBody(_ context.Context, body *extprocv3.HttpB
 			},
 		},
 	}
-	if p.config.tokenUsageMetadata != nil {
-		resp.DynamicMetadata = buildTokenUsageDynamicMetadata(p.config.tokenUsageMetadata, usedToken)
+
+	// TODO: this is coupled with "LLM" specific logic. Once we have another use case, we need to refactor this.
+	p.costs.InputTokens += tokenUsage.InputTokens
+	p.costs.OutputTokens += tokenUsage.OutputTokens
+	p.costs.TotalTokens += tokenUsage.TotalTokens
+	if body.EndOfStream && p.config.requestCost != nil {
+		c := p.config.requestCost
+		var cost uint32
+		switch c.Type {
+		case filterconfig.LLMRequestCostTypeInputToken:
+			cost = tokenUsage.InputTokens
+		case filterconfig.LLMRequestCostTypeOutputToken:
+			cost = tokenUsage.OutputTokens
+		case filterconfig.LLMRequestCostTypeTotalToken:
+			cost = tokenUsage.TotalTokens
+		default:
+			return nil, fmt.Errorf("unknown request cost kind: %s", c.Type)
+		}
+		if cost > 0 {
+			resp.DynamicMetadata = buildRequestCostDynamicMetadata(c.Namespace, c.Key, cost)
+		}
 	}
 	return resp, nil
 }
 
-func buildTokenUsageDynamicMetadata(md *filterconfig.TokenUsageMetadata, usage uint32) *structpb.Struct {
+func buildRequestCostDynamicMetadata(namespace, key string, cost uint32) *structpb.Struct {
 	return &structpb.Struct{
 		Fields: map[string]*structpb.Value{
-			md.Namespace: {
+			namespace: {
 				Kind: &structpb.Value_StructValue{
 					StructValue: &structpb.Struct{
 						Fields: map[string]*structpb.Value{
-							md.Key: {Kind: &structpb.Value_NumberValue{NumberValue: float64(usage)}},
+							key: {Kind: &structpb.Value_NumberValue{NumberValue: float64(cost)}},
 						},
 					},
 				},

@@ -57,13 +57,15 @@ func TestProcessor_ProcessResponseBody(t *testing.T) {
 		require.ErrorContains(t, err, "test error")
 	})
 	t.Run("ok", func(t *testing.T) {
-		inBody := &extprocv3.HttpBody{Body: []byte("some-body")}
+		inBody := &extprocv3.HttpBody{Body: []byte("some-body"), EndOfStream: true}
 		expBodyMut := &extprocv3.BodyMutation{}
 		expHeadMut := &extprocv3.HeaderMutation{}
-		mt := &mockTranslator{t: t, expResponseBody: inBody, retBodyMutation: expBodyMut, retHeaderMutation: expHeadMut, retUsedToken: 123}
-		p := &Processor{translator: mt, config: &processorConfig{tokenUsageMetadata: &filterconfig.TokenUsageMetadata{
-			Namespace: "ai_gateway_llm_ns", Key: "token_usage",
-		}}}
+		mt := &mockTranslator{t: t, expResponseBody: inBody, retBodyMutation: expBodyMut, retHeaderMutation: expHeadMut, retUsedToken: translator.LLMTokenUsage{OutputTokens: 123}}
+		p := &Processor{translator: mt, config: &processorConfig{
+			requestCost: &filterconfig.LLMRequestCost{
+				Namespace: "ai_gateway_llm_ns", Key: "token_usage",
+			},
+		}}
 		res, err := p.ProcessResponseBody(context.Background(), inBody)
 		require.NoError(t, err)
 		commonRes := res.Response.(*extprocv3.ProcessingResponse_ResponseBody).ResponseBody.Response

@@ -70,7 +70,7 @@ func (s *Server[P]) LoadConfig(config *filterconfig.Config) error {
 		ModelNameHeaderKey:       config.ModelNameHeaderKey,
 		factories:                factories,
 		backendAuthHandlers:      backendAuthHandlers,
-		tokenUsageMetadata:       config.TokenUsageMetadata,
+		requestCost:              config.LLMRequestCost,
 	}
 	s.config = newConfig // This is racey, but we don't care.
 	return nil
-Original file line number
+Diff line change
@@ Expand Up @@
     						Port:      &port,
     					},
     				}}},
+    				Metadata: &egv1a1.ExtProcMetadata{
+    					WritableNamespaces: []string{aigv1a1.AIGatewayFilterMetadataNamespace},
+    				},
     			}},
     		},
     	}
@@ Expand Down @@