diff --git a/inference-platforms/README.md b/inference-platforms/README.md index 92c16b4..f467f9b 100644 --- a/inference-platforms/README.md +++ b/inference-platforms/README.md @@ -13,7 +13,8 @@ proxies. Each are observable with OpenTelemetry compatible backends such as Elastic Stack. * [ArchGW](archgw) - [with tracing configuration][archgw] -* [LiteLLM](litellm) - [with OpenTelemetry logging callbacks][litellm] +* [Envoy AI Gateway](aigw) - with [OpenTelemetry tracing and metrics][aigw] +* [LiteLLM](litellm) - with [OpenTelemetry logging callbacks][litellm] * [LlamaStack](llama-stack) - with [OpenTelemetry sinks][llama-stack] * [OpenResponses](open-responses) - with [OpenTelemetry export][open-responses] * [vLLM](vllm) - with [OpenTelemetry POC][vllm] configuration @@ -104,6 +105,7 @@ To start and use Ollama, do the following: - This accepts OpenAI requests for any model on http://localhost:11434/v1 --- +[aigw]: https://aigateway.envoyproxy.io/docs/cli/aigwrun [archgw]: https://docs.archgw.com/guides/observability/tracing.html [litellm]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#configuration [llama-stack]: https://llama-stack.readthedocs.io/en/latest/building_applications/telemetry.html#telemetry diff --git a/inference-platforms/aigw/README.md b/inference-platforms/aigw/README.md new file mode 100644 index 0000000..dac8162 --- /dev/null +++ b/inference-platforms/aigw/README.md @@ -0,0 +1,47 @@ +# Envoy AI Gateway + +This shows how to use [Envoy AI Gateway][docs] to proxy Ollama, accessible via an +OpenAI compatible API. + +Envoy AI Gateway [YAML configuration](ai-gateway-local.yaml) is processed and run +by `aigw`, which launches an Envoy proxy to handle requests. OpenTelemetry support +for GenAI metrics and traces is handled directly in the `aigw` (go) binary. + +OpenTelemetry traces produced by Envoy AI Gateway follow the [OpenInference specification][openinference]. + +## Prerequisites + +Start Ollama and your OpenTelemetry Collector via this repository's [README](../README.md). + +## Run Envoy AI Gateway + +```bash +docker compose pull +docker compose up --force-recreate --remove-orphans +``` + +Clean up when finished, like this: + +```bash +docker compose down +``` + +## Call Envoy AI Gateway with python + +Once Envoy AI Gateway is running, use [uv][uv] to make an OpenAI request via +[chat.py](../chat.py): + +```bash +uv run --exact -q --env-file env.local ../chat.py +``` + +## Notes + +Here are some constraints about the Envoy AI Gateway implementation: +* Until [this][openai-responses] resolves, don't use `--use-responses-api`. + +--- +[docs]: https://aigateway.envoyproxy.io/docs/cli/ +[openinference]: https://github.com/Arize-ai/openinference/tree/main/spec +[uv]: https://docs.astral.sh/uv/getting-started/installation/ +[openai-responses]: https://github.com/envoyproxy/ai-gateway/issues/980 diff --git a/inference-platforms/aigw/ai-gateway-local.yaml b/inference-platforms/aigw/ai-gateway-local.yaml new file mode 100644 index 0000000..a20e1aa --- /dev/null +++ b/inference-platforms/aigw/ai-gateway-local.yaml @@ -0,0 +1,113 @@ +# Copyright Envoy AI Gateway Authors +# SPDX-License-Identifier: Apache-2.0 +# The full text of the Apache license is available in the LICENSE file at +# the root of the repo. + +# Configuration for running Envoy AI Gateway with an OpenAI compatible +# inference platform, defaulting to Ollama. +# +# Override with the OPENAI_HOST and OPENAI_PORT environment variables. +# For example, to use RamaLama you would set OPENAI_PORT=8080. +# +# TODO: Remove after https://github.com/envoyproxy/ai-gateway/issues/1211 +apiVersion: gateway.networking.k8s.io/v1 +kind: GatewayClass +metadata: + name: aigw-run +spec: + controllerName: gateway.envoyproxy.io/gatewayclass-controller +--- +apiVersion: gateway.networking.k8s.io/v1 +kind: Gateway +metadata: + name: aigw-run + namespace: default +spec: + gatewayClassName: aigw-run + listeners: + - name: http + protocol: HTTP + port: 1975 + infrastructure: + parametersRef: + group: gateway.envoyproxy.io + kind: EnvoyProxy + name: envoy-ai-gateway +--- +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: EnvoyProxy +metadata: + name: envoy-ai-gateway + namespace: default +spec: + logging: + level: + default: error +--- +apiVersion: aigateway.envoyproxy.io/v1alpha1 +kind: AIGatewayRoute +metadata: + name: aigw-run + namespace: default +spec: + parentRefs: + - name: aigw-run + kind: Gateway + group: gateway.networking.k8s.io + # Simple rule: route everything to Ollama + rules: + - matches: + - headers: + - type: RegularExpression + name: x-ai-eg-model + value: .* + backendRefs: + - name: ollama + namespace: default + timeouts: + request: 120s +--- +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: Backend +metadata: + name: ollama + namespace: default +spec: + endpoints: + # Use fqdn, not ip, to allow changing to host.docker.internal + - fqdn: + hostname: ${OPENAI_HOST:=127.0.0.1.nip.io} # Resolves to 127.0.0.1 + port: ${OPENAI_PORT:=11434} # Default to Ollama's port +--- +apiVersion: aigateway.envoyproxy.io/v1alpha1 +kind: AIServiceBackend +metadata: + name: ollama + namespace: default +spec: + timeouts: + request: 3m + schema: + name: OpenAI + backendRef: + name: ollama + kind: Backend + group: gateway.envoyproxy.io + namespace: default +--- +# By default, Envoy Gateway sets the buffer limit to 32kiB which is not +# sufficient for AI workloads. This ClientTrafficPolicy sets the buffer limit +# to 50MiB as an example. +# TODO: Remove ofter https://github.com/envoyproxy/ai-gateway/issues/1212 +apiVersion: gateway.envoyproxy.io/v1alpha1 +kind: ClientTrafficPolicy +metadata: + name: client-buffer-limit + namespace: default +spec: + targetRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: aigw-run + connection: + bufferLimit: 50Mi diff --git a/inference-platforms/aigw/docker-compose.yml b/inference-platforms/aigw/docker-compose.yml new file mode 100644 index 0000000..82131d9 --- /dev/null +++ b/inference-platforms/aigw/docker-compose.yml @@ -0,0 +1,32 @@ +services: + ollama-pull: + image: alpine/ollama + container_name: ollama-pull + environment: + OLLAMA_HOST: localhost:11434 # instead of IP 127.0.0.1 + env_file: + - env.local + entrypoint: sh + command: -c 'env | grep _MODEL | cut -d= -f2 | xargs -I{} ollama pull {}' + extra_hosts: # send localhost traffic to the docker host, e.g. your laptop + - "localhost:host-gateway" + + # aigw is the Envoy AI Gateway CLI a.k.a standalone mode. + aigw: + image: envoyproxy/ai-gateway-cli:latest + container_name: aigw + depends_on: + ollama-pull: + condition: service_completed_successfully + env_file: + - env.local + environment: + - OPENAI_HOST=host.docker.internal + - OTEL_EXPORTER_OTLP_ENDPOINT=http://host.docker.internal:4318 + ports: + - "1975:1975" # OpenAI compatible endpoint at /v1 + extra_hosts: # localhost:host-gateway trick doesn't work with aigw + - "host.docker.internal:host-gateway" + volumes: + - ./ai-gateway-local.yaml:/config.yaml:ro + command: ["run", "/config.yaml"] diff --git a/inference-platforms/aigw/env.local b/inference-platforms/aigw/env.local new file mode 100644 index 0000000..933b0c6 --- /dev/null +++ b/inference-platforms/aigw/env.local @@ -0,0 +1,23 @@ +OPENAI_BASE_URL=http://localhost:1975/v1 +OPENAI_API_KEY=unused +CHAT_MODEL=qwen3:0.6B + +# OpenTelemetry configuration +OTEL_SERVICE_NAME=aigw +OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 +OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf +OTEL_TRACES_EXPORTER=otlp +OTEL_METRICS_EXPORTER=otlp + +# Reduce trace and metrics export delay for demo purposes +OTEL_BSP_SCHEDULE_DELAY=100 +OTEL_METRIC_EXPORT_INTERVAL=100 +# ElasticStack and otel-tui work best when export only occurs on change (delta) +OTEL_EXPORTER_OTLP_METRICS_TEMPORALITY_PREFERENCE=delta + + +# Below are default values for span redaction in OpenInference. +# See https://github.com/Arize-ai/openinference/blob/main/spec/configuration.md +OPENINFERENCE_HIDE_INPUTS=false +OPENINFERENCE_HIDE_OUTPUTS=false +