@@ -30,8 +30,10 @@ materialize_telemetry_configs() {
3030 local otel_cfg=" ${dest} /otel-collector-config.yaml"
3131 local prom_cfg=" ${dest} /prometheus.yml"
3232 local graf_cfg=" ${dest} /grafana-datasources.yaml"
33+ local graf_dash_cfg=" ${dest} /grafana-dashboards.yaml"
34+ local dash_json=" ${dest} /llama-stack-dashboard.json"
3335
34- for asset in " $otel_cfg " " $prom_cfg " " $graf_cfg " ; do
36+ for asset in " $otel_cfg " " $prom_cfg " " $graf_cfg " " $graf_dash_cfg " " $dash_json " ; do
3537 if [ -e " $asset " ]; then
3638 die " Telemetry asset ${asset} already exists; refusing to overwrite"
3739 fi
@@ -103,6 +105,7 @@ datasources:
103105 type: prometheus
104106 access: proxy
105107 url: http://prometheus:9090
108+ uid: prometheus
106109 isDefault: true
107110 editable: true
108111
@@ -112,6 +115,224 @@ datasources:
112115 url: http://jaeger:16686
113116 editable: true
114117EOF
118+
119+ cat << 'EOF ' > "$graf_dash_cfg"
120+ apiVersion: 1
121+
122+ providers:
123+ - name: 'Llama Stack'
124+ orgId: 1
125+ folder: ''
126+ type: file
127+ disableDeletion: false
128+ updateIntervalSeconds: 10
129+ allowUiUpdates: true
130+ options:
131+ path: /etc/grafana/provisioning/dashboards
132+ EOF
133+
134+ # Copy the dashboard JSON inline to avoid line-length issues
135+ cat > " $dash_json " << 'DASHBOARD_JSON '
136+ {
137+ "annotations": {
138+ "list": []
139+ },
140+ "editable": true,
141+ "fiscalYearStartMonth": 0,
142+ "graphTooltip": 0,
143+ "id": null,
144+ "links": [],
145+ "liveNow": false,
146+ "panels": [
147+ {
148+ "datasource": {
149+ "type": "prometheus",
150+ "uid": "prometheus"
151+ },
152+ "fieldConfig": {
153+ "defaults": {
154+ "custom": {
155+ "drawStyle": "line",
156+ "lineInterpolation": "linear",
157+ "showPoints": "auto",
158+ "fillOpacity": 10
159+ },
160+ "mappings": [],
161+ "thresholds": {
162+ "mode": "absolute",
163+ "steps": [{"color": "green", "value": null}]
164+ }
165+ }
166+ },
167+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
168+ "id": 1,
169+ "options": {
170+ "legend": {"calcs": [], "displayMode": "table", "placement": "bottom", "showLegend": true},
171+ "tooltip": {"mode": "multi", "sort": "none"}
172+ },
173+ "targets": [
174+ {
175+ "datasource": {"type": "prometheus", "uid": "prometheus"},
176+ "expr": "llama_stack_completion_tokens_total",
177+ "legendFormat": "{{model_id}} ({{provider_id}})",
178+ "refId": "A"
179+ }
180+ ],
181+ "title": "Completion Tokens",
182+ "type": "timeseries"
183+ },
184+ {
185+ "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
186+ "fieldConfig": {
187+ "defaults": {
188+ "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
189+ "mappings": [],
190+ "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}
191+ }
192+ },
193+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
194+ "id": 2,
195+ "options": {
196+ "legend": {"calcs": [], "displayMode": "table", "placement": "bottom", "showLegend": true},
197+ "tooltip": {"mode": "multi", "sort": "none"}
198+ },
199+ "targets": [
200+ {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "llama_stack_prompt_tokens_total", "legendFormat": "Prompt - {{model_id}}", "refId": "A"},
201+ {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "llama_stack_tokens_total", "legendFormat": "Total - {{model_id}}", "refId": "B"}
202+ ],
203+ "title": "Prompt & Total Tokens",
204+ "type": "timeseries"
205+ },
206+ {
207+ "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
208+ "fieldConfig": {
209+ "defaults": {
210+ "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
211+ "mappings": [],
212+ "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
213+ "unit": "ms"
214+ }
215+ },
216+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
217+ "id": 3,
218+ "options": {
219+ "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
220+ "tooltip": {"mode": "multi", "sort": "none"}
221+ },
222+ "targets": [
223+ {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "histogram_quantile(0.95, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))", "legendFormat": "p95", "refId": "A"},
224+ {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "histogram_quantile(0.99, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))", "legendFormat": "p99", "refId": "B"}
225+ ],
226+ "title": "HTTP Request Duration (p95, p99)",
227+ "type": "timeseries"
228+ },
229+ {
230+ "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
231+ "fieldConfig": {
232+ "defaults": {
233+ "mappings": [],
234+ "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}
235+ }
236+ },
237+ "gridPos": {"h": 8, "w": 6, "x": 12, "y": 8},
238+ "id": 4,
239+ "options": {
240+ "colorMode": "value",
241+ "graphMode": "area",
242+ "justifyMode": "auto",
243+ "orientation": "auto",
244+ "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
245+ "textMode": "auto"
246+ },
247+ "targets": [
248+ {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "sum(llama_stack_http_server_duration_milliseconds_count)", "refId": "A"}
249+ ],
250+ "title": "Total Requests",
251+ "type": "stat"
252+ },
253+ {
254+ "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
255+ "fieldConfig": {
256+ "defaults": {
257+ "mappings": [],
258+ "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}
259+ }
260+ },
261+ "gridPos": {"h": 8, "w": 6, "x": 18, "y": 8},
262+ "id": 5,
263+ "options": {
264+ "colorMode": "value",
265+ "graphMode": "none",
266+ "justifyMode": "auto",
267+ "orientation": "auto",
268+ "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
269+ "textMode": "auto"
270+ },
271+ "targets": [
272+ {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "sum(llama_stack_http_server_active_requests)", "refId": "A"}
273+ ],
274+ "title": "Active Requests",
275+ "type": "stat"
276+ },
277+ {
278+ "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
279+ "fieldConfig": {
280+ "defaults": {
281+ "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
282+ "mappings": [],
283+ "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
284+ "unit": "reqps"
285+ }
286+ },
287+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
288+ "id": 6,
289+ "options": {
290+ "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
291+ "tooltip": {"mode": "multi", "sort": "none"}
292+ },
293+ "targets": [
294+ {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_duration_milliseconds_count[5m])", "legendFormat": "{{http_target}} - {{http_status_code}}", "refId": "A"}
295+ ],
296+ "title": "Request Rate",
297+ "type": "timeseries"
298+ },
299+ {
300+ "datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
301+ "fieldConfig": {
302+ "defaults": {
303+ "custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
304+ "mappings": [],
305+ "thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
306+ "unit": "Bps"
307+ }
308+ },
309+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
310+ "id": 7,
311+ "options": {
312+ "legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
313+ "tooltip": {"mode": "multi", "sort": "none"}
314+ },
315+ "targets": [
316+ {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_request_size_bytes_sum[5m])", "legendFormat": "Request", "refId": "A"},
317+ {"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_response_size_bytes_sum[5m])", "legendFormat": "Response", "refId": "B"}
318+ ],
319+ "title": "Request/Response Sizes",
320+ "type": "timeseries"
321+ }
322+ ],
323+ "refresh": "5s",
324+ "schemaVersion": 38,
325+ "tags": ["llama-stack"],
326+ "templating": {"list": []},
327+ "time": {"from": "now-15m", "to": "now"},
328+ "timepicker": {},
329+ "timezone": "browser",
330+ "title": "Llama Stack Metrics",
331+ "uid": "llama-stack-metrics",
332+ "version": 0,
333+ "weekStart": ""
334+ }
335+ DASHBOARD_JSON
115336}
116337
117338# Cleanup function to remove temporary files
@@ -372,6 +593,8 @@ if [ "$WITH_TELEMETRY" = true ]; then
372593 -e GF_SECURITY_ADMIN_PASSWORD=admin \
373594 -e GF_USERS_ALLOW_SIGN_UP=false \
374595 -v " ${TELEMETRY_ASSETS_DIR} /grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \
596+ -v " ${TELEMETRY_ASSETS_DIR} /grafana-dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:Z" \
597+ -v " ${TELEMETRY_ASSETS_DIR} /llama-stack-dashboard.json:/etc/grafana/provisioning/dashboards/llama-stack-dashboard.json:Z" \
375598 docker.io/grafana/grafana:11.0.0 > /dev/null 2>&1 ; then
376599 die " Grafana startup failed"
377600 fi
0 commit comments