Skip to content

Commit 1c9a31d

Browse files
authored
chore(telemetry): add grafana dashboards (#3921)
# What does this PR do? - add a dashboard in grafana (vibe-coded) ## Test Plan <img width="2416" height="1114" alt="image" src="https://github.com/user-attachments/assets/8927aad2-cc14-4a1d-847e-350522cac02f" />
1 parent b7dd3f5 commit 1c9a31d

File tree

5 files changed

+696
-1
lines changed

5 files changed

+696
-1
lines changed

scripts/install.sh

Lines changed: 224 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,10 @@ materialize_telemetry_configs() {
3030
local otel_cfg="${dest}/otel-collector-config.yaml"
3131
local prom_cfg="${dest}/prometheus.yml"
3232
local graf_cfg="${dest}/grafana-datasources.yaml"
33+
local graf_dash_cfg="${dest}/grafana-dashboards.yaml"
34+
local dash_json="${dest}/llama-stack-dashboard.json"
3335

34-
for asset in "$otel_cfg" "$prom_cfg" "$graf_cfg"; do
36+
for asset in "$otel_cfg" "$prom_cfg" "$graf_cfg" "$graf_dash_cfg" "$dash_json"; do
3537
if [ -e "$asset" ]; then
3638
die "Telemetry asset ${asset} already exists; refusing to overwrite"
3739
fi
@@ -103,6 +105,7 @@ datasources:
103105
type: prometheus
104106
access: proxy
105107
url: http://prometheus:9090
108+
uid: prometheus
106109
isDefault: true
107110
editable: true
108111
@@ -112,6 +115,224 @@ datasources:
112115
url: http://jaeger:16686
113116
editable: true
114117
EOF
118+
119+
cat <<'EOF' > "$graf_dash_cfg"
120+
apiVersion: 1
121+
122+
providers:
123+
- name: 'Llama Stack'
124+
orgId: 1
125+
folder: ''
126+
type: file
127+
disableDeletion: false
128+
updateIntervalSeconds: 10
129+
allowUiUpdates: true
130+
options:
131+
path: /etc/grafana/provisioning/dashboards
132+
EOF
133+
134+
# Copy the dashboard JSON inline to avoid line-length issues
135+
cat > "$dash_json" <<'DASHBOARD_JSON'
136+
{
137+
"annotations": {
138+
"list": []
139+
},
140+
"editable": true,
141+
"fiscalYearStartMonth": 0,
142+
"graphTooltip": 0,
143+
"id": null,
144+
"links": [],
145+
"liveNow": false,
146+
"panels": [
147+
{
148+
"datasource": {
149+
"type": "prometheus",
150+
"uid": "prometheus"
151+
},
152+
"fieldConfig": {
153+
"defaults": {
154+
"custom": {
155+
"drawStyle": "line",
156+
"lineInterpolation": "linear",
157+
"showPoints": "auto",
158+
"fillOpacity": 10
159+
},
160+
"mappings": [],
161+
"thresholds": {
162+
"mode": "absolute",
163+
"steps": [{"color": "green", "value": null}]
164+
}
165+
}
166+
},
167+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
168+
"id": 1,
169+
"options": {
170+
"legend": {"calcs": [], "displayMode": "table", "placement": "bottom", "showLegend": true},
171+
"tooltip": {"mode": "multi", "sort": "none"}
172+
},
173+
"targets": [
174+
{
175+
"datasource": {"type": "prometheus", "uid": "prometheus"},
176+
"expr": "llama_stack_completion_tokens_total",
177+
"legendFormat": "{{model_id}} ({{provider_id}})",
178+
"refId": "A"
179+
}
180+
],
181+
"title": "Completion Tokens",
182+
"type": "timeseries"
183+
},
184+
{
185+
"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
186+
"fieldConfig": {
187+
"defaults": {
188+
"custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
189+
"mappings": [],
190+
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}
191+
}
192+
},
193+
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
194+
"id": 2,
195+
"options": {
196+
"legend": {"calcs": [], "displayMode": "table", "placement": "bottom", "showLegend": true},
197+
"tooltip": {"mode": "multi", "sort": "none"}
198+
},
199+
"targets": [
200+
{"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "llama_stack_prompt_tokens_total", "legendFormat": "Prompt - {{model_id}}", "refId": "A"},
201+
{"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "llama_stack_tokens_total", "legendFormat": "Total - {{model_id}}", "refId": "B"}
202+
],
203+
"title": "Prompt & Total Tokens",
204+
"type": "timeseries"
205+
},
206+
{
207+
"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
208+
"fieldConfig": {
209+
"defaults": {
210+
"custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
211+
"mappings": [],
212+
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
213+
"unit": "ms"
214+
}
215+
},
216+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
217+
"id": 3,
218+
"options": {
219+
"legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
220+
"tooltip": {"mode": "multi", "sort": "none"}
221+
},
222+
"targets": [
223+
{"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "histogram_quantile(0.95, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))", "legendFormat": "p95", "refId": "A"},
224+
{"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "histogram_quantile(0.99, rate(llama_stack_http_server_duration_milliseconds_bucket[5m]))", "legendFormat": "p99", "refId": "B"}
225+
],
226+
"title": "HTTP Request Duration (p95, p99)",
227+
"type": "timeseries"
228+
},
229+
{
230+
"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
231+
"fieldConfig": {
232+
"defaults": {
233+
"mappings": [],
234+
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}
235+
}
236+
},
237+
"gridPos": {"h": 8, "w": 6, "x": 12, "y": 8},
238+
"id": 4,
239+
"options": {
240+
"colorMode": "value",
241+
"graphMode": "area",
242+
"justifyMode": "auto",
243+
"orientation": "auto",
244+
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
245+
"textMode": "auto"
246+
},
247+
"targets": [
248+
{"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "sum(llama_stack_http_server_duration_milliseconds_count)", "refId": "A"}
249+
],
250+
"title": "Total Requests",
251+
"type": "stat"
252+
},
253+
{
254+
"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
255+
"fieldConfig": {
256+
"defaults": {
257+
"mappings": [],
258+
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]}
259+
}
260+
},
261+
"gridPos": {"h": 8, "w": 6, "x": 18, "y": 8},
262+
"id": 5,
263+
"options": {
264+
"colorMode": "value",
265+
"graphMode": "none",
266+
"justifyMode": "auto",
267+
"orientation": "auto",
268+
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
269+
"textMode": "auto"
270+
},
271+
"targets": [
272+
{"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "sum(llama_stack_http_server_active_requests)", "refId": "A"}
273+
],
274+
"title": "Active Requests",
275+
"type": "stat"
276+
},
277+
{
278+
"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
279+
"fieldConfig": {
280+
"defaults": {
281+
"custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
282+
"mappings": [],
283+
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
284+
"unit": "reqps"
285+
}
286+
},
287+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
288+
"id": 6,
289+
"options": {
290+
"legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
291+
"tooltip": {"mode": "multi", "sort": "none"}
292+
},
293+
"targets": [
294+
{"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_duration_milliseconds_count[5m])", "legendFormat": "{{http_target}} - {{http_status_code}}", "refId": "A"}
295+
],
296+
"title": "Request Rate",
297+
"type": "timeseries"
298+
},
299+
{
300+
"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"},
301+
"fieldConfig": {
302+
"defaults": {
303+
"custom": {"drawStyle": "line", "lineInterpolation": "linear", "showPoints": "auto", "fillOpacity": 10},
304+
"mappings": [],
305+
"thresholds": {"mode": "absolute", "steps": [{"color": "green", "value": null}]},
306+
"unit": "Bps"
307+
}
308+
},
309+
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
310+
"id": 7,
311+
"options": {
312+
"legend": {"calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true},
313+
"tooltip": {"mode": "multi", "sort": "none"}
314+
},
315+
"targets": [
316+
{"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_request_size_bytes_sum[5m])", "legendFormat": "Request", "refId": "A"},
317+
{"datasource": {"type": "prometheus", "uid": "$(DS_PROMETHEUS}"}, "expr": "rate(llama_stack_http_server_response_size_bytes_sum[5m])", "legendFormat": "Response", "refId": "B"}
318+
],
319+
"title": "Request/Response Sizes",
320+
"type": "timeseries"
321+
}
322+
],
323+
"refresh": "5s",
324+
"schemaVersion": 38,
325+
"tags": ["llama-stack"],
326+
"templating": {"list": []},
327+
"time": {"from": "now-15m", "to": "now"},
328+
"timepicker": {},
329+
"timezone": "browser",
330+
"title": "Llama Stack Metrics",
331+
"uid": "llama-stack-metrics",
332+
"version": 0,
333+
"weekStart": ""
334+
}
335+
DASHBOARD_JSON
115336
}
116337

117338
# Cleanup function to remove temporary files
@@ -372,6 +593,8 @@ if [ "$WITH_TELEMETRY" = true ]; then
372593
-e GF_SECURITY_ADMIN_PASSWORD=admin \
373594
-e GF_USERS_ALLOW_SIGN_UP=false \
374595
-v "${TELEMETRY_ASSETS_DIR}/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml:Z" \
596+
-v "${TELEMETRY_ASSETS_DIR}/grafana-dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:Z" \
597+
-v "${TELEMETRY_ASSETS_DIR}/llama-stack-dashboard.json:/etc/grafana/provisioning/dashboards/llama-stack-dashboard.json:Z" \
375598
docker.io/grafana/grafana:11.0.0 > /dev/null 2>&1; then
376599
die "Grafana startup failed"
377600
fi
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: 1
2+
3+
providers:
4+
- name: 'Llama Stack'
5+
orgId: 1
6+
folder: ''
7+
type: file
8+
disableDeletion: false
9+
updateIntervalSeconds: 10
10+
allowUiUpdates: true
11+
options:
12+
path: /etc/grafana/provisioning/dashboards

scripts/telemetry/grafana-datasources.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ datasources:
55
type: prometheus
66
access: proxy
77
url: http://prometheus:9090
8+
uid: prometheus
89
isDefault: true
910
editable: true
1011

0 commit comments

Comments
 (0)