Dashboard Design Advanced
A monitoring dashboard that nobody looks at is worse than no dashboard at all — it gives a false sense of security. This lesson covers how to design ML dashboards that different stakeholders actually use, with Grafana templates, layout patterns, and SLA tracking strategies.
Dashboard Types by Audience
| Dashboard | Audience | Key Metrics | Refresh Rate | Time Range |
|---|---|---|---|---|
| Executive | VP/Director, PM | Business impact, cost, SLA compliance | 15 min | 7-30 days |
| ML Team | ML engineers, data scientists | Model accuracy, drift, feature health | 1-5 min | 1-7 days |
| On-Call | On-call engineer | Error rates, latency, active alerts, traffic | 30 sec | 1-6 hours |
| Model Deep-Dive | ML engineer investigating | Per-feature drift, prediction distributions, sliced metrics | 5 min | Custom |
Grafana Dashboard Configuration
# Grafana dashboard as code (JSON model for provisioning)
# Save as JSON, deploy via Grafana provisioning or API
GRAFANA_ML_DASHBOARD = {
"dashboard": {
"title": "ML Model Health - Production",
"tags": ["ml", "production", "monitoring"],
"timezone": "utc",
"refresh": "1m",
"time": {"from": "now-6h", "to": "now"},
# Variables for filtering
"templating": {
"list": [
{
"name": "model",
"type": "query",
"query": "label_values(ml_predictions_total, model_name)",
"refresh": 2, # On time range change
"multi": False
},
{
"name": "version",
"type": "query",
"query": "label_values(ml_predictions_total{model_name='$model'}, model_version)",
"refresh": 2
}
]
},
"panels": [
# Row 1: Traffic & Health (top-level overview)
{
"title": "Prediction Volume (QPS)",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 0},
"targets": [{
"expr": "rate(ml_predictions_total{model_name='$model', model_version='$version'}[5m])",
"legendFormat": "{{model_name}} {{status}}"
}],
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{"value": 0, "color": "red"},
{"value": 10, "color": "yellow"},
{"value": 50, "color": "green"}
]
}
}
}
},
{
"title": "Error Rate",
"type": "stat",
"gridPos": {"h": 4, "w": 4, "x": 8, "y": 0},
"targets": [{
"expr": (
"sum(rate(ml_predictions_total{model_name='$model', "
"status='error'}[5m])) / "
"sum(rate(ml_predictions_total{model_name='$model'}[5m]))"
)
}],
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"thresholds": {
"steps": [
{"value": 0, "color": "green"},
{"value": 0.01, "color": "yellow"},
{"value": 0.05, "color": "red"}
]
}
}
}
},
{
"title": "P99 Latency",
"type": "stat",
"gridPos": {"h": 4, "w": 4, "x": 12, "y": 0},
"targets": [{
"expr": "histogram_quantile(0.99, rate(ml_prediction_duration_seconds_bucket{model_name='$model'}[5m]))",
}],
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"steps": [
{"value": 0, "color": "green"},
{"value": 1, "color": "yellow"},
{"value": 5, "color": "red"}
]
}
}
}
},
{
"title": "Active Alerts",
"type": "stat",
"gridPos": {"h": 4, "w": 4, "x": 16, "y": 0},
"targets": [{
"expr": "ALERTS{model_name='$model', alertstate='firing'}"
}],
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{"value": 0, "color": "green"},
{"value": 1, "color": "yellow"},
{"value": 3, "color": "red"}
]
}
}
}
},
# Row 2: Model Performance
{
"title": "Model Accuracy (with Ground Truth)",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
"targets": [
{
"expr": "ml_model_accuracy{model_name='$model'}",
"legendFormat": "Accuracy"
},
{
"expr": "ml_model_precision{model_name='$model'}",
"legendFormat": "Precision"
},
{
"expr": "ml_model_recall{model_name='$model'}",
"legendFormat": "Recall"
}
]
},
{
"title": "Prediction Distribution",
"type": "histogram",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
"targets": [{
"expr": "ml_prediction_score_bucket{model_name='$model'}",
"legendFormat": "{{le}}"
}]
},
# Row 3: Data Quality & Drift
{
"title": "Feature Drift (PSI) - Top Features",
"type": "bargauge",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
"targets": [{
"expr": "topk(10, ml_feature_psi{model_name='$model'})",
"legendFormat": "{{feature_name}}"
}],
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{"value": 0, "color": "green"},
{"value": 0.1, "color": "yellow"},
{"value": 0.2, "color": "red"}
]
}
}
}
},
{
"title": "Feature Null Rates",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
"targets": [{
"expr": "ml_feature_null_rate{model_name='$model'} > 0.01",
"legendFormat": "{{feature_name}}"
}]
}
]
}
}
import json
print(json.dumps(GRAFANA_ML_DASHBOARD, indent=2))
Prometheus Metrics for ML Models
# Instrument your ML serving code with Prometheus metrics
from prometheus_client import (
Counter, Histogram, Gauge, Summary, start_http_server
)
import time
# --- Define metrics ---
# Traffic
PREDICTIONS_TOTAL = Counter(
'ml_predictions_total',
'Total predictions served',
['model_name', 'model_version', 'status']
)
# Latency
PREDICTION_DURATION = Histogram(
'ml_prediction_duration_seconds',
'Time to generate prediction',
['model_name'],
buckets=[0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]
)
# Model performance (updated periodically by eval job)
MODEL_ACCURACY = Gauge(
'ml_model_accuracy',
'Current model accuracy',
['model_name', 'model_version']
)
MODEL_PRECISION = Gauge(
'ml_model_precision',
'Current model precision',
['model_name', 'model_version']
)
MODEL_RECALL = Gauge(
'ml_model_recall',
'Current model recall',
['model_name', 'model_version']
)
# Data quality
FEATURE_PSI = Gauge(
'ml_feature_psi',
'Population Stability Index for feature',
['model_name', 'feature_name']
)
FEATURE_NULL_RATE = Gauge(
'ml_feature_null_rate',
'Null rate for feature',
['model_name', 'feature_name']
)
# Prediction distribution
PREDICTION_SCORE = Histogram(
'ml_prediction_score',
'Prediction score distribution',
['model_name'],
buckets=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
)
# Business metrics
DAILY_COST = Gauge(
'ml_daily_inference_cost_usd',
'Daily inference cost in USD',
['model_name']
)
# --- Usage in serving code ---
def predict(model_name: str, model_version: str, features: dict):
"""Instrumented prediction function."""
start_time = time.time()
try:
# Your model inference here
score = 0.85 # placeholder
prediction = 1 if score > 0.5 else 0
# Record metrics
duration = time.time() - start_time
PREDICTION_DURATION.labels(model_name=model_name).observe(duration)
PREDICTIONS_TOTAL.labels(
model_name=model_name,
model_version=model_version,
status="success"
).inc()
PREDICTION_SCORE.labels(model_name=model_name).observe(score)
return {"prediction": prediction, "score": score}
except Exception as e:
PREDICTIONS_TOTAL.labels(
model_name=model_name,
model_version=model_version,
status="error"
).inc()
raise
# Start Prometheus metrics endpoint
# start_http_server(8000) # Expose metrics at :8000/metrics
SLA Tracking for ML Systems
# ML SLA tracking - define and monitor service level objectives
from dataclasses import dataclass
from typing import Dict, List
from datetime import datetime, timedelta
@dataclass
class MLServiceLevelObjective:
"""Define SLOs for ML systems.
SLO = Service Level Objective (your target)
SLI = Service Level Indicator (the measurement)
SLA = Service Level Agreement (contractual commitment)
"""
name: str
sli_description: str
target: float # e.g., 0.999 for 99.9%
measurement_window: str # "30d", "7d", "24h"
error_budget: float = None # Calculated from target
def __post_init__(self):
self.error_budget = 1.0 - self.target
# Standard ML SLOs
ML_SLOS = [
MLServiceLevelObjective(
name="Availability",
sli_description="% of requests that return a valid prediction (non-5xx)",
target=0.999,
measurement_window="30d"
),
MLServiceLevelObjective(
name="Latency",
sli_description="% of requests served under 500ms (p95)",
target=0.95,
measurement_window="7d"
),
MLServiceLevelObjective(
name="Freshness",
sli_description="% of time model is serving predictions from the latest version",
target=0.99,
measurement_window="30d"
),
MLServiceLevelObjective(
name="Accuracy",
sli_description="Model accuracy remains within 5% of validation baseline",
target=0.95,
measurement_window="7d"
),
MLServiceLevelObjective(
name="Data Freshness",
sli_description="% of time features are less than 30 minutes old",
target=0.99,
measurement_window="7d"
)
]
class SLATracker:
"""Track SLA compliance over time."""
def __init__(self, slos: List[MLServiceLevelObjective]):
self.slos = {slo.name: slo for slo in slos}
self.measurements: Dict[str, List[dict]] = {
slo.name: [] for slo in slos
}
def record_measurement(self, slo_name: str, value: float,
timestamp: datetime = None):
"""Record an SLI measurement."""
ts = timestamp or datetime.utcnow()
self.measurements[slo_name].append({
"value": value,
"timestamp": ts.isoformat()
})
def get_compliance(self, slo_name: str) -> dict:
"""Check SLA compliance for a given SLO."""
slo = self.slos[slo_name]
measurements = self.measurements[slo_name]
if not measurements:
return {"status": "no_data"}
values = [m["value"] for m in measurements]
compliant = sum(1 for v in values if v >= slo.target)
compliance_rate = compliant / len(values)
# Error budget consumed
total_budget = slo.error_budget
consumed = 1.0 - compliance_rate
budget_remaining = max(0, total_budget - consumed)
budget_pct_remaining = (budget_remaining / total_budget * 100
if total_budget > 0 else 100)
return {
"slo": slo_name,
"target": slo.target,
"current_compliance": round(compliance_rate, 4),
"meeting_slo": compliance_rate >= slo.target,
"error_budget_total": round(total_budget, 4),
"error_budget_remaining": round(budget_remaining, 4),
"error_budget_pct_remaining": round(budget_pct_remaining, 1),
"measurements_count": len(values),
"action": "OK" if budget_pct_remaining > 20
else "WARNING - budget below 20%"
if budget_pct_remaining > 0
else "BUDGET EXHAUSTED - freeze changes"
}
# Example
tracker = SLATracker(ML_SLOS)
import numpy as np
# Simulate 30 days of availability measurements
for _ in range(720): # hourly for 30 days
tracker.record_measurement("Availability", np.random.choice(
[1.0, 0.0], p=[0.9995, 0.0005]
))
compliance = tracker.get_compliance("Availability")
print(f"Availability SLO: {compliance['current_compliance']:.4f}")
print(f"Error budget remaining: {compliance['error_budget_pct_remaining']}%")
Dashboard Design Best Practices
The 5-Second Rule: An on-call engineer should be able to determine "is the model healthy right now?" within 5 seconds of looking at the dashboard. Put the most critical health indicators (traffic, error rate, latency, active alerts) in the top row as large stat panels with color-coded thresholds.
Layout Pattern:
- Row 1: Traffic + Health (stat panels: QPS, error rate, latency, active alerts)
- Row 2: Model Performance (accuracy, precision, recall over time + prediction distribution)
- Row 3: Data Quality (feature drift heatmap + null rates)
- Row 4: Infrastructure (GPU/CPU usage, memory, queue depth)
- Row 5: Business Impact (conversion rate, cost, user feedback)
Lilly Tech Systems