Alerting & Incident Response Advanced
Good monitoring is useless without good alerting. ML teams are particularly prone to alert fatigue because ML systems generate more nuanced signals than traditional software. This lesson covers how to design alerts that actually get acted on, build runbooks for ML-specific incidents, and integrate with incident management tools.
Alert Severity Design for ML Systems
| Severity | Response Time | Notification | ML Examples |
|---|---|---|---|
| P0 - Critical | < 15 minutes | PagerDuty page + phone call | Model serving is down, 100% error rate, data pipeline completely stopped |
| P1 - High | < 1 hour | PagerDuty page + Slack | Prediction collapse (single class), p99 latency > 10s, cost runaway |
| P2 - Medium | < 4 hours | Slack channel | Significant data drift (>25% features), accuracy drop > 5%, ground truth pipeline delayed |
| P3 - Low | Next business day | Email / dashboard | Moderate drift, minor latency increase, feature importance shift |
Production Alert Manager
# Production ML alert manager with routing and deduplication
import hashlib
import json
import time
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Callable
from enum import Enum
from dataclasses import dataclass, field
class Severity(Enum):
CRITICAL = "P0"
HIGH = "P1"
MEDIUM = "P2"
LOW = "P3"
@dataclass
class Alert:
alert_id: str
title: str
description: str
severity: Severity
source: str # e.g., "drift_detector", "perf_monitor"
model_name: str
timestamp: str
metrics: Dict = field(default_factory=dict)
runbook_url: str = ""
dedup_key: str = ""
acknowledged: bool = False
resolved: bool = False
@dataclass
class AlertRule:
name: str
condition: Callable # Function that returns True if alert should fire
severity: Severity
title_template: str
description_template: str
cooldown_minutes: int = 30 # Don't re-fire within this window
runbook_url: str = ""
notify_channels: List[str] = field(default_factory=lambda: ["slack"])
class MLAlertManager:
"""Production alert manager for ML systems.
Features:
- Rule-based alert generation
- Deduplication (don't spam the same alert)
- Severity-based routing
- Cooldown periods
- Alert grouping
"""
def __init__(self, service_name: str):
self.service_name = service_name
self.rules: Dict[str, AlertRule] = {}
self.active_alerts: Dict[str, Alert] = {}
self.alert_history: List[Alert] = []
self.last_fired: Dict[str, datetime] = {}
self.notification_handlers: Dict[str, Callable] = {}
def register_rule(self, rule: AlertRule):
"""Register an alert rule."""
self.rules[rule.name] = rule
def register_notification_handler(self, channel: str,
handler: Callable):
"""Register a notification handler (Slack, PagerDuty, etc.)."""
self.notification_handlers[channel] = handler
def evaluate_rules(self, metrics: dict, model_name: str):
"""Evaluate all rules against current metrics.
Call this periodically (e.g., every minute)."""
now = datetime.utcnow()
for rule_name, rule in self.rules.items():
try:
should_fire = rule.condition(metrics)
except Exception:
continue
if not should_fire:
# Check if we should auto-resolve
dedup_key = f"{rule_name}:{model_name}"
if dedup_key in self.active_alerts:
self._resolve_alert(dedup_key)
continue
# Check cooldown
dedup_key = f"{rule_name}:{model_name}"
last = self.last_fired.get(dedup_key)
if last and (now - last).seconds < rule.cooldown_minutes * 60:
continue # Still in cooldown
# Fire alert
alert = Alert(
alert_id=hashlib.md5(
f"{dedup_key}:{now.isoformat()}".encode()
).hexdigest()[:12],
title=rule.title_template.format(**metrics,
model=model_name),
description=rule.description_template.format(
**metrics, model=model_name),
severity=rule.severity,
source=rule_name,
model_name=model_name,
timestamp=now.isoformat(),
metrics=metrics,
runbook_url=rule.runbook_url,
dedup_key=dedup_key
)
self.active_alerts[dedup_key] = alert
self.alert_history.append(alert)
self.last_fired[dedup_key] = now
self._route_alert(alert, rule)
def _route_alert(self, alert: Alert, rule: AlertRule):
"""Route alert to appropriate channels based on severity."""
channels = list(rule.notify_channels)
# P0/P1 always go to PagerDuty
if alert.severity in [Severity.CRITICAL, Severity.HIGH]:
if "pagerduty" not in channels:
channels.append("pagerduty")
for channel in channels:
handler = self.notification_handlers.get(channel)
if handler:
try:
handler(alert)
except Exception as e:
print(f"Failed to send alert to {channel}: {e}")
def _resolve_alert(self, dedup_key: str):
"""Auto-resolve an alert when condition clears."""
if dedup_key in self.active_alerts:
alert = self.active_alerts[dedup_key]
alert.resolved = True
del self.active_alerts[dedup_key]
def get_active_alerts(self) -> List[Alert]:
"""Get all currently active (unresolved) alerts."""
return sorted(
self.active_alerts.values(),
key=lambda a: list(Severity).index(a.severity)
)
# --- Setup Example ---
# Notification handlers
def slack_handler(alert: Alert):
"""Send alert to Slack via webhook."""
severity_emoji = {
Severity.CRITICAL: "🔴",
Severity.HIGH: "🟠",
Severity.MEDIUM: "🟡",
Severity.LOW: "🔵"
}
message = {
"text": f"{severity_emoji[alert.severity]} [{alert.severity.value}] "
f"{alert.title}\n{alert.description}"
f"\nRunbook: {alert.runbook_url}"
}
# requests.post(SLACK_WEBHOOK_URL, json=message)
print(f"Slack: {json.dumps(message)}")
def pagerduty_handler(alert: Alert):
"""Create PagerDuty incident."""
event = {
"routing_key": "YOUR_PAGERDUTY_INTEGRATION_KEY",
"event_action": "trigger",
"dedup_key": alert.dedup_key,
"payload": {
"summary": f"[{alert.severity.value}] {alert.title}",
"source": alert.source,
"severity": "critical" if alert.severity == Severity.CRITICAL
else "error" if alert.severity == Severity.HIGH
else "warning",
"custom_details": alert.metrics
}
}
# requests.post("https://events.pagerduty.com/v2/enqueue", json=event)
print(f"PagerDuty: {event['payload']['summary']}")
# Wire it all up
manager = MLAlertManager("ml-platform")
manager.register_notification_handler("slack", slack_handler)
manager.register_notification_handler("pagerduty", pagerduty_handler)
Alert Rules for Common ML Scenarios
# Common ML alert rules - copy and adapt for your system
# Rule 1: Prediction volume drop
manager.register_rule(AlertRule(
name="prediction_volume_drop",
condition=lambda m: m.get("predictions_last_hour", 0) <
m.get("expected_predictions_per_hour", 100) * 0.5,
severity=Severity.HIGH,
title_template="Prediction volume dropped 50%+ for {model}",
description_template=(
"Expected ~{expected_predictions_per_hour}/hr, "
"got {predictions_last_hour}. "
"Check if upstream data pipeline is running."
),
cooldown_minutes=30,
runbook_url="https://wiki.internal/runbooks/low-prediction-volume",
notify_channels=["slack", "pagerduty"]
))
# Rule 2: Data drift detected
manager.register_rule(AlertRule(
name="significant_data_drift",
condition=lambda m: m.get("drifted_feature_count", 0) >=
m.get("total_features", 1) * 0.25,
severity=Severity.MEDIUM,
title_template="Significant data drift: {drifted_feature_count} features for {model}",
description_template=(
"{drifted_feature_count}/{total_features} features show "
"significant drift (PSI > 0.2). Top drifted: {top_drifted_features}. "
"Review drift dashboard and upstream data sources."
),
cooldown_minutes=60,
runbook_url="https://wiki.internal/runbooks/data-drift",
notify_channels=["slack"]
))
# Rule 3: Model accuracy degradation
manager.register_rule(AlertRule(
name="accuracy_degradation",
condition=lambda m: (m.get("current_accuracy", 1.0) <
m.get("baseline_accuracy", 1.0) * 0.95),
severity=Severity.HIGH,
title_template="Model accuracy dropped >5% for {model}",
description_template=(
"Current accuracy: {current_accuracy:.3f}, "
"baseline: {baseline_accuracy:.3f}. "
"Check data drift, feature pipeline, and recent deployments."
),
cooldown_minutes=60,
runbook_url="https://wiki.internal/runbooks/accuracy-drop",
notify_channels=["slack", "pagerduty"]
))
# Rule 4: Latency spike
manager.register_rule(AlertRule(
name="latency_spike",
condition=lambda m: m.get("p99_latency_ms", 0) > 5000,
severity=Severity.HIGH,
title_template="P99 latency >5s for {model}",
description_template=(
"P99 latency: {p99_latency_ms}ms (threshold: 5000ms). "
"Check model serving infrastructure, GPU memory, batch sizes."
),
cooldown_minutes=15,
runbook_url="https://wiki.internal/runbooks/latency-spike",
notify_channels=["slack", "pagerduty"]
))
# Rule 5: LLM cost runaway
manager.register_rule(AlertRule(
name="llm_cost_runaway",
condition=lambda m: m.get("daily_cost_usd", 0) >
m.get("daily_budget_usd", 500) * 0.8,
severity=Severity.HIGH,
title_template="LLM cost approaching budget for {model}",
description_template=(
"Daily cost: ${daily_cost_usd:.2f} / "
"${daily_budget_usd:.2f} budget (80%+ consumed). "
"Check for retry loops, prompt bloat, or traffic spikes."
),
cooldown_minutes=60,
runbook_url="https://wiki.internal/runbooks/llm-cost",
notify_channels=["slack", "pagerduty"]
))
ML Incident Runbook Template
# ML Incident Runbook Template - copy for each model
RUNBOOK_TEMPLATE = """
# ML Incident Runbook: {model_name}
## Owner: {team} | Escalation: {escalation_contact}
## Quick Reference
- Model endpoint: {endpoint}
- Dashboard: {dashboard_url}
- Feature store: {feature_store_url}
- Training pipeline: {training_pipeline_url}
## Triage Steps (do these FIRST, in order)
### 1. Is the model serving traffic? (30 seconds)
- Check: `curl -s {endpoint}/health | jq .status`
- If DOWN: escalate to infra team immediately (P0)
- If UP: continue to step 2
### 2. Is prediction volume normal? (1 minute)
- Check dashboard: {dashboard_url}
- Compare current QPS to same time yesterday
- If volume dropped >50%: check upstream data pipeline
- If volume is normal: continue to step 3
### 3. Are predictions reasonable? (2 minutes)
- Check prediction distribution on dashboard
- If one class >95%: PREDICTION COLLAPSE
-> Rollback to previous model version: {rollback_command}
-> Investigate feature pipeline
- If distribution looks normal: continue to step 4
### 4. Check data quality (5 minutes)
- Review drift dashboard for top drifted features
- Check null rates: any feature >5% nulls?
- Check feature freshness: any features stale >30 min?
- If data issues found: fix upstream, then evaluate model
### 5. Check model performance (5 minutes)
- Review accuracy/precision/recall on dashboard
- Compare to baseline (accuracy: {baseline_accuracy})
- If degraded >5%: evaluate if retrain is needed
## Rollback Procedure
```bash
# Quick rollback to previous model version
kubectl set image deployment/{deployment_name} \\
model={previous_model_image}
# Verify rollback
kubectl rollout status deployment/{deployment_name}
curl -s {endpoint}/health | jq .model_version
```
## Escalation
- L1 (ML Engineer on-call): {l1_contact}
- L2 (ML Platform team): {l2_contact}
- L3 (VP Engineering): {l3_contact} (P0 only, after 30 min)
"""
print(RUNBOOK_TEMPLATE.format(
model_name="fraud-detector-v2",
team="ml-platform",
escalation_contact="#ml-incidents Slack channel",
endpoint="https://ml.internal/fraud/v2",
dashboard_url="https://grafana.internal/d/fraud-v2",
feature_store_url="https://feast.internal/fraud-features",
training_pipeline_url="https://airflow.internal/fraud-train",
rollback_command="kubectl rollout undo deployment/fraud-v2",
baseline_accuracy=0.95,
deployment_name="fraud-v2",
previous_model_image="ml-registry/fraud:v2.2",
l1_contact="@ml-oncall",
l2_contact="@ml-platform-team",
l3_contact="@vp-eng"
))
Reducing Alert Fatigue
The Alert Fatigue Problem: When engineers get too many alerts, they start ignoring all of them — including the critical ones. ML systems are especially prone to alert fatigue because drift and performance metrics generate constant noise.
| Strategy | How It Helps | Implementation |
|---|---|---|
| Deduplication | One alert per issue, not one per minute | Use dedup_key, alert only on state change (fire/resolve) |
| Cooldown periods | Prevent re-alerting during investigation | 30-60 min cooldown per alert type |
| Alert grouping | Related alerts become one notification | Group by model or data pipeline |
| Progressive severity | Start low, escalate if unresolved | P3 at 15 min, P2 at 30 min, P1 at 60 min |
| Weekly review | Identify and fix noisy alerts | Track alert-to-action ratio; delete alerts that are never acted on |
| Business hours routing | Only page for things that truly can't wait | P2/P3 go to Slack only; P0/P1 page |
Lilly Tech Systems