Step 2: Model Performance Views
Build interactive Plotly charts for accuracy over time, confusion matrix heatmaps, feature importance bar charts, and precision-recall curves — all wired to your data connectors.
Overview
This lesson creates the views/model_metrics.py module with four main visualizations that give ML engineers instant insight into how their models are performing.
Chart 1: Accuracy Over Time
The most important view — how model accuracy changes day to day. Degradation here often signals data drift or upstream data quality issues.
# views/model_metrics.py
import streamlit as st
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np
def render_accuracy_over_time(df):
"""
Line chart showing accuracy trends for each model.
Args:
df: DataFrame with columns [date, model, accuracy]
"""
st.subheader("Accuracy Over Time")
fig = px.line(
df,
x="date",
y="accuracy",
color="model",
title="Model Accuracy Trend",
labels={"accuracy": "Accuracy", "date": "Date", "model": "Model"},
template="plotly_dark",
)
# Add a threshold line at 90% accuracy
fig.add_hline(
y=0.90,
line_dash="dash",
line_color="red",
annotation_text="90% threshold",
annotation_position="top left",
)
fig.update_layout(
height=400,
margin=dict(l=20, r=20, t=50, b=20),
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
yaxis=dict(range=[0.70, 1.0], tickformat=".1%"),
hovermode="x unified",
)
st.plotly_chart(fig, use_container_width=True)
Chart 2: Confusion Matrix Heatmap
A confusion matrix shows where the model makes correct and incorrect predictions. This is essential for understanding class-level performance.
def render_confusion_matrix(model_name="fraud-detector-v2"):
"""
Heatmap showing prediction vs actual class distribution.
Uses synthetic data for demonstration.
"""
st.subheader(f"Confusion Matrix: {model_name}")
# Generate a realistic confusion matrix
# In production, compute this from prediction logs
classes = ["Legitimate", "Suspicious", "Fraud"]
n_classes = len(classes)
# Create a matrix where diagonal values are high (correct predictions)
matrix = np.zeros((n_classes, n_classes), dtype=int)
for i in range(n_classes):
for j in range(n_classes):
if i == j:
matrix[i][j] = np.random.randint(800, 2000)
else:
matrix[i][j] = np.random.randint(10, 150)
fig = go.Figure(data=go.Heatmap(
z=matrix,
x=classes,
y=classes,
colorscale="Blues",
text=matrix,
texttemplate="%{text}",
textfont={"size": 16},
hovertemplate="Predicted: %{x}<br>Actual: %{y}<br>Count: %{z}<extra></extra>",
))
fig.update_layout(
title="Prediction vs Actual",
xaxis_title="Predicted Class",
yaxis_title="Actual Class",
height=400,
template="plotly_dark",
margin=dict(l=20, r=20, t=50, b=20),
)
st.plotly_chart(fig, use_container_width=True)
# Show per-class metrics
col1, col2, col3 = st.columns(3)
total = matrix.sum()
for idx, (col, cls) in enumerate(zip([col1, col2, col3], classes)):
tp = matrix[idx][idx]
fp = matrix[:, idx].sum() - tp
fn = matrix[idx, :].sum() - tp
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
with col:
st.metric(f"{cls} Precision", f"{precision:.1%}")
st.metric(f"{cls} Recall", f"{recall:.1%}")
Chart 3: Feature Importance
Understanding which features drive predictions helps debug model behavior and catch upstream data pipeline issues.
def render_feature_importance(model_name="fraud-detector-v2"):
"""
Horizontal bar chart of feature importance scores.
In production, load from model artifacts or SHAP values.
"""
st.subheader(f"Feature Importance: {model_name}")
# Synthetic feature importance data
features = [
"transaction_amount", "time_since_last_txn", "merchant_category",
"device_fingerprint", "ip_country_match", "card_present",
"avg_daily_txns", "account_age_days", "txn_frequency_1h",
"amount_vs_avg_ratio",
]
importance = np.random.dirichlet(np.ones(len(features)) * 2) * 100
importance = sorted(zip(features, importance), key=lambda x: x[1])
feat_names = [f[0] for f in importance]
feat_values = [f[1] for f in importance]
fig = go.Figure(go.Bar(
x=feat_values,
y=feat_names,
orientation="h",
marker=dict(
color=feat_values,
colorscale="Viridis",
showscale=False,
),
text=[f"{v:.1f}%" for v in feat_values],
textposition="outside",
))
fig.update_layout(
title="Feature Importance (% contribution)",
xaxis_title="Importance (%)",
height=450,
template="plotly_dark",
margin=dict(l=20, r=80, t=50, b=20),
xaxis=dict(range=[0, max(feat_values) * 1.3]),
)
st.plotly_chart(fig, use_container_width=True)
Chart 4: Multi-Metric Comparison
Compare accuracy, precision, recall, and F1 across all models in a single grouped bar chart:
def render_metric_comparison(df):
"""
Grouped bar chart comparing accuracy, precision, recall, and F1
for each model using the latest available data.
"""
st.subheader("Model Comparison: Latest Metrics")
latest_date = df["date"].max()
latest = df[df["date"] == latest_date].copy()
# Reshape for grouped bar chart
metrics_cols = ["accuracy", "precision", "recall", "f1_score"]
melted = latest.melt(
id_vars=["model"],
value_vars=metrics_cols,
var_name="metric",
value_name="value",
)
fig = px.bar(
melted,
x="model",
y="value",
color="metric",
barmode="group",
title="Latest Performance by Model",
labels={"value": "Score", "model": "Model", "metric": "Metric"},
template="plotly_dark",
color_discrete_sequence=["#6366f1", "#22c55e", "#f59e0b", "#ef4444"],
)
fig.update_layout(
height=400,
margin=dict(l=20, r=20, t=50, b=20),
yaxis=dict(range=[0.5, 1.0], tickformat=".0%"),
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)
st.plotly_chart(fig, use_container_width=True)
Wiring It Into the App
Update the page routing in app.py to use the new views:
# In app.py, update the Model Performance section:
from views.model_metrics import (
render_accuracy_over_time,
render_confusion_matrix,
render_feature_importance,
render_metric_comparison,
)
from connectors import get_model_metrics
# Inside the main() function, replace the Model Performance branch:
elif page == "Model Performance":
metrics_df = get_model_metrics(days=30)
# Model selector in sidebar
models = metrics_df["model"].unique().tolist()
selected_model = st.sidebar.selectbox("Select Model", models)
# Render all charts
render_accuracy_over_time(metrics_df)
col1, col2 = st.columns(2)
with col1:
render_confusion_matrix(selected_model)
with col2:
render_feature_importance(selected_model)
render_metric_comparison(metrics_df)
st.columns() creates side-by-side panels. Putting the confusion matrix and feature importance side by side saves vertical space and makes comparisons easier. On mobile, Streamlit automatically stacks columns vertically.Latency Distribution Chart (Bonus)
Add a histogram showing inference latency distribution to spot performance outliers:
def render_latency_distribution(df, model_name):
"""Histogram of P50 and P99 latency values."""
st.subheader(f"Latency Distribution: {model_name}")
model_df = df[df["model"] == model_name]
fig = go.Figure()
fig.add_trace(go.Histogram(
x=model_df["latency_p50_ms"],
name="P50 Latency",
opacity=0.7,
marker_color="#6366f1",
))
fig.add_trace(go.Histogram(
x=model_df["latency_p99_ms"],
name="P99 Latency",
opacity=0.7,
marker_color="#ef4444",
))
fig.update_layout(
barmode="overlay",
title="Inference Latency Distribution (ms)",
xaxis_title="Latency (ms)",
yaxis_title="Frequency",
height=350,
template="plotly_dark",
margin=dict(l=20, r=20, t=50, b=20),
)
st.plotly_chart(fig, use_container_width=True)
# Summary stats
col1, col2 = st.columns(2)
with col1:
st.metric("Median P50", f"{model_df['latency_p50_ms'].median():.1f} ms")
with col2:
st.metric("Median P99", f"{model_df['latency_p99_ms'].median():.1f} ms")
What Is Next
The model performance views are complete. In the next lesson, we will build Step 3: Data Drift Monitoring — visualizing feature distribution changes with PSI and KS test plots, setting up drift alert thresholds, and triggering notifications when distributions shift.
Lilly Tech Systems