Intermediate

Step 2: Model Performance Views

Build interactive Plotly charts for accuracy over time, confusion matrix heatmaps, feature importance bar charts, and precision-recall curves — all wired to your data connectors.

Overview

This lesson creates the views/model_metrics.py module with four main visualizations that give ML engineers instant insight into how their models are performing.

Chart 1: Accuracy Over Time

The most important view — how model accuracy changes day to day. Degradation here often signals data drift or upstream data quality issues.

# views/model_metrics.py
import streamlit as st
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np


def render_accuracy_over_time(df):
    """
    Line chart showing accuracy trends for each model.
    Args:
        df: DataFrame with columns [date, model, accuracy]
    """
    st.subheader("Accuracy Over Time")

    fig = px.line(
        df,
        x="date",
        y="accuracy",
        color="model",
        title="Model Accuracy Trend",
        labels={"accuracy": "Accuracy", "date": "Date", "model": "Model"},
        template="plotly_dark",
    )

    # Add a threshold line at 90% accuracy
    fig.add_hline(
        y=0.90,
        line_dash="dash",
        line_color="red",
        annotation_text="90% threshold",
        annotation_position="top left",
    )

    fig.update_layout(
        height=400,
        margin=dict(l=20, r=20, t=50, b=20),
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
        yaxis=dict(range=[0.70, 1.0], tickformat=".1%"),
        hovermode="x unified",
    )

    st.plotly_chart(fig, use_container_width=True)

Chart 2: Confusion Matrix Heatmap

A confusion matrix shows where the model makes correct and incorrect predictions. This is essential for understanding class-level performance.

def render_confusion_matrix(model_name="fraud-detector-v2"):
    """
    Heatmap showing prediction vs actual class distribution.
    Uses synthetic data for demonstration.
    """
    st.subheader(f"Confusion Matrix: {model_name}")

    # Generate a realistic confusion matrix
    # In production, compute this from prediction logs
    classes = ["Legitimate", "Suspicious", "Fraud"]
    n_classes = len(classes)

    # Create a matrix where diagonal values are high (correct predictions)
    matrix = np.zeros((n_classes, n_classes), dtype=int)
    for i in range(n_classes):
        for j in range(n_classes):
            if i == j:
                matrix[i][j] = np.random.randint(800, 2000)
            else:
                matrix[i][j] = np.random.randint(10, 150)

    fig = go.Figure(data=go.Heatmap(
        z=matrix,
        x=classes,
        y=classes,
        colorscale="Blues",
        text=matrix,
        texttemplate="%{text}",
        textfont={"size": 16},
        hovertemplate="Predicted: %{x}<br>Actual: %{y}<br>Count: %{z}<extra></extra>",
    ))

    fig.update_layout(
        title="Prediction vs Actual",
        xaxis_title="Predicted Class",
        yaxis_title="Actual Class",
        height=400,
        template="plotly_dark",
        margin=dict(l=20, r=20, t=50, b=20),
    )

    st.plotly_chart(fig, use_container_width=True)

    # Show per-class metrics
    col1, col2, col3 = st.columns(3)
    total = matrix.sum()
    for idx, (col, cls) in enumerate(zip([col1, col2, col3], classes)):
        tp = matrix[idx][idx]
        fp = matrix[:, idx].sum() - tp
        fn = matrix[idx, :].sum() - tp
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        with col:
            st.metric(f"{cls} Precision", f"{precision:.1%}")
            st.metric(f"{cls} Recall", f"{recall:.1%}")

Chart 3: Feature Importance

Understanding which features drive predictions helps debug model behavior and catch upstream data pipeline issues.

def render_feature_importance(model_name="fraud-detector-v2"):
    """
    Horizontal bar chart of feature importance scores.
    In production, load from model artifacts or SHAP values.
    """
    st.subheader(f"Feature Importance: {model_name}")

    # Synthetic feature importance data
    features = [
        "transaction_amount", "time_since_last_txn", "merchant_category",
        "device_fingerprint", "ip_country_match", "card_present",
        "avg_daily_txns", "account_age_days", "txn_frequency_1h",
        "amount_vs_avg_ratio",
    ]

    importance = np.random.dirichlet(np.ones(len(features)) * 2) * 100
    importance = sorted(zip(features, importance), key=lambda x: x[1])

    feat_names = [f[0] for f in importance]
    feat_values = [f[1] for f in importance]

    fig = go.Figure(go.Bar(
        x=feat_values,
        y=feat_names,
        orientation="h",
        marker=dict(
            color=feat_values,
            colorscale="Viridis",
            showscale=False,
        ),
        text=[f"{v:.1f}%" for v in feat_values],
        textposition="outside",
    ))

    fig.update_layout(
        title="Feature Importance (% contribution)",
        xaxis_title="Importance (%)",
        height=450,
        template="plotly_dark",
        margin=dict(l=20, r=80, t=50, b=20),
        xaxis=dict(range=[0, max(feat_values) * 1.3]),
    )

    st.plotly_chart(fig, use_container_width=True)

Chart 4: Multi-Metric Comparison

Compare accuracy, precision, recall, and F1 across all models in a single grouped bar chart:

def render_metric_comparison(df):
    """
    Grouped bar chart comparing accuracy, precision, recall, and F1
    for each model using the latest available data.
    """
    st.subheader("Model Comparison: Latest Metrics")

    latest_date = df["date"].max()
    latest = df[df["date"] == latest_date].copy()

    # Reshape for grouped bar chart
    metrics_cols = ["accuracy", "precision", "recall", "f1_score"]
    melted = latest.melt(
        id_vars=["model"],
        value_vars=metrics_cols,
        var_name="metric",
        value_name="value",
    )

    fig = px.bar(
        melted,
        x="model",
        y="value",
        color="metric",
        barmode="group",
        title="Latest Performance by Model",
        labels={"value": "Score", "model": "Model", "metric": "Metric"},
        template="plotly_dark",
        color_discrete_sequence=["#6366f1", "#22c55e", "#f59e0b", "#ef4444"],
    )

    fig.update_layout(
        height=400,
        margin=dict(l=20, r=20, t=50, b=20),
        yaxis=dict(range=[0.5, 1.0], tickformat=".0%"),
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    )

    st.plotly_chart(fig, use_container_width=True)

Wiring It Into the App

Update the page routing in app.py to use the new views:

# In app.py, update the Model Performance section:
from views.model_metrics import (
    render_accuracy_over_time,
    render_confusion_matrix,
    render_feature_importance,
    render_metric_comparison,
)
from connectors import get_model_metrics

# Inside the main() function, replace the Model Performance branch:
elif page == "Model Performance":
    metrics_df = get_model_metrics(days=30)

    # Model selector in sidebar
    models = metrics_df["model"].unique().tolist()
    selected_model = st.sidebar.selectbox("Select Model", models)

    # Render all charts
    render_accuracy_over_time(metrics_df)

    col1, col2 = st.columns(2)
    with col1:
        render_confusion_matrix(selected_model)
    with col2:
        render_feature_importance(selected_model)

    render_metric_comparison(metrics_df)
💡
Layout tip: Streamlit's st.columns() creates side-by-side panels. Putting the confusion matrix and feature importance side by side saves vertical space and makes comparisons easier. On mobile, Streamlit automatically stacks columns vertically.

Latency Distribution Chart (Bonus)

Add a histogram showing inference latency distribution to spot performance outliers:

def render_latency_distribution(df, model_name):
    """Histogram of P50 and P99 latency values."""
    st.subheader(f"Latency Distribution: {model_name}")

    model_df = df[df["model"] == model_name]

    fig = go.Figure()
    fig.add_trace(go.Histogram(
        x=model_df["latency_p50_ms"],
        name="P50 Latency",
        opacity=0.7,
        marker_color="#6366f1",
    ))
    fig.add_trace(go.Histogram(
        x=model_df["latency_p99_ms"],
        name="P99 Latency",
        opacity=0.7,
        marker_color="#ef4444",
    ))

    fig.update_layout(
        barmode="overlay",
        title="Inference Latency Distribution (ms)",
        xaxis_title="Latency (ms)",
        yaxis_title="Frequency",
        height=350,
        template="plotly_dark",
        margin=dict(l=20, r=20, t=50, b=20),
    )

    st.plotly_chart(fig, use_container_width=True)

    # Summary stats
    col1, col2 = st.columns(2)
    with col1:
        st.metric("Median P50", f"{model_df['latency_p50_ms'].median():.1f} ms")
    with col2:
        st.metric("Median P99", f"{model_df['latency_p99_ms'].median():.1f} ms")

What Is Next

The model performance views are complete. In the next lesson, we will build Step 3: Data Drift Monitoring — visualizing feature distribution changes with PSI and KS test plots, setting up drift alert thresholds, and triggering notifications when distributions shift.