Advanced

Step 4: Cost & Infrastructure

Build GPU utilization gauges, API cost breakdowns, inference latency percentile charts, and resource usage time series — the financial and operational view of your ML systems.

Why Cost Tracking Matters

ML infrastructure is expensive. A single GPU instance can cost $1-3 per hour. Without visibility, costs spiral quickly. This view helps teams:

Identify underutilized GPUs that can be downsized or turned off
Track API costs per model and per endpoint
Monitor inference latency for SLA compliance
Forecast monthly spending based on trends

GPU Utilization Gauges

Create views/cost_tracker.py starting with GPU utilization gauge charts:

# views/cost_tracker.py
import streamlit as st
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import numpy as np


def render_gpu_gauges(cost_df):
    """
    Gauge charts showing current GPU utilization and memory usage.
    """
    st.subheader("GPU Utilization (Current)")

    # Get the latest readings
    latest = cost_df.tail(1).iloc[0]
    gpu_util = latest["gpu_utilization"] * 100
    gpu_mem = latest["gpu_memory_pct"] * 100

    col1, col2 = st.columns(2)

    with col1:
        fig = go.Figure(go.Indicator(
            mode="gauge+number+delta",
            value=gpu_util,
            title={"text": "GPU Compute Utilization"},
            number={"suffix": "%"},
            delta={"reference": 70, "increasing": {"color": "#ef4444"}, "decreasing": {"color": "#22c55e"}},
            gauge={
                "axis": {"range": [0, 100], "tickwidth": 1},
                "bar": {"color": "#6366f1"},
                "bgcolor": "#1e293b",
                "borderwidth": 0,
                "steps": [
                    {"range": [0, 30], "color": "#22c55e20"},
                    {"range": [30, 70], "color": "#f59e0b20"},
                    {"range": [70, 100], "color": "#ef444420"},
                ],
                "threshold": {
                    "line": {"color": "#ef4444", "width": 3},
                    "thickness": 0.75,
                    "value": 85,
                },
            },
        ))
        fig.update_layout(height=300, template="plotly_dark",
                          margin=dict(l=20, r=20, t=50, b=20))
        st.plotly_chart(fig, use_container_width=True)

    with col2:
        fig = go.Figure(go.Indicator(
            mode="gauge+number+delta",
            value=gpu_mem,
            title={"text": "GPU Memory Usage"},
            number={"suffix": "%"},
            delta={"reference": 60, "increasing": {"color": "#ef4444"}, "decreasing": {"color": "#22c55e"}},
            gauge={
                "axis": {"range": [0, 100], "tickwidth": 1},
                "bar": {"color": "#8b5cf6"},
                "bgcolor": "#1e293b",
                "borderwidth": 0,
                "steps": [
                    {"range": [0, 40], "color": "#22c55e20"},
                    {"range": [40, 75], "color": "#f59e0b20"},
                    {"range": [75, 100], "color": "#ef444420"},
                ],
                "threshold": {
                    "line": {"color": "#ef4444", "width": 3},
                    "thickness": 0.75,
                    "value": 90,
                },
            },
        ))
        fig.update_layout(height=300, template="plotly_dark",
                          margin=dict(l=20, r=20, t=50, b=20))
        st.plotly_chart(fig, use_container_width=True)

API Cost Breakdown

Track costs by time period with a stacked area chart and daily totals:

def render_api_costs(cost_df):
    """
    Stacked area chart of API costs over time, plus daily cost breakdown.
    """
    st.subheader("API Cost Tracking")

    # Aggregate to daily costs
    cost_df["date"] = pd.to_datetime(cost_df["timestamp"]).dt.date
    daily = cost_df.groupby("date").agg({
        "api_cost_usd": "sum",
        "api_calls": "sum",
    }).reset_index()

    # KPI cards
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        total_cost = daily["api_cost_usd"].sum()
        st.metric("Total (30d)", f"${total_cost:.2f}")
    with col2:
        avg_daily = daily["api_cost_usd"].mean()
        st.metric("Avg Daily", f"${avg_daily:.2f}")
    with col3:
        total_calls = daily["api_calls"].sum()
        st.metric("Total API Calls", f"{total_calls:,}")
    with col4:
        cost_per_call = total_cost / total_calls if total_calls > 0 else 0
        st.metric("Cost per Call", f"${cost_per_call:.4f}")

    # Daily cost chart
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=daily["date"],
        y=daily["api_cost_usd"],
        fill="tozeroy",
        name="Daily API Cost",
        line=dict(color="#6366f1"),
        fillcolor="rgba(99, 102, 241, 0.2)",
    ))

    # Add trend line
    x_numeric = np.arange(len(daily))
    z = np.polyfit(x_numeric, daily["api_cost_usd"].values, 1)
    trend = np.polyval(z, x_numeric)
    fig.add_trace(go.Scatter(
        x=daily["date"],
        y=trend,
        name="Trend",
        line=dict(color="#ef4444", dash="dash"),
    ))

    fig.update_layout(
        title="Daily API Cost (USD)",
        xaxis_title="Date",
        yaxis_title="Cost (USD)",
        height=400,
        template="plotly_dark",
        margin=dict(l=20, r=20, t=50, b=20),
        hovermode="x unified",
    )

    st.plotly_chart(fig, use_container_width=True)

    # Monthly forecast
    monthly_forecast = avg_daily * 30
    st.info(f"Projected monthly cost at current rate: **${monthly_forecast:.2f}**")

Inference Latency Charts

Latency percentiles help identify performance degradation and SLA violations:

def render_latency_charts(cost_df):
    """
    Latency time series with P50, P95, and P99 percentile bands.
    """
    st.subheader("Inference Latency")

    # Calculate hourly percentiles
    cost_df["hour"] = pd.to_datetime(cost_df["timestamp"]).dt.floor("H")
    hourly = cost_df.groupby("hour")["inference_latency_ms"].agg(
        p50="median",
        p95=lambda x: np.percentile(x, 95),
        p99=lambda x: np.percentile(x, 99),
        mean="mean",
    ).reset_index()

    # Latency time series
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=hourly["hour"], y=hourly["p99"],
        name="P99", line=dict(color="#ef4444"),
        fill=None,
    ))
    fig.add_trace(go.Scatter(
        x=hourly["hour"], y=hourly["p95"],
        name="P95", line=dict(color="#f59e0b"),
        fill="tonexty", fillcolor="rgba(239, 68, 68, 0.1)",
    ))
    fig.add_trace(go.Scatter(
        x=hourly["hour"], y=hourly["p50"],
        name="P50 (Median)", line=dict(color="#22c55e"),
        fill="tonexty", fillcolor="rgba(245, 158, 11, 0.1)",
    ))

    # SLA threshold line
    fig.add_hline(
        y=200,
        line_dash="dash",
        line_color="#ef4444",
        annotation_text="SLA: 200ms",
        annotation_position="top right",
    )

    fig.update_layout(
        title="Inference Latency Percentiles (ms)",
        xaxis_title="Time",
        yaxis_title="Latency (ms)",
        height=400,
        template="plotly_dark",
        margin=dict(l=20, r=20, t=50, b=20),
        hovermode="x unified",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    )

    st.plotly_chart(fig, use_container_width=True)

    # SLA compliance metric
    total_requests = len(cost_df)
    sla_violations = len(cost_df[cost_df["inference_latency_ms"] > 200])
    sla_pct = ((total_requests - sla_violations) / total_requests * 100) if total_requests > 0 else 100

    col1, col2, col3 = st.columns(3)
    with col1:
        st.metric("SLA Compliance", f"{sla_pct:.2f}%")
    with col2:
        st.metric("SLA Violations", f"{sla_violations:,}")
    with col3:
        st.metric("Avg Latency", f"{cost_df['inference_latency_ms'].mean():.1f} ms")

GPU Utilization Time Series

Track GPU usage patterns over time to identify idle periods and peak usage:

def render_gpu_timeline(cost_df):
    """
    Time series of GPU utilization showing usage patterns.
    """
    st.subheader("GPU Utilization Over Time")

    cost_df["hour"] = pd.to_datetime(cost_df["timestamp"]).dt.floor("H")
    hourly = cost_df.groupby("hour").agg({
        "gpu_utilization": "mean",
        "gpu_memory_pct": "mean",
    }).reset_index()

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=hourly["hour"], y=hourly["gpu_utilization"] * 100,
        name="GPU Compute",
        line=dict(color="#6366f1"),
        fill="tozeroy",
        fillcolor="rgba(99, 102, 241, 0.15)",
    ))

    fig.add_trace(go.Scatter(
        x=hourly["hour"], y=hourly["gpu_memory_pct"] * 100,
        name="GPU Memory",
        line=dict(color="#8b5cf6", dash="dot"),
    ))

    fig.update_layout(
        title="GPU Utilization (%)",
        xaxis_title="Time",
        yaxis_title="Utilization (%)",
        height=350,
        template="plotly_dark",
        margin=dict(l=20, r=20, t=50, b=20),
        yaxis=dict(range=[0, 100]),
        hovermode="x unified",
    )

    st.plotly_chart(fig, use_container_width=True)

    # Utilization insights
    avg_util = cost_df["gpu_utilization"].mean() * 100
    if avg_util < 30:
        st.warning(f"Average GPU utilization is only {avg_util:.1f}%. Consider downsizing to a smaller instance to save costs.")
    elif avg_util > 85:
        st.error(f"Average GPU utilization is {avg_util:.1f}%. Consider scaling up to avoid latency spikes.")

Wiring It Into the App

Update app.py to connect the cost tracking views:

# In app.py, add cost tracking:
from views.cost_tracker import (
    render_gpu_gauges,
    render_api_costs,
    render_latency_charts,
    render_gpu_timeline,
)
from connectors import get_cost_data

# Inside main(), replace the Cost & Infrastructure branch:
elif page == "Cost & Infrastructure":
    cost_df = get_cost_data(days=30)

    render_gpu_gauges(cost_df)
    st.markdown("---")
    render_api_costs(cost_df)
    st.markdown("---")
    render_latency_charts(cost_df)
    st.markdown("---")
    render_gpu_timeline(cost_df)

💡

Cost optimization tip: If your GPU utilization is consistently below 30%, you are overspending. Consider using spot/preemptible instances for batch inference, or switching to CPU-only instances for models that do not need GPU acceleration. The savings can be 60-80%.

What Is Next

The cost and infrastructure views are complete. In the next lesson, we will build Step 5: Interactive Features — dynamic filters, date range pickers, model comparison views, CSV/PDF export, and drill-down capabilities.

← Previous Data Drift Monitoring Next → Interactive Features