Advanced

Step 4: Cost & Infrastructure

Build GPU utilization gauges, API cost breakdowns, inference latency percentile charts, and resource usage time series — the financial and operational view of your ML systems.

Why Cost Tracking Matters

ML infrastructure is expensive. A single GPU instance can cost $1-3 per hour. Without visibility, costs spiral quickly. This view helps teams:

  • Identify underutilized GPUs that can be downsized or turned off
  • Track API costs per model and per endpoint
  • Monitor inference latency for SLA compliance
  • Forecast monthly spending based on trends

GPU Utilization Gauges

Create views/cost_tracker.py starting with GPU utilization gauge charts:

# views/cost_tracker.py
import streamlit as st
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import numpy as np


def render_gpu_gauges(cost_df):
    """
    Gauge charts showing current GPU utilization and memory usage.
    """
    st.subheader("GPU Utilization (Current)")

    # Get the latest readings
    latest = cost_df.tail(1).iloc[0]
    gpu_util = latest["gpu_utilization"] * 100
    gpu_mem = latest["gpu_memory_pct"] * 100

    col1, col2 = st.columns(2)

    with col1:
        fig = go.Figure(go.Indicator(
            mode="gauge+number+delta",
            value=gpu_util,
            title={"text": "GPU Compute Utilization"},
            number={"suffix": "%"},
            delta={"reference": 70, "increasing": {"color": "#ef4444"}, "decreasing": {"color": "#22c55e"}},
            gauge={
                "axis": {"range": [0, 100], "tickwidth": 1},
                "bar": {"color": "#6366f1"},
                "bgcolor": "#1e293b",
                "borderwidth": 0,
                "steps": [
                    {"range": [0, 30], "color": "#22c55e20"},
                    {"range": [30, 70], "color": "#f59e0b20"},
                    {"range": [70, 100], "color": "#ef444420"},
                ],
                "threshold": {
                    "line": {"color": "#ef4444", "width": 3},
                    "thickness": 0.75,
                    "value": 85,
                },
            },
        ))
        fig.update_layout(height=300, template="plotly_dark",
                          margin=dict(l=20, r=20, t=50, b=20))
        st.plotly_chart(fig, use_container_width=True)

    with col2:
        fig = go.Figure(go.Indicator(
            mode="gauge+number+delta",
            value=gpu_mem,
            title={"text": "GPU Memory Usage"},
            number={"suffix": "%"},
            delta={"reference": 60, "increasing": {"color": "#ef4444"}, "decreasing": {"color": "#22c55e"}},
            gauge={
                "axis": {"range": [0, 100], "tickwidth": 1},
                "bar": {"color": "#8b5cf6"},
                "bgcolor": "#1e293b",
                "borderwidth": 0,
                "steps": [
                    {"range": [0, 40], "color": "#22c55e20"},
                    {"range": [40, 75], "color": "#f59e0b20"},
                    {"range": [75, 100], "color": "#ef444420"},
                ],
                "threshold": {
                    "line": {"color": "#ef4444", "width": 3},
                    "thickness": 0.75,
                    "value": 90,
                },
            },
        ))
        fig.update_layout(height=300, template="plotly_dark",
                          margin=dict(l=20, r=20, t=50, b=20))
        st.plotly_chart(fig, use_container_width=True)

API Cost Breakdown

Track costs by time period with a stacked area chart and daily totals:

def render_api_costs(cost_df):
    """
    Stacked area chart of API costs over time, plus daily cost breakdown.
    """
    st.subheader("API Cost Tracking")

    # Aggregate to daily costs
    cost_df["date"] = pd.to_datetime(cost_df["timestamp"]).dt.date
    daily = cost_df.groupby("date").agg({
        "api_cost_usd": "sum",
        "api_calls": "sum",
    }).reset_index()

    # KPI cards
    col1, col2, col3, col4 = st.columns(4)
    with col1:
        total_cost = daily["api_cost_usd"].sum()
        st.metric("Total (30d)", f"${total_cost:.2f}")
    with col2:
        avg_daily = daily["api_cost_usd"].mean()
        st.metric("Avg Daily", f"${avg_daily:.2f}")
    with col3:
        total_calls = daily["api_calls"].sum()
        st.metric("Total API Calls", f"{total_calls:,}")
    with col4:
        cost_per_call = total_cost / total_calls if total_calls > 0 else 0
        st.metric("Cost per Call", f"${cost_per_call:.4f}")

    # Daily cost chart
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=daily["date"],
        y=daily["api_cost_usd"],
        fill="tozeroy",
        name="Daily API Cost",
        line=dict(color="#6366f1"),
        fillcolor="rgba(99, 102, 241, 0.2)",
    ))

    # Add trend line
    x_numeric = np.arange(len(daily))
    z = np.polyfit(x_numeric, daily["api_cost_usd"].values, 1)
    trend = np.polyval(z, x_numeric)
    fig.add_trace(go.Scatter(
        x=daily["date"],
        y=trend,
        name="Trend",
        line=dict(color="#ef4444", dash="dash"),
    ))

    fig.update_layout(
        title="Daily API Cost (USD)",
        xaxis_title="Date",
        yaxis_title="Cost (USD)",
        height=400,
        template="plotly_dark",
        margin=dict(l=20, r=20, t=50, b=20),
        hovermode="x unified",
    )

    st.plotly_chart(fig, use_container_width=True)

    # Monthly forecast
    monthly_forecast = avg_daily * 30
    st.info(f"Projected monthly cost at current rate: **${monthly_forecast:.2f}**")

Inference Latency Charts

Latency percentiles help identify performance degradation and SLA violations:

def render_latency_charts(cost_df):
    """
    Latency time series with P50, P95, and P99 percentile bands.
    """
    st.subheader("Inference Latency")

    # Calculate hourly percentiles
    cost_df["hour"] = pd.to_datetime(cost_df["timestamp"]).dt.floor("H")
    hourly = cost_df.groupby("hour")["inference_latency_ms"].agg(
        p50="median",
        p95=lambda x: np.percentile(x, 95),
        p99=lambda x: np.percentile(x, 99),
        mean="mean",
    ).reset_index()

    # Latency time series
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=hourly["hour"], y=hourly["p99"],
        name="P99", line=dict(color="#ef4444"),
        fill=None,
    ))
    fig.add_trace(go.Scatter(
        x=hourly["hour"], y=hourly["p95"],
        name="P95", line=dict(color="#f59e0b"),
        fill="tonexty", fillcolor="rgba(239, 68, 68, 0.1)",
    ))
    fig.add_trace(go.Scatter(
        x=hourly["hour"], y=hourly["p50"],
        name="P50 (Median)", line=dict(color="#22c55e"),
        fill="tonexty", fillcolor="rgba(245, 158, 11, 0.1)",
    ))

    # SLA threshold line
    fig.add_hline(
        y=200,
        line_dash="dash",
        line_color="#ef4444",
        annotation_text="SLA: 200ms",
        annotation_position="top right",
    )

    fig.update_layout(
        title="Inference Latency Percentiles (ms)",
        xaxis_title="Time",
        yaxis_title="Latency (ms)",
        height=400,
        template="plotly_dark",
        margin=dict(l=20, r=20, t=50, b=20),
        hovermode="x unified",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    )

    st.plotly_chart(fig, use_container_width=True)

    # SLA compliance metric
    total_requests = len(cost_df)
    sla_violations = len(cost_df[cost_df["inference_latency_ms"] > 200])
    sla_pct = ((total_requests - sla_violations) / total_requests * 100) if total_requests > 0 else 100

    col1, col2, col3 = st.columns(3)
    with col1:
        st.metric("SLA Compliance", f"{sla_pct:.2f}%")
    with col2:
        st.metric("SLA Violations", f"{sla_violations:,}")
    with col3:
        st.metric("Avg Latency", f"{cost_df['inference_latency_ms'].mean():.1f} ms")

GPU Utilization Time Series

Track GPU usage patterns over time to identify idle periods and peak usage:

def render_gpu_timeline(cost_df):
    """
    Time series of GPU utilization showing usage patterns.
    """
    st.subheader("GPU Utilization Over Time")

    cost_df["hour"] = pd.to_datetime(cost_df["timestamp"]).dt.floor("H")
    hourly = cost_df.groupby("hour").agg({
        "gpu_utilization": "mean",
        "gpu_memory_pct": "mean",
    }).reset_index()

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=hourly["hour"], y=hourly["gpu_utilization"] * 100,
        name="GPU Compute",
        line=dict(color="#6366f1"),
        fill="tozeroy",
        fillcolor="rgba(99, 102, 241, 0.15)",
    ))

    fig.add_trace(go.Scatter(
        x=hourly["hour"], y=hourly["gpu_memory_pct"] * 100,
        name="GPU Memory",
        line=dict(color="#8b5cf6", dash="dot"),
    ))

    fig.update_layout(
        title="GPU Utilization (%)",
        xaxis_title="Time",
        yaxis_title="Utilization (%)",
        height=350,
        template="plotly_dark",
        margin=dict(l=20, r=20, t=50, b=20),
        yaxis=dict(range=[0, 100]),
        hovermode="x unified",
    )

    st.plotly_chart(fig, use_container_width=True)

    # Utilization insights
    avg_util = cost_df["gpu_utilization"].mean() * 100
    if avg_util < 30:
        st.warning(f"Average GPU utilization is only {avg_util:.1f}%. Consider downsizing to a smaller instance to save costs.")
    elif avg_util > 85:
        st.error(f"Average GPU utilization is {avg_util:.1f}%. Consider scaling up to avoid latency spikes.")

Wiring It Into the App

Update app.py to connect the cost tracking views:

# In app.py, add cost tracking:
from views.cost_tracker import (
    render_gpu_gauges,
    render_api_costs,
    render_latency_charts,
    render_gpu_timeline,
)
from connectors import get_cost_data

# Inside main(), replace the Cost & Infrastructure branch:
elif page == "Cost & Infrastructure":
    cost_df = get_cost_data(days=30)

    render_gpu_gauges(cost_df)
    st.markdown("---")
    render_api_costs(cost_df)
    st.markdown("---")
    render_latency_charts(cost_df)
    st.markdown("---")
    render_gpu_timeline(cost_df)
💡
Cost optimization tip: If your GPU utilization is consistently below 30%, you are overspending. Consider using spot/preemptible instances for batch inference, or switching to CPU-only instances for models that do not need GPU acceleration. The savings can be 60-80%.

What Is Next

The cost and infrastructure views are complete. In the next lesson, we will build Step 5: Interactive Features — dynamic filters, date range pickers, model comparison views, CSV/PDF export, and drill-down capabilities.