Step 4: Cost & Infrastructure
Build GPU utilization gauges, API cost breakdowns, inference latency percentile charts, and resource usage time series — the financial and operational view of your ML systems.
Why Cost Tracking Matters
ML infrastructure is expensive. A single GPU instance can cost $1-3 per hour. Without visibility, costs spiral quickly. This view helps teams:
- Identify underutilized GPUs that can be downsized or turned off
- Track API costs per model and per endpoint
- Monitor inference latency for SLA compliance
- Forecast monthly spending based on trends
GPU Utilization Gauges
Create views/cost_tracker.py starting with GPU utilization gauge charts:
# views/cost_tracker.py
import streamlit as st
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import numpy as np
def render_gpu_gauges(cost_df):
"""
Gauge charts showing current GPU utilization and memory usage.
"""
st.subheader("GPU Utilization (Current)")
# Get the latest readings
latest = cost_df.tail(1).iloc[0]
gpu_util = latest["gpu_utilization"] * 100
gpu_mem = latest["gpu_memory_pct"] * 100
col1, col2 = st.columns(2)
with col1:
fig = go.Figure(go.Indicator(
mode="gauge+number+delta",
value=gpu_util,
title={"text": "GPU Compute Utilization"},
number={"suffix": "%"},
delta={"reference": 70, "increasing": {"color": "#ef4444"}, "decreasing": {"color": "#22c55e"}},
gauge={
"axis": {"range": [0, 100], "tickwidth": 1},
"bar": {"color": "#6366f1"},
"bgcolor": "#1e293b",
"borderwidth": 0,
"steps": [
{"range": [0, 30], "color": "#22c55e20"},
{"range": [30, 70], "color": "#f59e0b20"},
{"range": [70, 100], "color": "#ef444420"},
],
"threshold": {
"line": {"color": "#ef4444", "width": 3},
"thickness": 0.75,
"value": 85,
},
},
))
fig.update_layout(height=300, template="plotly_dark",
margin=dict(l=20, r=20, t=50, b=20))
st.plotly_chart(fig, use_container_width=True)
with col2:
fig = go.Figure(go.Indicator(
mode="gauge+number+delta",
value=gpu_mem,
title={"text": "GPU Memory Usage"},
number={"suffix": "%"},
delta={"reference": 60, "increasing": {"color": "#ef4444"}, "decreasing": {"color": "#22c55e"}},
gauge={
"axis": {"range": [0, 100], "tickwidth": 1},
"bar": {"color": "#8b5cf6"},
"bgcolor": "#1e293b",
"borderwidth": 0,
"steps": [
{"range": [0, 40], "color": "#22c55e20"},
{"range": [40, 75], "color": "#f59e0b20"},
{"range": [75, 100], "color": "#ef444420"},
],
"threshold": {
"line": {"color": "#ef4444", "width": 3},
"thickness": 0.75,
"value": 90,
},
},
))
fig.update_layout(height=300, template="plotly_dark",
margin=dict(l=20, r=20, t=50, b=20))
st.plotly_chart(fig, use_container_width=True)
API Cost Breakdown
Track costs by time period with a stacked area chart and daily totals:
def render_api_costs(cost_df):
"""
Stacked area chart of API costs over time, plus daily cost breakdown.
"""
st.subheader("API Cost Tracking")
# Aggregate to daily costs
cost_df["date"] = pd.to_datetime(cost_df["timestamp"]).dt.date
daily = cost_df.groupby("date").agg({
"api_cost_usd": "sum",
"api_calls": "sum",
}).reset_index()
# KPI cards
col1, col2, col3, col4 = st.columns(4)
with col1:
total_cost = daily["api_cost_usd"].sum()
st.metric("Total (30d)", f"${total_cost:.2f}")
with col2:
avg_daily = daily["api_cost_usd"].mean()
st.metric("Avg Daily", f"${avg_daily:.2f}")
with col3:
total_calls = daily["api_calls"].sum()
st.metric("Total API Calls", f"{total_calls:,}")
with col4:
cost_per_call = total_cost / total_calls if total_calls > 0 else 0
st.metric("Cost per Call", f"${cost_per_call:.4f}")
# Daily cost chart
fig = go.Figure()
fig.add_trace(go.Scatter(
x=daily["date"],
y=daily["api_cost_usd"],
fill="tozeroy",
name="Daily API Cost",
line=dict(color="#6366f1"),
fillcolor="rgba(99, 102, 241, 0.2)",
))
# Add trend line
x_numeric = np.arange(len(daily))
z = np.polyfit(x_numeric, daily["api_cost_usd"].values, 1)
trend = np.polyval(z, x_numeric)
fig.add_trace(go.Scatter(
x=daily["date"],
y=trend,
name="Trend",
line=dict(color="#ef4444", dash="dash"),
))
fig.update_layout(
title="Daily API Cost (USD)",
xaxis_title="Date",
yaxis_title="Cost (USD)",
height=400,
template="plotly_dark",
margin=dict(l=20, r=20, t=50, b=20),
hovermode="x unified",
)
st.plotly_chart(fig, use_container_width=True)
# Monthly forecast
monthly_forecast = avg_daily * 30
st.info(f"Projected monthly cost at current rate: **${monthly_forecast:.2f}**")
Inference Latency Charts
Latency percentiles help identify performance degradation and SLA violations:
def render_latency_charts(cost_df):
"""
Latency time series with P50, P95, and P99 percentile bands.
"""
st.subheader("Inference Latency")
# Calculate hourly percentiles
cost_df["hour"] = pd.to_datetime(cost_df["timestamp"]).dt.floor("H")
hourly = cost_df.groupby("hour")["inference_latency_ms"].agg(
p50="median",
p95=lambda x: np.percentile(x, 95),
p99=lambda x: np.percentile(x, 99),
mean="mean",
).reset_index()
# Latency time series
fig = go.Figure()
fig.add_trace(go.Scatter(
x=hourly["hour"], y=hourly["p99"],
name="P99", line=dict(color="#ef4444"),
fill=None,
))
fig.add_trace(go.Scatter(
x=hourly["hour"], y=hourly["p95"],
name="P95", line=dict(color="#f59e0b"),
fill="tonexty", fillcolor="rgba(239, 68, 68, 0.1)",
))
fig.add_trace(go.Scatter(
x=hourly["hour"], y=hourly["p50"],
name="P50 (Median)", line=dict(color="#22c55e"),
fill="tonexty", fillcolor="rgba(245, 158, 11, 0.1)",
))
# SLA threshold line
fig.add_hline(
y=200,
line_dash="dash",
line_color="#ef4444",
annotation_text="SLA: 200ms",
annotation_position="top right",
)
fig.update_layout(
title="Inference Latency Percentiles (ms)",
xaxis_title="Time",
yaxis_title="Latency (ms)",
height=400,
template="plotly_dark",
margin=dict(l=20, r=20, t=50, b=20),
hovermode="x unified",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)
st.plotly_chart(fig, use_container_width=True)
# SLA compliance metric
total_requests = len(cost_df)
sla_violations = len(cost_df[cost_df["inference_latency_ms"] > 200])
sla_pct = ((total_requests - sla_violations) / total_requests * 100) if total_requests > 0 else 100
col1, col2, col3 = st.columns(3)
with col1:
st.metric("SLA Compliance", f"{sla_pct:.2f}%")
with col2:
st.metric("SLA Violations", f"{sla_violations:,}")
with col3:
st.metric("Avg Latency", f"{cost_df['inference_latency_ms'].mean():.1f} ms")
GPU Utilization Time Series
Track GPU usage patterns over time to identify idle periods and peak usage:
def render_gpu_timeline(cost_df):
"""
Time series of GPU utilization showing usage patterns.
"""
st.subheader("GPU Utilization Over Time")
cost_df["hour"] = pd.to_datetime(cost_df["timestamp"]).dt.floor("H")
hourly = cost_df.groupby("hour").agg({
"gpu_utilization": "mean",
"gpu_memory_pct": "mean",
}).reset_index()
fig = go.Figure()
fig.add_trace(go.Scatter(
x=hourly["hour"], y=hourly["gpu_utilization"] * 100,
name="GPU Compute",
line=dict(color="#6366f1"),
fill="tozeroy",
fillcolor="rgba(99, 102, 241, 0.15)",
))
fig.add_trace(go.Scatter(
x=hourly["hour"], y=hourly["gpu_memory_pct"] * 100,
name="GPU Memory",
line=dict(color="#8b5cf6", dash="dot"),
))
fig.update_layout(
title="GPU Utilization (%)",
xaxis_title="Time",
yaxis_title="Utilization (%)",
height=350,
template="plotly_dark",
margin=dict(l=20, r=20, t=50, b=20),
yaxis=dict(range=[0, 100]),
hovermode="x unified",
)
st.plotly_chart(fig, use_container_width=True)
# Utilization insights
avg_util = cost_df["gpu_utilization"].mean() * 100
if avg_util < 30:
st.warning(f"Average GPU utilization is only {avg_util:.1f}%. Consider downsizing to a smaller instance to save costs.")
elif avg_util > 85:
st.error(f"Average GPU utilization is {avg_util:.1f}%. Consider scaling up to avoid latency spikes.")
Wiring It Into the App
Update app.py to connect the cost tracking views:
# In app.py, add cost tracking:
from views.cost_tracker import (
render_gpu_gauges,
render_api_costs,
render_latency_charts,
render_gpu_timeline,
)
from connectors import get_cost_data
# Inside main(), replace the Cost & Infrastructure branch:
elif page == "Cost & Infrastructure":
cost_df = get_cost_data(days=30)
render_gpu_gauges(cost_df)
st.markdown("---")
render_api_costs(cost_df)
st.markdown("---")
render_latency_charts(cost_df)
st.markdown("---")
render_gpu_timeline(cost_df)
What Is Next
The cost and infrastructure views are complete. In the next lesson, we will build Step 5: Interactive Features — dynamic filters, date range pickers, model comparison views, CSV/PDF export, and drill-down capabilities.
Lilly Tech Systems