Cost Monitoring & Budgeting Advanced

You can't optimize what you can't measure. This lesson shows you how to build real-time cost visibility for AI systems -- from Prometheus metrics and Grafana dashboards to anomaly detection and automated budget enforcement. These are the FinOps practices that separate teams who control their AI spend from teams who get surprised by it.

Cost Metrics to Track

Metric	Granularity	Alert Threshold	Dashboard Panel
Total AI spend	Daily / Monthly	>110% of budget	Single stat + trend line
Cost per request	Per endpoint	>2x baseline	Time series by endpoint
GPU utilization	Per instance, 1-min	<10% for 30 min	Heatmap by instance
API token usage	Per model, per team	>daily budget	Stacked bar by model
Cost per user	Per user tier	>revenue per user	Distribution histogram
Training job cost	Per experiment	>experiment budget	Table with sortable columns
Idle resources	Per instance, 5-min	Any idle GPU	List with action buttons
Cost trend (7-day)	Daily	>20% week-over-week	Trend line with forecast

Prometheus Metrics for AI Cost Tracking

# Prometheus metrics for AI cost observability
# These metrics feed your Grafana dashboards and alerting rules

from dataclasses import dataclass
from typing import Dict

# Metric definitions for Prometheus
PROMETHEUS_METRICS = """
# HELP ai_request_cost_dollars Cost per AI inference request in dollars
# TYPE ai_request_cost_dollars histogram
ai_request_cost_dollars_bucket{model="gpt-4o",team="search",le="0.001"} 0
ai_request_cost_dollars_bucket{model="gpt-4o",team="search",le="0.01"} 145
ai_request_cost_dollars_bucket{model="gpt-4o",team="search",le="0.1"} 982
ai_request_cost_dollars_bucket{model="gpt-4o",team="search",le="1.0"} 1000
ai_request_cost_dollars_sum{model="gpt-4o",team="search"} 8.45
ai_request_cost_dollars_count{model="gpt-4o",team="search"} 1000

# HELP ai_daily_spend_dollars Total AI spend per day per team
# TYPE ai_daily_spend_dollars gauge
ai_daily_spend_dollars{team="search",category="inference"} 245.50
ai_daily_spend_dollars{team="search",category="training"} 0
ai_daily_spend_dollars{team="chatbot",category="inference"} 180.25
ai_daily_spend_dollars{team="chatbot",category="api"} 420.00

# HELP ai_gpu_utilization_percent GPU utilization percentage
# TYPE ai_gpu_utilization_percent gauge
ai_gpu_utilization_percent{instance="i-abc123",gpu="0",type="A10G"} 72.5
ai_gpu_utilization_percent{instance="i-abc123",gpu="1",type="A10G"} 15.2
ai_gpu_utilization_percent{instance="i-def456",gpu="0",type="A100"} 89.1

# HELP ai_api_tokens_total Total tokens consumed by API calls
# TYPE ai_api_tokens_total counter
ai_api_tokens_total{model="gpt-4o",direction="input",team="chatbot"} 15420000
ai_api_tokens_total{model="gpt-4o",direction="output",team="chatbot"} 8210000
ai_api_tokens_total{model="gpt-4o-mini",direction="input",team="search"} 92000000

# HELP ai_budget_utilization_ratio Budget used vs allocated (0-1)
# TYPE ai_budget_utilization_ratio gauge
ai_budget_utilization_ratio{team="search",period="daily"} 0.72
ai_budget_utilization_ratio{team="chatbot",period="daily"} 0.91
ai_budget_utilization_ratio{team="data",period="daily"} 0.45
"""

# Python metrics instrumentation
INSTRUMENTATION_CODE = '''
from prometheus_client import Counter, Histogram, Gauge
import time

# Define metrics
ai_request_cost = Histogram(
    'ai_request_cost_dollars',
    'Cost per AI request in dollars',
    ['model', 'team', 'endpoint'],
    buckets=[0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0]
)

ai_daily_spend = Gauge(
    'ai_daily_spend_dollars',
    'Total AI spend today',
    ['team', 'category']
)

ai_tokens_total = Counter(
    'ai_api_tokens_total',
    'Total API tokens consumed',
    ['model', 'direction', 'team']
)

ai_gpu_util = Gauge(
    'ai_gpu_utilization_percent',
    'GPU utilization percentage',
    ['instance', 'gpu', 'type']
)

# Instrument an API call
def track_ai_call(model, team, endpoint, input_tokens, output_tokens, cost):
    ai_request_cost.labels(model=model, team=team, endpoint=endpoint).observe(cost)
    ai_tokens_total.labels(model=model, direction="input", team=team).inc(input_tokens)
    ai_tokens_total.labels(model=model, direction="output", team=team).inc(output_tokens)
    ai_daily_spend.labels(team=team, category="api").inc(cost)
'''

print("Prometheus Metrics Definition:")
print(PROMETHEUS_METRICS)
print("\nInstrumentation Code:")
print(INSTRUMENTATION_CODE)

Grafana Dashboard Configuration

# Grafana dashboard JSON for AI Cost Monitoring
# Import this into Grafana to get an instant cost dashboard

GRAFANA_DASHBOARD = {
    "title": "AI Cost Monitoring",
    "panels": [
        {
            "title": "Total Daily AI Spend",
            "type": "stat",
            "query": 'sum(ai_daily_spend_dollars)',
            "thresholds": [
                {"value": 0, "color": "green"},
                {"value": 500, "color": "yellow"},
                {"value": 1000, "color": "red"}
            ],
            "description": "Total AI spend today across all teams"
        },
        {
            "title": "Spend by Team (Today)",
            "type": "piechart",
            "query": 'sum by (team) (ai_daily_spend_dollars)',
            "description": "Cost attribution per team"
        },
        {
            "title": "Cost per Request (p50/p95/p99)",
            "type": "timeseries",
            "queries": [
                'histogram_quantile(0.50, rate(ai_request_cost_dollars_bucket[5m]))',
                'histogram_quantile(0.95, rate(ai_request_cost_dollars_bucket[5m]))',
                'histogram_quantile(0.99, rate(ai_request_cost_dollars_bucket[5m]))'
            ],
            "description": "Cost per request percentiles over time"
        },
        {
            "title": "GPU Utilization Heatmap",
            "type": "heatmap",
            "query": 'ai_gpu_utilization_percent',
            "description": "GPU utilization across fleet. Red = idle (wasted money)"
        },
        {
            "title": "Token Usage by Model",
            "type": "timeseries",
            "query": 'sum by (model) (rate(ai_api_tokens_total[1h]))',
            "description": "Token consumption rate per model"
        },
        {
            "title": "Budget Utilization",
            "type": "gauge",
            "query": 'ai_budget_utilization_ratio',
            "thresholds": [
                {"value": 0, "color": "green"},
                {"value": 0.8, "color": "yellow"},
                {"value": 0.95, "color": "red"}
            ],
            "description": "Budget used vs allocated per team"
        },
        {
            "title": "7-Day Cost Trend",
            "type": "timeseries",
            "query": 'sum(ai_daily_spend_dollars) by (team)',
            "time_range": "7d",
            "description": "Cost trend with week-over-week comparison"
        },
        {
            "title": "Idle GPU Instances",
            "type": "table",
            "query": 'ai_gpu_utilization_percent < 10',
            "columns": ["instance", "gpu_type", "utilization", "hourly_cost", "team"],
            "description": "GPUs below 10% utilization - candidates for shutdown"
        }
    ]
}

import json
print("Grafana Dashboard Configuration:")
print(json.dumps(GRAFANA_DASHBOARD, indent=2))

Cost Anomaly Detection

# AI cost anomaly detection system
import statistics
from dataclasses import dataclass, field
from typing import List, Dict, Optional
from datetime import datetime

@dataclass
class CostAnomalyDetector:
    """Detect unexpected cost spikes before they become $50K surprises.
    Uses rolling statistics to identify anomalies in real-time."""

    history: Dict[str, List[float]] = field(default_factory=dict)
    window_size: int = 14  # 14 days of history
    z_score_threshold: float = 2.5  # Standard deviations for anomaly

    def record_daily_cost(self, team: str, cost: float):
        if team not in self.history:
            self.history[team] = []
        self.history[team].append(cost)
        # Keep only window_size days
        self.history[team] = self.history[team][-self.window_size:]

    def check_anomaly(self, team: str, current_cost: float) -> dict:
        """Check if today's cost is anomalous."""
        history = self.history.get(team, [])

        if len(history) < 7:  # Need minimum history
            return {"status": "insufficient_data",
                    "message": f"Need {7 - len(history)} more days of data"}

        mean = statistics.mean(history)
        stdev = statistics.stdev(history)

        if stdev == 0:
            stdev = mean * 0.1  # Avoid division by zero

        z_score = (current_cost - mean) / stdev

        if abs(z_score) > self.z_score_threshold:
            direction = "SPIKE" if z_score > 0 else "DROP"
            return {
                "status": "ANOMALY",
                "type": direction,
                "severity": "critical" if abs(z_score) > 4 else "warning",
                "current_cost": f"${current_cost:,.2f}",
                "expected_cost": f"${mean:,.2f} +/- ${stdev:,.2f}",
                "z_score": round(z_score, 2),
                "excess_cost": f"${max(0, current_cost - mean - 2*stdev):,.2f}",
                "investigation_hints": [
                    "Check for new deployments or traffic spikes",
                    "Look for runaway batch jobs or loops",
                    "Verify auto-scaling limits are set",
                    "Check for pricing tier changes",
                    f"Team '{team}' daily trend: {[f'${c:.0f}' for c in history[-5:]]}"
                ]
            }

        return {
            "status": "normal",
            "current_cost": f"${current_cost:,.2f}",
            "expected_range": f"${mean - 2*stdev:,.2f} - ${mean + 2*stdev:,.2f}",
            "z_score": round(z_score, 2)
        }

# Alerting rules for cost monitoring
ALERTING_RULES = {
    "critical": [
        {
            "name": "daily_budget_exceeded",
            "condition": "ai_budget_utilization_ratio > 1.0",
            "message": "Team {team} has exceeded daily AI budget",
            "action": "Block new requests, notify team lead and finance",
            "channel": ["pagerduty", "slack-finance"]
        },
        {
            "name": "cost_spike_detected",
            "condition": "z_score > 4.0",
            "message": "Anomalous cost spike detected for {team}",
            "action": "Investigate immediately, may indicate runaway job",
            "channel": ["pagerduty", "slack-oncall"]
        }
    ],
    "warning": [
        {
            "name": "budget_80_percent",
            "condition": "ai_budget_utilization_ratio > 0.8",
            "message": "Team {team} at 80% of daily AI budget",
            "action": "Review usage, consider rate limiting",
            "channel": ["slack-team"]
        },
        {
            "name": "idle_gpu_detected",
            "condition": "ai_gpu_utilization_percent < 10 for 30m",
            "message": "GPU {instance} idle for 30+ minutes",
            "action": "Auto-shutdown if non-production",
            "channel": ["slack-infra"]
        },
        {
            "name": "cost_per_request_high",
            "condition": "ai_request_cost_p95 > 2x baseline",
            "message": "Cost per request 2x above baseline for {endpoint}",
            "action": "Check for prompt bloat or model misconfiguration",
            "channel": ["slack-team"]
        }
    ],
    "info": [
        {
            "name": "weekly_cost_report",
            "condition": "every Monday 9am",
            "message": "Weekly AI cost summary",
            "action": "Send report to all team leads",
            "channel": ["email", "slack-engineering"]
        }
    ]
}

# Demo anomaly detection
detector = CostAnomalyDetector()
daily_costs = [450, 460, 440, 470, 455, 430, 465, 450, 440, 470, 460, 450, 445, 455]
for cost in daily_costs:
    detector.record_daily_cost("search-team", cost)

# Normal day
result = detector.check_anomaly("search-team", 480)
print(f"Normal day: {result}")

# Anomalous spike
result = detector.check_anomaly("search-team", 1200)
print(f"\nAnomaly detected: {result}")

print("\n\nALERTING RULES:")
for severity, rules in ALERTING_RULES.items():
    print(f"\n[{severity.upper()}]")
    for rule in rules:
        print(f"  {rule['name']}: {rule['condition']}")
        print(f"    Action: {rule['action']}")
        print(f"    Channel: {rule['channel']}")

Team/Project Cost Allocation

# FinOps cost allocation system for AI teams
from dataclasses import dataclass, field
from typing import Dict, List

@dataclass
class AICostAllocator:
    """Allocate AI costs to teams/projects for accountability.
    Implements showback (visibility) and chargeback (billing)."""

    # Resource tagging requirements
    REQUIRED_TAGS = {
        "team": "Engineering team that owns the resource",
        "project": "Project or product the resource serves",
        "environment": "dev/staging/production",
        "cost_center": "Finance cost center code",
        "owner": "Individual responsible (email)"
    }

    allocations: Dict[str, Dict[str, float]] = field(default_factory=dict)

    def allocate_cost(self, resource_cost: float, tags: dict):
        """Allocate a resource cost to the appropriate team/project."""
        team = tags.get("team", "untagged")
        project = tags.get("project", "untagged")
        key = f"{team}/{project}"

        if key not in self.allocations:
            self.allocations[key] = {
                "compute": 0, "api": 0, "storage": 0,
                "network": 0, "other": 0, "total": 0
            }

        category = tags.get("cost_category", "other")
        self.allocations[key][category] = (
            self.allocations[key].get(category, 0) + resource_cost
        )
        self.allocations[key]["total"] += resource_cost

    def generate_report(self) -> str:
        """Generate cost allocation report."""
        lines = [
            "AI COST ALLOCATION REPORT",
            "=" * 70,
            f"{'Team/Project':<30} {'Compute':>10} {'API':>10} "
            f"{'Storage':>10} {'Total':>10}",
            "-" * 70
        ]

        total = 0
        sorted_allocs = sorted(
            self.allocations.items(),
            key=lambda x: x[1]["total"],
            reverse=True
        )

        for key, costs in sorted_allocs:
            total += costs["total"]
            lines.append(
                f"{key:<30} ${costs['compute']:>9,.0f} ${costs['api']:>9,.0f} "
                f"${costs['storage']:>9,.0f} ${costs['total']:>9,.0f}"
            )

        lines.append("-" * 70)
        lines.append(f"{'TOTAL':<30} ${total:>39,.0f}")

        # Untagged resources warning
        untagged = self.allocations.get("untagged/untagged", {}).get("total", 0)
        if untagged > 0:
            pct = untagged / total * 100
            lines.append(f"\n[WARNING] ${untagged:,.0f} ({pct:.1f}%) "
                         f"of costs are UNTAGGED. Fix resource tagging.")

        return "\n".join(lines)

# Example: allocate monthly costs
allocator = AICostAllocator()

costs = [
    (8500, {"team": "search", "project": "product-search", "cost_category": "compute"}),
    (3200, {"team": "search", "project": "product-search", "cost_category": "api"}),
    (1200, {"team": "search", "project": "product-search", "cost_category": "storage"}),
    (12000, {"team": "chatbot", "project": "customer-support", "cost_category": "api"}),
    (4500, {"team": "chatbot", "project": "customer-support", "cost_category": "compute"}),
    (6800, {"team": "ml-platform", "project": "recommendation", "cost_category": "compute"}),
    (2100, {"team": "ml-platform", "project": "recommendation", "cost_category": "storage"}),
    (3400, {"team": "data", "project": "analytics", "cost_category": "compute"}),
    (1500, {"cost_category": "compute"}),  # Untagged!
]

for cost, tags in costs:
    allocator.allocate_cost(cost, tags)

print(allocator.generate_report())

Key Takeaway: Effective AI cost monitoring requires three components: (1) Instrumentation -- emit cost metrics from every AI service using Prometheus counters and histograms. (2) Visibility -- build Grafana dashboards that show spend by team, model, and endpoint in real-time. (3) Enforcement -- set up anomaly detection and budget alerts that catch runaway costs before they become $50K surprises. Tag every resource with team and project for accurate cost allocation.

← Training Cost Optimization Best Practices & Checklist →