Cost Monitoring & Budgeting Advanced

You can't optimize what you can't measure. This lesson shows you how to build real-time cost visibility for AI systems -- from Prometheus metrics and Grafana dashboards to anomaly detection and automated budget enforcement. These are the FinOps practices that separate teams who control their AI spend from teams who get surprised by it.

Cost Metrics to Track

MetricGranularityAlert ThresholdDashboard Panel
Total AI spendDaily / Monthly>110% of budgetSingle stat + trend line
Cost per requestPer endpoint>2x baselineTime series by endpoint
GPU utilizationPer instance, 1-min<10% for 30 minHeatmap by instance
API token usagePer model, per team>daily budgetStacked bar by model
Cost per userPer user tier>revenue per userDistribution histogram
Training job costPer experiment>experiment budgetTable with sortable columns
Idle resourcesPer instance, 5-minAny idle GPUList with action buttons
Cost trend (7-day)Daily>20% week-over-weekTrend line with forecast

Prometheus Metrics for AI Cost Tracking

# Prometheus metrics for AI cost observability
# These metrics feed your Grafana dashboards and alerting rules

from dataclasses import dataclass
from typing import Dict

# Metric definitions for Prometheus
PROMETHEUS_METRICS = """
# HELP ai_request_cost_dollars Cost per AI inference request in dollars
# TYPE ai_request_cost_dollars histogram
ai_request_cost_dollars_bucket{model="gpt-4o",team="search",le="0.001"} 0
ai_request_cost_dollars_bucket{model="gpt-4o",team="search",le="0.01"} 145
ai_request_cost_dollars_bucket{model="gpt-4o",team="search",le="0.1"} 982
ai_request_cost_dollars_bucket{model="gpt-4o",team="search",le="1.0"} 1000
ai_request_cost_dollars_sum{model="gpt-4o",team="search"} 8.45
ai_request_cost_dollars_count{model="gpt-4o",team="search"} 1000

# HELP ai_daily_spend_dollars Total AI spend per day per team
# TYPE ai_daily_spend_dollars gauge
ai_daily_spend_dollars{team="search",category="inference"} 245.50
ai_daily_spend_dollars{team="search",category="training"} 0
ai_daily_spend_dollars{team="chatbot",category="inference"} 180.25
ai_daily_spend_dollars{team="chatbot",category="api"} 420.00

# HELP ai_gpu_utilization_percent GPU utilization percentage
# TYPE ai_gpu_utilization_percent gauge
ai_gpu_utilization_percent{instance="i-abc123",gpu="0",type="A10G"} 72.5
ai_gpu_utilization_percent{instance="i-abc123",gpu="1",type="A10G"} 15.2
ai_gpu_utilization_percent{instance="i-def456",gpu="0",type="A100"} 89.1

# HELP ai_api_tokens_total Total tokens consumed by API calls
# TYPE ai_api_tokens_total counter
ai_api_tokens_total{model="gpt-4o",direction="input",team="chatbot"} 15420000
ai_api_tokens_total{model="gpt-4o",direction="output",team="chatbot"} 8210000
ai_api_tokens_total{model="gpt-4o-mini",direction="input",team="search"} 92000000

# HELP ai_budget_utilization_ratio Budget used vs allocated (0-1)
# TYPE ai_budget_utilization_ratio gauge
ai_budget_utilization_ratio{team="search",period="daily"} 0.72
ai_budget_utilization_ratio{team="chatbot",period="daily"} 0.91
ai_budget_utilization_ratio{team="data",period="daily"} 0.45
"""

# Python metrics instrumentation
INSTRUMENTATION_CODE = '''
from prometheus_client import Counter, Histogram, Gauge
import time

# Define metrics
ai_request_cost = Histogram(
    'ai_request_cost_dollars',
    'Cost per AI request in dollars',
    ['model', 'team', 'endpoint'],
    buckets=[0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0]
)

ai_daily_spend = Gauge(
    'ai_daily_spend_dollars',
    'Total AI spend today',
    ['team', 'category']
)

ai_tokens_total = Counter(
    'ai_api_tokens_total',
    'Total API tokens consumed',
    ['model', 'direction', 'team']
)

ai_gpu_util = Gauge(
    'ai_gpu_utilization_percent',
    'GPU utilization percentage',
    ['instance', 'gpu', 'type']
)

# Instrument an API call
def track_ai_call(model, team, endpoint, input_tokens, output_tokens, cost):
    ai_request_cost.labels(model=model, team=team, endpoint=endpoint).observe(cost)
    ai_tokens_total.labels(model=model, direction="input", team=team).inc(input_tokens)
    ai_tokens_total.labels(model=model, direction="output", team=team).inc(output_tokens)
    ai_daily_spend.labels(team=team, category="api").inc(cost)
'''

print("Prometheus Metrics Definition:")
print(PROMETHEUS_METRICS)
print("\nInstrumentation Code:")
print(INSTRUMENTATION_CODE)

Grafana Dashboard Configuration

# Grafana dashboard JSON for AI Cost Monitoring
# Import this into Grafana to get an instant cost dashboard

GRAFANA_DASHBOARD = {
    "title": "AI Cost Monitoring",
    "panels": [
        {
            "title": "Total Daily AI Spend",
            "type": "stat",
            "query": 'sum(ai_daily_spend_dollars)',
            "thresholds": [
                {"value": 0, "color": "green"},
                {"value": 500, "color": "yellow"},
                {"value": 1000, "color": "red"}
            ],
            "description": "Total AI spend today across all teams"
        },
        {
            "title": "Spend by Team (Today)",
            "type": "piechart",
            "query": 'sum by (team) (ai_daily_spend_dollars)',
            "description": "Cost attribution per team"
        },
        {
            "title": "Cost per Request (p50/p95/p99)",
            "type": "timeseries",
            "queries": [
                'histogram_quantile(0.50, rate(ai_request_cost_dollars_bucket[5m]))',
                'histogram_quantile(0.95, rate(ai_request_cost_dollars_bucket[5m]))',
                'histogram_quantile(0.99, rate(ai_request_cost_dollars_bucket[5m]))'
            ],
            "description": "Cost per request percentiles over time"
        },
        {
            "title": "GPU Utilization Heatmap",
            "type": "heatmap",
            "query": 'ai_gpu_utilization_percent',
            "description": "GPU utilization across fleet. Red = idle (wasted money)"
        },
        {
            "title": "Token Usage by Model",
            "type": "timeseries",
            "query": 'sum by (model) (rate(ai_api_tokens_total[1h]))',
            "description": "Token consumption rate per model"
        },
        {
            "title": "Budget Utilization",
            "type": "gauge",
            "query": 'ai_budget_utilization_ratio',
            "thresholds": [
                {"value": 0, "color": "green"},
                {"value": 0.8, "color": "yellow"},
                {"value": 0.95, "color": "red"}
            ],
            "description": "Budget used vs allocated per team"
        },
        {
            "title": "7-Day Cost Trend",
            "type": "timeseries",
            "query": 'sum(ai_daily_spend_dollars) by (team)',
            "time_range": "7d",
            "description": "Cost trend with week-over-week comparison"
        },
        {
            "title": "Idle GPU Instances",
            "type": "table",
            "query": 'ai_gpu_utilization_percent < 10',
            "columns": ["instance", "gpu_type", "utilization", "hourly_cost", "team"],
            "description": "GPUs below 10% utilization - candidates for shutdown"
        }
    ]
}

import json
print("Grafana Dashboard Configuration:")
print(json.dumps(GRAFANA_DASHBOARD, indent=2))

Cost Anomaly Detection

# AI cost anomaly detection system
import statistics
from dataclasses import dataclass, field
from typing import List, Dict, Optional
from datetime import datetime

@dataclass
class CostAnomalyDetector:
    """Detect unexpected cost spikes before they become $50K surprises.
    Uses rolling statistics to identify anomalies in real-time."""

    history: Dict[str, List[float]] = field(default_factory=dict)
    window_size: int = 14  # 14 days of history
    z_score_threshold: float = 2.5  # Standard deviations for anomaly

    def record_daily_cost(self, team: str, cost: float):
        if team not in self.history:
            self.history[team] = []
        self.history[team].append(cost)
        # Keep only window_size days
        self.history[team] = self.history[team][-self.window_size:]

    def check_anomaly(self, team: str, current_cost: float) -> dict:
        """Check if today's cost is anomalous."""
        history = self.history.get(team, [])

        if len(history) < 7:  # Need minimum history
            return {"status": "insufficient_data",
                    "message": f"Need {7 - len(history)} more days of data"}

        mean = statistics.mean(history)
        stdev = statistics.stdev(history)

        if stdev == 0:
            stdev = mean * 0.1  # Avoid division by zero

        z_score = (current_cost - mean) / stdev

        if abs(z_score) > self.z_score_threshold:
            direction = "SPIKE" if z_score > 0 else "DROP"
            return {
                "status": "ANOMALY",
                "type": direction,
                "severity": "critical" if abs(z_score) > 4 else "warning",
                "current_cost": f"${current_cost:,.2f}",
                "expected_cost": f"${mean:,.2f} +/- ${stdev:,.2f}",
                "z_score": round(z_score, 2),
                "excess_cost": f"${max(0, current_cost - mean - 2*stdev):,.2f}",
                "investigation_hints": [
                    "Check for new deployments or traffic spikes",
                    "Look for runaway batch jobs or loops",
                    "Verify auto-scaling limits are set",
                    "Check for pricing tier changes",
                    f"Team '{team}' daily trend: {[f'${c:.0f}' for c in history[-5:]]}"
                ]
            }

        return {
            "status": "normal",
            "current_cost": f"${current_cost:,.2f}",
            "expected_range": f"${mean - 2*stdev:,.2f} - ${mean + 2*stdev:,.2f}",
            "z_score": round(z_score, 2)
        }

# Alerting rules for cost monitoring
ALERTING_RULES = {
    "critical": [
        {
            "name": "daily_budget_exceeded",
            "condition": "ai_budget_utilization_ratio > 1.0",
            "message": "Team {team} has exceeded daily AI budget",
            "action": "Block new requests, notify team lead and finance",
            "channel": ["pagerduty", "slack-finance"]
        },
        {
            "name": "cost_spike_detected",
            "condition": "z_score > 4.0",
            "message": "Anomalous cost spike detected for {team}",
            "action": "Investigate immediately, may indicate runaway job",
            "channel": ["pagerduty", "slack-oncall"]
        }
    ],
    "warning": [
        {
            "name": "budget_80_percent",
            "condition": "ai_budget_utilization_ratio > 0.8",
            "message": "Team {team} at 80% of daily AI budget",
            "action": "Review usage, consider rate limiting",
            "channel": ["slack-team"]
        },
        {
            "name": "idle_gpu_detected",
            "condition": "ai_gpu_utilization_percent < 10 for 30m",
            "message": "GPU {instance} idle for 30+ minutes",
            "action": "Auto-shutdown if non-production",
            "channel": ["slack-infra"]
        },
        {
            "name": "cost_per_request_high",
            "condition": "ai_request_cost_p95 > 2x baseline",
            "message": "Cost per request 2x above baseline for {endpoint}",
            "action": "Check for prompt bloat or model misconfiguration",
            "channel": ["slack-team"]
        }
    ],
    "info": [
        {
            "name": "weekly_cost_report",
            "condition": "every Monday 9am",
            "message": "Weekly AI cost summary",
            "action": "Send report to all team leads",
            "channel": ["email", "slack-engineering"]
        }
    ]
}

# Demo anomaly detection
detector = CostAnomalyDetector()
daily_costs = [450, 460, 440, 470, 455, 430, 465, 450, 440, 470, 460, 450, 445, 455]
for cost in daily_costs:
    detector.record_daily_cost("search-team", cost)

# Normal day
result = detector.check_anomaly("search-team", 480)
print(f"Normal day: {result}")

# Anomalous spike
result = detector.check_anomaly("search-team", 1200)
print(f"\nAnomaly detected: {result}")

print("\n\nALERTING RULES:")
for severity, rules in ALERTING_RULES.items():
    print(f"\n[{severity.upper()}]")
    for rule in rules:
        print(f"  {rule['name']}: {rule['condition']}")
        print(f"    Action: {rule['action']}")
        print(f"    Channel: {rule['channel']}")

Team/Project Cost Allocation

# FinOps cost allocation system for AI teams
from dataclasses import dataclass, field
from typing import Dict, List

@dataclass
class AICostAllocator:
    """Allocate AI costs to teams/projects for accountability.
    Implements showback (visibility) and chargeback (billing)."""

    # Resource tagging requirements
    REQUIRED_TAGS = {
        "team": "Engineering team that owns the resource",
        "project": "Project or product the resource serves",
        "environment": "dev/staging/production",
        "cost_center": "Finance cost center code",
        "owner": "Individual responsible (email)"
    }

    allocations: Dict[str, Dict[str, float]] = field(default_factory=dict)

    def allocate_cost(self, resource_cost: float, tags: dict):
        """Allocate a resource cost to the appropriate team/project."""
        team = tags.get("team", "untagged")
        project = tags.get("project", "untagged")
        key = f"{team}/{project}"

        if key not in self.allocations:
            self.allocations[key] = {
                "compute": 0, "api": 0, "storage": 0,
                "network": 0, "other": 0, "total": 0
            }

        category = tags.get("cost_category", "other")
        self.allocations[key][category] = (
            self.allocations[key].get(category, 0) + resource_cost
        )
        self.allocations[key]["total"] += resource_cost

    def generate_report(self) -> str:
        """Generate cost allocation report."""
        lines = [
            "AI COST ALLOCATION REPORT",
            "=" * 70,
            f"{'Team/Project':<30} {'Compute':>10} {'API':>10} "
            f"{'Storage':>10} {'Total':>10}",
            "-" * 70
        ]

        total = 0
        sorted_allocs = sorted(
            self.allocations.items(),
            key=lambda x: x[1]["total"],
            reverse=True
        )

        for key, costs in sorted_allocs:
            total += costs["total"]
            lines.append(
                f"{key:<30} ${costs['compute']:>9,.0f} ${costs['api']:>9,.0f} "
                f"${costs['storage']:>9,.0f} ${costs['total']:>9,.0f}"
            )

        lines.append("-" * 70)
        lines.append(f"{'TOTAL':<30} ${total:>39,.0f}")

        # Untagged resources warning
        untagged = self.allocations.get("untagged/untagged", {}).get("total", 0)
        if untagged > 0:
            pct = untagged / total * 100
            lines.append(f"\n[WARNING] ${untagged:,.0f} ({pct:.1f}%) "
                         f"of costs are UNTAGGED. Fix resource tagging.")

        return "\n".join(lines)

# Example: allocate monthly costs
allocator = AICostAllocator()

costs = [
    (8500, {"team": "search", "project": "product-search", "cost_category": "compute"}),
    (3200, {"team": "search", "project": "product-search", "cost_category": "api"}),
    (1200, {"team": "search", "project": "product-search", "cost_category": "storage"}),
    (12000, {"team": "chatbot", "project": "customer-support", "cost_category": "api"}),
    (4500, {"team": "chatbot", "project": "customer-support", "cost_category": "compute"}),
    (6800, {"team": "ml-platform", "project": "recommendation", "cost_category": "compute"}),
    (2100, {"team": "ml-platform", "project": "recommendation", "cost_category": "storage"}),
    (3400, {"team": "data", "project": "analytics", "cost_category": "compute"}),
    (1500, {"cost_category": "compute"}),  # Untagged!
]

for cost, tags in costs:
    allocator.allocate_cost(cost, tags)

print(allocator.generate_report())
Key Takeaway: Effective AI cost monitoring requires three components: (1) Instrumentation -- emit cost metrics from every AI service using Prometheus counters and histograms. (2) Visibility -- build Grafana dashboards that show spend by team, model, and endpoint in real-time. (3) Enforcement -- set up anomaly detection and budget alerts that catch runaway costs before they become $50K surprises. Tag every resource with team and project for accurate cost allocation.