API Cost Management Intermediate

LLM API costs are the fastest-growing line item in many engineering budgets. A single chatbot feature using GPT-4 can cost $50K+/month at scale. This lesson shows you how to reduce API costs by 50-70% through token optimization, prompt compression, smart routing, semantic caching, and production-ready budget enforcement. Includes a complete cost tracking system you can deploy today.

LLM API Pricing Reference (2025)

Model	Input $/1M tokens	Output $/1M tokens	Cost per avg request*	Monthly at 100K req/day
GPT-4o	$2.50	$10.00	$0.0075	$22,500
GPT-4o-mini	$0.15	$0.60	$0.0004	$1,200
Claude 3.5 Sonnet	$3.00	$15.00	$0.0105	$31,500
Claude 3.5 Haiku	$0.80	$4.00	$0.0028	$8,400
Gemini 1.5 Pro	$1.25	$5.00	$0.0038	$11,250
Gemini 1.5 Flash	$0.075	$0.30	$0.0002	$600
Llama 3 70B (self-hosted)	~$0.50**	~$0.50**	$0.0005	$1,500

*Avg request = 500 input tokens + 300 output tokens. **Self-hosted costs based on A100 amortized.

Token Optimization Strategies

# Token optimization techniques that reduce API costs immediately
from dataclasses import dataclass
from typing import Dict, List
import re

@dataclass
class TokenOptimizer:
    """Production token optimization for LLM API calls.
    Typical savings: 30-50% token reduction."""

    def optimize_system_prompt(self, prompt: str) -> dict:
        """Compress system prompt without losing effectiveness.
        System prompts are sent with EVERY request - optimizing them
        has the highest ROI."""
        original_words = len(prompt.split())

        optimizations = []

        # 1. Remove filler words and redundant instructions
        compressed = prompt
        fillers = [
            (r'\bplease\b', ''),
            (r'\bkindly\b', ''),
            (r'\bmake sure to\b', ''),
            (r'\bit is important that\b', ''),
            (r'\byou should always\b', ''),
            (r'\bremember that\b', ''),
        ]
        for pattern, replacement in fillers:
            compressed = re.sub(pattern, replacement, compressed, flags=re.I)
        optimizations.append("Removed filler words")

        # 2. Remove redundant whitespace
        compressed = re.sub(r'\s+', ' ', compressed).strip()
        optimizations.append("Normalized whitespace")

        # 3. Use abbreviations for repeated terms
        compressed = compressed.replace("the user", "user")
        compressed = compressed.replace("the response", "response")
        optimizations.append("Shortened repeated terms")

        optimized_words = len(compressed.split())
        reduction = (1 - optimized_words / original_words) * 100

        return {
            "original_tokens": original_words,  # approximate
            "optimized_tokens": optimized_words,
            "reduction": f"{reduction:.0f}%",
            "optimized_prompt": compressed,
            "techniques_applied": optimizations
        }

    def truncate_context(self, context: str, max_tokens: int = 2000,
                          strategy: str = "smart") -> str:
        """Reduce context window size while preserving key information."""
        words = context.split()
        if len(words) <= max_tokens:
            return context

        if strategy == "head":
            return " ".join(words[:max_tokens])
        elif strategy == "tail":
            return " ".join(words[-max_tokens:])
        elif strategy == "smart":
            # Keep first 30% and last 70% (most relevant info
            # is usually at the beginning and end)
            head_size = int(max_tokens * 0.3)
            tail_size = max_tokens - head_size
            head = " ".join(words[:head_size])
            tail = " ".join(words[-tail_size:])
            return f"{head}\n[...truncated {len(words) - max_tokens} tokens...]\n{tail}"

        return " ".join(words[:max_tokens])

    def limit_output_tokens(self, task_type: str) -> dict:
        """Set appropriate max_tokens for each task type.
        Most teams use default (4096) for everything, wasting tokens."""
        limits = {
            "classification": {"max_tokens": 10, "reason": "Just need a label"},
            "yes_no": {"max_tokens": 5, "reason": "Binary answer"},
            "extraction": {"max_tokens": 200, "reason": "Structured output"},
            "summary": {"max_tokens": 300, "reason": "Concise summary"},
            "short_answer": {"max_tokens": 150, "reason": "Brief response"},
            "analysis": {"max_tokens": 800, "reason": "Detailed but bounded"},
            "code_generation": {"max_tokens": 1500, "reason": "Code with explanation"},
            "creative_writing": {"max_tokens": 2000, "reason": "Longer form content"},
        }
        return limits.get(task_type, {"max_tokens": 500, "reason": "Default safe limit"})

# Token savings example
optimizer = TokenOptimizer()

# Example: optimize a bloated system prompt
bloated_prompt = """
You are a helpful AI assistant. Please make sure to always respond in a
professional and friendly manner. It is important that you provide accurate
information. Remember that you should always cite sources when possible.
You should always format your responses in a clear and organized way.
Please kindly make sure to keep responses concise and to the point.
It is important that you do not make up information. You should always
let the user know if you are not sure about something.
"""

result = optimizer.optimize_system_prompt(bloated_prompt)
print(f"System prompt optimization:")
print(f"  Original: ~{result['original_tokens']} tokens")
print(f"  Optimized: ~{result['optimized_tokens']} tokens")
print(f"  Reduction: {result['reduction']}")
print(f"  At 100K requests/day, GPT-4o:")
saved_tokens = (result['original_tokens'] - result['optimized_tokens']) * 100_000 * 30
print(f"  Monthly token savings: {saved_tokens:,.0f}")
print(f"  Monthly cost savings: ${saved_tokens * 2.5 / 1_000_000:,.0f}")

Budget Alerts and Rate Limiting

# Production budget enforcement system
import time
from dataclasses import dataclass, field
from typing import Dict, Optional
from datetime import datetime, timedelta

@dataclass
class APIBudgetEnforcer:
    """Enforce spending limits on AI API usage per team/project.
    Prevents $100K surprise bills."""

    budgets: Dict[str, dict] = field(default_factory=dict)
    spending: Dict[str, float] = field(default_factory=dict)
    alerts_sent: Dict[str, list] = field(default_factory=dict)

    def set_budget(self, team: str, daily_limit: float,
                    monthly_limit: float,
                    alert_thresholds: list = None):
        """Set spending limits for a team."""
        self.budgets[team] = {
            "daily_limit": daily_limit,
            "monthly_limit": monthly_limit,
            "alert_thresholds": alert_thresholds or [0.5, 0.8, 0.95, 1.0],
            "created": datetime.utcnow().isoformat()
        }
        self.spending.setdefault(team, 0.0)
        self.alerts_sent.setdefault(team, [])

    def check_budget(self, team: str, estimated_cost: float) -> dict:
        """Check if a request should be allowed based on budget."""
        if team not in self.budgets:
            return {"allowed": True, "warning": "No budget set for team"}

        budget = self.budgets[team]
        current_spend = self.spending.get(team, 0)
        projected = current_spend + estimated_cost

        # Check daily limit
        if projected > budget["daily_limit"]:
            return {
                "allowed": False,
                "reason": "daily_limit_exceeded",
                "current_spend": f"${current_spend:.2f}",
                "limit": f"${budget['daily_limit']:.2f}",
                "action": "Request blocked. Contact platform team for limit increase."
            }

        # Check alert thresholds
        utilization = projected / budget["daily_limit"]
        alerts = []
        for threshold in budget["alert_thresholds"]:
            if utilization >= threshold and threshold not in self.alerts_sent[team]:
                alerts.append({
                    "level": "critical" if threshold >= 0.95 else "warning",
                    "message": f"Team '{team}' at {utilization:.0%} of daily budget "
                               f"(${projected:.2f} / ${budget['daily_limit']:.2f})"
                })
                self.alerts_sent[team].append(threshold)

        return {
            "allowed": True,
            "utilization": f"{utilization:.0%}",
            "remaining": f"${budget['daily_limit'] - projected:.2f}",
            "alerts": alerts
        }

    def record_spend(self, team: str, cost: float):
        self.spending[team] = self.spending.get(team, 0) + cost

# Example: Set up budgets for engineering teams
enforcer = APIBudgetEnforcer()

enforcer.set_budget("search-team", daily_limit=200, monthly_limit=5000)
enforcer.set_budget("chatbot-team", daily_limit=500, monthly_limit=12000)
enforcer.set_budget("data-team", daily_limit=100, monthly_limit=2500)

# Simulate requests
for i in range(50):
    result = enforcer.check_budget("search-team", estimated_cost=5.0)
    if result["allowed"]:
        enforcer.record_spend("search-team", 5.0)
        if result.get("alerts"):
            for alert in result["alerts"]:
                print(f"  [{alert['level'].upper()}] {alert['message']}")
    else:
        print(f"  BLOCKED at request {i+1}: {result['reason']}")
        print(f"  Spend: {result['current_spend']} / {result['limit']}")
        break

Production Cost Tracking System

# Complete API cost tracking system - deploy this today
import time
import json
from dataclasses import dataclass, field
from typing import Dict, List, Optional
from datetime import datetime

@dataclass
class APICostTracker:
    """Track every AI API call's cost with team/project attribution.
    This is the foundation of AI cost management."""

    records: List[dict] = field(default_factory=list)

    # Pricing table (update monthly)
    PRICING = {
        "gpt-4o": {"input": 2.50, "output": 10.00},        # per 1M tokens
        "gpt-4o-mini": {"input": 0.15, "output": 0.60},
        "claude-3.5-sonnet": {"input": 3.00, "output": 15.00},
        "claude-3.5-haiku": {"input": 0.80, "output": 4.00},
        "text-embedding-3-small": {"input": 0.02, "output": 0.0},
    }

    def record_call(self, model: str, input_tokens: int,
                     output_tokens: int, team: str, project: str,
                     feature: str = "", user_id: str = "",
                     cached: bool = False):
        """Record an API call with full cost attribution."""
        pricing = self.PRICING.get(model, {"input": 0, "output": 0})

        # Cached requests cost 50% less on input (OpenAI prompt caching)
        input_multiplier = 0.5 if cached else 1.0

        input_cost = (input_tokens / 1_000_000) * pricing["input"] * input_multiplier
        output_cost = (output_tokens / 1_000_000) * pricing["output"]
        total_cost = input_cost + output_cost

        record = {
            "timestamp": datetime.utcnow().isoformat(),
            "model": model,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "input_cost": round(input_cost, 6),
            "output_cost": round(output_cost, 6),
            "total_cost": round(total_cost, 6),
            "team": team,
            "project": project,
            "feature": feature,
            "user_id": user_id,
            "cached": cached
        }
        self.records.append(record)
        return record

    def summary_by_team(self) -> Dict[str, dict]:
        """Get cost summary grouped by team."""
        teams = {}
        for r in self.records:
            team = r["team"]
            if team not in teams:
                teams[team] = {"total_cost": 0, "total_requests": 0,
                               "total_input_tokens": 0, "total_output_tokens": 0,
                               "models_used": set()}
            teams[team]["total_cost"] += r["total_cost"]
            teams[team]["total_requests"] += 1
            teams[team]["total_input_tokens"] += r["input_tokens"]
            teams[team]["total_output_tokens"] += r["output_tokens"]
            teams[team]["models_used"].add(r["model"])

        # Convert sets to lists for serialization
        for team in teams:
            teams[team]["models_used"] = list(teams[team]["models_used"])
            teams[team]["avg_cost_per_request"] = round(
                teams[team]["total_cost"] / max(1, teams[team]["total_requests"]), 6
            )
        return teams

    def summary_by_model(self) -> Dict[str, dict]:
        """Get cost summary grouped by model."""
        models = {}
        for r in self.records:
            model = r["model"]
            if model not in models:
                models[model] = {"total_cost": 0, "requests": 0}
            models[model]["total_cost"] += r["total_cost"]
            models[model]["requests"] += 1
        return models

    def chargeback_report(self) -> str:
        """Generate chargeback report for finance."""
        teams = self.summary_by_team()
        total = sum(t["total_cost"] for t in teams.values())

        lines = ["AI API CHARGEBACK REPORT", "=" * 50]
        for team, data in sorted(teams.items(),
                                  key=lambda x: x[1]["total_cost"],
                                  reverse=True):
            pct = (data["total_cost"] / total * 100) if total > 0 else 0
            lines.append(
                f"  {team:<20} ${data['total_cost']:>10,.4f}  "
                f"({pct:>5.1f}%)  "
                f"{data['total_requests']:>6,} requests  "
                f"avg ${data['avg_cost_per_request']:.6f}/req"
            )
        lines.append(f"\n  {'TOTAL':<20} ${total:>10,.4f}")
        return "\n".join(lines)

# Example usage
tracker = APICostTracker()

# Simulate API calls from different teams
import random
calls = [
    ("search-team", "product-search", "gpt-4o-mini", 300, 100),
    ("search-team", "product-search", "text-embedding-3-small", 500, 0),
    ("chatbot-team", "customer-support", "gpt-4o", 800, 400),
    ("chatbot-team", "customer-support", "gpt-4o-mini", 200, 150),
    ("data-team", "report-gen", "claude-3.5-sonnet", 2000, 1500),
]

for _ in range(100):
    team, project, model, inp, out = random.choice(calls)
    tracker.record_call(
        model=model, input_tokens=inp, output_tokens=out,
        team=team, project=project
    )

print(tracker.chargeback_report())
print("\n\nCost by Model:")
for model, data in tracker.summary_by_model().items():
    print(f"  {model:<30} ${data['total_cost']:.4f}  ({data['requests']} calls)")

Real-World Warning: A startup's engineer accidentally left a GPT-4 loop running overnight with verbose logging prompts. Each iteration sent 8K tokens of system prompt + logs. By morning: 2.3M API calls, $47,000 bill. Prevention: always set max_tokens, implement rate limits per user/feature, and set daily budget caps with hard enforcement (not just alerts).

Key Takeaway: API cost management requires three layers: (1) Token optimization at the prompt level (30-50% savings), (2) Smart routing to send simple queries to cheap models (40-60% savings), and (3) Budget enforcement to prevent runaway costs. The cost tracking system in this lesson is the minimum viable infrastructure every team using AI APIs should deploy.

← Inference Cost Reduction Training Cost Optimization →