GPU Cost Optimization Intermediate

GPU compute is typically the single largest cost in AI systems, often 50-70% of total infrastructure spend. The difference between a well-optimized GPU strategy and a naive one can be 3-5x in monthly costs. This lesson provides real pricing data, instance selection frameworks, and GPU sharing techniques that production teams use to cut GPU bills dramatically.

GPU Pricing Comparison: AWS vs GCP vs Azure (2025 Prices)

These are real on-demand hourly prices. Spot and reserved pricing follows below.

GPU	VRAM	AWS ($/hr)	GCP ($/hr)	Azure ($/hr)	Best For
NVIDIA T4	16 GB	$0.526	$0.35	$0.526	Small model inference, dev/test
NVIDIA A10G	24 GB	$1.006	$1.10	$1.10	Medium inference, fine-tuning
NVIDIA L4	24 GB	$0.725	$0.70	N/A	Efficient inference (Ada arch)
NVIDIA A100 40GB	40 GB	$3.67	$3.67	$3.40	Training, large model inference
NVIDIA A100 80GB	80 GB	$5.12	$5.07	$4.60	LLM training, large batch inference
NVIDIA H100	80 GB	$8.22	$8.68	$7.35	LLM training, high-throughput inference

On-Demand vs Spot vs Reserved: Decision Framework

# GPU pricing strategy selector
from dataclasses import dataclass
from enum import Enum
from typing import Optional

class PricingStrategy(Enum):
    ON_DEMAND = "on_demand"
    SPOT = "spot"           # AWS Spot / GCP Preemptible / Azure Spot
    RESERVED_1YR = "reserved_1yr"
    RESERVED_3YR = "reserved_3yr"
    SAVINGS_PLAN = "savings_plan"

@dataclass
class GPUPricingAnalysis:
    """Compare pricing strategies for a GPU workload."""

    gpu_type: str
    on_demand_hourly: float
    hours_per_month: float
    workload_type: str  # "training", "inference", "dev"

    # Typical discount percentages (vary by instance/region)
    SPOT_DISCOUNT = 0.60       # 60% off on-demand (can be 50-90%)
    RESERVED_1YR_DISCOUNT = 0.35  # 35% off on-demand
    RESERVED_3YR_DISCOUNT = 0.55  # 55% off on-demand
    SAVINGS_PLAN_DISCOUNT = 0.30  # 30% off on-demand

    def monthly_cost(self, strategy: PricingStrategy) -> float:
        discounts = {
            PricingStrategy.ON_DEMAND: 0,
            PricingStrategy.SPOT: self.SPOT_DISCOUNT,
            PricingStrategy.RESERVED_1YR: self.RESERVED_1YR_DISCOUNT,
            PricingStrategy.RESERVED_3YR: self.RESERVED_3YR_DISCOUNT,
            PricingStrategy.SAVINGS_PLAN: self.SAVINGS_PLAN_DISCOUNT,
        }
        hourly = self.on_demand_hourly * (1 - discounts[strategy])
        return hourly * self.hours_per_month

    def recommend(self) -> dict:
        """Recommend best pricing strategy based on workload."""
        results = {}
        for strategy in PricingStrategy:
            results[strategy.value] = self.monthly_cost(strategy)

        on_demand_cost = results["on_demand"]

        # Decision logic
        if self.workload_type == "training":
            # Training: use spot (interruptible, checkpoint-friendly)
            recommended = PricingStrategy.SPOT
            reason = ("Training jobs can checkpoint and resume. "
                      "Spot interruptions add 5-15% overhead but save 60%.")
        elif self.hours_per_month >= 600:  # >83% utilization
            # High utilization inference: reserved
            recommended = PricingStrategy.RESERVED_1YR
            reason = ("Steady-state inference with >83% utilization. "
                      "1-year reserved saves 35% with no interruption risk.")
        elif self.hours_per_month >= 400:  # 55%+ utilization
            recommended = PricingStrategy.SAVINGS_PLAN
            reason = ("Moderate utilization. Savings Plan gives 30% discount "
                      "with flexibility to change instance types.")
        else:
            # Low utilization: on-demand with auto-scaling
            recommended = PricingStrategy.ON_DEMAND
            reason = ("Low utilization (<55%). Use on-demand with aggressive "
                      "auto-scaling. Consider serverless inference.")

        return {
            "recommended": recommended.value,
            "reason": reason,
            "monthly_costs": {k: f"${v:,.0f}" for k, v in results.items()},
            "savings_vs_on_demand": f"${on_demand_cost - results[recommended.value]:,.0f}/month",
            "annual_savings": f"${(on_demand_cost - results[recommended.value]) * 12:,.0f}/year"
        }

# Example: Compare strategies for different workloads
workloads = [
    GPUPricingAnalysis("A100 80GB", 5.12, 720, "training"),    # 24/7 training
    GPUPricingAnalysis("A10G", 1.006, 720, "inference"),        # 24/7 inference
    GPUPricingAnalysis("A10G", 1.006, 300, "inference"),        # Partial inference
    GPUPricingAnalysis("H100", 8.22, 200, "training"),          # Burst training
]

for w in workloads:
    rec = w.recommend()
    print(f"\n{'='*55}")
    print(f"GPU: {w.gpu_type} | {w.hours_per_month}hrs/mo | {w.workload_type}")
    print(f"Recommended: {rec['recommended']}")
    print(f"Reason: {rec['reason']}")
    print(f"Monthly costs by strategy:")
    for strategy, cost in rec['monthly_costs'].items():
        marker = " <-- RECOMMENDED" if strategy == rec['recommended'] else ""
        print(f"  {strategy:<20} {cost}{marker}")
    print(f"Savings: {rec['savings_vs_on_demand']}/mo | {rec['annual_savings']}/yr")

Right-Sizing GPU Instances

The most common GPU waste: using an A100 when a T4 would do. Here's how to right-size.

# GPU right-sizing benchmark tool
import time
from dataclasses import dataclass
from typing import Dict, List

@dataclass
class GPURightSizer:
    """Benchmark your model on different GPU tiers to find the cheapest
    instance that meets your latency and throughput requirements."""

    model_name: str
    model_size_gb: float
    required_latency_p99_ms: float
    required_throughput_rps: float  # requests per second

    # GPU tier performance estimates (relative to A100)
    GPU_TIERS = {
        "T4":       {"vram_gb": 16, "hourly": 0.526, "relative_perf": 0.25},
        "A10G":     {"vram_gb": 24, "hourly": 1.006, "relative_perf": 0.45},
        "L4":       {"vram_gb": 24, "hourly": 0.725, "relative_perf": 0.50},
        "A100_40":  {"vram_gb": 40, "hourly": 3.670, "relative_perf": 0.85},
        "A100_80":  {"vram_gb": 80, "hourly": 5.120, "relative_perf": 1.00},
        "H100":     {"vram_gb": 80, "hourly": 8.220, "relative_perf": 1.80},
    }

    def find_cheapest_viable(self, a100_latency_ms: float,
                              a100_throughput_rps: float) -> List[dict]:
        """Given A100 benchmark results, estimate performance on each tier
        and find the cheapest GPU that meets requirements."""
        results = []

        for gpu_name, specs in self.GPU_TIERS.items():
            # Check VRAM fit (model must fit in GPU memory)
            if self.model_size_gb > specs["vram_gb"] * 0.85:  # 85% usable
                results.append({
                    "gpu": gpu_name,
                    "status": "SKIP - model too large for VRAM",
                    "hourly": specs["hourly"],
                    "monthly": specs["hourly"] * 720,
                })
                continue

            # Estimate performance on this GPU
            est_latency = a100_latency_ms / specs["relative_perf"]
            est_throughput = a100_throughput_rps * specs["relative_perf"]

            meets_latency = est_latency <= self.required_latency_p99_ms
            meets_throughput = est_throughput >= self.required_throughput_rps

            status = "VIABLE" if (meets_latency and meets_throughput) else "FAIL"
            if not meets_latency:
                status += f" - latency {est_latency:.0f}ms > {self.required_latency_p99_ms}ms"
            if not meets_throughput:
                status += f" - throughput {est_throughput:.0f} < {self.required_throughput_rps} rps"

            results.append({
                "gpu": gpu_name,
                "status": status,
                "est_latency_ms": round(est_latency, 1),
                "est_throughput_rps": round(est_throughput, 1),
                "hourly": specs["hourly"],
                "monthly": round(specs["hourly"] * 720, 0),
                "cost_per_1k_requests": round(
                    specs["hourly"] / (est_throughput * 3.6), 4
                ) if est_throughput > 0 else None
            })

        # Sort by cost, viable first
        viable = [r for r in results if r["status"] == "VIABLE"]
        viable.sort(key=lambda x: x["hourly"])
        return viable

# Example: right-size a 7B parameter model
sizer = GPURightSizer(
    model_name="llama-7b-chat",
    model_size_gb=14,              # FP16 weights
    required_latency_p99_ms=500,   # 500ms p99
    required_throughput_rps=20     # 20 requests/sec
)

# A100 benchmark results
viable = sizer.find_cheapest_viable(
    a100_latency_ms=85,    # 85ms on A100
    a100_throughput_rps=45  # 45 rps on A100
)

print(f"Model: {sizer.model_name} ({sizer.model_size_gb}GB)")
print(f"Requirements: p99 < {sizer.required_latency_p99_ms}ms, "
      f"> {sizer.required_throughput_rps} rps\n")

if viable:
    print("VIABLE GPUs (cheapest first):")
    for r in viable:
        print(f"  {r['gpu']:<12} ${r['hourly']:.3f}/hr  "
              f"${r['monthly']:,.0f}/mo  "
              f"latency:{r['est_latency_ms']}ms  "
              f"throughput:{r['est_throughput_rps']}rps  "
              f"$/1K req: ${r['cost_per_1k_requests']:.4f}")
    cheapest = viable[0]
    most_expensive = viable[-1]
    savings = (1 - cheapest['monthly'] / most_expensive['monthly']) * 100
    print(f"\nBest choice: {cheapest['gpu']} saves "
          f"{savings:.0f}% vs {most_expensive['gpu']}")
else:
    print("No single GPU meets requirements. Consider multi-GPU or model optimization.")

GPU Sharing: MIG and Time-Slicing

A single GPU running one small model wastes 60-80% of its capacity. GPU sharing lets you run multiple workloads on one GPU.

# GPU sharing configuration for Kubernetes
# NVIDIA Multi-Instance GPU (MIG) - A100/H100 only
# Partitions a single GPU into isolated instances

# MIG Configuration for A100 80GB
MIG_PROFILES_A100_80GB = {
    "7g.80gb": {"instances": 1, "memory": "80GB", "use_case": "Large LLM inference"},
    "4g.40gb": {"instances": 2, "memory": "40GB each", "use_case": "Medium models"},
    "3g.20gb": {"instances": 3, "memory": "20GB each", "use_case": "Small models"},
    "2g.10gb": {"instances": 4, "memory": "10GB each", "use_case": "Tiny models, embeddings"},
    "1g.5gb":  {"instances": 7, "memory": "5GB each", "use_case": "Micro services"},
}

# Example: 1 A100 serving 7 different microservices
# Cost without MIG: 7 x T4 = 7 x $0.526/hr = $3.68/hr
# Cost with MIG:    1 x A100 = $5.12/hr
# But MIG gives better isolation and more VRAM per slice
# Cost effective when you need >16GB VRAM per workload

# Kubernetes GPU sharing with time-slicing
# Works with any NVIDIA GPU, no hardware partitioning
K8S_TIME_SLICING_CONFIG = """
# nvidia-device-plugin ConfigMap for GPU time-slicing
apiVersion: v1
kind: ConfigMap
metadata:
  name: nvidia-device-plugin
  namespace: kube-system
data:
  config.yaml: |
    version: v1
    sharing:
      timeSlicing:
        renameByDefault: false
        failRequestsGreaterThanOne: false
        resources:
          - name: nvidia.com/gpu
            replicas: 4  # Each physical GPU appears as 4 virtual GPUs

# Pod requesting a time-sliced GPU
---
apiVersion: v1
kind: Pod
metadata:
  name: ml-inference-small
spec:
  containers:
    - name: model-server
      image: model-server:latest
      resources:
        limits:
          nvidia.com/gpu: 1  # Gets 1/4 of a physical GPU
"""

# Cost savings calculator for GPU sharing
def calculate_sharing_savings(
    num_workloads: int,
    workload_gpu_util_pct: float,  # avg utilization per workload
    gpu_type: str = "A10G",
    gpu_hourly: float = 1.006
):
    """Calculate savings from GPU sharing vs dedicated GPUs."""

    # Without sharing: one GPU per workload
    dedicated_cost = num_workloads * gpu_hourly * 720  # monthly

    # With sharing: pack based on utilization
    total_util = num_workloads * workload_gpu_util_pct
    gpus_needed = max(1, int(total_util / 70) + 1)  # 70% target util
    shared_cost = gpus_needed * gpu_hourly * 720

    savings = dedicated_cost - shared_cost
    savings_pct = (savings / dedicated_cost) * 100

    return {
        "without_sharing": {
            "gpus": num_workloads,
            "monthly_cost": f"${dedicated_cost:,.0f}"
        },
        "with_sharing": {
            "gpus": gpus_needed,
            "monthly_cost": f"${shared_cost:,.0f}"
        },
        "savings": f"${savings:,.0f}/month ({savings_pct:.0f}%)"
    }

# Example: 6 models each using ~15% GPU
result = calculate_sharing_savings(
    num_workloads=6,
    workload_gpu_util_pct=15,
    gpu_type="A10G",
    gpu_hourly=1.006
)
print("GPU Sharing Savings Analysis:")
print(f"  Without sharing: {result['without_sharing']}")
print(f"  With sharing: {result['with_sharing']}")
print(f"  Savings: {result['savings']}")

Idle GPU Detection and Auto-Scaling

# Idle GPU detector and auto-shutdown system
import datetime
from dataclasses import dataclass, field
from typing import List, Dict

@dataclass
class IdleGPUDetector:
    """Detect and flag idle GPU instances for shutdown or downscaling."""

    idle_threshold_pct: float = 5.0     # GPU util below this = idle
    idle_duration_minutes: int = 30     # Must be idle for this long
    check_interval_minutes: int = 5

    def analyze_fleet(self, gpu_metrics: List[dict]) -> dict:
        """Analyze GPU fleet for idle instances.

        gpu_metrics format:
        [{"instance_id": "i-xxx", "gpu_type": "A10G",
          "utilization_pct": 3.2, "idle_since": datetime,
          "hourly_cost": 1.006, "team": "ml-search",
          "environment": "dev"}]
        """
        idle = []
        active = []
        total_waste = 0

        for gpu in gpu_metrics:
            if gpu["utilization_pct"] < self.idle_threshold_pct:
                if gpu.get("idle_since"):
                    idle_minutes = (
                        datetime.datetime.utcnow() - gpu["idle_since"]
                    ).total_seconds() / 60
                else:
                    idle_minutes = 0

                if idle_minutes >= self.idle_duration_minutes:
                    monthly_waste = gpu["hourly_cost"] * 720
                    total_waste += monthly_waste
                    idle.append({
                        "instance_id": gpu["instance_id"],
                        "gpu_type": gpu["gpu_type"],
                        "utilization": f"{gpu['utilization_pct']:.1f}%",
                        "idle_for": f"{idle_minutes:.0f} minutes",
                        "monthly_waste": f"${monthly_waste:,.0f}",
                        "team": gpu["team"],
                        "environment": gpu["environment"],
                        "action": self._recommend_action(gpu)
                    })
                else:
                    active.append(gpu)
            else:
                active.append(gpu)

        return {
            "total_gpus": len(gpu_metrics),
            "idle_gpus": len(idle),
            "active_gpus": len(active),
            "monthly_waste": f"${total_waste:,.0f}",
            "idle_instances": idle
        }

    def _recommend_action(self, gpu: dict) -> str:
        env = gpu.get("environment", "unknown")
        if env in ("dev", "staging"):
            return "AUTO-SHUTDOWN: non-production, schedule restart"
        elif env == "production":
            return "ALERT: investigate if traffic dropped or scaling issue"
        return "REVIEW: check workload schedule"

# Auto-scaling policy for GPU inference
GPU_AUTOSCALING_POLICY = {
    "scale_up": {
        "metric": "gpu_utilization_avg_5min",
        "threshold": 70,           # Scale up at 70% util
        "cooldown_seconds": 300,   # Wait 5 min between scale-ups
        "increment": 1,            # Add 1 GPU at a time
        "max_instances": 10
    },
    "scale_down": {
        "metric": "gpu_utilization_avg_15min",
        "threshold": 20,           # Scale down at 20% util
        "cooldown_seconds": 600,   # Wait 10 min (avoid flapping)
        "decrement": 1,
        "min_instances": 1
    },
    "schedule": {
        "weekday_peak": {"hours": "8-20", "min_instances": 3},
        "weekday_off_peak": {"hours": "20-8", "min_instances": 1},
        "weekend": {"min_instances": 1}
    },
    "estimated_savings": "40-60% vs always-on at peak capacity"
}

print("GPU Auto-Scaling Policy:")
for phase, config in GPU_AUTOSCALING_POLICY.items():
    print(f"\n  {phase}: {config}")

Key Takeaway: GPU cost optimization has three high-impact levers: (1) Right-size your instances -- benchmark on smaller GPUs before defaulting to A100s. (2) Use spot instances for training (60% savings) and reserved instances for steady inference (35% savings). (3) Share GPUs across low-utilization workloads with time-slicing or MIG (50-75% savings). Most teams can cut GPU costs by 40-60% by applying these three strategies.

← AI Cost Landscape Inference Cost Reduction →