GPU Cost Optimization Intermediate
GPU compute is typically the single largest cost in AI systems, often 50-70% of total infrastructure spend. The difference between a well-optimized GPU strategy and a naive one can be 3-5x in monthly costs. This lesson provides real pricing data, instance selection frameworks, and GPU sharing techniques that production teams use to cut GPU bills dramatically.
GPU Pricing Comparison: AWS vs GCP vs Azure (2025 Prices)
These are real on-demand hourly prices. Spot and reserved pricing follows below.
| GPU | VRAM | AWS ($/hr) | GCP ($/hr) | Azure ($/hr) | Best For |
|---|---|---|---|---|---|
| NVIDIA T4 | 16 GB | $0.526 | $0.35 | $0.526 | Small model inference, dev/test |
| NVIDIA A10G | 24 GB | $1.006 | $1.10 | $1.10 | Medium inference, fine-tuning |
| NVIDIA L4 | 24 GB | $0.725 | $0.70 | N/A | Efficient inference (Ada arch) |
| NVIDIA A100 40GB | 40 GB | $3.67 | $3.67 | $3.40 | Training, large model inference |
| NVIDIA A100 80GB | 80 GB | $5.12 | $5.07 | $4.60 | LLM training, large batch inference |
| NVIDIA H100 | 80 GB | $8.22 | $8.68 | $7.35 | LLM training, high-throughput inference |
On-Demand vs Spot vs Reserved: Decision Framework
# GPU pricing strategy selector
from dataclasses import dataclass
from enum import Enum
from typing import Optional
class PricingStrategy(Enum):
ON_DEMAND = "on_demand"
SPOT = "spot" # AWS Spot / GCP Preemptible / Azure Spot
RESERVED_1YR = "reserved_1yr"
RESERVED_3YR = "reserved_3yr"
SAVINGS_PLAN = "savings_plan"
@dataclass
class GPUPricingAnalysis:
"""Compare pricing strategies for a GPU workload."""
gpu_type: str
on_demand_hourly: float
hours_per_month: float
workload_type: str # "training", "inference", "dev"
# Typical discount percentages (vary by instance/region)
SPOT_DISCOUNT = 0.60 # 60% off on-demand (can be 50-90%)
RESERVED_1YR_DISCOUNT = 0.35 # 35% off on-demand
RESERVED_3YR_DISCOUNT = 0.55 # 55% off on-demand
SAVINGS_PLAN_DISCOUNT = 0.30 # 30% off on-demand
def monthly_cost(self, strategy: PricingStrategy) -> float:
discounts = {
PricingStrategy.ON_DEMAND: 0,
PricingStrategy.SPOT: self.SPOT_DISCOUNT,
PricingStrategy.RESERVED_1YR: self.RESERVED_1YR_DISCOUNT,
PricingStrategy.RESERVED_3YR: self.RESERVED_3YR_DISCOUNT,
PricingStrategy.SAVINGS_PLAN: self.SAVINGS_PLAN_DISCOUNT,
}
hourly = self.on_demand_hourly * (1 - discounts[strategy])
return hourly * self.hours_per_month
def recommend(self) -> dict:
"""Recommend best pricing strategy based on workload."""
results = {}
for strategy in PricingStrategy:
results[strategy.value] = self.monthly_cost(strategy)
on_demand_cost = results["on_demand"]
# Decision logic
if self.workload_type == "training":
# Training: use spot (interruptible, checkpoint-friendly)
recommended = PricingStrategy.SPOT
reason = ("Training jobs can checkpoint and resume. "
"Spot interruptions add 5-15% overhead but save 60%.")
elif self.hours_per_month >= 600: # >83% utilization
# High utilization inference: reserved
recommended = PricingStrategy.RESERVED_1YR
reason = ("Steady-state inference with >83% utilization. "
"1-year reserved saves 35% with no interruption risk.")
elif self.hours_per_month >= 400: # 55%+ utilization
recommended = PricingStrategy.SAVINGS_PLAN
reason = ("Moderate utilization. Savings Plan gives 30% discount "
"with flexibility to change instance types.")
else:
# Low utilization: on-demand with auto-scaling
recommended = PricingStrategy.ON_DEMAND
reason = ("Low utilization (<55%). Use on-demand with aggressive "
"auto-scaling. Consider serverless inference.")
return {
"recommended": recommended.value,
"reason": reason,
"monthly_costs": {k: f"${v:,.0f}" for k, v in results.items()},
"savings_vs_on_demand": f"${on_demand_cost - results[recommended.value]:,.0f}/month",
"annual_savings": f"${(on_demand_cost - results[recommended.value]) * 12:,.0f}/year"
}
# Example: Compare strategies for different workloads
workloads = [
GPUPricingAnalysis("A100 80GB", 5.12, 720, "training"), # 24/7 training
GPUPricingAnalysis("A10G", 1.006, 720, "inference"), # 24/7 inference
GPUPricingAnalysis("A10G", 1.006, 300, "inference"), # Partial inference
GPUPricingAnalysis("H100", 8.22, 200, "training"), # Burst training
]
for w in workloads:
rec = w.recommend()
print(f"\n{'='*55}")
print(f"GPU: {w.gpu_type} | {w.hours_per_month}hrs/mo | {w.workload_type}")
print(f"Recommended: {rec['recommended']}")
print(f"Reason: {rec['reason']}")
print(f"Monthly costs by strategy:")
for strategy, cost in rec['monthly_costs'].items():
marker = " <-- RECOMMENDED" if strategy == rec['recommended'] else ""
print(f" {strategy:<20} {cost}{marker}")
print(f"Savings: {rec['savings_vs_on_demand']}/mo | {rec['annual_savings']}/yr")
Right-Sizing GPU Instances
The most common GPU waste: using an A100 when a T4 would do. Here's how to right-size.
# GPU right-sizing benchmark tool
import time
from dataclasses import dataclass
from typing import Dict, List
@dataclass
class GPURightSizer:
"""Benchmark your model on different GPU tiers to find the cheapest
instance that meets your latency and throughput requirements."""
model_name: str
model_size_gb: float
required_latency_p99_ms: float
required_throughput_rps: float # requests per second
# GPU tier performance estimates (relative to A100)
GPU_TIERS = {
"T4": {"vram_gb": 16, "hourly": 0.526, "relative_perf": 0.25},
"A10G": {"vram_gb": 24, "hourly": 1.006, "relative_perf": 0.45},
"L4": {"vram_gb": 24, "hourly": 0.725, "relative_perf": 0.50},
"A100_40": {"vram_gb": 40, "hourly": 3.670, "relative_perf": 0.85},
"A100_80": {"vram_gb": 80, "hourly": 5.120, "relative_perf": 1.00},
"H100": {"vram_gb": 80, "hourly": 8.220, "relative_perf": 1.80},
}
def find_cheapest_viable(self, a100_latency_ms: float,
a100_throughput_rps: float) -> List[dict]:
"""Given A100 benchmark results, estimate performance on each tier
and find the cheapest GPU that meets requirements."""
results = []
for gpu_name, specs in self.GPU_TIERS.items():
# Check VRAM fit (model must fit in GPU memory)
if self.model_size_gb > specs["vram_gb"] * 0.85: # 85% usable
results.append({
"gpu": gpu_name,
"status": "SKIP - model too large for VRAM",
"hourly": specs["hourly"],
"monthly": specs["hourly"] * 720,
})
continue
# Estimate performance on this GPU
est_latency = a100_latency_ms / specs["relative_perf"]
est_throughput = a100_throughput_rps * specs["relative_perf"]
meets_latency = est_latency <= self.required_latency_p99_ms
meets_throughput = est_throughput >= self.required_throughput_rps
status = "VIABLE" if (meets_latency and meets_throughput) else "FAIL"
if not meets_latency:
status += f" - latency {est_latency:.0f}ms > {self.required_latency_p99_ms}ms"
if not meets_throughput:
status += f" - throughput {est_throughput:.0f} < {self.required_throughput_rps} rps"
results.append({
"gpu": gpu_name,
"status": status,
"est_latency_ms": round(est_latency, 1),
"est_throughput_rps": round(est_throughput, 1),
"hourly": specs["hourly"],
"monthly": round(specs["hourly"] * 720, 0),
"cost_per_1k_requests": round(
specs["hourly"] / (est_throughput * 3.6), 4
) if est_throughput > 0 else None
})
# Sort by cost, viable first
viable = [r for r in results if r["status"] == "VIABLE"]
viable.sort(key=lambda x: x["hourly"])
return viable
# Example: right-size a 7B parameter model
sizer = GPURightSizer(
model_name="llama-7b-chat",
model_size_gb=14, # FP16 weights
required_latency_p99_ms=500, # 500ms p99
required_throughput_rps=20 # 20 requests/sec
)
# A100 benchmark results
viable = sizer.find_cheapest_viable(
a100_latency_ms=85, # 85ms on A100
a100_throughput_rps=45 # 45 rps on A100
)
print(f"Model: {sizer.model_name} ({sizer.model_size_gb}GB)")
print(f"Requirements: p99 < {sizer.required_latency_p99_ms}ms, "
f"> {sizer.required_throughput_rps} rps\n")
if viable:
print("VIABLE GPUs (cheapest first):")
for r in viable:
print(f" {r['gpu']:<12} ${r['hourly']:.3f}/hr "
f"${r['monthly']:,.0f}/mo "
f"latency:{r['est_latency_ms']}ms "
f"throughput:{r['est_throughput_rps']}rps "
f"$/1K req: ${r['cost_per_1k_requests']:.4f}")
cheapest = viable[0]
most_expensive = viable[-1]
savings = (1 - cheapest['monthly'] / most_expensive['monthly']) * 100
print(f"\nBest choice: {cheapest['gpu']} saves "
f"{savings:.0f}% vs {most_expensive['gpu']}")
else:
print("No single GPU meets requirements. Consider multi-GPU or model optimization.")
GPU Sharing: MIG and Time-Slicing
A single GPU running one small model wastes 60-80% of its capacity. GPU sharing lets you run multiple workloads on one GPU.
# GPU sharing configuration for Kubernetes
# NVIDIA Multi-Instance GPU (MIG) - A100/H100 only
# Partitions a single GPU into isolated instances
# MIG Configuration for A100 80GB
MIG_PROFILES_A100_80GB = {
"7g.80gb": {"instances": 1, "memory": "80GB", "use_case": "Large LLM inference"},
"4g.40gb": {"instances": 2, "memory": "40GB each", "use_case": "Medium models"},
"3g.20gb": {"instances": 3, "memory": "20GB each", "use_case": "Small models"},
"2g.10gb": {"instances": 4, "memory": "10GB each", "use_case": "Tiny models, embeddings"},
"1g.5gb": {"instances": 7, "memory": "5GB each", "use_case": "Micro services"},
}
# Example: 1 A100 serving 7 different microservices
# Cost without MIG: 7 x T4 = 7 x $0.526/hr = $3.68/hr
# Cost with MIG: 1 x A100 = $5.12/hr
# But MIG gives better isolation and more VRAM per slice
# Cost effective when you need >16GB VRAM per workload
# Kubernetes GPU sharing with time-slicing
# Works with any NVIDIA GPU, no hardware partitioning
K8S_TIME_SLICING_CONFIG = """
# nvidia-device-plugin ConfigMap for GPU time-slicing
apiVersion: v1
kind: ConfigMap
metadata:
name: nvidia-device-plugin
namespace: kube-system
data:
config.yaml: |
version: v1
sharing:
timeSlicing:
renameByDefault: false
failRequestsGreaterThanOne: false
resources:
- name: nvidia.com/gpu
replicas: 4 # Each physical GPU appears as 4 virtual GPUs
# Pod requesting a time-sliced GPU
---
apiVersion: v1
kind: Pod
metadata:
name: ml-inference-small
spec:
containers:
- name: model-server
image: model-server:latest
resources:
limits:
nvidia.com/gpu: 1 # Gets 1/4 of a physical GPU
"""
# Cost savings calculator for GPU sharing
def calculate_sharing_savings(
num_workloads: int,
workload_gpu_util_pct: float, # avg utilization per workload
gpu_type: str = "A10G",
gpu_hourly: float = 1.006
):
"""Calculate savings from GPU sharing vs dedicated GPUs."""
# Without sharing: one GPU per workload
dedicated_cost = num_workloads * gpu_hourly * 720 # monthly
# With sharing: pack based on utilization
total_util = num_workloads * workload_gpu_util_pct
gpus_needed = max(1, int(total_util / 70) + 1) # 70% target util
shared_cost = gpus_needed * gpu_hourly * 720
savings = dedicated_cost - shared_cost
savings_pct = (savings / dedicated_cost) * 100
return {
"without_sharing": {
"gpus": num_workloads,
"monthly_cost": f"${dedicated_cost:,.0f}"
},
"with_sharing": {
"gpus": gpus_needed,
"monthly_cost": f"${shared_cost:,.0f}"
},
"savings": f"${savings:,.0f}/month ({savings_pct:.0f}%)"
}
# Example: 6 models each using ~15% GPU
result = calculate_sharing_savings(
num_workloads=6,
workload_gpu_util_pct=15,
gpu_type="A10G",
gpu_hourly=1.006
)
print("GPU Sharing Savings Analysis:")
print(f" Without sharing: {result['without_sharing']}")
print(f" With sharing: {result['with_sharing']}")
print(f" Savings: {result['savings']}")
Idle GPU Detection and Auto-Scaling
# Idle GPU detector and auto-shutdown system
import datetime
from dataclasses import dataclass, field
from typing import List, Dict
@dataclass
class IdleGPUDetector:
"""Detect and flag idle GPU instances for shutdown or downscaling."""
idle_threshold_pct: float = 5.0 # GPU util below this = idle
idle_duration_minutes: int = 30 # Must be idle for this long
check_interval_minutes: int = 5
def analyze_fleet(self, gpu_metrics: List[dict]) -> dict:
"""Analyze GPU fleet for idle instances.
gpu_metrics format:
[{"instance_id": "i-xxx", "gpu_type": "A10G",
"utilization_pct": 3.2, "idle_since": datetime,
"hourly_cost": 1.006, "team": "ml-search",
"environment": "dev"}]
"""
idle = []
active = []
total_waste = 0
for gpu in gpu_metrics:
if gpu["utilization_pct"] < self.idle_threshold_pct:
if gpu.get("idle_since"):
idle_minutes = (
datetime.datetime.utcnow() - gpu["idle_since"]
).total_seconds() / 60
else:
idle_minutes = 0
if idle_minutes >= self.idle_duration_minutes:
monthly_waste = gpu["hourly_cost"] * 720
total_waste += monthly_waste
idle.append({
"instance_id": gpu["instance_id"],
"gpu_type": gpu["gpu_type"],
"utilization": f"{gpu['utilization_pct']:.1f}%",
"idle_for": f"{idle_minutes:.0f} minutes",
"monthly_waste": f"${monthly_waste:,.0f}",
"team": gpu["team"],
"environment": gpu["environment"],
"action": self._recommend_action(gpu)
})
else:
active.append(gpu)
else:
active.append(gpu)
return {
"total_gpus": len(gpu_metrics),
"idle_gpus": len(idle),
"active_gpus": len(active),
"monthly_waste": f"${total_waste:,.0f}",
"idle_instances": idle
}
def _recommend_action(self, gpu: dict) -> str:
env = gpu.get("environment", "unknown")
if env in ("dev", "staging"):
return "AUTO-SHUTDOWN: non-production, schedule restart"
elif env == "production":
return "ALERT: investigate if traffic dropped or scaling issue"
return "REVIEW: check workload schedule"
# Auto-scaling policy for GPU inference
GPU_AUTOSCALING_POLICY = {
"scale_up": {
"metric": "gpu_utilization_avg_5min",
"threshold": 70, # Scale up at 70% util
"cooldown_seconds": 300, # Wait 5 min between scale-ups
"increment": 1, # Add 1 GPU at a time
"max_instances": 10
},
"scale_down": {
"metric": "gpu_utilization_avg_15min",
"threshold": 20, # Scale down at 20% util
"cooldown_seconds": 600, # Wait 10 min (avoid flapping)
"decrement": 1,
"min_instances": 1
},
"schedule": {
"weekday_peak": {"hours": "8-20", "min_instances": 3},
"weekday_off_peak": {"hours": "20-8", "min_instances": 1},
"weekend": {"min_instances": 1}
},
"estimated_savings": "40-60% vs always-on at peak capacity"
}
print("GPU Auto-Scaling Policy:")
for phase, config in GPU_AUTOSCALING_POLICY.items():
print(f"\n {phase}: {config}")
Key Takeaway: GPU cost optimization has three high-impact levers: (1) Right-size your instances -- benchmark on smaller GPUs before defaulting to A100s. (2) Use spot instances for training (60% savings) and reserved instances for steady inference (35% savings). (3) Share GPUs across low-utilization workloads with time-slicing or MIG (50-75% savings). Most teams can cut GPU costs by 40-60% by applying these three strategies.
Lilly Tech Systems