Prompt Management System
Hard-coded prompts in your source code are technical debt. This lesson shows you how to build a prompt management system that versions, templates, A/B tests, and dynamically constructs prompts — so your team can iterate on prompts without deploying code.
Why You Need Prompt Management
In a typical LLM application, prompts change 10x more often than code. Without a prompt management system, every prompt change requires a code deployment, a PR review, and a release cycle. That is absurd when the change is just tweaking a sentence.
A prompt management system gives you:
- Decoupled iteration: Product managers and prompt engineers can update prompts without code changes
- Version history: Roll back to any previous prompt version when a new one degrades quality
- A/B testing: Compare prompt variants with real traffic to measure which performs better
- Audit trail: Know who changed what prompt and when, critical for regulated industries
Prompt Template System
The foundation of prompt management is a template system that separates prompt structure from prompt data. Here is a production-ready implementation:
import hashlib
import json
from datetime import datetime, timezone
from typing import Any
from dataclasses import dataclass, field
@dataclass
class PromptVersion:
"""A single version of a prompt template."""
version: int
template: str
variables: list[str]
model: str
temperature: float
max_tokens: int
created_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
created_by: str = "system"
hash: str = ""
def __post_init__(self):
self.hash = hashlib.sha256(self.template.encode()).hexdigest()[:12]
@dataclass
class PromptTemplate:
"""A named prompt with version history."""
name: str
description: str
versions: list[PromptVersion] = field(default_factory=list)
active_version: int = 1
tags: list[str] = field(default_factory=list)
@property
def current(self) -> PromptVersion:
return next(v for v in self.versions if v.version == self.active_version)
def render(self, **kwargs) -> str:
"""Render the active template with provided variables."""
template = self.current.template
missing = [v for v in self.current.variables if v not in kwargs]
if missing:
raise ValueError(f"Missing template variables: {missing}")
for key, value in kwargs.items():
template = template.replace(f"{{{{{key}}}}}", str(value))
return template
def add_version(self, template: str, variables: list[str],
model: str, temperature: float = 0.7,
max_tokens: int = 1024, created_by: str = "system") -> int:
"""Add a new version and return its version number."""
new_version = len(self.versions) + 1
self.versions.append(PromptVersion(
version=new_version,
template=template,
variables=variables,
model=model,
temperature=temperature,
max_tokens=max_tokens,
created_by=created_by
))
return new_version
def rollback(self, version: int):
"""Set a previous version as the active version."""
if not any(v.version == version for v in self.versions):
raise ValueError(f"Version {version} does not exist")
self.active_version = version
Prompt Registry
The prompt registry is where all your prompts live. It is the single source of truth for every prompt in your application. Here is a complete registry implementation with persistence:
class PromptRegistry:
"""Central registry for all prompt templates in the application."""
def __init__(self, storage_path: str = "prompts.json"):
self.storage_path = storage_path
self.prompts: dict[str, PromptTemplate] = {}
self._load()
def register(self, name: str, description: str,
template: str, variables: list[str],
model: str, temperature: float = 0.7,
max_tokens: int = 1024, tags: list[str] = None) -> PromptTemplate:
"""Register a new prompt or add a version to an existing one."""
if name in self.prompts:
prompt = self.prompts[name]
prompt.add_version(template, variables, model, temperature, max_tokens)
else:
prompt = PromptTemplate(name=name, description=description, tags=tags or [])
prompt.add_version(template, variables, model, temperature, max_tokens)
self.prompts[name] = prompt
self._save()
return prompt
def get(self, name: str) -> PromptTemplate:
"""Get a prompt template by name."""
if name not in self.prompts:
raise KeyError(f"Prompt '{name}' not found. Available: {list(self.prompts.keys())}")
return self.prompts[name]
def render(self, name: str, **kwargs) -> str:
"""Render a prompt by name with variables."""
return self.get(name).render(**kwargs)
def list_prompts(self, tag: str = None) -> list[dict]:
"""List all prompts, optionally filtered by tag."""
prompts = self.prompts.values()
if tag:
prompts = [p for p in prompts if tag in p.tags]
return [
{
"name": p.name,
"description": p.description,
"active_version": p.active_version,
"total_versions": len(p.versions),
"tags": p.tags
}
for p in prompts
]
def _save(self):
import dataclasses
data = {name: dataclasses.asdict(p) for name, p in self.prompts.items()}
with open(self.storage_path, "w") as f:
json.dump(data, f, indent=2)
def _load(self):
try:
with open(self.storage_path) as f:
data = json.load(f)
for name, prompt_data in data.items():
versions = [PromptVersion(**v) for v in prompt_data["versions"]]
self.prompts[name] = PromptTemplate(
name=prompt_data["name"],
description=prompt_data["description"],
versions=versions,
active_version=prompt_data["active_version"],
tags=prompt_data.get("tags", [])
)
except FileNotFoundError:
pass
# Usage: Register and use prompts
registry = PromptRegistry()
registry.register(
name="customer_support",
description="Main customer support agent prompt",
template="""You are a helpful customer support agent for {{company_name}}.
The customer's name is {{customer_name}} and their account tier is {{tier}}.
Previous interactions summary: {{history_summary}}
Guidelines:
- Be empathetic and solution-oriented
- For {{tier}} customers, offer priority escalation
- Never share internal system details
- If you cannot resolve the issue, create a ticket
Customer message: {{message}}""",
variables=["company_name", "customer_name", "tier", "history_summary", "message"],
model="gpt-4o",
temperature=0.3,
tags=["support", "production"]
)
# Render the prompt with actual values
prompt = registry.render(
"customer_support",
company_name="Acme Corp",
customer_name="Alice",
tier="enterprise",
history_summary="3 previous tickets, all resolved",
message="My API keys stopped working after the migration"
)
A/B Testing Prompts
A/B testing prompts is how you improve prompt quality with data instead of guesswork. Here is a production A/B testing implementation:
import random
import time
from collections import defaultdict
class PromptABTest:
"""A/B test two prompt versions with real traffic."""
def __init__(self, prompt_name: str, version_a: int, version_b: int,
traffic_split: float = 0.5):
self.prompt_name = prompt_name
self.version_a = version_a
self.version_b = version_b
self.traffic_split = traffic_split # fraction going to version B
self.results: dict[int, list[dict]] = defaultdict(list)
self.created_at = time.time()
def get_version(self, user_id: str = None) -> int:
"""Deterministically assign a user to a variant."""
if user_id:
# Consistent assignment: same user always sees same variant
hash_val = int(hashlib.md5(
f"{self.prompt_name}:{user_id}".encode()
).hexdigest(), 16)
return self.version_b if (hash_val % 100) < (self.traffic_split * 100) else self.version_a
# Random assignment for anonymous users
return self.version_b if random.random() < self.traffic_split else self.version_a
def record_result(self, version: int, metrics: dict):
"""Record evaluation metrics for a response."""
self.results[version].append({
"timestamp": time.time(),
**metrics
})
def get_stats(self) -> dict:
"""Calculate statistical summary for the test."""
stats = {}
for version in [self.version_a, self.version_b]:
results = self.results[version]
if not results:
stats[version] = {"count": 0}
continue
scores = [r.get("quality_score", 0) for r in results]
latencies = [r.get("latency_ms", 0) for r in results]
costs = [r.get("cost_usd", 0) for r in results]
stats[version] = {
"count": len(results),
"avg_quality": sum(scores) / len(scores),
"avg_latency_ms": sum(latencies) / len(latencies),
"avg_cost_usd": sum(costs) / len(costs),
"total_cost_usd": sum(costs)
}
return stats
# Usage
ab_test = PromptABTest("customer_support", version_a=1, version_b=2, traffic_split=0.2)
# In your request handler:
version = ab_test.get_version(user_id="user_123")
prompt_template = registry.get("customer_support")
prompt_template.active_version = version # temporarily set version
rendered = prompt_template.render(company_name="Acme", customer_name="Bob",
tier="pro", history_summary="none", message="Help!")
# After getting LLM response, record metrics:
ab_test.record_result(version, {
"quality_score": 0.85,
"latency_ms": 450,
"cost_usd": 0.003,
"user_satisfaction": 5
})
Few-Shot Example Management
Few-shot examples are the most underrated prompt engineering technique. Instead of describing what you want in prose, you show the model examples. The challenge in production is managing, rotating, and selecting examples dynamically.
from dataclasses import dataclass
import numpy as np
@dataclass
class FewShotExample:
"""A single input-output example for few-shot prompting."""
input_text: str
output_text: str
category: str
quality_score: float # 0-1, based on human evaluation
embedding: list[float] = None # for semantic selection
usage_count: int = 0
class FewShotManager:
"""Manage and select few-shot examples for prompts."""
def __init__(self):
self.examples: dict[str, list[FewShotExample]] = defaultdict(list)
def add_example(self, prompt_name: str, example: FewShotExample):
self.examples[prompt_name].append(example)
def select_examples(self, prompt_name: str, query: str = None,
n: int = 3, strategy: str = "quality") -> list[FewShotExample]:
"""Select n examples using the specified strategy."""
pool = self.examples.get(prompt_name, [])
if not pool:
return []
if strategy == "quality":
# Pick the highest-quality examples
sorted_pool = sorted(pool, key=lambda e: e.quality_score, reverse=True)
return sorted_pool[:n]
elif strategy == "diverse":
# Pick examples from different categories
by_category = defaultdict(list)
for ex in pool:
by_category[ex.category].append(ex)
selected = []
categories = list(by_category.keys())
idx = 0
while len(selected) < n and idx < len(pool):
cat = categories[idx % len(categories)]
if by_category[cat]:
best = max(by_category[cat], key=lambda e: e.quality_score)
selected.append(best)
by_category[cat].remove(best)
idx += 1
return selected
elif strategy == "semantic" and query:
# Pick examples most similar to the query (requires embeddings)
query_emb = get_embedding(query)
for ex in pool:
if ex.embedding:
ex._similarity = cosine_similarity(query_emb, ex.embedding)
else:
ex._similarity = 0
sorted_pool = sorted(pool, key=lambda e: e._similarity, reverse=True)
return sorted_pool[:n]
return pool[:n]
def format_examples(self, examples: list[FewShotExample]) -> str:
"""Format selected examples for prompt injection."""
parts = []
for i, ex in enumerate(examples, 1):
parts.append(f"Example {i}:")
parts.append(f"Input: {ex.input_text}")
parts.append(f"Output: {ex.output_text}")
parts.append("")
return "\n".join(parts)
# Usage
manager = FewShotManager()
manager.add_example("sentiment_classifier", FewShotExample(
input_text="The product arrived damaged and customer service was unhelpful.",
output_text='{"sentiment": "negative", "confidence": 0.95, "topics": ["product_quality", "support"]}',
category="negative",
quality_score=0.98
))
manager.add_example("sentiment_classifier", FewShotExample(
input_text="Fast shipping and exactly what I ordered. Will buy again!",
output_text='{"sentiment": "positive", "confidence": 0.97, "topics": ["shipping", "satisfaction"]}',
category="positive",
quality_score=0.95
))
# Select and format examples for a prompt
examples = manager.select_examples("sentiment_classifier", n=2, strategy="diverse")
examples_text = manager.format_examples(examples)
Dynamic Prompt Construction
Real production prompts are not static strings. They are dynamically constructed based on user context, conversation state, and system configuration. Here is a pattern for building prompts dynamically:
class DynamicPromptBuilder:
"""Build prompts dynamically based on context and configuration."""
def __init__(self, registry: PromptRegistry, few_shot_mgr: FewShotManager):
self.registry = registry
self.few_shot_mgr = few_shot_mgr
def build(self, prompt_name: str, context: dict) -> dict:
"""Build a complete prompt with all dynamic components.
Returns dict with: system_prompt, user_prompt, model, temperature, max_tokens
"""
template = self.registry.get(prompt_name)
version = template.current
# Build system prompt with conditional sections
system_parts = [version.template]
# Add few-shot examples if available
examples = self.few_shot_mgr.select_examples(
prompt_name,
query=context.get("user_message", ""),
n=context.get("num_examples", 3),
strategy=context.get("example_strategy", "quality")
)
if examples:
system_parts.append("\nHere are examples of correct responses:")
system_parts.append(self.few_shot_mgr.format_examples(examples))
# Add time-based context
system_parts.append(f"\nCurrent date: {datetime.now().strftime('%Y-%m-%d')}")
# Add user-tier-specific instructions
tier = context.get("user_tier", "free")
if tier == "enterprise":
system_parts.append("\nThis is an enterprise customer. Prioritize their request.")
elif tier == "free":
system_parts.append("\nFor feature requests, suggest upgrading to a paid plan.")
# Render template variables
system_prompt = "\n".join(system_parts)
for key, value in context.items():
system_prompt = system_prompt.replace(f"{{{{{key}}}}}", str(value))
return {
"system_prompt": system_prompt,
"user_prompt": context.get("user_message", ""),
"model": version.model,
"temperature": version.temperature,
"max_tokens": version.max_tokens
}
# Usage
builder = DynamicPromptBuilder(registry, manager)
prompt_config = builder.build("customer_support", {
"company_name": "Acme Corp",
"customer_name": "Alice",
"tier": "enterprise",
"history_summary": "VIP customer, 3 previous positive interactions",
"message": "I need to increase my API rate limit",
"user_message": "I need to increase my API rate limit",
"user_tier": "enterprise",
"num_examples": 2,
"example_strategy": "semantic"
})
# Send to LLM
response = client.chat.completions.create(
model=prompt_config["model"],
temperature=prompt_config["temperature"],
max_tokens=prompt_config["max_tokens"],
messages=[
{"role": "system", "content": prompt_config["system_prompt"]},
{"role": "user", "content": prompt_config["user_prompt"]}
]
)
Key Takeaways
- Prompts change 10x more often than code. A prompt management system decouples prompt iteration from code deployment.
- Version every prompt with a hash and timestamp. You need rollback capability when a new prompt version degrades quality.
- A/B test prompts with deterministic user assignment (consistent hashing) and start with a 90/10 traffic split.
- Few-shot examples are the most effective prompt engineering technique. Use semantic selection to pick examples similar to the current query.
- Dynamic prompt construction assembles prompts from templates, examples, user context, and system configuration at request time.
What Is Next
In the next lesson, we will build the LLM Gateway — the central routing layer that handles multi-provider routing, fallback chains, rate limiting, cost tracking, and semantic caching.
Lilly Tech Systems