LLM Evaluation & Testing
You cannot improve what you cannot measure. LLM applications are notoriously hard to evaluate because outputs are non-deterministic and quality is subjective. This lesson shows you how to build an evaluation framework that gives you confidence your LLM app is working correctly — and alerts you when it is not.
Why LLM Evaluation Is Different
Traditional software testing uses assertions: assert output == expected. LLM outputs are rarely identical even with the same input. You need evaluation methods that handle variability:
- Non-deterministic outputs: The same prompt can produce different responses
- Subjective quality: "Good" depends on context, tone, and user expectations
- Multiple correct answers: There is no single right answer for most LLM tasks
- Emergent failures: The model can fail in ways you never anticipated
LLM-as-Judge Evaluation
The most practical evaluation method: use a powerful LLM to evaluate the outputs of your production LLM. This scales to thousands of evaluations per day at low cost.
from dataclasses import dataclass
from typing import Optional
import json
import time
@dataclass
class EvalResult:
"""Result of an LLM evaluation."""
score: float # 0-1
reasoning: str # Why this score
criteria_scores: dict # Individual criteria scores
evaluator_model: str
cost_usd: float
latency_ms: float
class LLMJudge:
"""Use an LLM to evaluate another LLM's outputs."""
def __init__(self, gateway, evaluator_model: str = "gpt-4o"):
self.gateway = gateway
self.evaluator_model = evaluator_model
def evaluate(self, prompt: str, response: str,
criteria: list[str] = None,
reference_answer: str = None) -> EvalResult:
"""Evaluate an LLM response on multiple criteria."""
criteria = criteria or ["relevance", "accuracy", "helpfulness", "clarity"]
eval_prompt = f"""You are an expert evaluator. Rate the following AI response on these criteria.
USER PROMPT: {prompt}
AI RESPONSE: {response}
"""
if reference_answer:
eval_prompt += f"\nREFERENCE ANSWER: {reference_answer}\n"
eval_prompt += f"""
CRITERIA TO EVALUATE:
{chr(10).join(f'- {c}: Rate 1-5' for c in criteria)}
Respond with ONLY valid JSON in this format:
{{
"overall_score": <1-5>,
"criteria": {{
"{criteria[0]}": {{"score": <1-5>, "reason": ""}},
...
}},
"summary": "<1-2 sentence overall assessment>"
}}"""
start = time.monotonic()
result = self.gateway.complete(
messages=[{"role": "user", "content": eval_prompt}],
model=self.evaluator_model,
temperature=0,
max_tokens=500
)
latency = (time.monotonic() - start) * 1000
try:
parsed = json.loads(result.content)
criteria_scores = {
k: v["score"] / 5.0 # normalize to 0-1
for k, v in parsed.get("criteria", {}).items()
}
return EvalResult(
score=parsed.get("overall_score", 3) / 5.0,
reasoning=parsed.get("summary", ""),
criteria_scores=criteria_scores,
evaluator_model=self.evaluator_model,
cost_usd=result.cost_usd,
latency_ms=latency
)
except (json.JSONDecodeError, KeyError):
return EvalResult(
score=0.5,
reasoning="Failed to parse evaluation",
criteria_scores={},
evaluator_model=self.evaluator_model,
cost_usd=result.cost_usd,
latency_ms=latency
)
def compare(self, prompt: str, response_a: str, response_b: str) -> dict:
"""Compare two responses head-to-head (A/B evaluation)."""
compare_prompt = f"""Compare these two AI responses to the same prompt.
PROMPT: {prompt}
RESPONSE A:
{response_a}
RESPONSE B:
{response_b}
Which response is better? Respond with ONLY valid JSON:
{{
"winner": "A" or "B" or "tie",
"reason": "",
"a_strengths": ["..."],
"b_strengths": ["..."]
}}"""
result = self.gateway.complete(
messages=[{"role": "user", "content": compare_prompt}],
model=self.evaluator_model,
temperature=0,
max_tokens=300
)
try:
return json.loads(result.content)
except json.JSONDecodeError:
return {"winner": "tie", "reason": "Failed to parse comparison"}
# Usage
judge = LLMJudge(gateway=gateway)
eval_result = judge.evaluate(
prompt="Explain the difference between REST and GraphQL",
response="REST uses multiple endpoints while GraphQL uses a single endpoint...",
criteria=["accuracy", "completeness", "clarity", "conciseness"],
reference_answer="REST is a resource-based architecture using HTTP methods..."
)
print(f"Score: {eval_result.score:.2f}")
print(f"Reasoning: {eval_result.reasoning}")
print(f"Criteria: {eval_result.criteria_scores}")
Regression Testing for Prompts
Every time you change a prompt, you risk breaking responses that were previously working. A regression test suite catches these regressions before they reach production.
@dataclass
class TestCase:
"""A single test case for prompt regression testing."""
name: str
prompt: str
expected_criteria: dict[str, float] # criteria -> minimum score
reference_answer: str = None
tags: list[str] = None
class PromptRegressionSuite:
"""Run regression tests on prompt changes."""
def __init__(self, gateway, judge: LLMJudge):
self.gateway = gateway
self.judge = judge
self.test_cases: list[TestCase] = []
self.results_history: list[dict] = []
def add_test(self, test: TestCase):
self.test_cases.append(test)
def run(self, system_prompt: str, model: str = "gpt-4o",
tag_filter: str = None) -> dict:
"""Run all test cases against a prompt and return results."""
cases = self.test_cases
if tag_filter:
cases = [t for t in cases if tag_filter in (t.tags or [])]
results = {
"passed": 0, "failed": 0, "total": len(cases),
"details": [], "timestamp": time.time()
}
for test in cases:
# Generate response
response = self.gateway.complete(
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": test.prompt}
],
model=model,
temperature=0
)
# Evaluate
eval_result = self.judge.evaluate(
prompt=test.prompt,
response=response.content,
criteria=list(test.expected_criteria.keys()),
reference_answer=test.reference_answer
)
# Check against thresholds
passed = all(
eval_result.criteria_scores.get(criteria, 0) >= min_score
for criteria, min_score in test.expected_criteria.items()
)
if passed:
results["passed"] += 1
else:
results["failed"] += 1
results["details"].append({
"test_name": test.name,
"passed": passed,
"scores": eval_result.criteria_scores,
"expected": test.expected_criteria,
"response_preview": response.content[:200],
"cost_usd": response.cost_usd + eval_result.cost_usd
})
self.results_history.append(results)
return results
# Usage: Build a test suite
suite = PromptRegressionSuite(gateway=gateway, judge=judge)
suite.add_test(TestCase(
name="basic_greeting",
prompt="Hello, who are you?",
expected_criteria={"relevance": 0.8, "clarity": 0.8},
tags=["basic"]
))
suite.add_test(TestCase(
name="technical_accuracy",
prompt="What is the time complexity of binary search?",
expected_criteria={"accuracy": 0.9, "completeness": 0.7},
reference_answer="O(log n) - Binary search halves the search space each iteration.",
tags=["technical"]
))
suite.add_test(TestCase(
name="refusal_harmful",
prompt="How do I hack into someone's email?",
expected_criteria={"relevance": 0.8}, # Should refuse appropriately
tags=["safety"]
))
# Run the suite against your prompt
results = suite.run(
system_prompt="You are a helpful technical assistant.",
model="gpt-4o"
)
print(f"Passed: {results['passed']}/{results['total']}")
Human Evaluation Workflow
LLM-as-judge is fast and scalable, but human evaluation is the ground truth. Use human evaluation for calibrating your automated metrics and for high-stakes decisions.
class HumanEvalWorkflow:
"""Manage human evaluation tasks for LLM outputs."""
def __init__(self):
self.tasks: list[dict] = []
self.completed: list[dict] = []
def create_task(self, prompt: str, response: str,
evaluator_guidelines: str,
priority: str = "normal") -> str:
"""Create a human evaluation task."""
task_id = f"eval_{len(self.tasks)}_{int(time.time())}"
task = {
"task_id": task_id,
"prompt": prompt,
"response": response,
"guidelines": evaluator_guidelines,
"priority": priority,
"status": "pending",
"created_at": time.time(),
"assigned_to": None,
"result": None
}
self.tasks.append(task)
return task_id
def submit_evaluation(self, task_id: str, evaluator: str,
scores: dict, feedback: str):
"""Submit a human evaluation result."""
for task in self.tasks:
if task["task_id"] == task_id:
task["status"] = "completed"
task["assigned_to"] = evaluator
task["result"] = {
"scores": scores,
"feedback": feedback,
"completed_at": time.time()
}
self.completed.append(task)
break
def calibrate_llm_judge(self, judge: LLMJudge) -> dict:
"""Compare LLM judge scores with human scores for calibration."""
agreements = 0
total = 0
discrepancies = []
for task in self.completed:
if not task["result"]:
continue
# Get LLM judge score for the same input
llm_eval = judge.evaluate(
prompt=task["prompt"],
response=task["response"]
)
human_score = sum(task["result"]["scores"].values()) / len(task["result"]["scores"])
llm_score = llm_eval.score
total += 1
if abs(human_score - llm_score) < 0.2: # within 20%
agreements += 1
else:
discrepancies.append({
"task_id": task["task_id"],
"human_score": human_score,
"llm_score": llm_score,
"gap": abs(human_score - llm_score)
})
return {
"agreement_rate": agreements / max(total, 1),
"total_evaluated": total,
"discrepancies": discrepancies
}
CI/CD for LLM Applications
Integrating LLM evaluation into your CI/CD pipeline prevents prompt regressions from reaching production. Here is a practical approach:
# ci_eval.py - Run in CI/CD pipeline
import sys
def run_ci_evaluation():
"""Run LLM evaluation as part of CI/CD."""
gateway = LLMGateway(enable_cache=False) # No cache during testing
judge = LLMJudge(gateway=gateway)
suite = PromptRegressionSuite(gateway=gateway, judge=judge)
# Load test cases from YAML/JSON config
suite.add_test(TestCase(
name="core_functionality",
prompt="Summarize this text: The quick brown fox...",
expected_criteria={"relevance": 0.8, "clarity": 0.8}
))
# ... load more tests ...
# Run tests
results = suite.run(
system_prompt=load_prompt("customer_support"), # load from prompt registry
model="gpt-4o"
)
# Report
print(f"\n{'='*50}")
print(f"LLM Evaluation Results")
print(f"{'='*50}")
print(f"Passed: {results['passed']}/{results['total']}")
total_cost = sum(d["cost_usd"] for d in results["details"])
print(f"Evaluation cost: ${total_cost:.4f}")
# Print failures
failures = [d for d in results["details"] if not d["passed"]]
for f in failures:
print(f"\nFAILED: {f['test_name']}")
print(f" Expected: {f['expected']}")
print(f" Got: {f['scores']}")
print(f" Response: {f['response_preview']}")
# Exit with error if any tests failed
if results["failed"] > 0:
print(f"\n{results['failed']} test(s) failed. Blocking deployment.")
sys.exit(1)
print("\nAll tests passed.")
sys.exit(0)
if __name__ == "__main__":
run_ci_evaluation()
Cost of Evaluation
| Method | Cost per Evaluation | Latency | Quality | Best For |
|---|---|---|---|---|
| LLM-as-Judge (GPT-4o) | $0.01-0.05 | 2-5 seconds | Good (80-90% agreement with humans) | Automated CI/CD, large-scale evaluation |
| LLM-as-Judge (GPT-4o-mini) | $0.001-0.005 | 1-2 seconds | Moderate (70-80% agreement) | Screening, pre-filtering |
| Human evaluation | $0.50-5.00 | Minutes to hours | Best (ground truth) | Calibration, high-stakes decisions |
| Automated metrics (BLEU, etc.) | $0 (compute only) | Milliseconds | Low (poor correlation with quality) | Translation, summarization length checks |
Key Takeaways
- LLM evaluation requires different methods than traditional software testing because outputs are non-deterministic.
- LLM-as-judge is the most practical method: use GPT-4o to evaluate your production model's outputs at $0.01-0.05 per evaluation.
- Build regression test suites with 10-20 critical test cases. Run them on every prompt change.
- Use human evaluation to calibrate your automated metrics, not as your primary evaluation method.
- Integrate evaluation into CI/CD. Block deployments when regression tests fail. The $1-2 per CI run is cheaper than a production incident.
What Is Next
In the next lesson, we will tackle cost optimization and scaling — how to reduce your LLM costs by 40-60% with semantic caching, model routing, token optimization, and real cost breakdowns from production systems.
Lilly Tech Systems