Introduction: LLM applications are notoriously hard to test. Outputs are non-deterministic, “correct” is often subjective, and traditional unit tests don’t apply. Yet shipping untested LLM features is risky—prompt changes can break functionality, model updates can degrade quality, and edge cases can embarrass your product. This guide covers practical testing strategies: building evaluation datasets, implementing automated scoring, regression testing for prompt changes, and continuous evaluation in production. These patterns help you ship LLM features with confidence.

Building Evaluation Datasets
from dataclasses import dataclass
from typing import Optional, Callable
import json
from pathlib import Path
@dataclass
class TestCase:
id: str
input: str
expected_output: Optional[str] = None
expected_contains: list[str] = None
expected_not_contains: list[str] = None
metadata: dict = None
def __post_init__(self):
self.expected_contains = self.expected_contains or []
self.expected_not_contains = self.expected_not_contains or []
self.metadata = self.metadata or {}
class EvalDataset:
"""Manage evaluation test cases."""
def __init__(self, name: str):
self.name = name
self.test_cases: list[TestCase] = []
def add_case(
self,
input: str,
expected_output: str = None,
expected_contains: list[str] = None,
expected_not_contains: list[str] = None,
**metadata
):
"""Add a test case."""
case = TestCase(
id=f"{self.name}_{len(self.test_cases)}",
input=input,
expected_output=expected_output,
expected_contains=expected_contains,
expected_not_contains=expected_not_contains,
metadata=metadata
)
self.test_cases.append(case)
return self
def save(self, path: Path):
"""Save dataset to JSON."""
data = {
"name": self.name,
"cases": [
{
"id": c.id,
"input": c.input,
"expected_output": c.expected_output,
"expected_contains": c.expected_contains,
"expected_not_contains": c.expected_not_contains,
"metadata": c.metadata
}
for c in self.test_cases
]
}
path.write_text(json.dumps(data, indent=2))
@classmethod
def load(cls, path: Path) -> "EvalDataset":
"""Load dataset from JSON."""
data = json.loads(path.read_text())
dataset = cls(data["name"])
for case_data in data["cases"]:
dataset.test_cases.append(TestCase(**case_data))
return dataset
# Create evaluation dataset for a summarization task
summarization_eval = EvalDataset("summarization")
summarization_eval.add_case(
input="The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet.",
expected_contains=["fox", "dog"],
expected_not_contains=["alphabet"], # Summary shouldn't include meta-info
category="short_text"
)
summarization_eval.add_case(
input="Python 3.12 was released with several new features including improved error messages...",
expected_contains=["Python", "3.12"],
category="technical"
)
summarization_eval.save(Path("./evals/summarization.json"))
Automated Evaluation
from dataclasses import dataclass, field
from datetime import datetime
from openai import OpenAI
client = OpenAI()
@dataclass
class EvalResult:
test_case_id: str
input: str
output: str
passed: bool
score: float
details: dict = field(default_factory=dict)
latency_ms: float = 0.0
@dataclass
class EvalReport:
dataset_name: str
timestamp: datetime
results: list[EvalResult]
@property
def pass_rate(self) -> float:
if not self.results:
return 0.0
return sum(1 for r in self.results if r.passed) / len(self.results)
@property
def avg_score(self) -> float:
if not self.results:
return 0.0
return sum(r.score for r in self.results) / len(self.results)
@property
def avg_latency(self) -> float:
if not self.results:
return 0.0
return sum(r.latency_ms for r in self.results) / len(self.results)
def summary(self) -> str:
return f"""
Evaluation Report: {self.dataset_name}
Timestamp: {self.timestamp}
Total Cases: {len(self.results)}
Pass Rate: {self.pass_rate:.1%}
Average Score: {self.avg_score:.2f}
Average Latency: {self.avg_latency:.0f}ms
"""
class LLMEvaluator:
"""Evaluate LLM outputs against test cases."""
def __init__(self, model: str = "gpt-4o"):
self.model = model
def evaluate_case(
self,
test_case: TestCase,
prompt_template: str,
system_prompt: str = ""
) -> EvalResult:
"""Evaluate a single test case."""
import time
# Format prompt
prompt = prompt_template.format(input=test_case.input)
# Run LLM
start = time.time()
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
response = client.chat.completions.create(
model=self.model,
messages=messages
)
latency = (time.time() - start) * 1000
output = response.choices[0].message.content
# Evaluate output
passed, score, details = self._score_output(test_case, output)
return EvalResult(
test_case_id=test_case.id,
input=test_case.input,
output=output,
passed=passed,
score=score,
details=details,
latency_ms=latency
)
def _score_output(
self,
test_case: TestCase,
output: str
) -> tuple[bool, float, dict]:
"""Score the output against expectations."""
details = {}
score = 1.0
# Check expected contains
if test_case.expected_contains:
missing = [
term for term in test_case.expected_contains
if term.lower() not in output.lower()
]
details["missing_terms"] = missing
if missing:
score -= 0.2 * len(missing)
# Check expected not contains
if test_case.expected_not_contains:
found = [
term for term in test_case.expected_not_contains
if term.lower() in output.lower()
]
details["unwanted_terms"] = found
if found:
score -= 0.3 * len(found)
# Check exact match if provided
if test_case.expected_output:
exact_match = output.strip() == test_case.expected_output.strip()
details["exact_match"] = exact_match
if not exact_match:
score -= 0.5
score = max(0.0, min(1.0, score))
passed = score >= 0.7
return passed, score, details
def evaluate_dataset(
self,
dataset: EvalDataset,
prompt_template: str,
system_prompt: str = ""
) -> EvalReport:
"""Evaluate entire dataset."""
results = []
for case in dataset.test_cases:
result = self.evaluate_case(case, prompt_template, system_prompt)
results.append(result)
return EvalReport(
dataset_name=dataset.name,
timestamp=datetime.now(),
results=results
)
# Usage
evaluator = LLMEvaluator(model="gpt-4o-mini")
report = evaluator.evaluate_dataset(
summarization_eval,
prompt_template="Summarize this text in one sentence:\n\n{input}",
system_prompt="You are a concise summarizer."
)
print(report.summary())
LLM-as-Judge Evaluation
class LLMJudge:
"""Use an LLM to evaluate outputs."""
def __init__(self, judge_model: str = "gpt-4o"):
self.judge_model = judge_model
def score_output(
self,
input_text: str,
output: str,
criteria: list[str],
reference: str = None
) -> dict:
"""Score output using LLM judge."""
criteria_text = "\n".join(f"- {c}" for c in criteria)
reference_section = ""
if reference:
reference_section = f"\nReference answer:\n{reference}\n"
prompt = f"""Evaluate this LLM output based on the given criteria.
Input: {input_text}
Output to evaluate:
{output}
{reference_section}
Evaluation criteria:
{criteria_text}
For each criterion, provide a score from 1-5 and brief explanation.
Then provide an overall score from 1-5.
Format your response as JSON:
{{
"criteria_scores": {{
"criterion_name": {{"score": 1-5, "explanation": "..."}},
...
}},
"overall_score": 1-5,
"overall_explanation": "..."
}}"""
response = client.chat.completions.create(
model=self.judge_model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
def compare_outputs(
self,
input_text: str,
output_a: str,
output_b: str,
criteria: list[str]
) -> dict:
"""Compare two outputs and determine which is better."""
criteria_text = "\n".join(f"- {c}" for c in criteria)
prompt = f"""Compare these two LLM outputs and determine which is better.
Input: {input_text}
Output A:
{output_a}
Output B:
{output_b}
Evaluation criteria:
{criteria_text}
Provide your analysis as JSON:
{{
"winner": "A" or "B" or "tie",
"confidence": 1-5,
"reasoning": "...",
"criteria_comparison": {{
"criterion": {{"winner": "A/B/tie", "explanation": "..."}}
}}
}}"""
response = client.chat.completions.create(
model=self.judge_model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
# Usage
judge = LLMJudge()
# Score a single output
score = judge.score_output(
input_text="Explain quantum computing",
output="Quantum computing uses qubits...",
criteria=[
"Accuracy of information",
"Clarity of explanation",
"Appropriate level of detail",
"Engaging writing style"
]
)
print(f"Overall score: {score['overall_score']}/5")
# Compare two model outputs
comparison = judge.compare_outputs(
input_text="Write a product description for a coffee maker",
output_a="This coffee maker brews great coffee...",
output_b="Introducing the ultimate coffee experience...",
criteria=["Persuasiveness", "Clarity", "Feature coverage"]
)
print(f"Winner: Output {comparison['winner']}")
Regression Testing
from pathlib import Path
import hashlib
class PromptRegressionTester:
"""Test for regressions when prompts change."""
def __init__(self, baseline_dir: Path = Path("./baselines")):
self.baseline_dir = baseline_dir
self.baseline_dir.mkdir(exist_ok=True)
def _get_baseline_path(self, test_name: str) -> Path:
return self.baseline_dir / f"{test_name}.json"
def save_baseline(self, test_name: str, report: EvalReport):
"""Save evaluation report as baseline."""
data = {
"timestamp": report.timestamp.isoformat(),
"pass_rate": report.pass_rate,
"avg_score": report.avg_score,
"results": [
{
"id": r.test_case_id,
"output": r.output,
"score": r.score,
"passed": r.passed
}
for r in report.results
]
}
self._get_baseline_path(test_name).write_text(json.dumps(data, indent=2))
def compare_to_baseline(
self,
test_name: str,
current_report: EvalReport,
regression_threshold: float = 0.05
) -> dict:
"""Compare current results to baseline."""
baseline_path = self._get_baseline_path(test_name)
if not baseline_path.exists():
return {
"status": "no_baseline",
"message": "No baseline found. Run with --save-baseline first."
}
baseline = json.loads(baseline_path.read_text())
# Compare metrics
pass_rate_diff = current_report.pass_rate - baseline["pass_rate"]
score_diff = current_report.avg_score - baseline["avg_score"]
# Check for regressions
regressions = []
if pass_rate_diff < -regression_threshold:
regressions.append(f"Pass rate dropped: {baseline['pass_rate']:.1%} -> {current_report.pass_rate:.1%}")
if score_diff < -regression_threshold:
regressions.append(f"Avg score dropped: {baseline['avg_score']:.2f} -> {current_report.avg_score:.2f}")
# Check individual cases
baseline_results = {r["id"]: r for r in baseline["results"]}
for result in current_report.results:
if result.test_case_id in baseline_results:
baseline_result = baseline_results[result.test_case_id]
if baseline_result["passed"] and not result.passed:
regressions.append(f"Case {result.test_case_id} now failing")
return {
"status": "regression" if regressions else "pass",
"regressions": regressions,
"pass_rate_change": pass_rate_diff,
"score_change": score_diff
}
# Usage in CI/CD
def run_regression_tests():
"""Run as part of CI pipeline."""
evaluator = LLMEvaluator()
tester = PromptRegressionTester()
# Load test dataset
dataset = EvalDataset.load(Path("./evals/summarization.json"))
# Run evaluation
report = evaluator.evaluate_dataset(
dataset,
prompt_template="Summarize: {input}",
system_prompt="Be concise."
)
# Compare to baseline
comparison = tester.compare_to_baseline("summarization", report)
if comparison["status"] == "regression":
print("REGRESSION DETECTED!")
for reg in comparison["regressions"]:
print(f" - {reg}")
exit(1)
elif comparison["status"] == "no_baseline":
print("No baseline found. Saving current results as baseline.")
tester.save_baseline("summarization", report)
else:
print(f"All tests passed. Score change: {comparison['score_change']:+.2f}")
run_regression_tests()
Production Monitoring
from collections import deque
from threading import Lock
import random
class ProductionEvaluator:
"""Continuous evaluation in production."""
def __init__(
self,
sample_rate: float = 0.01, # Evaluate 1% of requests
window_size: int = 1000
):
self.sample_rate = sample_rate
self.window_size = window_size
self.scores: deque = deque(maxlen=window_size)
self.latencies: deque = deque(maxlen=window_size)
self.lock = Lock()
self.judge = LLMJudge()
def should_evaluate(self) -> bool:
"""Determine if this request should be evaluated."""
return random.random() < self.sample_rate
def evaluate_async(
self,
input_text: str,
output: str,
latency_ms: float,
criteria: list[str]
):
"""Evaluate in background (call from async task)."""
# Get LLM judge score
result = self.judge.score_output(
input_text=input_text,
output=output,
criteria=criteria
)
with self.lock:
self.scores.append(result["overall_score"])
self.latencies.append(latency_ms)
def get_metrics(self) -> dict:
"""Get current production metrics."""
with self.lock:
if not self.scores:
return {"status": "no_data"}
scores = list(self.scores)
latencies = list(self.latencies)
return {
"sample_count": len(scores),
"avg_score": sum(scores) / len(scores),
"min_score": min(scores),
"max_score": max(scores),
"avg_latency_ms": sum(latencies) / len(latencies),
"p95_latency_ms": sorted(latencies)[int(len(latencies) * 0.95)]
}
def check_alerts(self, min_score: float = 3.0) -> list[str]:
"""Check for alert conditions."""
metrics = self.get_metrics()
alerts = []
if metrics.get("avg_score", 5) < min_score:
alerts.append(f"Average score {metrics['avg_score']:.1f} below threshold {min_score}")
if metrics.get("p95_latency_ms", 0) > 5000:
alerts.append(f"P95 latency {metrics['p95_latency_ms']:.0f}ms exceeds 5s")
return alerts
# Integration with FastAPI
from fastapi import FastAPI, BackgroundTasks
app = FastAPI()
prod_evaluator = ProductionEvaluator(sample_rate=0.02)
@app.post("/chat")
async def chat(message: str, background_tasks: BackgroundTasks):
import time
start = time.time()
# Get LLM response
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": message}]
)
output = response.choices[0].message.content
latency = (time.time() - start) * 1000
# Sample for evaluation
if prod_evaluator.should_evaluate():
background_tasks.add_task(
prod_evaluator.evaluate_async,
input_text=message,
output=output,
latency_ms=latency,
criteria=["Helpfulness", "Accuracy", "Clarity"]
)
return {"response": output}
@app.get("/metrics")
async def metrics():
return prod_evaluator.get_metrics()
References
- OpenAI Evals: https://github.com/openai/evals
- LangSmith: https://smith.langchain.com/
- Promptfoo: https://promptfoo.dev/
- DeepEval: https://github.com/confident-ai/deepeval
Conclusion
Testing LLM applications requires different approaches than traditional software. Build evaluation datasets that capture your specific use cases—generic benchmarks won’t tell you if your product works. Use multiple evaluation methods: rule-based checks for objective criteria, LLM-as-judge for subjective quality, and human review for critical decisions. Implement regression testing to catch quality drops when prompts change. Monitor production continuously with sampling-based evaluation. The goal isn’t perfect scores—it’s confidence that your LLM features work reliably for users. Start with a small evaluation set covering critical paths, then expand as you learn what breaks. Testing LLMs is an ongoing process, not a one-time task.
Discover more from Code, Cloud & Context
Subscribe to get the latest posts sent to your email.

Leave a Reply