Introduction: Testing LLM applications is fundamentally different from testing traditional software. Outputs are non-deterministic, quality is subjective, and edge cases are infinite. You can’t simply assert that output equals expected—you need to evaluate whether outputs are good enough across multiple dimensions. Yet many teams skip testing entirely or rely solely on manual spot-checking. This guide covers practical testing strategies: unit tests for prompts and chains, evaluation metrics for quality assessment, regression testing to catch degradation, and continuous evaluation pipelines that run automatically.

Unit Testing Prompts
from dataclasses import dataclass
from typing import Any, Callable, Optional
import pytest
import asyncio
@dataclass
class PromptTestCase:
"""A test case for a prompt."""
name: str
input_variables: dict
expected_contains: list[str] = None
expected_not_contains: list[str] = None
min_length: int = 0
max_length: int = float('inf')
custom_validator: Callable[[str], bool] = None
class PromptTester:
"""Test prompts for correctness."""
def __init__(self, client: Any, model: str = "gpt-4o-mini"):
self.client = client
self.model = model
async def test_prompt(
self,
prompt_template: str,
test_case: PromptTestCase
) -> tuple[bool, str, list[str]]:
"""Test a prompt with a test case."""
# Format prompt
prompt = prompt_template.format(**test_case.input_variables)
# Get response
response = await self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}]
)
output = response.choices[0].message.content
# Run assertions
failures = []
# Check contains
if test_case.expected_contains:
for expected in test_case.expected_contains:
if expected.lower() not in output.lower():
failures.append(f"Expected to contain: '{expected}'")
# Check not contains
if test_case.expected_not_contains:
for not_expected in test_case.expected_not_contains:
if not_expected.lower() in output.lower():
failures.append(f"Should not contain: '{not_expected}'")
# Check length
if len(output) < test_case.min_length:
failures.append(f"Output too short: {len(output)} < {test_case.min_length}")
if len(output) > test_case.max_length:
failures.append(f"Output too long: {len(output)} > {test_case.max_length}")
# Custom validator
if test_case.custom_validator:
if not test_case.custom_validator(output):
failures.append("Custom validation failed")
passed = len(failures) == 0
return passed, output, failures
async def run_test_suite(
self,
prompt_template: str,
test_cases: list[PromptTestCase]
) -> dict:
"""Run a suite of tests."""
results = []
for test_case in test_cases:
passed, output, failures = await self.test_prompt(prompt_template, test_case)
results.append({
"name": test_case.name,
"passed": passed,
"output_preview": output[:200],
"failures": failures
})
passed_count = sum(1 for r in results if r["passed"])
return {
"total": len(results),
"passed": passed_count,
"failed": len(results) - passed_count,
"results": results
}
# Example pytest integration
class TestSummarizationPrompt:
"""Test cases for summarization prompt."""
@pytest.fixture
def prompt_template(self):
return """Summarize the following text in 2-3 sentences:
{text}
Summary:"""
@pytest.fixture
def tester(self, client):
return PromptTester(client)
@pytest.mark.asyncio
async def test_short_text(self, tester, prompt_template):
test_case = PromptTestCase(
name="short_text",
input_variables={
"text": "The quick brown fox jumps over the lazy dog. This is a simple sentence for testing."
},
min_length=20,
max_length=500
)
passed, output, failures = await tester.test_prompt(prompt_template, test_case)
assert passed, f"Test failed: {failures}"
@pytest.mark.asyncio
async def test_technical_text(self, tester, prompt_template):
test_case = PromptTestCase(
name="technical_text",
input_variables={
"text": "Machine learning is a subset of artificial intelligence that enables systems to learn from data. Neural networks are a key component of deep learning."
},
expected_contains=["machine learning", "artificial intelligence"],
min_length=50
)
passed, output, failures = await tester.test_prompt(prompt_template, test_case)
assert passed, f"Test failed: {failures}"
Evaluation Metrics
from dataclasses import dataclass
from typing import Any, Optional
import re
@dataclass
class EvaluationResult:
"""Result of an evaluation."""
metric_name: str
score: float # 0-1
details: dict = None
class ExactMatchEvaluator:
"""Evaluate exact match accuracy."""
def evaluate(self, output: str, expected: str) -> EvaluationResult:
"""Check if output exactly matches expected."""
# Normalize whitespace
output_normalized = " ".join(output.split())
expected_normalized = " ".join(expected.split())
score = 1.0 if output_normalized == expected_normalized else 0.0
return EvaluationResult(
metric_name="exact_match",
score=score,
details={"matched": score == 1.0}
)
class ContainsEvaluator:
"""Evaluate if output contains expected elements."""
def evaluate(
self,
output: str,
expected_elements: list[str],
case_sensitive: bool = False
) -> EvaluationResult:
"""Check if output contains all expected elements."""
if not case_sensitive:
output = output.lower()
expected_elements = [e.lower() for e in expected_elements]
found = [e for e in expected_elements if e in output]
score = len(found) / len(expected_elements) if expected_elements else 1.0
return EvaluationResult(
metric_name="contains",
score=score,
details={
"found": found,
"missing": [e for e in expected_elements if e not in found]
}
)
class LLMJudgeEvaluator:
"""Use LLM as a judge for quality evaluation."""
def __init__(self, client: Any, model: str = "gpt-4o"):
self.client = client
self.model = model
async def evaluate_relevance(
self,
query: str,
output: str
) -> EvaluationResult:
"""Evaluate relevance of output to query."""
prompt = f"""Rate how relevant this response is to the query on a scale of 1-5.
Query: {query}
Response: {output[:2000]}
Provide your rating as a JSON object:
{{"score": 1-5, "reason": "brief explanation"}}"""
response = await self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
import json
result = json.loads(response.choices[0].message.content)
return EvaluationResult(
metric_name="relevance",
score=result["score"] / 5, # Normalize to 0-1
details={"raw_score": result["score"], "reason": result["reason"]}
)
async def evaluate_correctness(
self,
output: str,
ground_truth: str
) -> EvaluationResult:
"""Evaluate factual correctness against ground truth."""
prompt = f"""Compare the response to the ground truth and rate factual correctness on a scale of 1-5.
Response: {output[:1500]}
Ground Truth: {ground_truth[:1500]}
Provide your rating as a JSON object:
{{"score": 1-5, "correct_facts": ["list"], "incorrect_facts": ["list"]}}"""
response = await self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
import json
result = json.loads(response.choices[0].message.content)
return EvaluationResult(
metric_name="correctness",
score=result["score"] / 5,
details={
"correct_facts": result.get("correct_facts", []),
"incorrect_facts": result.get("incorrect_facts", [])
}
)
async def evaluate_helpfulness(
self,
query: str,
output: str
) -> EvaluationResult:
"""Evaluate how helpful the response is."""
prompt = f"""Rate how helpful this response is for the user's query on a scale of 1-5.
Consider: Does it answer the question? Is it actionable? Is it complete?
Query: {query}
Response: {output[:2000]}
Provide your rating as a JSON object:
{{"score": 1-5, "strengths": ["list"], "weaknesses": ["list"]}}"""
response = await self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
import json
result = json.loads(response.choices[0].message.content)
return EvaluationResult(
metric_name="helpfulness",
score=result["score"] / 5,
details={
"strengths": result.get("strengths", []),
"weaknesses": result.get("weaknesses", [])
}
)
class CodeEvaluator:
"""Evaluate code generation quality."""
def evaluate_syntax(self, code: str, language: str = "python") -> EvaluationResult:
"""Check if code has valid syntax."""
if language == "python":
try:
import ast
ast.parse(code)
return EvaluationResult(
metric_name="syntax",
score=1.0,
details={"valid": True}
)
except SyntaxError as e:
return EvaluationResult(
metric_name="syntax",
score=0.0,
details={"valid": False, "error": str(e)}
)
return EvaluationResult(
metric_name="syntax",
score=0.5,
details={"message": f"Syntax check not implemented for {language}"}
)
def evaluate_execution(
self,
code: str,
test_cases: list[tuple[dict, Any]]
) -> EvaluationResult:
"""Execute code and check against test cases."""
passed = 0
results = []
for inputs, expected in test_cases:
try:
# Create isolated namespace
namespace = {}
exec(code, namespace)
# Find the main function
func_name = None
for name, obj in namespace.items():
if callable(obj) and not name.startswith('_'):
func_name = name
break
if func_name:
result = namespace[func_name](**inputs)
if result == expected:
passed += 1
results.append({"inputs": inputs, "passed": True})
else:
results.append({
"inputs": inputs,
"passed": False,
"expected": expected,
"got": result
})
else:
results.append({"inputs": inputs, "passed": False, "error": "No function found"})
except Exception as e:
results.append({"inputs": inputs, "passed": False, "error": str(e)})
score = passed / len(test_cases) if test_cases else 0
return EvaluationResult(
metric_name="execution",
score=score,
details={"passed": passed, "total": len(test_cases), "results": results}
)
Regression Testing
from dataclasses import dataclass, field
from typing import Any, Optional
from datetime import datetime
import json
import hashlib
@dataclass
class GoldenExample:
"""A golden example for regression testing."""
id: str
prompt: str
expected_output: str
metadata: dict = field(default_factory=dict)
created_at: datetime = field(default_factory=datetime.utcnow)
def to_dict(self) -> dict:
return {
"id": self.id,
"prompt": self.prompt,
"expected_output": self.expected_output,
"metadata": self.metadata,
"created_at": self.created_at.isoformat()
}
@classmethod
def from_dict(cls, data: dict) -> "GoldenExample":
return cls(
id=data["id"],
prompt=data["prompt"],
expected_output=data["expected_output"],
metadata=data.get("metadata", {}),
created_at=datetime.fromisoformat(data["created_at"])
)
class GoldenStore:
"""Store and manage golden examples."""
def __init__(self, storage_path: str):
self.storage_path = storage_path
self.examples: dict[str, GoldenExample] = {}
self._load()
def _load(self):
"""Load examples from storage."""
try:
with open(self.storage_path, 'r') as f:
data = json.load(f)
for item in data:
example = GoldenExample.from_dict(item)
self.examples[example.id] = example
except FileNotFoundError:
pass
def _save(self):
"""Save examples to storage."""
with open(self.storage_path, 'w') as f:
json.dump([e.to_dict() for e in self.examples.values()], f, indent=2)
def add(self, prompt: str, expected_output: str, metadata: dict = None) -> str:
"""Add a golden example."""
example_id = hashlib.md5(prompt.encode()).hexdigest()[:12]
example = GoldenExample(
id=example_id,
prompt=prompt,
expected_output=expected_output,
metadata=metadata or {}
)
self.examples[example_id] = example
self._save()
return example_id
def get(self, example_id: str) -> Optional[GoldenExample]:
"""Get a golden example by ID."""
return self.examples.get(example_id)
def list_all(self) -> list[GoldenExample]:
"""List all golden examples."""
return list(self.examples.values())
def update(self, example_id: str, expected_output: str):
"""Update expected output for an example."""
if example_id in self.examples:
self.examples[example_id].expected_output = expected_output
self._save()
@dataclass
class RegressionResult:
"""Result of a regression test."""
example_id: str
passed: bool
similarity_score: float
current_output: str
expected_output: str
diff_summary: str = ""
class RegressionTester:
"""Run regression tests against golden examples."""
def __init__(
self,
client: Any,
golden_store: GoldenStore,
evaluator: LLMJudgeEvaluator = None,
similarity_threshold: float = 0.8
):
self.client = client
self.golden_store = golden_store
self.evaluator = evaluator
self.similarity_threshold = similarity_threshold
async def run_single(
self,
example: GoldenExample,
model: str = "gpt-4o-mini"
) -> RegressionResult:
"""Run regression test for a single example."""
# Get current output
response = await self.client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": example.prompt}]
)
current_output = response.choices[0].message.content
# Calculate similarity
if self.evaluator:
eval_result = await self.evaluator.evaluate_correctness(
current_output,
example.expected_output
)
similarity = eval_result.score
else:
similarity = self._simple_similarity(current_output, example.expected_output)
passed = similarity >= self.similarity_threshold
return RegressionResult(
example_id=example.id,
passed=passed,
similarity_score=similarity,
current_output=current_output,
expected_output=example.expected_output,
diff_summary=self._summarize_diff(current_output, example.expected_output)
)
async def run_all(self, model: str = "gpt-4o-mini") -> dict:
"""Run all regression tests."""
examples = self.golden_store.list_all()
results = []
for example in examples:
result = await self.run_single(example, model)
results.append(result)
passed = sum(1 for r in results if r.passed)
return {
"total": len(results),
"passed": passed,
"failed": len(results) - passed,
"pass_rate": passed / len(results) if results else 0,
"results": [
{
"example_id": r.example_id,
"passed": r.passed,
"similarity": r.similarity_score,
"diff_summary": r.diff_summary
}
for r in results
]
}
def _simple_similarity(self, text1: str, text2: str) -> float:
"""Calculate simple word overlap similarity."""
words1 = set(text1.lower().split())
words2 = set(text2.lower().split())
if not words1 or not words2:
return 0.0
intersection = len(words1 & words2)
union = len(words1 | words2)
return intersection / union
def _summarize_diff(self, current: str, expected: str) -> str:
"""Summarize differences between outputs."""
current_words = set(current.lower().split())
expected_words = set(expected.lower().split())
missing = expected_words - current_words
extra = current_words - expected_words
summary_parts = []
if missing:
summary_parts.append(f"Missing: {', '.join(list(missing)[:5])}")
if extra:
summary_parts.append(f"Extra: {', '.join(list(extra)[:5])}")
return "; ".join(summary_parts) if summary_parts else "Similar"
Continuous Evaluation Pipeline
from dataclasses import dataclass, field
from typing import Any, Callable
from datetime import datetime
import asyncio
@dataclass
class EvaluationRun:
"""A single evaluation run."""
run_id: str
timestamp: datetime
model: str
metrics: dict[str, float]
details: dict = field(default_factory=dict)
class EvaluationPipeline:
"""Continuous evaluation pipeline."""
def __init__(
self,
client: Any,
model: str = "gpt-4o-mini"
):
self.client = client
self.model = model
self.evaluators: list[tuple[str, Any]] = []
self.test_cases: list[dict] = []
self.runs: list[EvaluationRun] = []
def add_evaluator(self, name: str, evaluator: Any):
"""Add an evaluator to the pipeline."""
self.evaluators.append((name, evaluator))
def add_test_case(
self,
prompt: str,
expected: str = None,
metadata: dict = None
):
"""Add a test case."""
self.test_cases.append({
"prompt": prompt,
"expected": expected,
"metadata": metadata or {}
})
async def run(self) -> EvaluationRun:
"""Run the evaluation pipeline."""
import uuid
run_id = str(uuid.uuid4())[:8]
all_scores = {name: [] for name, _ in self.evaluators}
details = []
for test_case in self.test_cases:
# Get model output
response = await self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": test_case["prompt"]}]
)
output = response.choices[0].message.content
# Run evaluators
case_scores = {}
for name, evaluator in self.evaluators:
if hasattr(evaluator, 'evaluate'):
if asyncio.iscoroutinefunction(evaluator.evaluate):
result = await evaluator.evaluate(output, test_case.get("expected", ""))
else:
result = evaluator.evaluate(output, test_case.get("expected", ""))
score = result.score if hasattr(result, 'score') else result
all_scores[name].append(score)
case_scores[name] = score
details.append({
"prompt": test_case["prompt"][:100],
"output": output[:200],
"scores": case_scores
})
# Calculate aggregate metrics
metrics = {}
for name, scores in all_scores.items():
if scores:
metrics[f"{name}_avg"] = sum(scores) / len(scores)
metrics[f"{name}_min"] = min(scores)
metrics[f"{name}_max"] = max(scores)
run = EvaluationRun(
run_id=run_id,
timestamp=datetime.utcnow(),
model=self.model,
metrics=metrics,
details={"cases": details}
)
self.runs.append(run)
return run
def compare_runs(self, run_id_1: str, run_id_2: str) -> dict:
"""Compare two evaluation runs."""
run1 = next((r for r in self.runs if r.run_id == run_id_1), None)
run2 = next((r for r in self.runs if r.run_id == run_id_2), None)
if not run1 or not run2:
return {"error": "Run not found"}
comparison = {}
all_metrics = set(run1.metrics.keys()) | set(run2.metrics.keys())
for metric in all_metrics:
val1 = run1.metrics.get(metric, 0)
val2 = run2.metrics.get(metric, 0)
comparison[metric] = {
"run1": val1,
"run2": val2,
"diff": val2 - val1,
"improved": val2 > val1
}
return comparison
def get_trend(self, metric: str, last_n: int = 10) -> list[float]:
"""Get trend for a metric over recent runs."""
recent_runs = self.runs[-last_n:]
return [r.metrics.get(metric, 0) for r in recent_runs]
class CIIntegration:
"""Integration with CI/CD systems."""
def __init__(
self,
pipeline: EvaluationPipeline,
thresholds: dict[str, float]
):
self.pipeline = pipeline
self.thresholds = thresholds
async def run_ci_check(self) -> tuple[bool, dict]:
"""Run CI check and return pass/fail status."""
run = await self.pipeline.run()
failures = []
for metric, threshold in self.thresholds.items():
actual = run.metrics.get(metric, 0)
if actual < threshold:
failures.append({
"metric": metric,
"threshold": threshold,
"actual": actual
})
passed = len(failures) == 0
return passed, {
"run_id": run.run_id,
"passed": passed,
"metrics": run.metrics,
"failures": failures
}
def generate_report(self, run: EvaluationRun) -> str:
"""Generate a markdown report for CI."""
lines = [
f"# Evaluation Report",
f"",
f"**Run ID:** {run.run_id}",
f"**Model:** {run.model}",
f"**Timestamp:** {run.timestamp.isoformat()}",
f"",
f"## Metrics",
f""
]
for metric, value in sorted(run.metrics.items()):
threshold = self.thresholds.get(metric)
status = ""
if threshold:
status = " :white_check_mark:" if value >= threshold else " :x:"
lines.append(f"- **{metric}:** {value:.3f}{status}")
return "\n".join(lines)
Production Testing Service
from fastapi import FastAPI, HTTPException, BackgroundTasks
from pydantic import BaseModel
from typing import Optional
app = FastAPI()
# Initialize components
golden_store = GoldenStore("/tmp/golden_examples.json")
llm_judge = None # Initialize with actual client
regression_tester = None # Initialize with actual client
eval_pipeline = None # Initialize with actual client
class PromptTestRequest(BaseModel):
prompt_template: str
test_cases: list[dict]
class GoldenExampleRequest(BaseModel):
prompt: str
expected_output: str
metadata: Optional[dict] = None
class EvaluationRequest(BaseModel):
output: str
expected: Optional[str] = None
query: Optional[str] = None
metrics: list[str] = ["relevance", "helpfulness"]
class RegressionRequest(BaseModel):
model: str = "gpt-4o-mini"
example_ids: Optional[list[str]] = None
@app.post("/v1/test/prompt")
async def test_prompt(request: PromptTestRequest):
"""Test a prompt template."""
tester = PromptTester(llm_judge.client if llm_judge else None)
test_cases = [
PromptTestCase(
name=tc.get("name", f"test_{i}"),
input_variables=tc["input_variables"],
expected_contains=tc.get("expected_contains"),
expected_not_contains=tc.get("expected_not_contains"),
min_length=tc.get("min_length", 0),
max_length=tc.get("max_length", float('inf'))
)
for i, tc in enumerate(request.test_cases)
]
results = await tester.run_test_suite(request.prompt_template, test_cases)
return results
@app.post("/v1/golden/add")
async def add_golden_example(request: GoldenExampleRequest):
"""Add a golden example."""
example_id = golden_store.add(
request.prompt,
request.expected_output,
request.metadata
)
return {"example_id": example_id, "status": "added"}
@app.get("/v1/golden/list")
async def list_golden_examples():
"""List all golden examples."""
examples = golden_store.list_all()
return {
"count": len(examples),
"examples": [
{
"id": e.id,
"prompt_preview": e.prompt[:100],
"created_at": e.created_at.isoformat()
}
for e in examples
]
}
@app.post("/v1/evaluate")
async def evaluate_output(request: EvaluationRequest):
"""Evaluate an LLM output."""
if not llm_judge:
raise HTTPException(status_code=500, detail="Evaluator not configured")
results = {}
if "relevance" in request.metrics and request.query:
result = await llm_judge.evaluate_relevance(request.query, request.output)
results["relevance"] = {"score": result.score, "details": result.details}
if "correctness" in request.metrics and request.expected:
result = await llm_judge.evaluate_correctness(request.output, request.expected)
results["correctness"] = {"score": result.score, "details": result.details}
if "helpfulness" in request.metrics and request.query:
result = await llm_judge.evaluate_helpfulness(request.query, request.output)
results["helpfulness"] = {"score": result.score, "details": result.details}
return {"metrics": results}
@app.post("/v1/regression/run")
async def run_regression_tests(request: RegressionRequest):
"""Run regression tests."""
if not regression_tester:
raise HTTPException(status_code=500, detail="Regression tester not configured")
if request.example_ids:
# Run specific examples
results = []
for example_id in request.example_ids:
example = golden_store.get(example_id)
if example:
result = await regression_tester.run_single(example, request.model)
results.append({
"example_id": result.example_id,
"passed": result.passed,
"similarity": result.similarity_score
})
passed = sum(1 for r in results if r["passed"])
return {
"total": len(results),
"passed": passed,
"failed": len(results) - passed,
"results": results
}
else:
# Run all
return await regression_tester.run_all(request.model)
@app.post("/v1/pipeline/run")
async def run_evaluation_pipeline(background_tasks: BackgroundTasks):
"""Run the evaluation pipeline."""
if not eval_pipeline:
raise HTTPException(status_code=500, detail="Pipeline not configured")
run = await eval_pipeline.run()
return {
"run_id": run.run_id,
"timestamp": run.timestamp.isoformat(),
"model": run.model,
"metrics": run.metrics
}
@app.get("/v1/pipeline/runs")
async def list_pipeline_runs(limit: int = 10):
"""List recent pipeline runs."""
if not eval_pipeline:
raise HTTPException(status_code=500, detail="Pipeline not configured")
runs = eval_pipeline.runs[-limit:]
return {
"count": len(runs),
"runs": [
{
"run_id": r.run_id,
"timestamp": r.timestamp.isoformat(),
"model": r.model,
"metrics": r.metrics
}
for r in runs
]
}
@app.get("/v1/pipeline/trend/{metric}")
async def get_metric_trend(metric: str, last_n: int = 10):
"""Get trend for a metric."""
if not eval_pipeline:
raise HTTPException(status_code=500, detail="Pipeline not configured")
trend = eval_pipeline.get_trend(metric, last_n)
return {
"metric": metric,
"values": trend,
"avg": sum(trend) / len(trend) if trend else 0
}
@app.get("/health")
async def health():
return {"status": "healthy"}
References
- LangSmith Evaluation: https://docs.smith.langchain.com/evaluation
- OpenAI Evals: https://github.com/openai/evals
- DeepEval: https://docs.confident-ai.com/
- RAGAS: https://docs.ragas.io/
Conclusion
Testing LLM applications requires a different mindset than traditional software testing. Unit tests for prompts verify basic behavior—that outputs contain expected elements, meet length requirements, and pass custom validators. Evaluation metrics quantify quality across dimensions like relevance, correctness, and helpfulness, using both deterministic checks and LLM-as-judge approaches. Regression testing with golden examples catches degradation when you change prompts, models, or system configurations. Continuous evaluation pipelines run automatically, tracking metrics over time and integrating with CI/CD to gate deployments. The key insight is that LLM testing is about measuring quality distributions, not asserting exact outputs. Start with basic unit tests and contains checks, add LLM-based evaluation for subjective quality, build a golden set for regression testing, and implement continuous evaluation as your system matures. Test early, test often, and track metrics over time to catch degradation before users do.
