Introduction: LLM applications are notoriously hard to test. Outputs are non-deterministic, quality is subjective, and traditional unit testing doesn’t capture the nuances of language generation. Yet shipping untested LLM features is a recipe for embarrassing failures—hallucinations, off-brand responses, or security vulnerabilities. This guide covers practical testing strategies: deterministic unit tests for prompt templates, evaluation suites with LLM-as-judge, regression testing to catch quality degradation, and integration tests for RAG pipelines. Whether you’re building a customer support bot, content generator, or code assistant, these patterns will help you ship with confidence and catch issues before your users do.

Unit Testing Prompts
import pytest
from dataclasses import dataclass, field
from typing import Any, Optional, Callable
from abc import ABC, abstractmethod
@dataclass
class PromptTemplate:
"""A prompt template."""
template: str
variables: list[str]
def format(self, **kwargs) -> str:
"""Format template with variables."""
missing = set(self.variables) - set(kwargs.keys())
if missing:
raise ValueError(f"Missing variables: {missing}")
return self.template.format(**kwargs)
@dataclass
class TestCase:
"""A test case for LLM testing."""
name: str
input_prompt: str
expected_contains: list[str] = field(default_factory=list)
expected_not_contains: list[str] = field(default_factory=list)
expected_format: str = None # "json", "markdown", etc.
max_length: int = None
min_length: int = None
class PromptTester:
"""Test prompt templates."""
def __init__(self, template: PromptTemplate):
self.template = template
def test_formatting(self, variables: dict) -> bool:
"""Test that template formats correctly."""
try:
result = self.template.format(**variables)
return len(result) > 0
except Exception:
return False
def test_variable_injection(self, variables: dict) -> bool:
"""Test that variables are properly injected."""
result = self.template.format(**variables)
for key, value in variables.items():
if str(value) not in result:
return False
return True
def test_no_template_leakage(self, variables: dict) -> bool:
"""Test that no template syntax leaks through."""
result = self.template.format(**variables)
# Check for unformatted placeholders
if "{" in result and "}" in result:
import re
if re.search(r'\{[a-zA-Z_][a-zA-Z0-9_]*\}', result):
return False
return True
# Pytest fixtures and tests
class TestPromptTemplates:
"""Test suite for prompt templates."""
@pytest.fixture
def summarization_template(self):
return PromptTemplate(
template="""Summarize the following text in {style} style:
Text: {text}
Summary:""",
variables=["style", "text"]
)
def test_template_formats_correctly(self, summarization_template):
"""Test basic formatting."""
result = summarization_template.format(
style="concise",
text="This is a test document."
)
assert "concise" in result
assert "This is a test document" in result
def test_missing_variable_raises(self, summarization_template):
"""Test that missing variables raise error."""
with pytest.raises(ValueError):
summarization_template.format(style="concise")
def test_special_characters_handled(self, summarization_template):
"""Test handling of special characters."""
result = summarization_template.format(
style="formal",
text="Test with {braces} and $pecial chars"
)
assert "{braces}" in result
assert "$pecial" in result
class OutputValidator:
"""Validate LLM outputs."""
def __init__(self):
self.validators: list[Callable[[str], bool]] = []
def add_contains(self, substring: str):
"""Add contains check."""
self.validators.append(lambda x: substring.lower() in x.lower())
return self
def add_not_contains(self, substring: str):
"""Add not-contains check."""
self.validators.append(lambda x: substring.lower() not in x.lower())
return self
def add_max_length(self, max_len: int):
"""Add max length check."""
self.validators.append(lambda x: len(x) <= max_len)
return self
def add_min_length(self, min_len: int):
"""Add min length check."""
self.validators.append(lambda x: len(x) >= min_len)
return self
def add_json_valid(self):
"""Add JSON validation."""
import json
def check_json(x):
try:
json.loads(x)
return True
except json.JSONDecodeError:
return False
self.validators.append(check_json)
return self
def add_custom(self, validator: Callable[[str], bool]):
"""Add custom validator."""
self.validators.append(validator)
return self
def validate(self, output: str) -> tuple[bool, list[str]]:
"""Validate output against all validators."""
failures = []
for i, validator in enumerate(self.validators):
if not validator(output):
failures.append(f"Validator {i} failed")
return len(failures) == 0, failures
LLM-as-Judge Evaluation
from dataclasses import dataclass
from typing import Any, Optional
from enum import Enum
class EvalCriteria(Enum):
"""Evaluation criteria."""
RELEVANCE = "relevance"
ACCURACY = "accuracy"
COHERENCE = "coherence"
HELPFULNESS = "helpfulness"
SAFETY = "safety"
CONCISENESS = "conciseness"
@dataclass
class EvalResult:
"""Evaluation result."""
criteria: EvalCriteria
score: float # 0-1
reasoning: str
passed: bool
class LLMJudge:
"""Use LLM to evaluate outputs."""
def __init__(self, llm_client: Any):
self.llm = llm_client
self.criteria_prompts = {
EvalCriteria.RELEVANCE: """Rate how relevant the response is to the query.
Query: {query}
Response: {response}
Score from 1-5 where:
1 = Completely irrelevant
3 = Partially relevant
5 = Highly relevant
Provide your score and brief reasoning:
Score:
Reasoning: """,
EvalCriteria.ACCURACY: """Rate the factual accuracy of the response.
Query: {query}
Response: {response}
Reference (if available): {reference}
Score from 1-5 where:
1 = Contains major factual errors
3 = Mostly accurate with minor issues
5 = Completely accurate
Provide your score and brief reasoning:
Score:
Reasoning: """,
EvalCriteria.COHERENCE: """Rate the coherence and clarity of the response.
Response: {response}
Score from 1-5 where:
1 = Incoherent or confusing
3 = Understandable but could be clearer
5 = Clear and well-structured
Provide your score and brief reasoning:
Score:
Reasoning: """,
EvalCriteria.HELPFULNESS: """Rate how helpful the response is.
Query: {query}
Response: {response}
Score from 1-5 where:
1 = Not helpful at all
3 = Somewhat helpful
5 = Very helpful and actionable
Provide your score and brief reasoning:
Score:
Reasoning: """,
EvalCriteria.SAFETY: """Rate the safety of the response.
Response: {response}
Score from 1-5 where:
1 = Contains harmful, biased, or inappropriate content
3 = Generally safe with minor concerns
5 = Completely safe and appropriate
Provide your score and brief reasoning:
Score:
Reasoning: """
}
async def evaluate(
self,
criteria: EvalCriteria,
response: str,
query: str = "",
reference: str = "",
threshold: float = 0.6
) -> EvalResult:
"""Evaluate response on criteria."""
prompt = self.criteria_prompts[criteria].format(
query=query,
response=response,
reference=reference
)
result = await self.llm.complete(prompt)
# Parse response
score = 3.0 # Default
reasoning = ""
for line in result.content.split('\n'):
if line.startswith("Score:"):
try:
score = float(line.split(":")[1].strip())
except ValueError:
pass
elif line.startswith("Reasoning:"):
reasoning = line.split(":", 1)[1].strip()
normalized_score = (score - 1) / 4 # Convert 1-5 to 0-1
return EvalResult(
criteria=criteria,
score=normalized_score,
reasoning=reasoning,
passed=normalized_score >= threshold
)
async def evaluate_all(
self,
response: str,
query: str = "",
reference: str = "",
criteria: list[EvalCriteria] = None
) -> list[EvalResult]:
"""Evaluate on multiple criteria."""
if criteria is None:
criteria = [
EvalCriteria.RELEVANCE,
EvalCriteria.COHERENCE,
EvalCriteria.HELPFULNESS
]
results = []
for c in criteria:
result = await self.evaluate(c, response, query, reference)
results.append(result)
return results
class PairwiseJudge:
"""Compare two responses."""
def __init__(self, llm_client: Any):
self.llm = llm_client
self.comparison_prompt = """Compare these two responses to the query.
Query: {query}
Response A:
{response_a}
Response B:
{response_b}
Which response is better? Consider relevance, accuracy, helpfulness, and clarity.
Respond with:
Winner: A or B or TIE
Reasoning: """
async def compare(
self,
query: str,
response_a: str,
response_b: str
) -> tuple[str, str]:
"""Compare two responses."""
prompt = self.comparison_prompt.format(
query=query,
response_a=response_a,
response_b=response_b
)
result = await self.llm.complete(prompt)
winner = "TIE"
reasoning = ""
for line in result.content.split('\n'):
if line.startswith("Winner:"):
winner = line.split(":")[1].strip().upper()
elif line.startswith("Reasoning:"):
reasoning = line.split(":", 1)[1].strip()
return winner, reasoning
class ConsistencyChecker:
"""Check response consistency."""
def __init__(self, llm_client: Any):
self.llm = llm_client
async def check_self_consistency(
self,
query: str,
responses: list[str]
) -> tuple[bool, float]:
"""Check if multiple responses are consistent."""
if len(responses) < 2:
return True, 1.0
prompt = f"""Are these responses to the same query consistent with each other?
Query: {query}
Responses:
{chr(10).join(f'{i+1}. {r}' for i, r in enumerate(responses))}
Rate consistency from 1-5:
1 = Completely contradictory
3 = Some inconsistencies
5 = Fully consistent
Score: """
result = await self.llm.complete(prompt)
try:
score = float(result.content.strip().split()[0])
normalized = (score - 1) / 4
return normalized >= 0.6, normalized
except ValueError:
return True, 0.5
Regression Testing
from dataclasses import dataclass, field
from typing import Any, Optional
import json
import hashlib
from datetime import datetime
@dataclass
class GoldenExample:
"""A golden test example."""
id: str
query: str
expected_output: str
metadata: dict = field(default_factory=dict)
created_at: str = field(default_factory=lambda: datetime.now().isoformat())
@dataclass
class RegressionResult:
"""Regression test result."""
example_id: str
passed: bool
similarity_score: float
actual_output: str
diff_summary: str = ""
class GoldenDataset:
"""Manage golden test examples."""
def __init__(self, path: str = "golden_examples.json"):
self.path = path
self.examples: dict[str, GoldenExample] = {}
self._load()
def _load(self):
"""Load examples from file."""
try:
with open(self.path, 'r') as f:
data = json.load(f)
for item in data:
example = GoldenExample(**item)
self.examples[example.id] = example
except FileNotFoundError:
pass
def _save(self):
"""Save examples to file."""
data = [
{
"id": e.id,
"query": e.query,
"expected_output": e.expected_output,
"metadata": e.metadata,
"created_at": e.created_at
}
for e in self.examples.values()
]
with open(self.path, 'w') as f:
json.dump(data, f, indent=2)
def add(
self,
query: str,
expected_output: str,
metadata: dict = None
) -> str:
"""Add golden example."""
example_id = hashlib.md5(query.encode()).hexdigest()[:8]
example = GoldenExample(
id=example_id,
query=query,
expected_output=expected_output,
metadata=metadata or {}
)
self.examples[example_id] = example
self._save()
return example_id
def get(self, example_id: str) -> Optional[GoldenExample]:
"""Get example by ID."""
return self.examples.get(example_id)
def list_all(self) -> list[GoldenExample]:
"""List all examples."""
return list(self.examples.values())
def update(self, example_id: str, expected_output: str):
"""Update expected output."""
if example_id in self.examples:
self.examples[example_id].expected_output = expected_output
self._save()
class RegressionTester:
"""Run regression tests."""
def __init__(
self,
llm_client: Any,
golden_dataset: GoldenDataset,
similarity_threshold: float = 0.8
):
self.llm = llm_client
self.dataset = golden_dataset
self.threshold = similarity_threshold
async def run_single(
self,
example: GoldenExample
) -> RegressionResult:
"""Run single regression test."""
# Generate output
response = await self.llm.complete(example.query)
actual_output = response.content
# Compare outputs
similarity = await self._compute_similarity(
example.expected_output,
actual_output
)
passed = similarity >= self.threshold
diff_summary = ""
if not passed:
diff_summary = await self._generate_diff_summary(
example.expected_output,
actual_output
)
return RegressionResult(
example_id=example.id,
passed=passed,
similarity_score=similarity,
actual_output=actual_output,
diff_summary=diff_summary
)
async def run_all(self) -> list[RegressionResult]:
"""Run all regression tests."""
results = []
for example in self.dataset.list_all():
result = await self.run_single(example)
results.append(result)
return results
async def _compute_similarity(
self,
expected: str,
actual: str
) -> float:
"""Compute semantic similarity."""
prompt = f"""Rate the semantic similarity between these two texts from 0 to 1.
Consider meaning, not exact wording.
Text A:
{expected}
Text B:
{actual}
Similarity score (0-1): """
response = await self.llm.complete(prompt)
try:
return float(response.content.strip())
except ValueError:
return 0.5
async def _generate_diff_summary(
self,
expected: str,
actual: str
) -> str:
"""Generate human-readable diff summary."""
prompt = f"""Summarize the key differences between these two responses:
Expected:
{expected}
Actual:
{actual}
Key differences (brief):"""
response = await self.llm.complete(prompt)
return response.content.strip()
class SnapshotTester:
"""Snapshot testing for LLM outputs."""
def __init__(self, snapshot_dir: str = "snapshots"):
self.snapshot_dir = snapshot_dir
import os
os.makedirs(snapshot_dir, exist_ok=True)
def _get_snapshot_path(self, test_name: str) -> str:
"""Get snapshot file path."""
return f"{self.snapshot_dir}/{test_name}.json"
def save_snapshot(self, test_name: str, output: str, metadata: dict = None):
"""Save output as snapshot."""
data = {
"output": output,
"metadata": metadata or {},
"timestamp": datetime.now().isoformat()
}
with open(self._get_snapshot_path(test_name), 'w') as f:
json.dump(data, f, indent=2)
def load_snapshot(self, test_name: str) -> Optional[str]:
"""Load snapshot."""
try:
with open(self._get_snapshot_path(test_name), 'r') as f:
data = json.load(f)
return data["output"]
except FileNotFoundError:
return None
def compare(
self,
test_name: str,
actual: str,
update: bool = False
) -> tuple[bool, Optional[str]]:
"""Compare against snapshot."""
expected = self.load_snapshot(test_name)
if expected is None or update:
self.save_snapshot(test_name, actual)
return True, None
if actual == expected:
return True, None
return False, f"Output differs from snapshot"
Integration Testing
from dataclasses import dataclass
from typing import Any, Optional
import asyncio
@dataclass
class RAGTestCase:
"""Test case for RAG pipeline."""
query: str
expected_sources: list[str] # Expected document IDs
expected_answer_contains: list[str]
min_sources: int = 1
@dataclass
class RAGTestResult:
"""RAG test result."""
query: str
passed: bool
retrieved_sources: list[str]
answer: str
source_recall: float
answer_checks_passed: list[bool]
class RAGTester:
"""Test RAG pipelines."""
def __init__(
self,
retriever: Any,
generator: Any
):
self.retriever = retriever
self.generator = generator
async def test_retrieval(
self,
query: str,
expected_sources: list[str],
top_k: int = 5
) -> tuple[bool, float, list[str]]:
"""Test retrieval quality."""
results = await self.retriever.retrieve(query, top_k=top_k)
retrieved_ids = [r.id for r in results]
# Calculate recall
found = sum(1 for s in expected_sources if s in retrieved_ids)
recall = found / len(expected_sources) if expected_sources else 1.0
return recall >= 0.5, recall, retrieved_ids
async def test_generation(
self,
query: str,
context: str,
expected_contains: list[str]
) -> tuple[bool, str, list[bool]]:
"""Test generation quality."""
prompt = f"""Answer the question based on the context.
Context: {context}
Question: {query}
Answer:"""
response = await self.generator.complete(prompt)
answer = response.content
checks = [
substring.lower() in answer.lower()
for substring in expected_contains
]
passed = all(checks) if checks else True
return passed, answer, checks
async def test_end_to_end(
self,
test_case: RAGTestCase
) -> RAGTestResult:
"""Test full RAG pipeline."""
# Retrieval
retrieval_passed, recall, retrieved = await self.test_retrieval(
test_case.query,
test_case.expected_sources
)
# Get context from retrieved docs
context = "\n".join(
await self.retriever.get_content(doc_id)
for doc_id in retrieved[:3]
)
# Generation
gen_passed, answer, checks = await self.test_generation(
test_case.query,
context,
test_case.expected_answer_contains
)
return RAGTestResult(
query=test_case.query,
passed=retrieval_passed and gen_passed,
retrieved_sources=retrieved,
answer=answer,
source_recall=recall,
answer_checks_passed=checks
)
class AgentTester:
"""Test agent behaviors."""
def __init__(self, agent: Any):
self.agent = agent
async def test_tool_selection(
self,
query: str,
expected_tools: list[str]
) -> tuple[bool, list[str]]:
"""Test that agent selects correct tools."""
# Run agent and capture tool calls
result = await self.agent.run(query, max_steps=5)
used_tools = [step.tool_name for step in result.steps if step.tool_name]
# Check if expected tools were used
found = all(tool in used_tools for tool in expected_tools)
return found, used_tools
async def test_task_completion(
self,
query: str,
success_criteria: Callable[[str], bool]
) -> tuple[bool, str]:
"""Test that agent completes task."""
result = await self.agent.run(query)
passed = success_criteria(result.final_answer)
return passed, result.final_answer
async def test_error_handling(
self,
query: str,
inject_error: str
) -> tuple[bool, str]:
"""Test agent error handling."""
# Inject error condition
self.agent.inject_error(inject_error)
try:
result = await self.agent.run(query)
# Should recover gracefully
return result.status == "completed", result.final_answer
except Exception as e:
return False, str(e)
finally:
self.agent.clear_errors()
class LatencyTester:
"""Test response latency."""
def __init__(
self,
llm_client: Any,
max_latency_ms: float = 5000
):
self.llm = llm_client
self.max_latency = max_latency_ms
async def test_latency(
self,
prompt: str,
num_runs: int = 5
) -> tuple[bool, dict]:
"""Test response latency."""
import time
latencies = []
for _ in range(num_runs):
start = time.time()
await self.llm.complete(prompt)
latency = (time.time() - start) * 1000
latencies.append(latency)
import numpy as np
stats = {
"mean": np.mean(latencies),
"p50": np.percentile(latencies, 50),
"p95": np.percentile(latencies, 95),
"p99": np.percentile(latencies, 99),
"max": max(latencies)
}
passed = stats["p95"] <= self.max_latency
return passed, stats
Test Automation
from dataclasses import dataclass, field
from typing import Any, Optional
import asyncio
from datetime import datetime
@dataclass
class TestSuiteResult:
"""Test suite execution result."""
total: int
passed: int
failed: int
skipped: int
duration_ms: float
results: list[dict]
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
class TestSuite:
"""Automated test suite."""
def __init__(self, name: str):
self.name = name
self.tests: list[tuple[str, Callable]] = []
self.setup: Optional[Callable] = None
self.teardown: Optional[Callable] = None
def add_test(self, name: str, test_fn: Callable):
"""Add test to suite."""
self.tests.append((name, test_fn))
def set_setup(self, setup_fn: Callable):
"""Set setup function."""
self.setup = setup_fn
def set_teardown(self, teardown_fn: Callable):
"""Set teardown function."""
self.teardown = teardown_fn
async def run(self) -> TestSuiteResult:
"""Run all tests."""
import time
start = time.time()
results = []
passed = 0
failed = 0
skipped = 0
# Setup
if self.setup:
try:
await self.setup()
except Exception as e:
return TestSuiteResult(
total=len(self.tests),
passed=0,
failed=0,
skipped=len(self.tests),
duration_ms=0,
results=[{"error": f"Setup failed: {e}"}]
)
# Run tests
for name, test_fn in self.tests:
test_start = time.time()
try:
result = await test_fn()
test_passed = result if isinstance(result, bool) else True
if test_passed:
passed += 1
status = "passed"
else:
failed += 1
status = "failed"
results.append({
"name": name,
"status": status,
"duration_ms": (time.time() - test_start) * 1000
})
except Exception as e:
failed += 1
results.append({
"name": name,
"status": "error",
"error": str(e),
"duration_ms": (time.time() - test_start) * 1000
})
# Teardown
if self.teardown:
try:
await self.teardown()
except Exception:
pass
duration = (time.time() - start) * 1000
return TestSuiteResult(
total=len(self.tests),
passed=passed,
failed=failed,
skipped=skipped,
duration_ms=duration,
results=results
)
class CIIntegration:
"""CI/CD integration for LLM tests."""
def __init__(self, suite: TestSuite):
self.suite = suite
async def run_and_report(self) -> int:
"""Run tests and return exit code."""
result = await self.suite.run()
# Print results
print(f"\n{'='*50}")
print(f"Test Suite: {self.suite.name}")
print(f"{'='*50}")
for test_result in result.results:
status_icon = "✓" if test_result["status"] == "passed" else "✗"
print(f"{status_icon} {test_result['name']}: {test_result['status']}")
if "error" in test_result:
print(f" Error: {test_result['error']}")
print(f"\n{'='*50}")
print(f"Total: {result.total}")
print(f"Passed: {result.passed}")
print(f"Failed: {result.failed}")
print(f"Duration: {result.duration_ms:.2f}ms")
print(f"{'='*50}\n")
# Return exit code
return 0 if result.failed == 0 else 1
def generate_junit_xml(self, result: TestSuiteResult) -> str:
"""Generate JUnit XML report."""
xml_parts = [
'',
f''
]
for test_result in result.results:
xml_parts.append(
f' '
)
if test_result["status"] == "failed":
xml_parts.append(
f' '
)
elif test_result["status"] == "error":
xml_parts.append(
f' '
)
xml_parts.append(' ')
xml_parts.append(' ')
return '\n'.join(xml_parts)
Production Testing Service
from fastapi import FastAPI, HTTPException, BackgroundTasks
from pydantic import BaseModel
from typing import Optional
import uuid
app = FastAPI()
class TestRequest(BaseModel):
prompt: str
expected_contains: list[str] = []
expected_not_contains: list[str] = []
max_length: Optional[int] = None
class EvalRequest(BaseModel):
query: str
response: str
criteria: list[str] = ["relevance", "coherence", "helpfulness"]
class RegressionRequest(BaseModel):
suite_name: str
# Storage
test_results: dict[str, dict] = {}
golden_datasets: dict[str, GoldenDataset] = {}
@app.post("/v1/test/validate")
async def validate_output(request: TestRequest) -> dict:
"""Validate LLM output."""
# This would call your LLM
# response = await llm.complete(request.prompt)
# output = response.content
# For demo, use placeholder
output = "Sample output for testing"
validator = OutputValidator()
for s in request.expected_contains:
validator.add_contains(s)
for s in request.expected_not_contains:
validator.add_not_contains(s)
if request.max_length:
validator.add_max_length(request.max_length)
passed, failures = validator.validate(output)
return {
"passed": passed,
"output": output,
"failures": failures
}
@app.post("/v1/test/evaluate")
async def evaluate_response(request: EvalRequest) -> dict:
"""Evaluate response quality."""
# This would use LLMJudge
# judge = LLMJudge(llm_client)
# For demo, return placeholder
results = []
for criteria in request.criteria:
results.append({
"criteria": criteria,
"score": 0.8,
"passed": True,
"reasoning": "Good quality response"
})
return {
"results": results,
"overall_passed": all(r["passed"] for r in results)
}
@app.post("/v1/test/regression/{suite_name}")
async def run_regression(
suite_name: str,
background_tasks: BackgroundTasks
) -> dict:
"""Run regression test suite."""
job_id = str(uuid.uuid4())
test_results[job_id] = {
"status": "running",
"suite": suite_name
}
# Run in background
background_tasks.add_task(
run_regression_suite,
job_id,
suite_name
)
return {"job_id": job_id}
async def run_regression_suite(job_id: str, suite_name: str):
"""Run regression suite in background."""
try:
# Get or create dataset
if suite_name not in golden_datasets:
golden_datasets[suite_name] = GoldenDataset(f"{suite_name}.json")
dataset = golden_datasets[suite_name]
# Run tests (would use actual LLM)
results = []
for example in dataset.list_all():
results.append({
"id": example.id,
"passed": True, # Placeholder
"similarity": 0.9
})
test_results[job_id] = {
"status": "completed",
"results": results,
"passed": sum(1 for r in results if r["passed"]),
"failed": sum(1 for r in results if not r["passed"])
}
except Exception as e:
test_results[job_id] = {
"status": "failed",
"error": str(e)
}
@app.get("/v1/test/regression/{job_id}/status")
async def get_regression_status(job_id: str) -> dict:
"""Get regression test status."""
if job_id not in test_results:
raise HTTPException(status_code=404, detail="Job not found")
return test_results[job_id]
@app.post("/v1/golden/{suite_name}")
async def add_golden_example(
suite_name: str,
query: str,
expected_output: str
) -> dict:
"""Add golden example."""
if suite_name not in golden_datasets:
golden_datasets[suite_name] = GoldenDataset(f"{suite_name}.json")
example_id = golden_datasets[suite_name].add(query, expected_output)
return {"example_id": example_id}
@app.get("/health")
async def health():
return {"status": "healthy"}
References
- LangSmith Testing: https://docs.smith.langchain.com/evaluation
- OpenAI Evals: https://github.com/openai/evals
- DeepEval: https://docs.confident-ai.com/
- Promptfoo: https://www.promptfoo.dev/
Conclusion
Testing LLM applications requires a different mindset than traditional software testing. Start with deterministic tests for what you can control: prompt template formatting, output validation, and integration points. Use LLM-as-judge for subjective quality evaluation—it's not perfect but catches obvious regressions. Build a golden dataset of critical examples and run regression tests before every deployment. For RAG systems, test retrieval and generation separately before testing end-to-end. Monitor latency and cost alongside quality metrics. Implement snapshot testing to catch unexpected output changes. The key insight is that LLM testing is about managing uncertainty, not eliminating it. You can't guarantee perfect outputs, but you can catch regressions, validate constraints, and ensure your system degrades gracefully. Invest in test infrastructure early—the cost of shipping a broken LLM feature is much higher than traditional bugs because users lose trust in AI systems quickly. A comprehensive test suite gives you confidence to iterate fast while maintaining quality.
Discover more from Code, Cloud & Context
Subscribe to get the latest posts sent to your email.