Introduction: Testing LLM applications presents unique challenges compared to traditional software. Outputs are non-deterministic, quality is subjective, and the same input can produce different but equally valid responses. This guide covers practical testing strategies: unit testing with mocked LLM responses, integration testing with real API calls, evaluation frameworks for quality assessment, and regression testing to catch prompt degradation. A robust testing strategy combines deterministic tests for application logic with probabilistic evaluation for LLM output quality.

Unit Testing with Mocked LLM
import pytest
from unittest.mock import Mock, patch, MagicMock
from dataclasses import dataclass
from typing import Callable
# Application code to test
class ChatService:
def __init__(self, client):
self.client = client
def get_response(self, user_message: str, system_prompt: str = None) -> str:
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": user_message})
response = self.client.chat.completions.create(
model="gpt-4o-mini",
messages=messages
)
return response.choices[0].message.content
def classify_intent(self, message: str) -> str:
response = self.get_response(
message,
system_prompt="Classify the user intent. Respond with one word: question, request, complaint, or other."
)
return response.strip().lower()
# Mock response factory
@dataclass
class MockChoice:
message: Mock
@dataclass
class MockResponse:
choices: list[MockChoice]
def create_mock_response(content: str) -> MockResponse:
"""Create a mock OpenAI response."""
message = Mock()
message.content = content
choice = MockChoice(message=message)
return MockResponse(choices=[choice])
# Unit tests
class TestChatService:
@pytest.fixture
def mock_client(self):
"""Create a mock OpenAI client."""
client = Mock()
client.chat = Mock()
client.chat.completions = Mock()
return client
@pytest.fixture
def service(self, mock_client):
"""Create ChatService with mock client."""
return ChatService(mock_client)
def test_get_response_returns_content(self, service, mock_client):
"""Test that get_response returns the message content."""
mock_client.chat.completions.create.return_value = create_mock_response("Hello!")
result = service.get_response("Hi there")
assert result == "Hello!"
def test_get_response_includes_system_prompt(self, service, mock_client):
"""Test that system prompt is included in messages."""
mock_client.chat.completions.create.return_value = create_mock_response("Response")
service.get_response("User message", system_prompt="Be helpful")
call_args = mock_client.chat.completions.create.call_args
messages = call_args.kwargs["messages"]
assert len(messages) == 2
assert messages[0]["role"] == "system"
assert messages[0]["content"] == "Be helpful"
def test_classify_intent_question(self, service, mock_client):
"""Test intent classification for questions."""
mock_client.chat.completions.create.return_value = create_mock_response("question")
result = service.classify_intent("What is Python?")
assert result == "question"
def test_classify_intent_normalizes_response(self, service, mock_client):
"""Test that classification normalizes whitespace and case."""
mock_client.chat.completions.create.return_value = create_mock_response(" COMPLAINT ")
result = service.classify_intent("This is broken!")
assert result == "complaint"
# Parameterized tests for multiple scenarios
class TestIntentClassification:
@pytest.fixture
def mock_client(self):
client = Mock()
client.chat = Mock()
client.chat.completions = Mock()
return client
@pytest.mark.parametrize("llm_response,expected", [
("question", "question"),
("Question", "question"),
(" request ", "request"),
("COMPLAINT", "complaint"),
("other", "other"),
])
def test_intent_normalization(self, mock_client, llm_response, expected):
"""Test various LLM response formats are normalized correctly."""
mock_client.chat.completions.create.return_value = create_mock_response(llm_response)
service = ChatService(mock_client)
result = service.classify_intent("Test message")
assert result == expected
Integration Testing
import pytest
import os
from openai import OpenAI
# Skip integration tests if no API key
pytestmark = pytest.mark.skipif(
not os.getenv("OPENAI_API_KEY"),
reason="OPENAI_API_KEY not set"
)
class TestLLMIntegration:
"""Integration tests that call real LLM API."""
@pytest.fixture
def client(self):
return OpenAI()
@pytest.fixture
def service(self, client):
return ChatService(client)
def test_basic_response(self, service):
"""Test that we get a non-empty response."""
response = service.get_response("Say 'hello' and nothing else.")
assert response is not None
assert len(response) > 0
assert "hello" in response.lower()
def test_intent_classification_question(self, service):
"""Test intent classification with real LLM."""
result = service.classify_intent("What time is it?")
assert result in ["question", "request", "other"]
def test_intent_classification_complaint(self, service):
"""Test complaint detection."""
result = service.classify_intent(
"This product is terrible and I want a refund!"
)
assert result == "complaint"
@pytest.mark.slow
def test_response_consistency(self, service):
"""Test that similar prompts get consistent responses."""
prompt = "What is 2 + 2? Answer with just the number."
responses = [
service.get_response(prompt)
for _ in range(3)
]
# All responses should contain "4"
assert all("4" in r for r in responses)
# Snapshot testing for prompts
class TestPromptSnapshots:
"""Test that prompts produce expected output patterns."""
@pytest.fixture
def client(self):
return OpenAI()
def test_json_output_format(self, client):
"""Test that JSON mode produces valid JSON."""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{
"role": "user",
"content": "List 3 colors as JSON array"
}],
response_format={"type": "json_object"}
)
import json
content = response.choices[0].message.content
# Should be valid JSON
data = json.loads(content)
assert isinstance(data, dict)
def test_structured_extraction(self, client):
"""Test structured data extraction."""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{
"role": "user",
"content": """
Extract name and age from: "John is 30 years old"
Return JSON with keys: name, age
"""
}],
response_format={"type": "json_object"}
)
import json
data = json.loads(response.choices[0].message.content)
assert "name" in data or "Name" in data
assert "age" in data or "Age" in data
Evaluation Framework
from dataclasses import dataclass
from typing import Callable, Any
from abc import ABC, abstractmethod
import json
@dataclass
class EvalCase:
"""A single evaluation test case."""
input: str
expected: Any = None # Optional expected output
metadata: dict = None
@dataclass
class EvalResult:
"""Result of evaluating a single case."""
case: EvalCase
output: str
score: float
passed: bool
details: dict = None
class Evaluator(ABC):
"""Base class for evaluators."""
@abstractmethod
def evaluate(self, output: str, case: EvalCase) -> EvalResult:
pass
class ExactMatchEvaluator(Evaluator):
"""Check for exact match with expected output."""
def evaluate(self, output: str, case: EvalCase) -> EvalResult:
passed = output.strip() == str(case.expected).strip()
return EvalResult(
case=case,
output=output,
score=1.0 if passed else 0.0,
passed=passed
)
class ContainsEvaluator(Evaluator):
"""Check if output contains expected substring."""
def __init__(self, case_sensitive: bool = False):
self.case_sensitive = case_sensitive
def evaluate(self, output: str, case: EvalCase) -> EvalResult:
if self.case_sensitive:
passed = case.expected in output
else:
passed = case.expected.lower() in output.lower()
return EvalResult(
case=case,
output=output,
score=1.0 if passed else 0.0,
passed=passed
)
class JSONValidEvaluator(Evaluator):
"""Check if output is valid JSON."""
def __init__(self, schema: dict = None):
self.schema = schema
def evaluate(self, output: str, case: EvalCase) -> EvalResult:
try:
data = json.loads(output)
# Check schema if provided
if self.schema:
for key in self.schema.get("required", []):
if key not in data:
return EvalResult(
case=case,
output=output,
score=0.5,
passed=False,
details={"error": f"Missing required key: {key}"}
)
return EvalResult(
case=case,
output=output,
score=1.0,
passed=True,
details={"parsed": data}
)
except json.JSONDecodeError as e:
return EvalResult(
case=case,
output=output,
score=0.0,
passed=False,
details={"error": str(e)}
)
class LLMJudgeEvaluator(Evaluator):
"""Use LLM to judge output quality."""
def __init__(self, client, criteria: str):
self.client = client
self.criteria = criteria
def evaluate(self, output: str, case: EvalCase) -> EvalResult:
judge_prompt = f"""
Evaluate this LLM output based on the following criteria:
{self.criteria}
Input: {case.input}
Output: {output}
Rate the output from 0 to 10 and explain your reasoning.
Respond with JSON: {{"score": , "reasoning": ""}}
"""
response = self.client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": judge_prompt}],
response_format={"type": "json_object"}
)
result = json.loads(response.choices[0].message.content)
score = result["score"] / 10 # Normalize to 0-1
return EvalResult(
case=case,
output=output,
score=score,
passed=score >= 0.7,
details={"reasoning": result["reasoning"]}
)
# Evaluation runner
class EvalRunner:
"""Run evaluations across test cases."""
def __init__(self, llm_fn: Callable[[str], str]):
self.llm_fn = llm_fn
def run(
self,
cases: list[EvalCase],
evaluator: Evaluator
) -> dict:
"""Run evaluation on all cases."""
results = []
for case in cases:
output = self.llm_fn(case.input)
result = evaluator.evaluate(output, case)
results.append(result)
# Aggregate metrics
total = len(results)
passed = sum(1 for r in results if r.passed)
avg_score = sum(r.score for r in results) / total if total > 0 else 0
return {
"total": total,
"passed": passed,
"failed": total - passed,
"pass_rate": passed / total if total > 0 else 0,
"avg_score": avg_score,
"results": results
}
# Usage
def my_llm_function(prompt: str) -> str:
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
runner = EvalRunner(my_llm_function)
# Test cases
cases = [
EvalCase(input="What is 2+2?", expected="4"),
EvalCase(input="What is the capital of France?", expected="Paris"),
EvalCase(input="Is the sky blue?", expected="yes"),
]
# Run with contains evaluator
results = runner.run(cases, ContainsEvaluator())
print(f"Pass rate: {results['pass_rate']:.1%}")
Regression Testing
import hashlib
import json
from pathlib import Path
from datetime import datetime
from typing import Optional
class RegressionTestSuite:
"""Track and detect regressions in LLM outputs."""
def __init__(
self,
baseline_dir: str = "./baselines",
threshold: float = 0.8
):
self.baseline_dir = Path(baseline_dir)
self.baseline_dir.mkdir(exist_ok=True)
self.threshold = threshold
def _get_baseline_path(self, test_name: str) -> Path:
"""Get path for baseline file."""
return self.baseline_dir / f"{test_name}.json"
def save_baseline(
self,
test_name: str,
cases: list[EvalCase],
results: list[EvalResult]
):
"""Save baseline results."""
baseline = {
"test_name": test_name,
"created_at": datetime.now().isoformat(),
"cases": [
{
"input": case.input,
"output": result.output,
"score": result.score
}
for case, result in zip(cases, results)
]
}
path = self._get_baseline_path(test_name)
path.write_text(json.dumps(baseline, indent=2))
def load_baseline(self, test_name: str) -> Optional[dict]:
"""Load baseline if exists."""
path = self._get_baseline_path(test_name)
if not path.exists():
return None
return json.loads(path.read_text())
def compare_to_baseline(
self,
test_name: str,
current_results: list[EvalResult],
similarity_fn: Callable[[str, str], float] = None
) -> dict:
"""Compare current results to baseline."""
baseline = self.load_baseline(test_name)
if not baseline:
return {
"status": "no_baseline",
"message": "No baseline found. Run with --save-baseline to create one."
}
if similarity_fn is None:
# Default: exact match
similarity_fn = lambda a, b: 1.0 if a.strip() == b.strip() else 0.0
comparisons = []
for i, (baseline_case, current) in enumerate(
zip(baseline["cases"], current_results)
):
similarity = similarity_fn(
baseline_case["output"],
current.output
)
comparisons.append({
"case_index": i,
"input": baseline_case["input"],
"baseline_output": baseline_case["output"][:100],
"current_output": current.output[:100],
"similarity": similarity,
"baseline_score": baseline_case["score"],
"current_score": current.score,
"score_diff": current.score - baseline_case["score"]
})
avg_similarity = sum(c["similarity"] for c in comparisons) / len(comparisons)
regressions = [c for c in comparisons if c["score_diff"] < -0.1]
return {
"status": "pass" if avg_similarity >= self.threshold else "fail",
"avg_similarity": avg_similarity,
"threshold": self.threshold,
"regressions": regressions,
"comparisons": comparisons
}
# Semantic similarity for regression testing
def semantic_similarity(text1: str, text2: str) -> float:
"""Compute semantic similarity using embeddings."""
client = OpenAI()
response = client.embeddings.create(
model="text-embedding-3-small",
input=[text1, text2]
)
emb1 = response.data[0].embedding
emb2 = response.data[1].embedding
import numpy as np
return float(np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)))
# Usage in pytest
class TestRegression:
@pytest.fixture
def suite(self, tmp_path):
return RegressionTestSuite(baseline_dir=str(tmp_path))
def test_qa_regression(self, suite):
"""Test for regressions in Q&A quality."""
cases = [
EvalCase(input="What is Python?"),
EvalCase(input="Explain machine learning"),
]
# Get current results
runner = EvalRunner(my_llm_function)
results = runner.run(cases, ContainsEvaluator())
# Compare to baseline
comparison = suite.compare_to_baseline(
"qa_test",
results["results"],
similarity_fn=semantic_similarity
)
if comparison["status"] == "no_baseline":
suite.save_baseline("qa_test", cases, results["results"])
pytest.skip("Baseline created")
assert comparison["status"] == "pass", f"Regressions detected: {comparison['regressions']}"
Test Fixtures and Factories
import pytest
from typing import Generator
from contextlib import contextmanager
# Fixtures for common test scenarios
@pytest.fixture
def mock_openai_client():
"""Provide a fully mocked OpenAI client."""
client = Mock()
client.chat = Mock()
client.chat.completions = Mock()
client.embeddings = Mock()
# Default response
client.chat.completions.create.return_value = create_mock_response("Default response")
return client
@pytest.fixture
def response_factory():
"""Factory for creating mock responses."""
def factory(content: str, finish_reason: str = "stop"):
response = Mock()
response.choices = [Mock()]
response.choices[0].message = Mock()
response.choices[0].message.content = content
response.choices[0].finish_reason = finish_reason
response.usage = Mock()
response.usage.prompt_tokens = 10
response.usage.completion_tokens = 20
response.usage.total_tokens = 30
return response
return factory
# Context manager for temporary API behavior
@contextmanager
def mock_llm_responses(responses: list[str]) -> Generator:
"""Mock LLM to return specific responses in order."""
with patch("openai.OpenAI") as mock_class:
mock_client = Mock()
mock_class.return_value = mock_client
mock_responses = [create_mock_response(r) for r in responses]
mock_client.chat.completions.create.side_effect = mock_responses
yield mock_client
# Test data generators
class TestDataGenerator:
"""Generate test data for LLM testing."""
@staticmethod
def question_cases(count: int = 10) -> list[EvalCase]:
"""Generate question test cases."""
questions = [
("What is Python?", "programming language"),
("What is 2+2?", "4"),
("What color is the sky?", "blue"),
("Who wrote Romeo and Juliet?", "Shakespeare"),
("What is the capital of Japan?", "Tokyo"),
]
return [
EvalCase(input=q, expected=a)
for q, a in questions[:count]
]
@staticmethod
def classification_cases() -> list[EvalCase]:
"""Generate classification test cases."""
return [
EvalCase(
input="What time does the store open?",
expected="question",
metadata={"category": "inquiry"}
),
EvalCase(
input="I want to return this product",
expected="request",
metadata={"category": "action"}
),
EvalCase(
input="This service is terrible!",
expected="complaint",
metadata={"category": "negative"}
),
]
# Usage
def test_with_mock_responses():
"""Test using mock response context manager."""
with mock_llm_responses(["Response 1", "Response 2"]):
client = OpenAI()
r1 = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "First"}]
)
r2 = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": "Second"}]
)
assert r1.choices[0].message.content == "Response 1"
assert r2.choices[0].message.content == "Response 2"
CI/CD Integration
# pytest.ini configuration
"""
[pytest]
markers =
unit: Unit tests (no API calls)
integration: Integration tests (requires API key)
slow: Slow tests
eval: Evaluation tests
testpaths = tests
python_files = test_*.py
python_functions = test_*
"""
# conftest.py
import pytest
import os
def pytest_configure(config):
"""Configure pytest markers."""
config.addinivalue_line("markers", "unit: Unit tests")
config.addinivalue_line("markers", "integration: Integration tests")
config.addinivalue_line("markers", "slow: Slow tests")
config.addinivalue_line("markers", "eval: Evaluation tests")
def pytest_collection_modifyitems(config, items):
"""Skip integration tests if no API key."""
if not os.getenv("OPENAI_API_KEY"):
skip_integration = pytest.mark.skip(reason="OPENAI_API_KEY not set")
for item in items:
if "integration" in item.keywords:
item.add_marker(skip_integration)
# GitHub Actions workflow
"""
# .github/workflows/test.yml
name: Tests
on: [push, pull_request]
jobs:
unit-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- run: pip install -r requirements.txt
- run: pytest -m unit --tb=short
integration-tests:
runs-on: ubuntu-latest
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- run: pip install -r requirements.txt
- run: pytest -m integration --tb=short
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
eval-tests:
runs-on: ubuntu-latest
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- run: pip install -r requirements.txt
- run: pytest -m eval --tb=short
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
- uses: actions/upload-artifact@v4
with:
name: eval-results
path: eval_results/
"""
# Run specific test categories
# pytest -m unit # Only unit tests
# pytest -m integration # Only integration tests
# pytest -m "not slow" # Skip slow tests
# pytest -m eval # Only evaluation tests
References
- pytest Documentation: https://docs.pytest.org/
- OpenAI Evals: https://github.com/openai/evals
- DeepEval: https://docs.confident-ai.com/
- LangSmith Testing: https://docs.smith.langchain.com/
Conclusion
Testing LLM applications requires a layered approach. Unit tests with mocked responses verify application logic without API costs or latency. Integration tests with real API calls catch issues that mocks miss. Evaluation frameworks assess output quality using metrics, LLM judges, or human review. Regression testing detects when prompt changes degrade performance. For CI/CD, run unit tests on every commit, integration tests on main branch merges, and evaluation tests on a schedule. The key insight is that LLM testing is probabilistic—focus on statistical properties and quality distributions rather than exact output matching.
Discover more from Code, Cloud & Context
Subscribe to get the latest posts sent to your email.