Testing LLM Applications: Unit Tests, Integration Tests, and Evaluation

Introduction: Testing LLM applications presents unique challenges compared to traditional software. Outputs are non-deterministic, quality is subjective, and the same input can produce different but equally valid responses. This guide covers practical testing strategies: unit testing with mocked LLM responses, integration testing with real API calls, evaluation frameworks for quality assessment, and regression testing to catch prompt degradation. A robust testing strategy combines deterministic tests for application logic with probabilistic evaluation for LLM output quality.

LLM Application Testing
LLM Testing: Unit Tests, Integration Tests, and Quality Evaluation

Unit Testing with Mocked LLM

import pytest
from unittest.mock import Mock, patch, MagicMock
from dataclasses import dataclass
from typing import Callable

# Application code to test
class ChatService:
    def __init__(self, client):
        self.client = client
    
    def get_response(self, user_message: str, system_prompt: str = None) -> str:
        messages = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})
        messages.append({"role": "user", "content": user_message})
        
        response = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=messages
        )
        
        return response.choices[0].message.content
    
    def classify_intent(self, message: str) -> str:
        response = self.get_response(
            message,
            system_prompt="Classify the user intent. Respond with one word: question, request, complaint, or other."
        )
        return response.strip().lower()

# Mock response factory
@dataclass
class MockChoice:
    message: Mock

@dataclass
class MockResponse:
    choices: list[MockChoice]

def create_mock_response(content: str) -> MockResponse:
    """Create a mock OpenAI response."""
    message = Mock()
    message.content = content
    choice = MockChoice(message=message)
    return MockResponse(choices=[choice])

# Unit tests
class TestChatService:
    
    @pytest.fixture
    def mock_client(self):
        """Create a mock OpenAI client."""
        client = Mock()
        client.chat = Mock()
        client.chat.completions = Mock()
        return client
    
    @pytest.fixture
    def service(self, mock_client):
        """Create ChatService with mock client."""
        return ChatService(mock_client)
    
    def test_get_response_returns_content(self, service, mock_client):
        """Test that get_response returns the message content."""
        mock_client.chat.completions.create.return_value = create_mock_response("Hello!")
        
        result = service.get_response("Hi there")
        
        assert result == "Hello!"
    
    def test_get_response_includes_system_prompt(self, service, mock_client):
        """Test that system prompt is included in messages."""
        mock_client.chat.completions.create.return_value = create_mock_response("Response")
        
        service.get_response("User message", system_prompt="Be helpful")
        
        call_args = mock_client.chat.completions.create.call_args
        messages = call_args.kwargs["messages"]
        
        assert len(messages) == 2
        assert messages[0]["role"] == "system"
        assert messages[0]["content"] == "Be helpful"
    
    def test_classify_intent_question(self, service, mock_client):
        """Test intent classification for questions."""
        mock_client.chat.completions.create.return_value = create_mock_response("question")
        
        result = service.classify_intent("What is Python?")
        
        assert result == "question"
    
    def test_classify_intent_normalizes_response(self, service, mock_client):
        """Test that classification normalizes whitespace and case."""
        mock_client.chat.completions.create.return_value = create_mock_response("  COMPLAINT  ")
        
        result = service.classify_intent("This is broken!")
        
        assert result == "complaint"

# Parameterized tests for multiple scenarios
class TestIntentClassification:
    
    @pytest.fixture
    def mock_client(self):
        client = Mock()
        client.chat = Mock()
        client.chat.completions = Mock()
        return client
    
    @pytest.mark.parametrize("llm_response,expected", [
        ("question", "question"),
        ("Question", "question"),
        ("  request  ", "request"),
        ("COMPLAINT", "complaint"),
        ("other", "other"),
    ])
    def test_intent_normalization(self, mock_client, llm_response, expected):
        """Test various LLM response formats are normalized correctly."""
        mock_client.chat.completions.create.return_value = create_mock_response(llm_response)
        service = ChatService(mock_client)
        
        result = service.classify_intent("Test message")
        
        assert result == expected

Integration Testing

import pytest
import os
from openai import OpenAI

# Skip integration tests if no API key
pytestmark = pytest.mark.skipif(
    not os.getenv("OPENAI_API_KEY"),
    reason="OPENAI_API_KEY not set"
)

class TestLLMIntegration:
    """Integration tests that call real LLM API."""
    
    @pytest.fixture
    def client(self):
        return OpenAI()
    
    @pytest.fixture
    def service(self, client):
        return ChatService(client)
    
    def test_basic_response(self, service):
        """Test that we get a non-empty response."""
        response = service.get_response("Say 'hello' and nothing else.")
        
        assert response is not None
        assert len(response) > 0
        assert "hello" in response.lower()
    
    def test_intent_classification_question(self, service):
        """Test intent classification with real LLM."""
        result = service.classify_intent("What time is it?")
        
        assert result in ["question", "request", "other"]
    
    def test_intent_classification_complaint(self, service):
        """Test complaint detection."""
        result = service.classify_intent(
            "This product is terrible and I want a refund!"
        )
        
        assert result == "complaint"
    
    @pytest.mark.slow
    def test_response_consistency(self, service):
        """Test that similar prompts get consistent responses."""
        prompt = "What is 2 + 2? Answer with just the number."
        
        responses = [
            service.get_response(prompt)
            for _ in range(3)
        ]
        
        # All responses should contain "4"
        assert all("4" in r for r in responses)

# Snapshot testing for prompts
class TestPromptSnapshots:
    """Test that prompts produce expected output patterns."""
    
    @pytest.fixture
    def client(self):
        return OpenAI()
    
    def test_json_output_format(self, client):
        """Test that JSON mode produces valid JSON."""
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{
                "role": "user",
                "content": "List 3 colors as JSON array"
            }],
            response_format={"type": "json_object"}
        )
        
        import json
        content = response.choices[0].message.content
        
        # Should be valid JSON
        data = json.loads(content)
        assert isinstance(data, dict)
    
    def test_structured_extraction(self, client):
        """Test structured data extraction."""
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{
                "role": "user",
                "content": """
                Extract name and age from: "John is 30 years old"
                Return JSON with keys: name, age
                """
            }],
            response_format={"type": "json_object"}
        )
        
        import json
        data = json.loads(response.choices[0].message.content)
        
        assert "name" in data or "Name" in data
        assert "age" in data or "Age" in data

Evaluation Framework

from dataclasses import dataclass
from typing import Callable, Any
from abc import ABC, abstractmethod
import json

@dataclass
class EvalCase:
    """A single evaluation test case."""
    input: str
    expected: Any = None  # Optional expected output
    metadata: dict = None

@dataclass
class EvalResult:
    """Result of evaluating a single case."""
    case: EvalCase
    output: str
    score: float
    passed: bool
    details: dict = None

class Evaluator(ABC):
    """Base class for evaluators."""
    
    @abstractmethod
    def evaluate(self, output: str, case: EvalCase) -> EvalResult:
        pass

class ExactMatchEvaluator(Evaluator):
    """Check for exact match with expected output."""
    
    def evaluate(self, output: str, case: EvalCase) -> EvalResult:
        passed = output.strip() == str(case.expected).strip()
        
        return EvalResult(
            case=case,
            output=output,
            score=1.0 if passed else 0.0,
            passed=passed
        )

class ContainsEvaluator(Evaluator):
    """Check if output contains expected substring."""
    
    def __init__(self, case_sensitive: bool = False):
        self.case_sensitive = case_sensitive
    
    def evaluate(self, output: str, case: EvalCase) -> EvalResult:
        if self.case_sensitive:
            passed = case.expected in output
        else:
            passed = case.expected.lower() in output.lower()
        
        return EvalResult(
            case=case,
            output=output,
            score=1.0 if passed else 0.0,
            passed=passed
        )

class JSONValidEvaluator(Evaluator):
    """Check if output is valid JSON."""
    
    def __init__(self, schema: dict = None):
        self.schema = schema
    
    def evaluate(self, output: str, case: EvalCase) -> EvalResult:
        try:
            data = json.loads(output)
            
            # Check schema if provided
            if self.schema:
                for key in self.schema.get("required", []):
                    if key not in data:
                        return EvalResult(
                            case=case,
                            output=output,
                            score=0.5,
                            passed=False,
                            details={"error": f"Missing required key: {key}"}
                        )
            
            return EvalResult(
                case=case,
                output=output,
                score=1.0,
                passed=True,
                details={"parsed": data}
            )
            
        except json.JSONDecodeError as e:
            return EvalResult(
                case=case,
                output=output,
                score=0.0,
                passed=False,
                details={"error": str(e)}
            )

class LLMJudgeEvaluator(Evaluator):
    """Use LLM to judge output quality."""
    
    def __init__(self, client, criteria: str):
        self.client = client
        self.criteria = criteria
    
    def evaluate(self, output: str, case: EvalCase) -> EvalResult:
        judge_prompt = f"""
Evaluate this LLM output based on the following criteria:
{self.criteria}

Input: {case.input}
Output: {output}

Rate the output from 0 to 10 and explain your reasoning.
Respond with JSON: {{"score": , "reasoning": ""}}
"""
        
        response = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": judge_prompt}],
            response_format={"type": "json_object"}
        )
        
        result = json.loads(response.choices[0].message.content)
        score = result["score"] / 10  # Normalize to 0-1
        
        return EvalResult(
            case=case,
            output=output,
            score=score,
            passed=score >= 0.7,
            details={"reasoning": result["reasoning"]}
        )

# Evaluation runner
class EvalRunner:
    """Run evaluations across test cases."""
    
    def __init__(self, llm_fn: Callable[[str], str]):
        self.llm_fn = llm_fn
    
    def run(
        self,
        cases: list[EvalCase],
        evaluator: Evaluator
    ) -> dict:
        """Run evaluation on all cases."""
        
        results = []
        
        for case in cases:
            output = self.llm_fn(case.input)
            result = evaluator.evaluate(output, case)
            results.append(result)
        
        # Aggregate metrics
        total = len(results)
        passed = sum(1 for r in results if r.passed)
        avg_score = sum(r.score for r in results) / total if total > 0 else 0
        
        return {
            "total": total,
            "passed": passed,
            "failed": total - passed,
            "pass_rate": passed / total if total > 0 else 0,
            "avg_score": avg_score,
            "results": results
        }

# Usage
def my_llm_function(prompt: str) -> str:
    client = OpenAI()
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

runner = EvalRunner(my_llm_function)

# Test cases
cases = [
    EvalCase(input="What is 2+2?", expected="4"),
    EvalCase(input="What is the capital of France?", expected="Paris"),
    EvalCase(input="Is the sky blue?", expected="yes"),
]

# Run with contains evaluator
results = runner.run(cases, ContainsEvaluator())
print(f"Pass rate: {results['pass_rate']:.1%}")

Regression Testing

import hashlib
import json
from pathlib import Path
from datetime import datetime
from typing import Optional

class RegressionTestSuite:
    """Track and detect regressions in LLM outputs."""
    
    def __init__(
        self,
        baseline_dir: str = "./baselines",
        threshold: float = 0.8
    ):
        self.baseline_dir = Path(baseline_dir)
        self.baseline_dir.mkdir(exist_ok=True)
        self.threshold = threshold
    
    def _get_baseline_path(self, test_name: str) -> Path:
        """Get path for baseline file."""
        return self.baseline_dir / f"{test_name}.json"
    
    def save_baseline(
        self,
        test_name: str,
        cases: list[EvalCase],
        results: list[EvalResult]
    ):
        """Save baseline results."""
        
        baseline = {
            "test_name": test_name,
            "created_at": datetime.now().isoformat(),
            "cases": [
                {
                    "input": case.input,
                    "output": result.output,
                    "score": result.score
                }
                for case, result in zip(cases, results)
            ]
        }
        
        path = self._get_baseline_path(test_name)
        path.write_text(json.dumps(baseline, indent=2))
    
    def load_baseline(self, test_name: str) -> Optional[dict]:
        """Load baseline if exists."""
        
        path = self._get_baseline_path(test_name)
        
        if not path.exists():
            return None
        
        return json.loads(path.read_text())
    
    def compare_to_baseline(
        self,
        test_name: str,
        current_results: list[EvalResult],
        similarity_fn: Callable[[str, str], float] = None
    ) -> dict:
        """Compare current results to baseline."""
        
        baseline = self.load_baseline(test_name)
        
        if not baseline:
            return {
                "status": "no_baseline",
                "message": "No baseline found. Run with --save-baseline to create one."
            }
        
        if similarity_fn is None:
            # Default: exact match
            similarity_fn = lambda a, b: 1.0 if a.strip() == b.strip() else 0.0
        
        comparisons = []
        
        for i, (baseline_case, current) in enumerate(
            zip(baseline["cases"], current_results)
        ):
            similarity = similarity_fn(
                baseline_case["output"],
                current.output
            )
            
            comparisons.append({
                "case_index": i,
                "input": baseline_case["input"],
                "baseline_output": baseline_case["output"][:100],
                "current_output": current.output[:100],
                "similarity": similarity,
                "baseline_score": baseline_case["score"],
                "current_score": current.score,
                "score_diff": current.score - baseline_case["score"]
            })
        
        avg_similarity = sum(c["similarity"] for c in comparisons) / len(comparisons)
        regressions = [c for c in comparisons if c["score_diff"] < -0.1]
        
        return {
            "status": "pass" if avg_similarity >= self.threshold else "fail",
            "avg_similarity": avg_similarity,
            "threshold": self.threshold,
            "regressions": regressions,
            "comparisons": comparisons
        }

# Semantic similarity for regression testing
def semantic_similarity(text1: str, text2: str) -> float:
    """Compute semantic similarity using embeddings."""
    
    client = OpenAI()
    
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=[text1, text2]
    )
    
    emb1 = response.data[0].embedding
    emb2 = response.data[1].embedding
    
    import numpy as np
    return float(np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)))

# Usage in pytest
class TestRegression:
    
    @pytest.fixture
    def suite(self, tmp_path):
        return RegressionTestSuite(baseline_dir=str(tmp_path))
    
    def test_qa_regression(self, suite):
        """Test for regressions in Q&A quality."""
        
        cases = [
            EvalCase(input="What is Python?"),
            EvalCase(input="Explain machine learning"),
        ]
        
        # Get current results
        runner = EvalRunner(my_llm_function)
        results = runner.run(cases, ContainsEvaluator())
        
        # Compare to baseline
        comparison = suite.compare_to_baseline(
            "qa_test",
            results["results"],
            similarity_fn=semantic_similarity
        )
        
        if comparison["status"] == "no_baseline":
            suite.save_baseline("qa_test", cases, results["results"])
            pytest.skip("Baseline created")
        
        assert comparison["status"] == "pass", f"Regressions detected: {comparison['regressions']}"

Test Fixtures and Factories

import pytest
from typing import Generator
from contextlib import contextmanager

# Fixtures for common test scenarios
@pytest.fixture
def mock_openai_client():
    """Provide a fully mocked OpenAI client."""
    
    client = Mock()
    client.chat = Mock()
    client.chat.completions = Mock()
    client.embeddings = Mock()
    
    # Default response
    client.chat.completions.create.return_value = create_mock_response("Default response")
    
    return client

@pytest.fixture
def response_factory():
    """Factory for creating mock responses."""
    
    def factory(content: str, finish_reason: str = "stop"):
        response = Mock()
        response.choices = [Mock()]
        response.choices[0].message = Mock()
        response.choices[0].message.content = content
        response.choices[0].finish_reason = finish_reason
        response.usage = Mock()
        response.usage.prompt_tokens = 10
        response.usage.completion_tokens = 20
        response.usage.total_tokens = 30
        return response
    
    return factory

# Context manager for temporary API behavior
@contextmanager
def mock_llm_responses(responses: list[str]) -> Generator:
    """Mock LLM to return specific responses in order."""
    
    with patch("openai.OpenAI") as mock_class:
        mock_client = Mock()
        mock_class.return_value = mock_client
        
        mock_responses = [create_mock_response(r) for r in responses]
        mock_client.chat.completions.create.side_effect = mock_responses
        
        yield mock_client

# Test data generators
class TestDataGenerator:
    """Generate test data for LLM testing."""
    
    @staticmethod
    def question_cases(count: int = 10) -> list[EvalCase]:
        """Generate question test cases."""
        
        questions = [
            ("What is Python?", "programming language"),
            ("What is 2+2?", "4"),
            ("What color is the sky?", "blue"),
            ("Who wrote Romeo and Juliet?", "Shakespeare"),
            ("What is the capital of Japan?", "Tokyo"),
        ]
        
        return [
            EvalCase(input=q, expected=a)
            for q, a in questions[:count]
        ]
    
    @staticmethod
    def classification_cases() -> list[EvalCase]:
        """Generate classification test cases."""
        
        return [
            EvalCase(
                input="What time does the store open?",
                expected="question",
                metadata={"category": "inquiry"}
            ),
            EvalCase(
                input="I want to return this product",
                expected="request",
                metadata={"category": "action"}
            ),
            EvalCase(
                input="This service is terrible!",
                expected="complaint",
                metadata={"category": "negative"}
            ),
        ]

# Usage
def test_with_mock_responses():
    """Test using mock response context manager."""
    
    with mock_llm_responses(["Response 1", "Response 2"]):
        client = OpenAI()
        
        r1 = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": "First"}]
        )
        
        r2 = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": "Second"}]
        )
        
        assert r1.choices[0].message.content == "Response 1"
        assert r2.choices[0].message.content == "Response 2"

CI/CD Integration

# pytest.ini configuration
"""
[pytest]
markers =
    unit: Unit tests (no API calls)
    integration: Integration tests (requires API key)
    slow: Slow tests
    eval: Evaluation tests

testpaths = tests
python_files = test_*.py
python_functions = test_*
"""

# conftest.py
import pytest
import os

def pytest_configure(config):
    """Configure pytest markers."""
    config.addinivalue_line("markers", "unit: Unit tests")
    config.addinivalue_line("markers", "integration: Integration tests")
    config.addinivalue_line("markers", "slow: Slow tests")
    config.addinivalue_line("markers", "eval: Evaluation tests")

def pytest_collection_modifyitems(config, items):
    """Skip integration tests if no API key."""
    
    if not os.getenv("OPENAI_API_KEY"):
        skip_integration = pytest.mark.skip(reason="OPENAI_API_KEY not set")
        
        for item in items:
            if "integration" in item.keywords:
                item.add_marker(skip_integration)

# GitHub Actions workflow
"""
# .github/workflows/test.yml
name: Tests

on: [push, pull_request]

jobs:
  unit-tests:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - run: pip install -r requirements.txt
      - run: pytest -m unit --tb=short

  integration-tests:
    runs-on: ubuntu-latest
    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - run: pip install -r requirements.txt
      - run: pytest -m integration --tb=short
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

  eval-tests:
    runs-on: ubuntu-latest
    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - run: pip install -r requirements.txt
      - run: pytest -m eval --tb=short
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
      - uses: actions/upload-artifact@v4
        with:
          name: eval-results
          path: eval_results/
"""

# Run specific test categories
# pytest -m unit           # Only unit tests
# pytest -m integration    # Only integration tests
# pytest -m "not slow"     # Skip slow tests
# pytest -m eval           # Only evaluation tests

References

Conclusion

Testing LLM applications requires a layered approach. Unit tests with mocked responses verify application logic without API costs or latency. Integration tests with real API calls catch issues that mocks miss. Evaluation frameworks assess output quality using metrics, LLM judges, or human review. Regression testing detects when prompt changes degrade performance. For CI/CD, run unit tests on every commit, integration tests on main branch merges, and evaluation tests on a schedule. The key insight is that LLM testing is probabilistic—focus on statistical properties and quality distributions rather than exact output matching.


Discover more from Code, Cloud & Context

Subscribe to get the latest posts sent to your email.

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.