LLM Testing and Evaluation: Building Confidence in AI Applications

Introduction: LLM applications are notoriously hard to test. Outputs are non-deterministic, “correct” is often subjective, and traditional unit tests don’t apply. Yet shipping untested LLM features is risky—prompt changes can break functionality, model updates can degrade quality, and edge cases can embarrass your product. This guide covers practical testing strategies: building evaluation datasets, implementing automated scoring, regression testing for prompt changes, and continuous evaluation in production. These patterns help you ship LLM features with confidence.

Building Evaluation Datasets

from dataclasses import dataclass
from typing import Optional, Callable
import json
from pathlib import Path

@dataclass
class TestCase:
    id: str
    input: str
    expected_output: Optional[str] = None
    expected_contains: list[str] = None
    expected_not_contains: list[str] = None
    metadata: dict = None
    
    def __post_init__(self):
        self.expected_contains = self.expected_contains or []
        self.expected_not_contains = self.expected_not_contains or []
        self.metadata = self.metadata or {}

class EvalDataset:
    """Manage evaluation test cases."""
    
    def __init__(self, name: str):
        self.name = name
        self.test_cases: list[TestCase] = []
    
    def add_case(
        self,
        input: str,
        expected_output: str = None,
        expected_contains: list[str] = None,
        expected_not_contains: list[str] = None,
        **metadata
    ):
        """Add a test case."""
        
        case = TestCase(
            id=f"{self.name}_{len(self.test_cases)}",
            input=input,
            expected_output=expected_output,
            expected_contains=expected_contains,
            expected_not_contains=expected_not_contains,
            metadata=metadata
        )
        self.test_cases.append(case)
        return self
    
    def save(self, path: Path):
        """Save dataset to JSON."""
        
        data = {
            "name": self.name,
            "cases": [
                {
                    "id": c.id,
                    "input": c.input,
                    "expected_output": c.expected_output,
                    "expected_contains": c.expected_contains,
                    "expected_not_contains": c.expected_not_contains,
                    "metadata": c.metadata
                }
                for c in self.test_cases
            ]
        }
        
        path.write_text(json.dumps(data, indent=2))
    
    @classmethod
    def load(cls, path: Path) -> "EvalDataset":
        """Load dataset from JSON."""
        
        data = json.loads(path.read_text())
        dataset = cls(data["name"])
        
        for case_data in data["cases"]:
            dataset.test_cases.append(TestCase(**case_data))
        
        return dataset

# Create evaluation dataset for a summarization task
summarization_eval = EvalDataset("summarization")

summarization_eval.add_case(
    input="The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet.",
    expected_contains=["fox", "dog"],
    expected_not_contains=["alphabet"],  # Summary shouldn't include meta-info
    category="short_text"
)

summarization_eval.add_case(
    input="Python 3.12 was released with several new features including improved error messages...",
    expected_contains=["Python", "3.12"],
    category="technical"
)

summarization_eval.save(Path("./evals/summarization.json"))

Automated Evaluation

from dataclasses import dataclass, field
from datetime import datetime
from openai import OpenAI

client = OpenAI()

@dataclass
class EvalResult:
    test_case_id: str
    input: str
    output: str
    passed: bool
    score: float
    details: dict = field(default_factory=dict)
    latency_ms: float = 0.0

@dataclass
class EvalReport:
    dataset_name: str
    timestamp: datetime
    results: list[EvalResult]
    
    @property
    def pass_rate(self) -> float:
        if not self.results:
            return 0.0
        return sum(1 for r in self.results if r.passed) / len(self.results)
    
    @property
    def avg_score(self) -> float:
        if not self.results:
            return 0.0
        return sum(r.score for r in self.results) / len(self.results)
    
    @property
    def avg_latency(self) -> float:
        if not self.results:
            return 0.0
        return sum(r.latency_ms for r in self.results) / len(self.results)
    
    def summary(self) -> str:
        return f"""
Evaluation Report: {self.dataset_name}
Timestamp: {self.timestamp}
Total Cases: {len(self.results)}
Pass Rate: {self.pass_rate:.1%}
Average Score: {self.avg_score:.2f}
Average Latency: {self.avg_latency:.0f}ms
"""

class LLMEvaluator:
    """Evaluate LLM outputs against test cases."""
    
    def __init__(self, model: str = "gpt-4o"):
        self.model = model
    
    def evaluate_case(
        self,
        test_case: TestCase,
        prompt_template: str,
        system_prompt: str = ""
    ) -> EvalResult:
        """Evaluate a single test case."""
        import time
        
        # Format prompt
        prompt = prompt_template.format(input=test_case.input)
        
        # Run LLM
        start = time.time()
        
        messages = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})
        messages.append({"role": "user", "content": prompt})
        
        response = client.chat.completions.create(
            model=self.model,
            messages=messages
        )
        
        latency = (time.time() - start) * 1000
        output = response.choices[0].message.content
        
        # Evaluate output
        passed, score, details = self._score_output(test_case, output)
        
        return EvalResult(
            test_case_id=test_case.id,
            input=test_case.input,
            output=output,
            passed=passed,
            score=score,
            details=details,
            latency_ms=latency
        )
    
    def _score_output(
        self,
        test_case: TestCase,
        output: str
    ) -> tuple[bool, float, dict]:
        """Score the output against expectations."""
        
        details = {}
        score = 1.0
        
        # Check expected contains
        if test_case.expected_contains:
            missing = [
                term for term in test_case.expected_contains
                if term.lower() not in output.lower()
            ]
            details["missing_terms"] = missing
            if missing:
                score -= 0.2 * len(missing)
        
        # Check expected not contains
        if test_case.expected_not_contains:
            found = [
                term for term in test_case.expected_not_contains
                if term.lower() in output.lower()
            ]
            details["unwanted_terms"] = found
            if found:
                score -= 0.3 * len(found)
        
        # Check exact match if provided
        if test_case.expected_output:
            exact_match = output.strip() == test_case.expected_output.strip()
            details["exact_match"] = exact_match
            if not exact_match:
                score -= 0.5
        
        score = max(0.0, min(1.0, score))
        passed = score >= 0.7
        
        return passed, score, details
    
    def evaluate_dataset(
        self,
        dataset: EvalDataset,
        prompt_template: str,
        system_prompt: str = ""
    ) -> EvalReport:
        """Evaluate entire dataset."""
        
        results = []
        
        for case in dataset.test_cases:
            result = self.evaluate_case(case, prompt_template, system_prompt)
            results.append(result)
        
        return EvalReport(
            dataset_name=dataset.name,
            timestamp=datetime.now(),
            results=results
        )

# Usage
evaluator = LLMEvaluator(model="gpt-4o-mini")

report = evaluator.evaluate_dataset(
    summarization_eval,
    prompt_template="Summarize this text in one sentence:\n\n{input}",
    system_prompt="You are a concise summarizer."
)

print(report.summary())

LLM-as-Judge Evaluation

class LLMJudge:
    """Use an LLM to evaluate outputs."""
    
    def __init__(self, judge_model: str = "gpt-4o"):
        self.judge_model = judge_model
    
    def score_output(
        self,
        input_text: str,
        output: str,
        criteria: list[str],
        reference: str = None
    ) -> dict:
        """Score output using LLM judge."""
        
        criteria_text = "\n".join(f"- {c}" for c in criteria)
        
        reference_section = ""
        if reference:
            reference_section = f"\nReference answer:\n{reference}\n"
        
        prompt = f"""Evaluate this LLM output based on the given criteria.

Input: {input_text}

Output to evaluate:
{output}
{reference_section}
Evaluation criteria:
{criteria_text}

For each criterion, provide a score from 1-5 and brief explanation.
Then provide an overall score from 1-5.

Format your response as JSON:
{{
    "criteria_scores": {{
        "criterion_name": {{"score": 1-5, "explanation": "..."}},
        ...
    }},
    "overall_score": 1-5,
    "overall_explanation": "..."
}}"""
        
        response = client.chat.completions.create(
            model=self.judge_model,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        return json.loads(response.choices[0].message.content)
    
    def compare_outputs(
        self,
        input_text: str,
        output_a: str,
        output_b: str,
        criteria: list[str]
    ) -> dict:
        """Compare two outputs and determine which is better."""
        
        criteria_text = "\n".join(f"- {c}" for c in criteria)
        
        prompt = f"""Compare these two LLM outputs and determine which is better.

Input: {input_text}

Output A:
{output_a}

Output B:
{output_b}

Evaluation criteria:
{criteria_text}

Provide your analysis as JSON:
{{
    "winner": "A" or "B" or "tie",
    "confidence": 1-5,
    "reasoning": "...",
    "criteria_comparison": {{
        "criterion": {{"winner": "A/B/tie", "explanation": "..."}}
    }}
}}"""
        
        response = client.chat.completions.create(
            model=self.judge_model,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        return json.loads(response.choices[0].message.content)

# Usage
judge = LLMJudge()

# Score a single output
score = judge.score_output(
    input_text="Explain quantum computing",
    output="Quantum computing uses qubits...",
    criteria=[
        "Accuracy of information",
        "Clarity of explanation",
        "Appropriate level of detail",
        "Engaging writing style"
    ]
)

print(f"Overall score: {score['overall_score']}/5")

# Compare two model outputs
comparison = judge.compare_outputs(
    input_text="Write a product description for a coffee maker",
    output_a="This coffee maker brews great coffee...",
    output_b="Introducing the ultimate coffee experience...",
    criteria=["Persuasiveness", "Clarity", "Feature coverage"]
)

print(f"Winner: Output {comparison['winner']}")

Regression Testing

from pathlib import Path
import hashlib

class PromptRegressionTester:
    """Test for regressions when prompts change."""
    
    def __init__(self, baseline_dir: Path = Path("./baselines")):
        self.baseline_dir = baseline_dir
        self.baseline_dir.mkdir(exist_ok=True)
    
    def _get_baseline_path(self, test_name: str) -> Path:
        return self.baseline_dir / f"{test_name}.json"
    
    def save_baseline(self, test_name: str, report: EvalReport):
        """Save evaluation report as baseline."""
        
        data = {
            "timestamp": report.timestamp.isoformat(),
            "pass_rate": report.pass_rate,
            "avg_score": report.avg_score,
            "results": [
                {
                    "id": r.test_case_id,
                    "output": r.output,
                    "score": r.score,
                    "passed": r.passed
                }
                for r in report.results
            ]
        }
        
        self._get_baseline_path(test_name).write_text(json.dumps(data, indent=2))
    
    def compare_to_baseline(
        self,
        test_name: str,
        current_report: EvalReport,
        regression_threshold: float = 0.05
    ) -> dict:
        """Compare current results to baseline."""
        
        baseline_path = self._get_baseline_path(test_name)
        
        if not baseline_path.exists():
            return {
                "status": "no_baseline",
                "message": "No baseline found. Run with --save-baseline first."
            }
        
        baseline = json.loads(baseline_path.read_text())
        
        # Compare metrics
        pass_rate_diff = current_report.pass_rate - baseline["pass_rate"]
        score_diff = current_report.avg_score - baseline["avg_score"]
        
        # Check for regressions
        regressions = []
        
        if pass_rate_diff < -regression_threshold:
            regressions.append(f"Pass rate dropped: {baseline['pass_rate']:.1%} -> {current_report.pass_rate:.1%}")
        
        if score_diff < -regression_threshold:
            regressions.append(f"Avg score dropped: {baseline['avg_score']:.2f} -> {current_report.avg_score:.2f}")
        
        # Check individual cases
        baseline_results = {r["id"]: r for r in baseline["results"]}
        
        for result in current_report.results:
            if result.test_case_id in baseline_results:
                baseline_result = baseline_results[result.test_case_id]
                
                if baseline_result["passed"] and not result.passed:
                    regressions.append(f"Case {result.test_case_id} now failing")
        
        return {
            "status": "regression" if regressions else "pass",
            "regressions": regressions,
            "pass_rate_change": pass_rate_diff,
            "score_change": score_diff
        }

# Usage in CI/CD
def run_regression_tests():
    """Run as part of CI pipeline."""
    
    evaluator = LLMEvaluator()
    tester = PromptRegressionTester()
    
    # Load test dataset
    dataset = EvalDataset.load(Path("./evals/summarization.json"))
    
    # Run evaluation
    report = evaluator.evaluate_dataset(
        dataset,
        prompt_template="Summarize: {input}",
        system_prompt="Be concise."
    )
    
    # Compare to baseline
    comparison = tester.compare_to_baseline("summarization", report)
    
    if comparison["status"] == "regression":
        print("REGRESSION DETECTED!")
        for reg in comparison["regressions"]:
            print(f"  - {reg}")
        exit(1)
    elif comparison["status"] == "no_baseline":
        print("No baseline found. Saving current results as baseline.")
        tester.save_baseline("summarization", report)
    else:
        print(f"All tests passed. Score change: {comparison['score_change']:+.2f}")

run_regression_tests()

Production Monitoring

from collections import deque
from threading import Lock
import random

class ProductionEvaluator:
    """Continuous evaluation in production."""
    
    def __init__(
        self,
        sample_rate: float = 0.01,  # Evaluate 1% of requests
        window_size: int = 1000
    ):
        self.sample_rate = sample_rate
        self.window_size = window_size
        
        self.scores: deque = deque(maxlen=window_size)
        self.latencies: deque = deque(maxlen=window_size)
        self.lock = Lock()
        
        self.judge = LLMJudge()
    
    def should_evaluate(self) -> bool:
        """Determine if this request should be evaluated."""
        return random.random() < self.sample_rate
    
    def evaluate_async(
        self,
        input_text: str,
        output: str,
        latency_ms: float,
        criteria: list[str]
    ):
        """Evaluate in background (call from async task)."""
        
        # Get LLM judge score
        result = self.judge.score_output(
            input_text=input_text,
            output=output,
            criteria=criteria
        )
        
        with self.lock:
            self.scores.append(result["overall_score"])
            self.latencies.append(latency_ms)
    
    def get_metrics(self) -> dict:
        """Get current production metrics."""
        
        with self.lock:
            if not self.scores:
                return {"status": "no_data"}
            
            scores = list(self.scores)
            latencies = list(self.latencies)
        
        return {
            "sample_count": len(scores),
            "avg_score": sum(scores) / len(scores),
            "min_score": min(scores),
            "max_score": max(scores),
            "avg_latency_ms": sum(latencies) / len(latencies),
            "p95_latency_ms": sorted(latencies)[int(len(latencies) * 0.95)]
        }
    
    def check_alerts(self, min_score: float = 3.0) -> list[str]:
        """Check for alert conditions."""
        
        metrics = self.get_metrics()
        alerts = []
        
        if metrics.get("avg_score", 5) < min_score:
            alerts.append(f"Average score {metrics['avg_score']:.1f} below threshold {min_score}")
        
        if metrics.get("p95_latency_ms", 0) > 5000:
            alerts.append(f"P95 latency {metrics['p95_latency_ms']:.0f}ms exceeds 5s")
        
        return alerts

# Integration with FastAPI
from fastapi import FastAPI, BackgroundTasks

app = FastAPI()
prod_evaluator = ProductionEvaluator(sample_rate=0.02)

@app.post("/chat")
async def chat(message: str, background_tasks: BackgroundTasks):
    import time
    
    start = time.time()
    
    # Get LLM response
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": message}]
    )
    
    output = response.choices[0].message.content
    latency = (time.time() - start) * 1000
    
    # Sample for evaluation
    if prod_evaluator.should_evaluate():
        background_tasks.add_task(
            prod_evaluator.evaluate_async,
            input_text=message,
            output=output,
            latency_ms=latency,
            criteria=["Helpfulness", "Accuracy", "Clarity"]
        )
    
    return {"response": output}

@app.get("/metrics")
async def metrics():
    return prod_evaluator.get_metrics()

References

OpenAI Evals: https://github.com/openai/evals
LangSmith: https://smith.langchain.com/
Promptfoo: https://promptfoo.dev/
DeepEval: https://github.com/confident-ai/deepeval

Conclusion

Testing LLM applications requires different approaches than traditional software. Build evaluation datasets that capture your specific use cases—generic benchmarks won’t tell you if your product works. Use multiple evaluation methods: rule-based checks for objective criteria, LLM-as-judge for subjective quality, and human review for critical decisions. Implement regression testing to catch quality drops when prompts change. Monitor production continuously with sampling-based evaluation. The goal isn’t perfect scores—it’s confidence that your LLM features work reliably for users. Start with a small evaluation set covering critical paths, then expand as you learn what breaks. Testing LLMs is an ongoing process, not a one-time task.

Discover more from Code, Cloud & Context

Subscribe to get the latest posts sent to your email.

Searching in

Code, Cloud & Context

Categories

Archives

A sample text widget

LLM Testing and Evaluation: Building Confidence in AI Applications

Building Evaluation Datasets

Automated Evaluation

LLM-as-Judge Evaluation

Regression Testing

Production Monitoring

References

Conclusion

Discover more from Code, Cloud & Context

Leave a Reply

Recent Posts

Blog Roll

Meta