LLM Testing Strategies: Unit Tests, Evaluation Metrics, and Regression Testing

Introduction: Testing LLM applications is fundamentally different from testing traditional software. Outputs are non-deterministic, quality is subjective, and edge cases are infinite. You can’t simply assert that output equals expected—you need to evaluate whether outputs are good enough across multiple dimensions. Yet many teams skip testing entirely or rely solely on manual spot-checking. This guide covers practical testing strategies: unit tests for prompts and chains, evaluation metrics for quality assessment, regression testing to catch degradation, and continuous evaluation pipelines that run automatically.

LLM Testing
Testing Pipeline: Unit Tests, Evaluation Metrics, Regression Tests

Unit Testing Prompts

from dataclasses import dataclass
from typing import Any, Callable, Optional
import pytest
import asyncio

@dataclass
class PromptTestCase:
    """A test case for a prompt."""
    
    name: str
    input_variables: dict
    expected_contains: list[str] = None
    expected_not_contains: list[str] = None
    min_length: int = 0
    max_length: int = float('inf')
    custom_validator: Callable[[str], bool] = None

class PromptTester:
    """Test prompts for correctness."""
    
    def __init__(self, client: Any, model: str = "gpt-4o-mini"):
        self.client = client
        self.model = model
    
    async def test_prompt(
        self,
        prompt_template: str,
        test_case: PromptTestCase
    ) -> tuple[bool, str, list[str]]:
        """Test a prompt with a test case."""
        
        # Format prompt
        prompt = prompt_template.format(**test_case.input_variables)
        
        # Get response
        response = await self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}]
        )
        
        output = response.choices[0].message.content
        
        # Run assertions
        failures = []
        
        # Check contains
        if test_case.expected_contains:
            for expected in test_case.expected_contains:
                if expected.lower() not in output.lower():
                    failures.append(f"Expected to contain: '{expected}'")
        
        # Check not contains
        if test_case.expected_not_contains:
            for not_expected in test_case.expected_not_contains:
                if not_expected.lower() in output.lower():
                    failures.append(f"Should not contain: '{not_expected}'")
        
        # Check length
        if len(output) < test_case.min_length:
            failures.append(f"Output too short: {len(output)} < {test_case.min_length}")
        
        if len(output) > test_case.max_length:
            failures.append(f"Output too long: {len(output)} > {test_case.max_length}")
        
        # Custom validator
        if test_case.custom_validator:
            if not test_case.custom_validator(output):
                failures.append("Custom validation failed")
        
        passed = len(failures) == 0
        return passed, output, failures
    
    async def run_test_suite(
        self,
        prompt_template: str,
        test_cases: list[PromptTestCase]
    ) -> dict:
        """Run a suite of tests."""
        
        results = []
        
        for test_case in test_cases:
            passed, output, failures = await self.test_prompt(prompt_template, test_case)
            results.append({
                "name": test_case.name,
                "passed": passed,
                "output_preview": output[:200],
                "failures": failures
            })
        
        passed_count = sum(1 for r in results if r["passed"])
        
        return {
            "total": len(results),
            "passed": passed_count,
            "failed": len(results) - passed_count,
            "results": results
        }

# Example pytest integration
class TestSummarizationPrompt:
    """Test cases for summarization prompt."""
    
    @pytest.fixture
    def prompt_template(self):
        return """Summarize the following text in 2-3 sentences:

{text}

Summary:"""
    
    @pytest.fixture
    def tester(self, client):
        return PromptTester(client)
    
    @pytest.mark.asyncio
    async def test_short_text(self, tester, prompt_template):
        test_case = PromptTestCase(
            name="short_text",
            input_variables={
                "text": "The quick brown fox jumps over the lazy dog. This is a simple sentence for testing."
            },
            min_length=20,
            max_length=500
        )
        
        passed, output, failures = await tester.test_prompt(prompt_template, test_case)
        assert passed, f"Test failed: {failures}"
    
    @pytest.mark.asyncio
    async def test_technical_text(self, tester, prompt_template):
        test_case = PromptTestCase(
            name="technical_text",
            input_variables={
                "text": "Machine learning is a subset of artificial intelligence that enables systems to learn from data. Neural networks are a key component of deep learning."
            },
            expected_contains=["machine learning", "artificial intelligence"],
            min_length=50
        )
        
        passed, output, failures = await tester.test_prompt(prompt_template, test_case)
        assert passed, f"Test failed: {failures}"

Evaluation Metrics

from dataclasses import dataclass
from typing import Any, Optional
import re

@dataclass
class EvaluationResult:
    """Result of an evaluation."""
    
    metric_name: str
    score: float  # 0-1
    details: dict = None

class ExactMatchEvaluator:
    """Evaluate exact match accuracy."""
    
    def evaluate(self, output: str, expected: str) -> EvaluationResult:
        """Check if output exactly matches expected."""
        
        # Normalize whitespace
        output_normalized = " ".join(output.split())
        expected_normalized = " ".join(expected.split())
        
        score = 1.0 if output_normalized == expected_normalized else 0.0
        
        return EvaluationResult(
            metric_name="exact_match",
            score=score,
            details={"matched": score == 1.0}
        )

class ContainsEvaluator:
    """Evaluate if output contains expected elements."""
    
    def evaluate(
        self,
        output: str,
        expected_elements: list[str],
        case_sensitive: bool = False
    ) -> EvaluationResult:
        """Check if output contains all expected elements."""
        
        if not case_sensitive:
            output = output.lower()
            expected_elements = [e.lower() for e in expected_elements]
        
        found = [e for e in expected_elements if e in output]
        score = len(found) / len(expected_elements) if expected_elements else 1.0
        
        return EvaluationResult(
            metric_name="contains",
            score=score,
            details={
                "found": found,
                "missing": [e for e in expected_elements if e not in found]
            }
        )

class LLMJudgeEvaluator:
    """Use LLM as a judge for quality evaluation."""
    
    def __init__(self, client: Any, model: str = "gpt-4o"):
        self.client = client
        self.model = model
    
    async def evaluate_relevance(
        self,
        query: str,
        output: str
    ) -> EvaluationResult:
        """Evaluate relevance of output to query."""
        
        prompt = f"""Rate how relevant this response is to the query on a scale of 1-5.

Query: {query}

Response: {output[:2000]}

Provide your rating as a JSON object:
{{"score": 1-5, "reason": "brief explanation"}}"""
        
        response = await self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        import json
        result = json.loads(response.choices[0].message.content)
        
        return EvaluationResult(
            metric_name="relevance",
            score=result["score"] / 5,  # Normalize to 0-1
            details={"raw_score": result["score"], "reason": result["reason"]}
        )
    
    async def evaluate_correctness(
        self,
        output: str,
        ground_truth: str
    ) -> EvaluationResult:
        """Evaluate factual correctness against ground truth."""
        
        prompt = f"""Compare the response to the ground truth and rate factual correctness on a scale of 1-5.

Response: {output[:1500]}

Ground Truth: {ground_truth[:1500]}

Provide your rating as a JSON object:
{{"score": 1-5, "correct_facts": ["list"], "incorrect_facts": ["list"]}}"""
        
        response = await self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        import json
        result = json.loads(response.choices[0].message.content)
        
        return EvaluationResult(
            metric_name="correctness",
            score=result["score"] / 5,
            details={
                "correct_facts": result.get("correct_facts", []),
                "incorrect_facts": result.get("incorrect_facts", [])
            }
        )
    
    async def evaluate_helpfulness(
        self,
        query: str,
        output: str
    ) -> EvaluationResult:
        """Evaluate how helpful the response is."""
        
        prompt = f"""Rate how helpful this response is for the user's query on a scale of 1-5.
Consider: Does it answer the question? Is it actionable? Is it complete?

Query: {query}

Response: {output[:2000]}

Provide your rating as a JSON object:
{{"score": 1-5, "strengths": ["list"], "weaknesses": ["list"]}}"""
        
        response = await self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        import json
        result = json.loads(response.choices[0].message.content)
        
        return EvaluationResult(
            metric_name="helpfulness",
            score=result["score"] / 5,
            details={
                "strengths": result.get("strengths", []),
                "weaknesses": result.get("weaknesses", [])
            }
        )

class CodeEvaluator:
    """Evaluate code generation quality."""
    
    def evaluate_syntax(self, code: str, language: str = "python") -> EvaluationResult:
        """Check if code has valid syntax."""
        
        if language == "python":
            try:
                import ast
                ast.parse(code)
                return EvaluationResult(
                    metric_name="syntax",
                    score=1.0,
                    details={"valid": True}
                )
            except SyntaxError as e:
                return EvaluationResult(
                    metric_name="syntax",
                    score=0.0,
                    details={"valid": False, "error": str(e)}
                )
        
        return EvaluationResult(
            metric_name="syntax",
            score=0.5,
            details={"message": f"Syntax check not implemented for {language}"}
        )
    
    def evaluate_execution(
        self,
        code: str,
        test_cases: list[tuple[dict, Any]]
    ) -> EvaluationResult:
        """Execute code and check against test cases."""
        
        passed = 0
        results = []
        
        for inputs, expected in test_cases:
            try:
                # Create isolated namespace
                namespace = {}
                exec(code, namespace)
                
                # Find the main function
                func_name = None
                for name, obj in namespace.items():
                    if callable(obj) and not name.startswith('_'):
                        func_name = name
                        break
                
                if func_name:
                    result = namespace[func_name](**inputs)
                    if result == expected:
                        passed += 1
                        results.append({"inputs": inputs, "passed": True})
                    else:
                        results.append({
                            "inputs": inputs,
                            "passed": False,
                            "expected": expected,
                            "got": result
                        })
                else:
                    results.append({"inputs": inputs, "passed": False, "error": "No function found"})
            
            except Exception as e:
                results.append({"inputs": inputs, "passed": False, "error": str(e)})
        
        score = passed / len(test_cases) if test_cases else 0
        
        return EvaluationResult(
            metric_name="execution",
            score=score,
            details={"passed": passed, "total": len(test_cases), "results": results}
        )

Regression Testing

from dataclasses import dataclass, field
from typing import Any, Optional
from datetime import datetime
import json
import hashlib

@dataclass
class GoldenExample:
    """A golden example for regression testing."""
    
    id: str
    prompt: str
    expected_output: str
    metadata: dict = field(default_factory=dict)
    created_at: datetime = field(default_factory=datetime.utcnow)
    
    def to_dict(self) -> dict:
        return {
            "id": self.id,
            "prompt": self.prompt,
            "expected_output": self.expected_output,
            "metadata": self.metadata,
            "created_at": self.created_at.isoformat()
        }
    
    @classmethod
    def from_dict(cls, data: dict) -> "GoldenExample":
        return cls(
            id=data["id"],
            prompt=data["prompt"],
            expected_output=data["expected_output"],
            metadata=data.get("metadata", {}),
            created_at=datetime.fromisoformat(data["created_at"])
        )

class GoldenStore:
    """Store and manage golden examples."""
    
    def __init__(self, storage_path: str):
        self.storage_path = storage_path
        self.examples: dict[str, GoldenExample] = {}
        self._load()
    
    def _load(self):
        """Load examples from storage."""
        
        try:
            with open(self.storage_path, 'r') as f:
                data = json.load(f)
                for item in data:
                    example = GoldenExample.from_dict(item)
                    self.examples[example.id] = example
        except FileNotFoundError:
            pass
    
    def _save(self):
        """Save examples to storage."""
        
        with open(self.storage_path, 'w') as f:
            json.dump([e.to_dict() for e in self.examples.values()], f, indent=2)
    
    def add(self, prompt: str, expected_output: str, metadata: dict = None) -> str:
        """Add a golden example."""
        
        example_id = hashlib.md5(prompt.encode()).hexdigest()[:12]
        
        example = GoldenExample(
            id=example_id,
            prompt=prompt,
            expected_output=expected_output,
            metadata=metadata or {}
        )
        
        self.examples[example_id] = example
        self._save()
        
        return example_id
    
    def get(self, example_id: str) -> Optional[GoldenExample]:
        """Get a golden example by ID."""
        return self.examples.get(example_id)
    
    def list_all(self) -> list[GoldenExample]:
        """List all golden examples."""
        return list(self.examples.values())
    
    def update(self, example_id: str, expected_output: str):
        """Update expected output for an example."""
        
        if example_id in self.examples:
            self.examples[example_id].expected_output = expected_output
            self._save()

@dataclass
class RegressionResult:
    """Result of a regression test."""
    
    example_id: str
    passed: bool
    similarity_score: float
    current_output: str
    expected_output: str
    diff_summary: str = ""

class RegressionTester:
    """Run regression tests against golden examples."""
    
    def __init__(
        self,
        client: Any,
        golden_store: GoldenStore,
        evaluator: LLMJudgeEvaluator = None,
        similarity_threshold: float = 0.8
    ):
        self.client = client
        self.golden_store = golden_store
        self.evaluator = evaluator
        self.similarity_threshold = similarity_threshold
    
    async def run_single(
        self,
        example: GoldenExample,
        model: str = "gpt-4o-mini"
    ) -> RegressionResult:
        """Run regression test for a single example."""
        
        # Get current output
        response = await self.client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": example.prompt}]
        )
        
        current_output = response.choices[0].message.content
        
        # Calculate similarity
        if self.evaluator:
            eval_result = await self.evaluator.evaluate_correctness(
                current_output,
                example.expected_output
            )
            similarity = eval_result.score
        else:
            similarity = self._simple_similarity(current_output, example.expected_output)
        
        passed = similarity >= self.similarity_threshold
        
        return RegressionResult(
            example_id=example.id,
            passed=passed,
            similarity_score=similarity,
            current_output=current_output,
            expected_output=example.expected_output,
            diff_summary=self._summarize_diff(current_output, example.expected_output)
        )
    
    async def run_all(self, model: str = "gpt-4o-mini") -> dict:
        """Run all regression tests."""
        
        examples = self.golden_store.list_all()
        results = []
        
        for example in examples:
            result = await self.run_single(example, model)
            results.append(result)
        
        passed = sum(1 for r in results if r.passed)
        
        return {
            "total": len(results),
            "passed": passed,
            "failed": len(results) - passed,
            "pass_rate": passed / len(results) if results else 0,
            "results": [
                {
                    "example_id": r.example_id,
                    "passed": r.passed,
                    "similarity": r.similarity_score,
                    "diff_summary": r.diff_summary
                }
                for r in results
            ]
        }
    
    def _simple_similarity(self, text1: str, text2: str) -> float:
        """Calculate simple word overlap similarity."""
        
        words1 = set(text1.lower().split())
        words2 = set(text2.lower().split())
        
        if not words1 or not words2:
            return 0.0
        
        intersection = len(words1 & words2)
        union = len(words1 | words2)
        
        return intersection / union
    
    def _summarize_diff(self, current: str, expected: str) -> str:
        """Summarize differences between outputs."""
        
        current_words = set(current.lower().split())
        expected_words = set(expected.lower().split())
        
        missing = expected_words - current_words
        extra = current_words - expected_words
        
        summary_parts = []
        if missing:
            summary_parts.append(f"Missing: {', '.join(list(missing)[:5])}")
        if extra:
            summary_parts.append(f"Extra: {', '.join(list(extra)[:5])}")
        
        return "; ".join(summary_parts) if summary_parts else "Similar"

Continuous Evaluation Pipeline

from dataclasses import dataclass, field
from typing import Any, Callable
from datetime import datetime
import asyncio

@dataclass
class EvaluationRun:
    """A single evaluation run."""
    
    run_id: str
    timestamp: datetime
    model: str
    metrics: dict[str, float]
    details: dict = field(default_factory=dict)

class EvaluationPipeline:
    """Continuous evaluation pipeline."""
    
    def __init__(
        self,
        client: Any,
        model: str = "gpt-4o-mini"
    ):
        self.client = client
        self.model = model
        self.evaluators: list[tuple[str, Any]] = []
        self.test_cases: list[dict] = []
        self.runs: list[EvaluationRun] = []
    
    def add_evaluator(self, name: str, evaluator: Any):
        """Add an evaluator to the pipeline."""
        self.evaluators.append((name, evaluator))
    
    def add_test_case(
        self,
        prompt: str,
        expected: str = None,
        metadata: dict = None
    ):
        """Add a test case."""
        
        self.test_cases.append({
            "prompt": prompt,
            "expected": expected,
            "metadata": metadata or {}
        })
    
    async def run(self) -> EvaluationRun:
        """Run the evaluation pipeline."""
        
        import uuid
        run_id = str(uuid.uuid4())[:8]
        
        all_scores = {name: [] for name, _ in self.evaluators}
        details = []
        
        for test_case in self.test_cases:
            # Get model output
            response = await self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": test_case["prompt"]}]
            )
            
            output = response.choices[0].message.content
            
            # Run evaluators
            case_scores = {}
            for name, evaluator in self.evaluators:
                if hasattr(evaluator, 'evaluate'):
                    if asyncio.iscoroutinefunction(evaluator.evaluate):
                        result = await evaluator.evaluate(output, test_case.get("expected", ""))
                    else:
                        result = evaluator.evaluate(output, test_case.get("expected", ""))
                    
                    score = result.score if hasattr(result, 'score') else result
                    all_scores[name].append(score)
                    case_scores[name] = score
            
            details.append({
                "prompt": test_case["prompt"][:100],
                "output": output[:200],
                "scores": case_scores
            })
        
        # Calculate aggregate metrics
        metrics = {}
        for name, scores in all_scores.items():
            if scores:
                metrics[f"{name}_avg"] = sum(scores) / len(scores)
                metrics[f"{name}_min"] = min(scores)
                metrics[f"{name}_max"] = max(scores)
        
        run = EvaluationRun(
            run_id=run_id,
            timestamp=datetime.utcnow(),
            model=self.model,
            metrics=metrics,
            details={"cases": details}
        )
        
        self.runs.append(run)
        return run
    
    def compare_runs(self, run_id_1: str, run_id_2: str) -> dict:
        """Compare two evaluation runs."""
        
        run1 = next((r for r in self.runs if r.run_id == run_id_1), None)
        run2 = next((r for r in self.runs if r.run_id == run_id_2), None)
        
        if not run1 or not run2:
            return {"error": "Run not found"}
        
        comparison = {}
        all_metrics = set(run1.metrics.keys()) | set(run2.metrics.keys())
        
        for metric in all_metrics:
            val1 = run1.metrics.get(metric, 0)
            val2 = run2.metrics.get(metric, 0)
            
            comparison[metric] = {
                "run1": val1,
                "run2": val2,
                "diff": val2 - val1,
                "improved": val2 > val1
            }
        
        return comparison
    
    def get_trend(self, metric: str, last_n: int = 10) -> list[float]:
        """Get trend for a metric over recent runs."""
        
        recent_runs = self.runs[-last_n:]
        return [r.metrics.get(metric, 0) for r in recent_runs]

class CIIntegration:
    """Integration with CI/CD systems."""
    
    def __init__(
        self,
        pipeline: EvaluationPipeline,
        thresholds: dict[str, float]
    ):
        self.pipeline = pipeline
        self.thresholds = thresholds
    
    async def run_ci_check(self) -> tuple[bool, dict]:
        """Run CI check and return pass/fail status."""
        
        run = await self.pipeline.run()
        
        failures = []
        for metric, threshold in self.thresholds.items():
            actual = run.metrics.get(metric, 0)
            if actual < threshold:
                failures.append({
                    "metric": metric,
                    "threshold": threshold,
                    "actual": actual
                })
        
        passed = len(failures) == 0
        
        return passed, {
            "run_id": run.run_id,
            "passed": passed,
            "metrics": run.metrics,
            "failures": failures
        }
    
    def generate_report(self, run: EvaluationRun) -> str:
        """Generate a markdown report for CI."""
        
        lines = [
            f"# Evaluation Report",
            f"",
            f"**Run ID:** {run.run_id}",
            f"**Model:** {run.model}",
            f"**Timestamp:** {run.timestamp.isoformat()}",
            f"",
            f"## Metrics",
            f""
        ]
        
        for metric, value in sorted(run.metrics.items()):
            threshold = self.thresholds.get(metric)
            status = ""
            if threshold:
                status = " :white_check_mark:" if value >= threshold else " :x:"
            lines.append(f"- **{metric}:** {value:.3f}{status}")
        
        return "\n".join(lines)

Production Testing Service

from fastapi import FastAPI, HTTPException, BackgroundTasks
from pydantic import BaseModel
from typing import Optional

app = FastAPI()

# Initialize components
golden_store = GoldenStore("/tmp/golden_examples.json")
llm_judge = None  # Initialize with actual client
regression_tester = None  # Initialize with actual client
eval_pipeline = None  # Initialize with actual client

class PromptTestRequest(BaseModel):
    prompt_template: str
    test_cases: list[dict]

class GoldenExampleRequest(BaseModel):
    prompt: str
    expected_output: str
    metadata: Optional[dict] = None

class EvaluationRequest(BaseModel):
    output: str
    expected: Optional[str] = None
    query: Optional[str] = None
    metrics: list[str] = ["relevance", "helpfulness"]

class RegressionRequest(BaseModel):
    model: str = "gpt-4o-mini"
    example_ids: Optional[list[str]] = None

@app.post("/v1/test/prompt")
async def test_prompt(request: PromptTestRequest):
    """Test a prompt template."""
    
    tester = PromptTester(llm_judge.client if llm_judge else None)
    
    test_cases = [
        PromptTestCase(
            name=tc.get("name", f"test_{i}"),
            input_variables=tc["input_variables"],
            expected_contains=tc.get("expected_contains"),
            expected_not_contains=tc.get("expected_not_contains"),
            min_length=tc.get("min_length", 0),
            max_length=tc.get("max_length", float('inf'))
        )
        for i, tc in enumerate(request.test_cases)
    ]
    
    results = await tester.run_test_suite(request.prompt_template, test_cases)
    
    return results

@app.post("/v1/golden/add")
async def add_golden_example(request: GoldenExampleRequest):
    """Add a golden example."""
    
    example_id = golden_store.add(
        request.prompt,
        request.expected_output,
        request.metadata
    )
    
    return {"example_id": example_id, "status": "added"}

@app.get("/v1/golden/list")
async def list_golden_examples():
    """List all golden examples."""
    
    examples = golden_store.list_all()
    
    return {
        "count": len(examples),
        "examples": [
            {
                "id": e.id,
                "prompt_preview": e.prompt[:100],
                "created_at": e.created_at.isoformat()
            }
            for e in examples
        ]
    }

@app.post("/v1/evaluate")
async def evaluate_output(request: EvaluationRequest):
    """Evaluate an LLM output."""
    
    if not llm_judge:
        raise HTTPException(status_code=500, detail="Evaluator not configured")
    
    results = {}
    
    if "relevance" in request.metrics and request.query:
        result = await llm_judge.evaluate_relevance(request.query, request.output)
        results["relevance"] = {"score": result.score, "details": result.details}
    
    if "correctness" in request.metrics and request.expected:
        result = await llm_judge.evaluate_correctness(request.output, request.expected)
        results["correctness"] = {"score": result.score, "details": result.details}
    
    if "helpfulness" in request.metrics and request.query:
        result = await llm_judge.evaluate_helpfulness(request.query, request.output)
        results["helpfulness"] = {"score": result.score, "details": result.details}
    
    return {"metrics": results}

@app.post("/v1/regression/run")
async def run_regression_tests(request: RegressionRequest):
    """Run regression tests."""
    
    if not regression_tester:
        raise HTTPException(status_code=500, detail="Regression tester not configured")
    
    if request.example_ids:
        # Run specific examples
        results = []
        for example_id in request.example_ids:
            example = golden_store.get(example_id)
            if example:
                result = await regression_tester.run_single(example, request.model)
                results.append({
                    "example_id": result.example_id,
                    "passed": result.passed,
                    "similarity": result.similarity_score
                })
        
        passed = sum(1 for r in results if r["passed"])
        return {
            "total": len(results),
            "passed": passed,
            "failed": len(results) - passed,
            "results": results
        }
    else:
        # Run all
        return await regression_tester.run_all(request.model)

@app.post("/v1/pipeline/run")
async def run_evaluation_pipeline(background_tasks: BackgroundTasks):
    """Run the evaluation pipeline."""
    
    if not eval_pipeline:
        raise HTTPException(status_code=500, detail="Pipeline not configured")
    
    run = await eval_pipeline.run()
    
    return {
        "run_id": run.run_id,
        "timestamp": run.timestamp.isoformat(),
        "model": run.model,
        "metrics": run.metrics
    }

@app.get("/v1/pipeline/runs")
async def list_pipeline_runs(limit: int = 10):
    """List recent pipeline runs."""
    
    if not eval_pipeline:
        raise HTTPException(status_code=500, detail="Pipeline not configured")
    
    runs = eval_pipeline.runs[-limit:]
    
    return {
        "count": len(runs),
        "runs": [
            {
                "run_id": r.run_id,
                "timestamp": r.timestamp.isoformat(),
                "model": r.model,
                "metrics": r.metrics
            }
            for r in runs
        ]
    }

@app.get("/v1/pipeline/trend/{metric}")
async def get_metric_trend(metric: str, last_n: int = 10):
    """Get trend for a metric."""
    
    if not eval_pipeline:
        raise HTTPException(status_code=500, detail="Pipeline not configured")
    
    trend = eval_pipeline.get_trend(metric, last_n)
    
    return {
        "metric": metric,
        "values": trend,
        "avg": sum(trend) / len(trend) if trend else 0
    }

@app.get("/health")
async def health():
    return {"status": "healthy"}

References

Conclusion

Testing LLM applications requires a different mindset than traditional software testing. Unit tests for prompts verify basic behavior—that outputs contain expected elements, meet length requirements, and pass custom validators. Evaluation metrics quantify quality across dimensions like relevance, correctness, and helpfulness, using both deterministic checks and LLM-as-judge approaches. Regression testing with golden examples catches degradation when you change prompts, models, or system configurations. Continuous evaluation pipelines run automatically, tracking metrics over time and integrating with CI/CD to gate deployments. The key insight is that LLM testing is about measuring quality distributions, not asserting exact outputs. Start with basic unit tests and contains checks, add LLM-based evaluation for subjective quality, build a golden set for regression testing, and implement continuous evaluation as your system matures. Test early, test often, and track metrics over time to catch degradation before users do.


Discover more from Code, Cloud & Context

Subscribe to get the latest posts sent to your email.

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.