Introduction: Evaluating LLM outputs is fundamentally different from traditional ML metrics. You can’t just compute accuracy when there’s no single correct answer, and human evaluation doesn’t scale. This guide covers the full spectrum of LLM evaluation: automated metrics like BLEU, ROUGE, and BERTScore for measuring similarity; semantic metrics that capture meaning beyond surface-level matching; LLM-as-judge approaches where models evaluate other models; and frameworks for combining multiple signals into actionable quality scores. Whether you’re evaluating a chatbot, summarization system, or code generator, these techniques will help you measure what matters and catch regressions before they reach production.

Text Similarity Metrics
from dataclasses import dataclass
from typing import Any, Optional
from abc import ABC, abstractmethod
import re
from collections import Counter
@dataclass
class MetricResult:
"""Evaluation metric result."""
name: str
score: float
details: dict = None
class TextMetric(ABC):
"""Abstract text similarity metric."""
@abstractmethod
def compute(self, prediction: str, reference: str) -> MetricResult:
"""Compute metric between prediction and reference."""
pass
class BLEUScore(TextMetric):
"""BLEU score for n-gram overlap."""
def __init__(self, max_n: int = 4):
self.max_n = max_n
def compute(self, prediction: str, reference: str) -> MetricResult:
"""Compute BLEU score."""
pred_tokens = prediction.lower().split()
ref_tokens = reference.lower().split()
if not pred_tokens:
return MetricResult(name="bleu", score=0.0)
# Compute n-gram precisions
precisions = []
for n in range(1, self.max_n + 1):
pred_ngrams = self._get_ngrams(pred_tokens, n)
ref_ngrams = self._get_ngrams(ref_tokens, n)
if not pred_ngrams:
precisions.append(0.0)
continue
# Count matches
matches = sum(
min(pred_ngrams[ng], ref_ngrams.get(ng, 0))
for ng in pred_ngrams
)
precision = matches / sum(pred_ngrams.values())
precisions.append(precision)
# Geometric mean of precisions
if 0 in precisions:
bleu = 0.0
else:
import math
bleu = math.exp(sum(math.log(p) for p in precisions) / len(precisions))
# Brevity penalty
bp = min(1.0, math.exp(1 - len(ref_tokens) / max(len(pred_tokens), 1)))
bleu *= bp
return MetricResult(
name="bleu",
score=bleu,
details={"precisions": precisions, "brevity_penalty": bp}
)
def _get_ngrams(self, tokens: list[str], n: int) -> Counter:
"""Extract n-grams from tokens."""
ngrams = []
for i in range(len(tokens) - n + 1):
ngrams.append(tuple(tokens[i:i+n]))
return Counter(ngrams)
class ROUGEScore(TextMetric):
"""ROUGE score for recall-oriented evaluation."""
def __init__(self, rouge_type: str = "rouge-l"):
self.rouge_type = rouge_type
def compute(self, prediction: str, reference: str) -> MetricResult:
"""Compute ROUGE score."""
pred_tokens = prediction.lower().split()
ref_tokens = reference.lower().split()
if self.rouge_type == "rouge-1":
return self._rouge_n(pred_tokens, ref_tokens, 1)
elif self.rouge_type == "rouge-2":
return self._rouge_n(pred_tokens, ref_tokens, 2)
else: # rouge-l
return self._rouge_l(pred_tokens, ref_tokens)
def _rouge_n(
self,
pred_tokens: list[str],
ref_tokens: list[str],
n: int
) -> MetricResult:
"""Compute ROUGE-N."""
pred_ngrams = self._get_ngrams(pred_tokens, n)
ref_ngrams = self._get_ngrams(ref_tokens, n)
if not ref_ngrams:
return MetricResult(name=f"rouge-{n}", score=0.0)
matches = sum(
min(pred_ngrams.get(ng, 0), count)
for ng, count in ref_ngrams.items()
)
precision = matches / max(sum(pred_ngrams.values()), 1)
recall = matches / sum(ref_ngrams.values())
f1 = 2 * precision * recall / max(precision + recall, 1e-8)
return MetricResult(
name=f"rouge-{n}",
score=f1,
details={"precision": precision, "recall": recall}
)
def _rouge_l(
self,
pred_tokens: list[str],
ref_tokens: list[str]
) -> MetricResult:
"""Compute ROUGE-L using LCS."""
lcs_length = self._lcs_length(pred_tokens, ref_tokens)
precision = lcs_length / max(len(pred_tokens), 1)
recall = lcs_length / max(len(ref_tokens), 1)
f1 = 2 * precision * recall / max(precision + recall, 1e-8)
return MetricResult(
name="rouge-l",
score=f1,
details={"lcs_length": lcs_length, "precision": precision, "recall": recall}
)
def _lcs_length(self, seq1: list, seq2: list) -> int:
"""Compute longest common subsequence length."""
m, n = len(seq1), len(seq2)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
if seq1[i-1] == seq2[j-1]:
dp[i][j] = dp[i-1][j-1] + 1
else:
dp[i][j] = max(dp[i-1][j], dp[i][j-1])
return dp[m][n]
def _get_ngrams(self, tokens: list[str], n: int) -> Counter:
"""Extract n-grams."""
ngrams = []
for i in range(len(tokens) - n + 1):
ngrams.append(tuple(tokens[i:i+n]))
return Counter(ngrams)
class ExactMatch(TextMetric):
"""Exact match metric."""
def __init__(self, normalize: bool = True):
self.normalize = normalize
def compute(self, prediction: str, reference: str) -> MetricResult:
"""Check exact match."""
pred = prediction
ref = reference
if self.normalize:
pred = self._normalize(pred)
ref = self._normalize(ref)
match = pred == ref
return MetricResult(
name="exact_match",
score=1.0 if match else 0.0
)
def _normalize(self, text: str) -> str:
"""Normalize text for comparison."""
text = text.lower().strip()
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s]', '', text)
return text
Semantic Similarity Metrics
from dataclasses import dataclass
from typing import Any, Optional
import numpy as np
class BERTScore(TextMetric):
"""BERTScore for semantic similarity."""
def __init__(self, model_name: str = "microsoft/deberta-xlarge-mnli"):
self.model_name = model_name
self.model = None
self.tokenizer = None
def _load_model(self):
"""Lazy load model."""
if self.model is None:
from transformers import AutoModel, AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model = AutoModel.from_pretrained(self.model_name)
def compute(self, prediction: str, reference: str) -> MetricResult:
"""Compute BERTScore."""
self._load_model()
import torch
# Tokenize
pred_inputs = self.tokenizer(
prediction,
return_tensors="pt",
truncation=True,
max_length=512
)
ref_inputs = self.tokenizer(
reference,
return_tensors="pt",
truncation=True,
max_length=512
)
# Get embeddings
with torch.no_grad():
pred_outputs = self.model(**pred_inputs)
ref_outputs = self.model(**ref_inputs)
pred_embeds = pred_outputs.last_hidden_state[0] # [seq_len, hidden]
ref_embeds = ref_outputs.last_hidden_state[0]
# Compute cosine similarity matrix
pred_norm = pred_embeds / pred_embeds.norm(dim=-1, keepdim=True)
ref_norm = ref_embeds / ref_embeds.norm(dim=-1, keepdim=True)
similarity = torch.mm(pred_norm, ref_norm.t()) # [pred_len, ref_len]
# Precision: max similarity for each prediction token
precision = similarity.max(dim=1).values.mean().item()
# Recall: max similarity for each reference token
recall = similarity.max(dim=0).values.mean().item()
# F1
f1 = 2 * precision * recall / max(precision + recall, 1e-8)
return MetricResult(
name="bertscore",
score=f1,
details={"precision": precision, "recall": recall}
)
class EmbeddingSimilarity(TextMetric):
"""Embedding-based similarity."""
def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
self.model_name = model_name
self.model = None
def _load_model(self):
"""Lazy load model."""
if self.model is None:
from sentence_transformers import SentenceTransformer
self.model = SentenceTransformer(self.model_name)
def compute(self, prediction: str, reference: str) -> MetricResult:
"""Compute embedding similarity."""
self._load_model()
embeddings = self.model.encode([prediction, reference])
# Cosine similarity
similarity = np.dot(embeddings[0], embeddings[1]) / (
np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
)
return MetricResult(
name="embedding_similarity",
score=float(similarity)
)
class SemanticEntailment(TextMetric):
"""Check if prediction entails reference."""
def __init__(self, model_name: str = "facebook/bart-large-mnli"):
self.model_name = model_name
self.model = None
self.tokenizer = None
def _load_model(self):
"""Lazy load model."""
if self.model is None:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name)
def compute(self, prediction: str, reference: str) -> MetricResult:
"""Check entailment."""
self._load_model()
import torch
inputs = self.tokenizer(
prediction,
reference,
return_tensors="pt",
truncation=True,
max_length=512
)
with torch.no_grad():
outputs = self.model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1)
# Labels: contradiction, neutral, entailment
entailment_prob = probs[0][2].item()
contradiction_prob = probs[0][0].item()
return MetricResult(
name="entailment",
score=entailment_prob,
details={
"entailment": entailment_prob,
"contradiction": contradiction_prob,
"neutral": probs[0][1].item()
}
)
LLM-as-Judge Evaluation
from dataclasses import dataclass
from typing import Any, Optional
from abc import ABC, abstractmethod
from enum import Enum
import json
class EvalCriterion(Enum):
"""Evaluation criteria."""
RELEVANCE = "relevance"
COHERENCE = "coherence"
FLUENCY = "fluency"
FACTUALITY = "factuality"
HELPFULNESS = "helpfulness"
HARMLESSNESS = "harmlessness"
@dataclass
class JudgeResult:
"""LLM judge result."""
criterion: EvalCriterion
score: float # 1-5 scale
reasoning: str
confidence: float = 1.0
class LLMJudge(ABC):
"""Abstract LLM judge."""
@abstractmethod
async def evaluate(
self,
prediction: str,
reference: str = None,
context: str = None,
criterion: EvalCriterion = EvalCriterion.RELEVANCE
) -> JudgeResult:
"""Evaluate prediction."""
pass
class SinglePointJudge(LLMJudge):
"""Single-point scoring judge."""
def __init__(self, llm_client: Any):
self.llm = llm_client
self.prompts = {
EvalCriterion.RELEVANCE: """Rate the relevance of the response to the query on a scale of 1-5.
Query: {context}
Response: {prediction}
Scoring:
1 - Completely irrelevant
2 - Mostly irrelevant with minor relevant points
3 - Partially relevant
4 - Mostly relevant with minor issues
5 - Highly relevant and on-topic
Provide your rating as JSON: {{"score": <1-5>, "reasoning": ""}}""",
EvalCriterion.COHERENCE: """Rate the coherence of the response on a scale of 1-5.
Response: {prediction}
Scoring:
1 - Incoherent, impossible to follow
2 - Mostly incoherent with some clear parts
3 - Somewhat coherent but disorganized
4 - Mostly coherent with minor issues
5 - Highly coherent and well-organized
Provide your rating as JSON: {{"score": <1-5>, "reasoning": " "}}""",
EvalCriterion.FACTUALITY: """Rate the factual accuracy of the response on a scale of 1-5.
Reference: {reference}
Response: {prediction}
Scoring:
1 - Completely inaccurate
2 - Mostly inaccurate with some correct facts
3 - Mix of accurate and inaccurate information
4 - Mostly accurate with minor errors
5 - Completely accurate
Provide your rating as JSON: {{"score": <1-5>, "reasoning": " "}}""",
EvalCriterion.HELPFULNESS: """Rate how helpful the response is on a scale of 1-5.
Query: {context}
Response: {prediction}
Scoring:
1 - Not helpful at all
2 - Minimally helpful
3 - Somewhat helpful
4 - Very helpful
5 - Extremely helpful and comprehensive
Provide your rating as JSON: {{"score": <1-5>, "reasoning": " "}}"""
}
async def evaluate(
self,
prediction: str,
reference: str = None,
context: str = None,
criterion: EvalCriterion = EvalCriterion.RELEVANCE
) -> JudgeResult:
"""Evaluate using single-point scoring."""
prompt_template = self.prompts.get(criterion, self.prompts[EvalCriterion.RELEVANCE])
prompt = prompt_template.format(
prediction=prediction,
reference=reference or "",
context=context or ""
)
response = await self.llm.complete(prompt)
try:
result = json.loads(response.content)
return JudgeResult(
criterion=criterion,
score=float(result["score"]),
reasoning=result.get("reasoning", "")
)
except Exception:
return JudgeResult(
criterion=criterion,
score=3.0,
reasoning="Failed to parse response",
confidence=0.5
)
class PairwiseJudge(LLMJudge):
"""Pairwise comparison judge."""
def __init__(self, llm_client: Any):
self.llm = llm_client
async def compare(
self,
response_a: str,
response_b: str,
context: str,
criterion: EvalCriterion = EvalCriterion.HELPFULNESS
) -> dict:
"""Compare two responses."""
prompt = f"""Compare these two responses and determine which is better for {criterion.value}.
Query: {context}
Response A:
{response_a}
Response B:
{response_b}
Which response is better? Provide your answer as JSON:
{{"winner": "A" or "B" or "tie", "reasoning": " ", "confidence": <0-1>}}"""
response = await self.llm.complete(prompt)
try:
result = json.loads(response.content)
return result
except Exception:
return {"winner": "tie", "reasoning": "Parse error", "confidence": 0.5}
async def evaluate(
self,
prediction: str,
reference: str = None,
context: str = None,
criterion: EvalCriterion = EvalCriterion.RELEVANCE
) -> JudgeResult:
"""Evaluate by comparing to reference."""
if not reference:
raise ValueError("Pairwise judge requires reference")
result = await self.compare(prediction, reference, context or "", criterion)
# Convert to score
if result["winner"] == "A":
score = 4.0
elif result["winner"] == "B":
score = 2.0
else:
score = 3.0
return JudgeResult(
criterion=criterion,
score=score,
reasoning=result.get("reasoning", ""),
confidence=result.get("confidence", 0.8)
)
class RubricJudge(LLMJudge):
"""Judge using detailed rubric."""
def __init__(self, llm_client: Any, rubric: dict[str, str] = None):
self.llm = llm_client
self.rubric = rubric or {}
def set_rubric(self, criterion: EvalCriterion, rubric: str):
"""Set rubric for criterion."""
self.rubric[criterion.value] = rubric
async def evaluate(
self,
prediction: str,
reference: str = None,
context: str = None,
criterion: EvalCriterion = EvalCriterion.RELEVANCE
) -> JudgeResult:
"""Evaluate using rubric."""
rubric = self.rubric.get(criterion.value, "Rate from 1-5 based on quality.")
prompt = f"""Evaluate the response using this rubric:
{rubric}
Context: {context or "N/A"}
Reference: {reference or "N/A"}
Response: {prediction}
Provide your evaluation as JSON:
{{"score": <1-5>, "reasoning": ""}}"""
response = await self.llm.complete(prompt)
try:
result = json.loads(response.content)
return JudgeResult(
criterion=criterion,
score=float(result["score"]),
reasoning=result.get("reasoning", "")
)
except Exception:
return JudgeResult(
criterion=criterion,
score=3.0,
reasoning="Parse error",
confidence=0.5
)
Task-Specific Metrics
from dataclasses import dataclass
from typing import Any, Optional
import re
class SummarizationMetrics:
"""Metrics for summarization tasks."""
def __init__(self):
self.rouge = ROUGEScore("rouge-l")
self.embedding = EmbeddingSimilarity()
def compute(self, summary: str, source: str, reference: str = None) -> dict:
"""Compute summarization metrics."""
results = {}
# Compression ratio
results["compression_ratio"] = len(summary) / max(len(source), 1)
# ROUGE against reference
if reference:
rouge_result = self.rouge.compute(summary, reference)
results["rouge_l"] = rouge_result.score
# Semantic similarity to source
embed_result = self.embedding.compute(summary, source)
results["source_similarity"] = embed_result.score
# Coverage: what fraction of source sentences are represented
source_sentences = self._split_sentences(source)
summary_sentences = self._split_sentences(summary)
covered = 0
for src_sent in source_sentences:
for sum_sent in summary_sentences:
sim = self._sentence_similarity(src_sent, sum_sent)
if sim > 0.5:
covered += 1
break
results["coverage"] = covered / max(len(source_sentences), 1)
return results
def _split_sentences(self, text: str) -> list[str]:
"""Split text into sentences."""
return re.split(r'[.!?]+', text)
def _sentence_similarity(self, sent1: str, sent2: str) -> float:
"""Quick sentence similarity."""
words1 = set(sent1.lower().split())
words2 = set(sent2.lower().split())
if not words1 or not words2:
return 0.0
return len(words1 & words2) / len(words1 | words2)
class QAMetrics:
"""Metrics for question answering."""
def __init__(self):
self.exact_match = ExactMatch()
def compute(
self,
prediction: str,
reference: str,
question: str = None
) -> dict:
"""Compute QA metrics."""
results = {}
# Exact match
em_result = self.exact_match.compute(prediction, reference)
results["exact_match"] = em_result.score
# F1 score (token overlap)
pred_tokens = set(prediction.lower().split())
ref_tokens = set(reference.lower().split())
if pred_tokens and ref_tokens:
precision = len(pred_tokens & ref_tokens) / len(pred_tokens)
recall = len(pred_tokens & ref_tokens) / len(ref_tokens)
f1 = 2 * precision * recall / max(precision + recall, 1e-8)
else:
f1 = 0.0
results["f1"] = f1
# Contains answer
results["contains_answer"] = reference.lower() in prediction.lower()
return results
class CodeMetrics:
"""Metrics for code generation."""
def compute(
self,
generated_code: str,
reference_code: str = None,
test_cases: list[dict] = None
) -> dict:
"""Compute code metrics."""
results = {}
# Syntax validity
results["syntax_valid"] = self._check_syntax(generated_code)
# Code similarity (if reference provided)
if reference_code:
results["code_bleu"] = self._code_bleu(generated_code, reference_code)
# Functional correctness (if test cases provided)
if test_cases:
passed, total = self._run_tests(generated_code, test_cases)
results["pass_rate"] = passed / max(total, 1)
results["tests_passed"] = passed
results["tests_total"] = total
return results
def _check_syntax(self, code: str) -> bool:
"""Check Python syntax validity."""
try:
compile(code, "", "exec")
return True
except SyntaxError:
return False
def _code_bleu(self, generated: str, reference: str) -> float:
"""Compute CodeBLEU-like score."""
# Tokenize code
gen_tokens = self._tokenize_code(generated)
ref_tokens = self._tokenize_code(reference)
# N-gram overlap
bleu = BLEUScore(max_n=4)
result = bleu.compute(" ".join(gen_tokens), " ".join(ref_tokens))
return result.score
def _tokenize_code(self, code: str) -> list[str]:
"""Tokenize code into meaningful tokens."""
# Simple tokenization
tokens = re.findall(r'\w+|[^\w\s]', code)
return tokens
def _run_tests(
self,
code: str,
test_cases: list[dict]
) -> tuple[int, int]:
"""Run test cases against code."""
passed = 0
for test in test_cases:
try:
# Create isolated namespace
namespace = {}
exec(code, namespace)
# Run test
func_name = test.get("function", "solution")
func = namespace.get(func_name)
if func:
result = func(*test.get("inputs", []))
if result == test.get("expected"):
passed += 1
except Exception:
pass
return passed, len(test_cases)
Evaluation Pipeline
from dataclasses import dataclass, field
from typing import Any, Optional
import asyncio
@dataclass
class EvalSample:
"""Single evaluation sample."""
id: str
prediction: str
reference: str = None
context: str = None
metadata: dict = field(default_factory=dict)
@dataclass
class EvalReport:
"""Evaluation report."""
metrics: dict[str, float]
samples: list[dict]
summary: str = ""
class EvaluationPipeline:
"""Complete evaluation pipeline."""
def __init__(
self,
text_metrics: list[TextMetric] = None,
llm_judge: LLMJudge = None,
task_metrics: Any = None
):
self.text_metrics = text_metrics or [
BLEUScore(),
ROUGEScore("rouge-l"),
ExactMatch()
]
self.llm_judge = llm_judge
self.task_metrics = task_metrics
async def evaluate_sample(self, sample: EvalSample) -> dict:
"""Evaluate single sample."""
results = {"id": sample.id}
# Text metrics
for metric in self.text_metrics:
if sample.reference:
result = metric.compute(sample.prediction, sample.reference)
results[result.name] = result.score
# LLM judge
if self.llm_judge:
for criterion in [EvalCriterion.RELEVANCE, EvalCriterion.COHERENCE]:
judge_result = await self.llm_judge.evaluate(
sample.prediction,
sample.reference,
sample.context,
criterion
)
results[f"judge_{criterion.value}"] = judge_result.score
# Task-specific metrics
if self.task_metrics:
task_results = self.task_metrics.compute(
sample.prediction,
sample.reference,
sample.context
)
results.update(task_results)
return results
async def evaluate_batch(
self,
samples: list[EvalSample],
max_concurrent: int = 10
) -> EvalReport:
"""Evaluate batch of samples."""
semaphore = asyncio.Semaphore(max_concurrent)
async def eval_with_limit(sample):
async with semaphore:
return await self.evaluate_sample(sample)
tasks = [eval_with_limit(s) for s in samples]
sample_results = await asyncio.gather(*tasks)
# Aggregate metrics
metric_names = set()
for result in sample_results:
metric_names.update(k for k in result.keys() if k != "id")
aggregated = {}
for name in metric_names:
values = [r.get(name) for r in sample_results if r.get(name) is not None]
if values:
aggregated[name] = sum(values) / len(values)
return EvalReport(
metrics=aggregated,
samples=sample_results,
summary=self._generate_summary(aggregated)
)
def _generate_summary(self, metrics: dict) -> str:
"""Generate human-readable summary."""
lines = ["Evaluation Summary:", "=" * 40]
for name, value in sorted(metrics.items()):
lines.append(f"{name}: {value:.4f}")
return "\n".join(lines)
class ABTestEvaluator:
"""Compare two models via A/B testing."""
def __init__(self, judge: PairwiseJudge):
self.judge = judge
async def compare_models(
self,
samples: list[dict], # {"context": ..., "model_a": ..., "model_b": ...}
criteria: list[EvalCriterion] = None
) -> dict:
"""Compare two models across samples."""
criteria = criteria or [EvalCriterion.HELPFULNESS, EvalCriterion.RELEVANCE]
results = {
"model_a_wins": 0,
"model_b_wins": 0,
"ties": 0,
"by_criterion": {}
}
for criterion in criteria:
results["by_criterion"][criterion.value] = {
"model_a_wins": 0,
"model_b_wins": 0,
"ties": 0
}
for sample in samples:
for criterion in criteria:
comparison = await self.judge.compare(
sample["model_a"],
sample["model_b"],
sample["context"],
criterion
)
winner = comparison.get("winner", "tie")
if winner == "A":
results["model_a_wins"] += 1
results["by_criterion"][criterion.value]["model_a_wins"] += 1
elif winner == "B":
results["model_b_wins"] += 1
results["by_criterion"][criterion.value]["model_b_wins"] += 1
else:
results["ties"] += 1
results["by_criterion"][criterion.value]["ties"] += 1
# Calculate win rates
total = results["model_a_wins"] + results["model_b_wins"] + results["ties"]
results["model_a_win_rate"] = results["model_a_wins"] / max(total, 1)
results["model_b_win_rate"] = results["model_b_wins"] / max(total, 1)
return results
Production Evaluation Service
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional
app = FastAPI()
class TextMetricRequest(BaseModel):
prediction: str
reference: str
metrics: list[str] = ["bleu", "rouge-l", "exact_match"]
class JudgeRequest(BaseModel):
prediction: str
reference: Optional[str] = None
context: Optional[str] = None
criterion: str = "relevance"
class BatchEvalRequest(BaseModel):
samples: list[dict]
metrics: list[str] = ["bleu", "rouge-l"]
use_judge: bool = False
# Initialize metrics
bleu = BLEUScore()
rouge = ROUGEScore("rouge-l")
exact = ExactMatch()
metric_map = {
"bleu": bleu,
"rouge-l": rouge,
"rouge-1": ROUGEScore("rouge-1"),
"rouge-2": ROUGEScore("rouge-2"),
"exact_match": exact
}
@app.post("/v1/metrics/text")
async def compute_text_metrics(request: TextMetricRequest) -> dict:
"""Compute text similarity metrics."""
results = {}
for metric_name in request.metrics:
metric = metric_map.get(metric_name)
if metric:
result = metric.compute(request.prediction, request.reference)
results[metric_name] = {
"score": result.score,
"details": result.details
}
return {"metrics": results}
@app.post("/v1/metrics/semantic")
async def compute_semantic_metrics(
prediction: str,
reference: str
) -> dict:
"""Compute semantic similarity metrics."""
embedding = EmbeddingSimilarity()
result = embedding.compute(prediction, reference)
return {
"embedding_similarity": result.score
}
@app.post("/v1/judge")
async def llm_judge_evaluate(request: JudgeRequest) -> dict:
"""Evaluate using LLM judge."""
# Note: In production, inject LLM client
# judge = SinglePointJudge(llm_client)
# Placeholder response
return {
"criterion": request.criterion,
"score": 4.0,
"reasoning": "Evaluation placeholder - configure LLM client"
}
@app.post("/v1/evaluate/batch")
async def batch_evaluate(request: BatchEvalRequest) -> dict:
"""Batch evaluation."""
results = []
for sample in request.samples:
sample_results = {"id": sample.get("id", "unknown")}
prediction = sample.get("prediction", "")
reference = sample.get("reference", "")
for metric_name in request.metrics:
metric = metric_map.get(metric_name)
if metric and reference:
result = metric.compute(prediction, reference)
sample_results[metric_name] = result.score
results.append(sample_results)
# Aggregate
aggregated = {}
for metric_name in request.metrics:
values = [r.get(metric_name) for r in results if r.get(metric_name) is not None]
if values:
aggregated[metric_name] = sum(values) / len(values)
return {
"samples": results,
"aggregated": aggregated
}
@app.get("/health")
async def health():
return {"status": "healthy"}
References
- BLEU Score: https://aclanthology.org/P02-1040/
- ROUGE: https://aclanthology.org/W04-1013/
- BERTScore: https://arxiv.org/abs/1904.09675
- LLM-as-Judge: https://arxiv.org/abs/2306.05685
- Hugging Face Evaluate: https://huggingface.co/docs/evaluate
Conclusion
LLM evaluation requires a multi-faceted approach because no single metric captures everything that matters. Start with fast, automated metrics like BLEU and ROUGE for quick iteration—they’re not perfect but they catch obvious regressions. Add semantic metrics like BERTScore and embedding similarity to capture meaning beyond surface-level word overlap. For nuanced quality assessment, use LLM-as-judge approaches with clear rubrics and multiple criteria. Pairwise comparison is particularly useful for A/B testing model changes. Task-specific metrics matter: summarization needs coverage and compression ratio, QA needs exact match and F1, code generation needs functional correctness via test execution. Build evaluation into your development workflow—run automated metrics on every change, use LLM judges for periodic deep evaluation, and reserve human evaluation for high-stakes decisions. Track metrics over time to catch gradual degradation. Remember that evaluation is not just about measuring quality—it’s about understanding failure modes and guiding improvement. The best evaluation systems don’t just score outputs; they provide actionable insights about what’s working and what needs attention.
