Introduction: Retrieval evaluation is the foundation of building effective RAG systems and search applications. Without proper metrics, you’re flying blind—unable to tell if your retrieval improvements actually help or hurt end-user experience. This guide covers the essential metrics for evaluating retrieval systems: precision and recall at various cutoffs, Mean Reciprocal Rank (MRR), Normalized Discounted Cumulative Gain (NDCG), Mean Average Precision (MAP), and newer LLM-based evaluation approaches. Understanding these metrics helps you make informed decisions about embedding models, chunking strategies, reranking approaches, and hybrid search configurations. Whether you’re building a document search system or optimizing RAG retrieval, these evaluation patterns will help you measure what matters and improve systematically.

Core Retrieval Metrics
from dataclasses import dataclass, field
from typing import List, Set, Dict, Any
import numpy as np
@dataclass
class RetrievalResult:
"""A single retrieval result."""
doc_id: str
score: float
rank: int
content: str = ""
metadata: dict = field(default_factory=dict)
@dataclass
class EvaluationQuery:
"""A query with ground truth relevance."""
query_id: str
query_text: str
relevant_docs: set[str] # Set of relevant doc IDs
relevance_scores: dict[str, int] = None # Optional graded relevance
class PrecisionRecall:
"""Calculate precision and recall metrics."""
@staticmethod
def precision_at_k(
retrieved: list[str],
relevant: set[str],
k: int
) -> float:
"""Precision at rank k."""
if k <= 0:
return 0.0
retrieved_at_k = retrieved[:k]
relevant_retrieved = sum(1 for doc in retrieved_at_k if doc in relevant)
return relevant_retrieved / k
@staticmethod
def recall_at_k(
retrieved: list[str],
relevant: set[str],
k: int
) -> float:
"""Recall at rank k."""
if not relevant:
return 0.0
retrieved_at_k = set(retrieved[:k])
relevant_retrieved = len(retrieved_at_k & relevant)
return relevant_retrieved / len(relevant)
@staticmethod
def f1_at_k(
retrieved: list[str],
relevant: set[str],
k: int
) -> float:
"""F1 score at rank k."""
precision = PrecisionRecall.precision_at_k(retrieved, relevant, k)
recall = PrecisionRecall.recall_at_k(retrieved, relevant, k)
if precision + recall == 0:
return 0.0
return 2 * precision * recall / (precision + recall)
@staticmethod
def r_precision(
retrieved: list[str],
relevant: set[str]
) -> float:
"""R-Precision: precision at R, where R is number of relevant docs."""
r = len(relevant)
return PrecisionRecall.precision_at_k(retrieved, relevant, r)
class MRR:
"""Mean Reciprocal Rank calculator."""
@staticmethod
def reciprocal_rank(
retrieved: list[str],
relevant: set[str]
) -> float:
"""Reciprocal rank for a single query."""
for i, doc in enumerate(retrieved):
if doc in relevant:
return 1.0 / (i + 1)
return 0.0
@staticmethod
def mean_reciprocal_rank(
results: list[tuple[list[str], set[str]]]
) -> float:
"""MRR across multiple queries."""
if not results:
return 0.0
rr_sum = sum(
MRR.reciprocal_rank(retrieved, relevant)
for retrieved, relevant in results
)
return rr_sum / len(results)
class MAP:
"""Mean Average Precision calculator."""
@staticmethod
def average_precision(
retrieved: list[str],
relevant: set[str]
) -> float:
"""Average precision for a single query."""
if not relevant:
return 0.0
precision_sum = 0.0
relevant_count = 0
for i, doc in enumerate(retrieved):
if doc in relevant:
relevant_count += 1
precision_at_i = relevant_count / (i + 1)
precision_sum += precision_at_i
return precision_sum / len(relevant)
@staticmethod
def mean_average_precision(
results: list[tuple[list[str], set[str]]]
) -> float:
"""MAP across multiple queries."""
if not results:
return 0.0
ap_sum = sum(
MAP.average_precision(retrieved, relevant)
for retrieved, relevant in results
)
return ap_sum / len(results)
class NDCG:
"""Normalized Discounted Cumulative Gain calculator."""
@staticmethod
def dcg_at_k(
relevance_scores: list[float],
k: int
) -> float:
"""Discounted Cumulative Gain at k."""
relevance_scores = relevance_scores[:k]
dcg = 0.0
for i, rel in enumerate(relevance_scores):
# Using log2(i + 2) to handle i=0
dcg += (2 ** rel - 1) / np.log2(i + 2)
return dcg
@staticmethod
def ndcg_at_k(
retrieved: list[str],
relevance_scores: dict[str, float],
k: int
) -> float:
"""NDCG at k."""
# Get relevance scores for retrieved docs
retrieved_relevance = [
relevance_scores.get(doc, 0.0)
for doc in retrieved[:k]
]
# Calculate DCG
dcg = NDCG.dcg_at_k(retrieved_relevance, k)
# Calculate ideal DCG
ideal_relevance = sorted(relevance_scores.values(), reverse=True)[:k]
idcg = NDCG.dcg_at_k(ideal_relevance, k)
if idcg == 0:
return 0.0
return dcg / idcg
@staticmethod
def mean_ndcg_at_k(
results: list[tuple[list[str], dict[str, float]]],
k: int
) -> float:
"""Mean NDCG at k across queries."""
if not results:
return 0.0
ndcg_sum = sum(
NDCG.ndcg_at_k(retrieved, relevance, k)
for retrieved, relevance in results
)
return ndcg_sum / len(results)
Comprehensive Evaluator
from dataclasses import dataclass, field
from typing import List, Dict, Any, Callable
import numpy as np
@dataclass
class EvaluationResult:
"""Complete evaluation results."""
query_id: str
precision_at_k: dict[int, float]
recall_at_k: dict[int, float]
f1_at_k: dict[int, float]
mrr: float
map_score: float
ndcg_at_k: dict[int, float]
hit_rate_at_k: dict[int, float]
@dataclass
class AggregatedResults:
"""Aggregated results across all queries."""
num_queries: int
mean_precision_at_k: dict[int, float]
mean_recall_at_k: dict[int, float]
mean_f1_at_k: dict[int, float]
mrr: float
map_score: float
mean_ndcg_at_k: dict[int, float]
mean_hit_rate_at_k: dict[int, float]
class RetrievalEvaluator:
"""Comprehensive retrieval evaluation."""
def __init__(self, k_values: list[int] = None):
self.k_values = k_values or [1, 3, 5, 10, 20]
def evaluate_query(
self,
query: EvaluationQuery,
retrieved: list[str]
) -> EvaluationResult:
"""Evaluate a single query."""
relevant = query.relevant_docs
relevance_scores = query.relevance_scores or {
doc: 1 for doc in relevant
}
# Calculate metrics at each k
precision_at_k = {}
recall_at_k = {}
f1_at_k = {}
ndcg_at_k = {}
hit_rate_at_k = {}
for k in self.k_values:
precision_at_k[k] = PrecisionRecall.precision_at_k(retrieved, relevant, k)
recall_at_k[k] = PrecisionRecall.recall_at_k(retrieved, relevant, k)
f1_at_k[k] = PrecisionRecall.f1_at_k(retrieved, relevant, k)
ndcg_at_k[k] = NDCG.ndcg_at_k(retrieved, relevance_scores, k)
# Hit rate: 1 if any relevant doc in top k
hit_rate_at_k[k] = 1.0 if any(
doc in relevant for doc in retrieved[:k]
) else 0.0
# Calculate MRR and MAP
mrr = MRR.reciprocal_rank(retrieved, relevant)
map_score = MAP.average_precision(retrieved, relevant)
return EvaluationResult(
query_id=query.query_id,
precision_at_k=precision_at_k,
recall_at_k=recall_at_k,
f1_at_k=f1_at_k,
mrr=mrr,
map_score=map_score,
ndcg_at_k=ndcg_at_k,
hit_rate_at_k=hit_rate_at_k
)
def evaluate_all(
self,
queries: list[EvaluationQuery],
retriever: Callable[[str], list[str]]
) -> AggregatedResults:
"""Evaluate all queries and aggregate."""
results = []
for query in queries:
retrieved = retriever(query.query_text)
result = self.evaluate_query(query, retrieved)
results.append(result)
return self._aggregate_results(results)
def _aggregate_results(
self,
results: list[EvaluationResult]
) -> AggregatedResults:
"""Aggregate results across queries."""
n = len(results)
if n == 0:
return AggregatedResults(
num_queries=0,
mean_precision_at_k={},
mean_recall_at_k={},
mean_f1_at_k={},
mrr=0.0,
map_score=0.0,
mean_ndcg_at_k={},
mean_hit_rate_at_k={}
)
# Aggregate each metric
mean_precision = {
k: np.mean([r.precision_at_k[k] for r in results])
for k in self.k_values
}
mean_recall = {
k: np.mean([r.recall_at_k[k] for r in results])
for k in self.k_values
}
mean_f1 = {
k: np.mean([r.f1_at_k[k] for r in results])
for k in self.k_values
}
mean_ndcg = {
k: np.mean([r.ndcg_at_k[k] for r in results])
for k in self.k_values
}
mean_hit_rate = {
k: np.mean([r.hit_rate_at_k[k] for r in results])
for k in self.k_values
}
mrr = np.mean([r.mrr for r in results])
map_score = np.mean([r.map_score for r in results])
return AggregatedResults(
num_queries=n,
mean_precision_at_k=mean_precision,
mean_recall_at_k=mean_recall,
mean_f1_at_k=mean_f1,
mrr=mrr,
map_score=map_score,
mean_ndcg_at_k=mean_ndcg,
mean_hit_rate_at_k=mean_hit_rate
)
def compare_retrievers(
self,
queries: list[EvaluationQuery],
retrievers: dict[str, Callable[[str], list[str]]]
) -> dict[str, AggregatedResults]:
"""Compare multiple retrievers."""
results = {}
for name, retriever in retrievers.items():
results[name] = self.evaluate_all(queries, retriever)
return results
def format_results(self, results: AggregatedResults) -> str:
"""Format results as readable string."""
lines = [
f"Evaluation Results ({results.num_queries} queries)",
"=" * 50,
f"MRR: {results.mrr:.4f}",
f"MAP: {results.map_score:.4f}",
"",
"Precision@k:",
]
for k in sorted(results.mean_precision_at_k.keys()):
lines.append(f" P@{k}: {results.mean_precision_at_k[k]:.4f}")
lines.append("\nRecall@k:")
for k in sorted(results.mean_recall_at_k.keys()):
lines.append(f" R@{k}: {results.mean_recall_at_k[k]:.4f}")
lines.append("\nNDCG@k:")
for k in sorted(results.mean_ndcg_at_k.keys()):
lines.append(f" NDCG@{k}: {results.mean_ndcg_at_k[k]:.4f}")
lines.append("\nHit Rate@k:")
for k in sorted(results.mean_hit_rate_at_k.keys()):
lines.append(f" HR@{k}: {results.mean_hit_rate_at_k[k]:.4f}")
return "\n".join(lines)
LLM-Based Evaluation
from dataclasses import dataclass
from typing import Any, Optional, List
from enum import Enum
class RelevanceLevel(Enum):
"""Relevance levels for LLM judgment."""
NOT_RELEVANT = 0
PARTIALLY_RELEVANT = 1
RELEVANT = 2
HIGHLY_RELEVANT = 3
@dataclass
class LLMJudgment:
"""LLM relevance judgment."""
query: str
document: str
relevance: RelevanceLevel
reasoning: str
confidence: float
class LLMRelevanceJudge:
"""Use LLM to judge document relevance."""
def __init__(self, llm_client: Any):
self.llm = llm_client
self.prompt_template = """You are evaluating the relevance of a document to a query.
Query: {query}
Document: {document}
Rate the relevance on this scale:
0 - NOT_RELEVANT: Document has no useful information for the query
1 - PARTIALLY_RELEVANT: Document has some related information but doesn't directly answer
2 - RELEVANT: Document contains information that helps answer the query
3 - HIGHLY_RELEVANT: Document directly and completely addresses the query
Respond in this format:
RELEVANCE: [0-3]
REASONING: [Brief explanation]
CONFIDENCE: [0.0-1.0]"""
async def judge(self, query: str, document: str) -> LLMJudgment:
"""Judge relevance of document to query."""
prompt = self.prompt_template.format(
query=query,
document=document[:2000] # Truncate long docs
)
response = await self.llm.generate(prompt)
return self._parse_response(query, document, response)
def _parse_response(
self,
query: str,
document: str,
response: str
) -> LLMJudgment:
"""Parse LLM response."""
lines = response.strip().split("\n")
relevance = RelevanceLevel.NOT_RELEVANT
reasoning = ""
confidence = 0.5
for line in lines:
if line.startswith("RELEVANCE:"):
try:
score = int(line.split(":")[1].strip())
relevance = RelevanceLevel(score)
except (ValueError, IndexError):
pass
elif line.startswith("REASONING:"):
reasoning = line.split(":", 1)[1].strip()
elif line.startswith("CONFIDENCE:"):
try:
confidence = float(line.split(":")[1].strip())
except (ValueError, IndexError):
pass
return LLMJudgment(
query=query,
document=document,
relevance=relevance,
reasoning=reasoning,
confidence=confidence
)
async def judge_batch(
self,
query: str,
documents: list[str]
) -> list[LLMJudgment]:
"""Judge multiple documents."""
import asyncio
tasks = [self.judge(query, doc) for doc in documents]
return await asyncio.gather(*tasks)
class RAGASEvaluator:
"""RAGAS-style evaluation metrics."""
def __init__(self, llm_client: Any):
self.llm = llm_client
async def context_relevance(
self,
query: str,
contexts: list[str]
) -> float:
"""Evaluate how relevant retrieved contexts are to query."""
prompt = f"""Given the query and retrieved contexts, rate how relevant the contexts are.
Query: {query}
Contexts:
{self._format_contexts(contexts)}
Rate the overall relevance from 0.0 to 1.0, where:
- 0.0: Contexts are completely irrelevant
- 0.5: Contexts are somewhat relevant
- 1.0: Contexts are highly relevant and sufficient
Respond with just a number between 0.0 and 1.0."""
response = await self.llm.generate(prompt)
try:
return float(response.strip())
except ValueError:
return 0.5
async def answer_relevance(
self,
query: str,
answer: str
) -> float:
"""Evaluate how well answer addresses the query."""
prompt = f"""Given the query and answer, rate how well the answer addresses the query.
Query: {query}
Answer: {answer}
Rate from 0.0 to 1.0, where:
- 0.0: Answer doesn't address the query at all
- 0.5: Answer partially addresses the query
- 1.0: Answer fully and directly addresses the query
Respond with just a number between 0.0 and 1.0."""
response = await self.llm.generate(prompt)
try:
return float(response.strip())
except ValueError:
return 0.5
async def faithfulness(
self,
answer: str,
contexts: list[str]
) -> float:
"""Evaluate if answer is faithful to contexts (no hallucination)."""
prompt = f"""Given the answer and source contexts, rate how faithful the answer is to the contexts.
Answer: {answer}
Source Contexts:
{self._format_contexts(contexts)}
Rate from 0.0 to 1.0, where:
- 0.0: Answer contains information not in contexts (hallucination)
- 0.5: Answer is partially supported by contexts
- 1.0: Answer is fully supported by the contexts
Respond with just a number between 0.0 and 1.0."""
response = await self.llm.generate(prompt)
try:
return float(response.strip())
except ValueError:
return 0.5
async def context_precision(
self,
query: str,
contexts: list[str],
ground_truth: str
) -> float:
"""Evaluate precision of retrieved contexts."""
prompt = f"""Given the query, retrieved contexts, and ground truth answer, evaluate context precision.
Query: {query}
Retrieved Contexts:
{self._format_contexts(contexts)}
Ground Truth Answer: {ground_truth}
Rate from 0.0 to 1.0 how many of the retrieved contexts are actually needed to answer the query.
- 0.0: None of the contexts are useful
- 0.5: About half the contexts are useful
- 1.0: All contexts are useful and relevant
Respond with just a number between 0.0 and 1.0."""
response = await self.llm.generate(prompt)
try:
return float(response.strip())
except ValueError:
return 0.5
async def context_recall(
self,
contexts: list[str],
ground_truth: str
) -> float:
"""Evaluate if contexts contain all info needed for ground truth."""
prompt = f"""Given the retrieved contexts and ground truth answer, evaluate context recall.
Retrieved Contexts:
{self._format_contexts(contexts)}
Ground Truth Answer: {ground_truth}
Rate from 0.0 to 1.0 how much of the ground truth can be derived from the contexts.
- 0.0: Contexts don't support the ground truth at all
- 0.5: Contexts partially support the ground truth
- 1.0: Contexts fully support deriving the ground truth
Respond with just a number between 0.0 and 1.0."""
response = await self.llm.generate(prompt)
try:
return float(response.strip())
except ValueError:
return 0.5
def _format_contexts(self, contexts: list[str]) -> str:
"""Format contexts for prompt."""
formatted = []
for i, ctx in enumerate(contexts, 1):
formatted.append(f"[{i}] {ctx[:500]}...")
return "\n\n".join(formatted)
Benchmark Dataset
from dataclasses import dataclass, field
from typing import List, Dict, Any
import json
import random
@dataclass
class BenchmarkQuery:
"""A benchmark query with metadata."""
query_id: str
query_text: str
relevant_docs: list[str]
relevance_grades: dict[str, int] = None
category: str = ""
difficulty: str = ""
@dataclass
class BenchmarkDataset:
"""A retrieval benchmark dataset."""
name: str
queries: list[BenchmarkQuery]
documents: dict[str, str]
metadata: dict = field(default_factory=dict)
class DatasetBuilder:
"""Build evaluation datasets."""
def __init__(self):
self.queries = []
self.documents = {}
def add_query(
self,
query_text: str,
relevant_docs: list[str],
relevance_grades: dict[str, int] = None,
category: str = "",
difficulty: str = ""
) -> str:
"""Add a query to the dataset."""
query_id = f"q_{len(self.queries)}"
self.queries.append(BenchmarkQuery(
query_id=query_id,
query_text=query_text,
relevant_docs=relevant_docs,
relevance_grades=relevance_grades,
category=category,
difficulty=difficulty
))
return query_id
def add_document(self, doc_id: str, content: str):
"""Add a document to the dataset."""
self.documents[doc_id] = content
def build(self, name: str) -> BenchmarkDataset:
"""Build the dataset."""
return BenchmarkDataset(
name=name,
queries=self.queries,
documents=self.documents
)
def save(self, path: str):
"""Save dataset to file."""
data = {
"queries": [
{
"query_id": q.query_id,
"query_text": q.query_text,
"relevant_docs": q.relevant_docs,
"relevance_grades": q.relevance_grades,
"category": q.category,
"difficulty": q.difficulty
}
for q in self.queries
],
"documents": self.documents
}
with open(path, "w") as f:
json.dump(data, f, indent=2)
@classmethod
def load(cls, path: str) -> "DatasetBuilder":
"""Load dataset from file."""
with open(path) as f:
data = json.load(f)
builder = cls()
for q in data["queries"]:
builder.queries.append(BenchmarkQuery(
query_id=q["query_id"],
query_text=q["query_text"],
relevant_docs=q["relevant_docs"],
relevance_grades=q.get("relevance_grades"),
category=q.get("category", ""),
difficulty=q.get("difficulty", "")
))
builder.documents = data["documents"]
return builder
class SyntheticDataGenerator:
"""Generate synthetic evaluation data."""
def __init__(self, llm_client: Any = None):
self.llm = llm_client
async def generate_queries(
self,
documents: list[str],
queries_per_doc: int = 2
) -> list[tuple[str, str]]:
"""Generate queries for documents."""
results = []
for doc in documents:
prompt = f"""Generate {queries_per_doc} natural questions that this document answers.
Document: {doc[:1500]}
Generate questions that:
1. Are specific and answerable from the document
2. Vary in complexity (simple fact vs. reasoning)
3. Use natural language
Format: One question per line."""
response = await self.llm.generate(prompt)
questions = [q.strip() for q in response.strip().split("\n") if q.strip()]
for q in questions[:queries_per_doc]:
results.append((q, doc))
return results
def create_negative_samples(
self,
documents: list[str],
num_negatives: int = 5
) -> dict[str, list[str]]:
"""Create negative samples for each document."""
negatives = {}
for i, doc in enumerate(documents):
# Select random documents as negatives
other_docs = [d for j, d in enumerate(documents) if j != i]
negatives[f"doc_{i}"] = random.sample(
other_docs,
min(num_negatives, len(other_docs))
)
return negatives
class StandardBenchmarks:
"""Load standard retrieval benchmarks."""
@staticmethod
def load_beir(dataset_name: str) -> BenchmarkDataset:
"""Load a BEIR benchmark dataset."""
# Placeholder - actual implementation would load from BEIR
return BenchmarkDataset(
name=f"beir/{dataset_name}",
queries=[],
documents={}
)
@staticmethod
def load_msmarco() -> BenchmarkDataset:
"""Load MS MARCO dataset."""
# Placeholder
return BenchmarkDataset(
name="msmarco",
queries=[],
documents={}
)
@staticmethod
def load_natural_questions() -> BenchmarkDataset:
"""Load Natural Questions dataset."""
# Placeholder
return BenchmarkDataset(
name="natural_questions",
queries=[],
documents={}
)
Production Evaluation Service
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional, List, Dict
import time
app = FastAPI()
class EvalRequest(BaseModel):
queries: list[dict] # List of {query_text, relevant_docs}
retrieved: list[list[str]] # Retrieved docs for each query
k_values: Optional[list[int]] = [1, 3, 5, 10]
class EvalResponse(BaseModel):
num_queries: int
mrr: float
map_score: float
precision_at_k: dict[str, float]
recall_at_k: dict[str, float]
ndcg_at_k: dict[str, float]
hit_rate_at_k: dict[str, float]
latency_ms: float
class CompareRequest(BaseModel):
queries: list[dict]
retrievers: dict[str, list[list[str]]] # name -> retrieved for each query
k_values: Optional[list[int]] = [1, 3, 5, 10]
evaluator = RetrievalEvaluator()
@app.post("/v1/evaluate")
async def evaluate(request: EvalRequest) -> EvalResponse:
"""Evaluate retrieval results."""
start = time.time()
# Build evaluation queries
eval_queries = [
EvaluationQuery(
query_id=f"q_{i}",
query_text=q["query_text"],
relevant_docs=set(q["relevant_docs"])
)
for i, q in enumerate(request.queries)
]
# Evaluate each query
results = []
for query, retrieved in zip(eval_queries, request.retrieved):
result = evaluator.evaluate_query(query, retrieved)
results.append(result)
# Aggregate
aggregated = evaluator._aggregate_results(results)
latency = (time.time() - start) * 1000
return EvalResponse(
num_queries=aggregated.num_queries,
mrr=aggregated.mrr,
map_score=aggregated.map_score,
precision_at_k={str(k): v for k, v in aggregated.mean_precision_at_k.items()},
recall_at_k={str(k): v for k, v in aggregated.mean_recall_at_k.items()},
ndcg_at_k={str(k): v for k, v in aggregated.mean_ndcg_at_k.items()},
hit_rate_at_k={str(k): v for k, v in aggregated.mean_hit_rate_at_k.items()},
latency_ms=latency
)
@app.post("/v1/compare")
async def compare(request: CompareRequest) -> dict:
"""Compare multiple retrievers."""
# Build evaluation queries
eval_queries = [
EvaluationQuery(
query_id=f"q_{i}",
query_text=q["query_text"],
relevant_docs=set(q["relevant_docs"])
)
for i, q in enumerate(request.queries)
]
comparison = {}
for name, retrieved_lists in request.retrievers.items():
results = []
for query, retrieved in zip(eval_queries, retrieved_lists):
result = evaluator.evaluate_query(query, retrieved)
results.append(result)
aggregated = evaluator._aggregate_results(results)
comparison[name] = {
"mrr": aggregated.mrr,
"map": aggregated.map_score,
"precision_at_5": aggregated.mean_precision_at_k.get(5, 0),
"recall_at_10": aggregated.mean_recall_at_k.get(10, 0),
"ndcg_at_10": aggregated.mean_ndcg_at_k.get(10, 0)
}
return {"comparison": comparison}
@app.get("/v1/metrics")
async def list_metrics() -> dict:
"""List available metrics."""
return {
"metrics": [
{"name": "precision_at_k", "description": "Fraction of retrieved docs that are relevant"},
{"name": "recall_at_k", "description": "Fraction of relevant docs that are retrieved"},
{"name": "mrr", "description": "Mean Reciprocal Rank - position of first relevant doc"},
{"name": "map", "description": "Mean Average Precision - average precision across recall levels"},
{"name": "ndcg_at_k", "description": "Normalized DCG - accounts for graded relevance"},
{"name": "hit_rate_at_k", "description": "Fraction of queries with at least one relevant doc"}
]
}
@app.get("/health")
async def health():
return {"status": "healthy"}
References
- BEIR Benchmark: https://github.com/beir-cellar/beir
- RAGAS: https://github.com/explodinggradients/ragas
- MTEB Leaderboard: https://huggingface.co/spaces/mteb/leaderboard
- MS MARCO: https://microsoft.github.io/msmarco/
- Trec Eval: https://github.com/usnistgov/trec_eval
Conclusion
Retrieval evaluation requires understanding what each metric measures and when to use it. Precision@k tells you how many of your top results are relevant—critical when users only look at the first few results. Recall@k measures coverage—essential when you need to find all relevant documents. MRR focuses on the position of the first relevant result, making it ideal for navigational queries where users want one good answer. MAP provides a single number summarizing precision across all recall levels, useful for comparing systems overall. NDCG handles graded relevance, recognizing that some documents are more relevant than others. For RAG systems, combine traditional metrics with LLM-based evaluation: context relevance measures if you retrieved the right information, faithfulness checks for hallucination, and answer relevance evaluates end-to-end quality. Build evaluation datasets that reflect your actual query distribution—synthetic benchmarks often miss domain-specific patterns. Run evaluations continuously as you iterate on embeddings, chunking, and reranking. The key insight is that no single metric tells the whole story; use a dashboard of metrics and understand the tradeoffs each represents. A system optimized purely for recall might retrieve too much noise, while one optimized for precision might miss important documents.
