Introduction: Choosing the right embedding model is critical for RAG systems, semantic search, and similarity applications. The wrong choice leads to poor retrieval quality, high costs, or unacceptable latency. OpenAI’s text-embedding-3-small is cheap and fast but may miss nuanced similarities. Cohere’s embed-v3 excels at multilingual content. Open-source models like BGE and E5 offer privacy and cost control. This guide covers practical embedding model selection: understanding model characteristics, benchmarking for your specific use case, balancing quality versus cost versus latency, and building evaluation pipelines that help you make data-driven decisions.

Embedding Model Registry
from dataclasses import dataclass
from typing import Any, Optional
from enum import Enum
class EmbeddingProvider(Enum):
"""Embedding model providers."""
OPENAI = "openai"
COHERE = "cohere"
VOYAGE = "voyage"
HUGGINGFACE = "huggingface"
LOCAL = "local"
@dataclass
class EmbeddingModelSpec:
"""Specification for an embedding model."""
name: str
provider: EmbeddingProvider
dimensions: int
max_tokens: int
cost_per_1m_tokens: float
supports_batching: bool = True
supports_truncation: bool = True
multilingual: bool = False
description: str = ""
class EmbeddingModelRegistry:
"""Registry of available embedding models."""
def __init__(self):
self.models: dict[str, EmbeddingModelSpec] = {}
self._register_defaults()
def _register_defaults(self):
"""Register default models."""
# OpenAI models
self.register(EmbeddingModelSpec(
name="text-embedding-3-small",
provider=EmbeddingProvider.OPENAI,
dimensions=1536,
max_tokens=8191,
cost_per_1m_tokens=0.02,
multilingual=True,
description="Fast, cheap, good for most use cases"
))
self.register(EmbeddingModelSpec(
name="text-embedding-3-large",
provider=EmbeddingProvider.OPENAI,
dimensions=3072,
max_tokens=8191,
cost_per_1m_tokens=0.13,
multilingual=True,
description="Higher quality, better for complex retrieval"
))
self.register(EmbeddingModelSpec(
name="text-embedding-ada-002",
provider=EmbeddingProvider.OPENAI,
dimensions=1536,
max_tokens=8191,
cost_per_1m_tokens=0.10,
multilingual=False,
description="Legacy model, still widely used"
))
# Cohere models
self.register(EmbeddingModelSpec(
name="embed-english-v3.0",
provider=EmbeddingProvider.COHERE,
dimensions=1024,
max_tokens=512,
cost_per_1m_tokens=0.10,
multilingual=False,
description="Optimized for English retrieval"
))
self.register(EmbeddingModelSpec(
name="embed-multilingual-v3.0",
provider=EmbeddingProvider.COHERE,
dimensions=1024,
max_tokens=512,
cost_per_1m_tokens=0.10,
multilingual=True,
description="100+ languages, excellent cross-lingual"
))
# Voyage models
self.register(EmbeddingModelSpec(
name="voyage-large-2",
provider=EmbeddingProvider.VOYAGE,
dimensions=1536,
max_tokens=16000,
cost_per_1m_tokens=0.12,
multilingual=False,
description="Long context, high quality"
))
self.register(EmbeddingModelSpec(
name="voyage-code-2",
provider=EmbeddingProvider.VOYAGE,
dimensions=1536,
max_tokens=16000,
cost_per_1m_tokens=0.12,
multilingual=False,
description="Optimized for code retrieval"
))
# Open source models
self.register(EmbeddingModelSpec(
name="BAAI/bge-large-en-v1.5",
provider=EmbeddingProvider.HUGGINGFACE,
dimensions=1024,
max_tokens=512,
cost_per_1m_tokens=0.0, # Self-hosted
multilingual=False,
description="Top open-source English model"
))
self.register(EmbeddingModelSpec(
name="intfloat/e5-large-v2",
provider=EmbeddingProvider.HUGGINGFACE,
dimensions=1024,
max_tokens=512,
cost_per_1m_tokens=0.0,
multilingual=False,
description="Excellent retrieval performance"
))
self.register(EmbeddingModelSpec(
name="sentence-transformers/all-MiniLM-L6-v2",
provider=EmbeddingProvider.HUGGINGFACE,
dimensions=384,
max_tokens=256,
cost_per_1m_tokens=0.0,
multilingual=False,
description="Fast, lightweight, good baseline"
))
def register(self, spec: EmbeddingModelSpec) -> None:
"""Register a model."""
self.models[spec.name] = spec
def get(self, name: str) -> Optional[EmbeddingModelSpec]:
"""Get model specification."""
return self.models.get(name)
def list_by_provider(self, provider: EmbeddingProvider) -> list[EmbeddingModelSpec]:
"""List models by provider."""
return [m for m in self.models.values() if m.provider == provider]
def list_multilingual(self) -> list[EmbeddingModelSpec]:
"""List multilingual models."""
return [m for m in self.models.values() if m.multilingual]
def list_by_cost(self, max_cost: float) -> list[EmbeddingModelSpec]:
"""List models under cost threshold."""
return [m for m in self.models.values() if m.cost_per_1m_tokens <= max_cost]
Unified Embedding Client
from dataclasses import dataclass
from typing import Any, Optional
import numpy as np
@dataclass
class EmbeddingResult:
"""Result of embedding generation."""
embeddings: list[list[float]]
model: str
tokens_used: int
latency_ms: float
class UnifiedEmbeddingClient:
"""Unified client for multiple embedding providers."""
def __init__(
self,
openai_client: Any = None,
cohere_client: Any = None,
voyage_client: Any = None,
local_model: Any = None
):
self.openai = openai_client
self.cohere = cohere_client
self.voyage = voyage_client
self.local = local_model
self.registry = EmbeddingModelRegistry()
async def embed(
self,
texts: list[str],
model: str
) -> EmbeddingResult:
"""Generate embeddings using specified model."""
import time
start = time.time()
spec = self.registry.get(model)
if not spec:
raise ValueError(f"Unknown model: {model}")
if spec.provider == EmbeddingProvider.OPENAI:
result = await self._embed_openai(texts, model)
elif spec.provider == EmbeddingProvider.COHERE:
result = await self._embed_cohere(texts, model)
elif spec.provider == EmbeddingProvider.VOYAGE:
result = await self._embed_voyage(texts, model)
elif spec.provider == EmbeddingProvider.HUGGINGFACE:
result = self._embed_local(texts, model)
else:
raise ValueError(f"Unsupported provider: {spec.provider}")
latency = (time.time() - start) * 1000
result.latency_ms = latency
return result
async def _embed_openai(self, texts: list[str], model: str) -> EmbeddingResult:
"""Generate embeddings using OpenAI."""
response = await self.openai.embeddings.create(
model=model,
input=texts
)
embeddings = [d.embedding for d in response.data]
return EmbeddingResult(
embeddings=embeddings,
model=model,
tokens_used=response.usage.total_tokens,
latency_ms=0
)
async def _embed_cohere(self, texts: list[str], model: str) -> EmbeddingResult:
"""Generate embeddings using Cohere."""
response = await self.cohere.embed(
texts=texts,
model=model,
input_type="search_document"
)
return EmbeddingResult(
embeddings=response.embeddings,
model=model,
tokens_used=len(texts) * 100, # Approximate
latency_ms=0
)
async def _embed_voyage(self, texts: list[str], model: str) -> EmbeddingResult:
"""Generate embeddings using Voyage."""
response = await self.voyage.embed(
texts=texts,
model=model
)
return EmbeddingResult(
embeddings=response.embeddings,
model=model,
tokens_used=response.total_tokens,
latency_ms=0
)
def _embed_local(self, texts: list[str], model: str) -> EmbeddingResult:
"""Generate embeddings using local model."""
embeddings = self.local.encode(texts).tolist()
return EmbeddingResult(
embeddings=embeddings,
model=model,
tokens_used=sum(len(t.split()) for t in texts),
latency_ms=0
)
Embedding Benchmarking
from dataclasses import dataclass
from typing import Any, Optional
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
@dataclass
class BenchmarkQuery:
"""A benchmark query with expected results."""
query: str
relevant_docs: list[str]
irrelevant_docs: list[str]
@dataclass
class BenchmarkResult:
"""Result of model benchmarking."""
model: str
precision_at_k: dict[int, float]
recall_at_k: dict[int, float]
mrr: float
ndcg: float
avg_latency_ms: float
cost_per_query: float
class EmbeddingBenchmark:
"""Benchmark embedding models for retrieval quality."""
def __init__(self, client: UnifiedEmbeddingClient):
self.client = client
async def benchmark_model(
self,
model: str,
queries: list[BenchmarkQuery],
k_values: list[int] = [1, 3, 5, 10]
) -> BenchmarkResult:
"""Benchmark a model on retrieval tasks."""
precision_scores = {k: [] for k in k_values}
recall_scores = {k: [] for k in k_values}
mrr_scores = []
ndcg_scores = []
latencies = []
for query in queries:
# Embed query
query_result = await self.client.embed([query.query], model)
query_embedding = np.array(query_result.embeddings[0])
latencies.append(query_result.latency_ms)
# Embed all documents
all_docs = query.relevant_docs + query.irrelevant_docs
doc_result = await self.client.embed(all_docs, model)
doc_embeddings = np.array(doc_result.embeddings)
latencies.append(doc_result.latency_ms)
# Calculate similarities
similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
# Rank documents
ranked_indices = np.argsort(similarities)[::-1]
# Calculate metrics
num_relevant = len(query.relevant_docs)
for k in k_values:
top_k = ranked_indices[:k]
relevant_in_top_k = sum(1 for i in top_k if i < num_relevant)
precision_scores[k].append(relevant_in_top_k / k)
recall_scores[k].append(relevant_in_top_k / num_relevant)
# MRR
for rank, idx in enumerate(ranked_indices, 1):
if idx < num_relevant:
mrr_scores.append(1.0 / rank)
break
else:
mrr_scores.append(0.0)
# NDCG
ndcg = self._calculate_ndcg(ranked_indices, num_relevant, max(k_values))
ndcg_scores.append(ndcg)
# Get cost info
spec = self.client.registry.get(model)
avg_tokens = sum(len(q.query.split()) for q in queries) / len(queries)
cost_per_query = (avg_tokens / 1_000_000) * spec.cost_per_1m_tokens
return BenchmarkResult(
model=model,
precision_at_k={k: np.mean(scores) for k, scores in precision_scores.items()},
recall_at_k={k: np.mean(scores) for k, scores in recall_scores.items()},
mrr=np.mean(mrr_scores),
ndcg=np.mean(ndcg_scores),
avg_latency_ms=np.mean(latencies),
cost_per_query=cost_per_query
)
def _calculate_ndcg(
self,
ranked_indices: np.ndarray,
num_relevant: int,
k: int
) -> float:
"""Calculate NDCG@k."""
dcg = 0.0
for i, idx in enumerate(ranked_indices[:k]):
if idx < num_relevant:
dcg += 1.0 / np.log2(i + 2)
# Ideal DCG
idcg = sum(1.0 / np.log2(i + 2) for i in range(min(k, num_relevant)))
return dcg / idcg if idcg > 0 else 0.0
async def compare_models(
self,
models: list[str],
queries: list[BenchmarkQuery]
) -> dict[str, BenchmarkResult]:
"""Compare multiple models."""
results = {}
for model in models:
results[model] = await self.benchmark_model(model, queries)
return results
def rank_models(
self,
results: dict[str, BenchmarkResult],
weights: dict[str, float] = None
) -> list[tuple[str, float]]:
"""Rank models by weighted score."""
if weights is None:
weights = {
"mrr": 0.3,
"ndcg": 0.3,
"latency": 0.2,
"cost": 0.2
}
# Normalize metrics
mrr_values = [r.mrr for r in results.values()]
ndcg_values = [r.ndcg for r in results.values()]
latency_values = [r.avg_latency_ms for r in results.values()]
cost_values = [r.cost_per_query for r in results.values()]
scores = []
for model, result in results.items():
score = 0.0
# Higher is better for MRR and NDCG
score += weights["mrr"] * (result.mrr / max(mrr_values))
score += weights["ndcg"] * (result.ndcg / max(ndcg_values))
# Lower is better for latency and cost
score += weights["latency"] * (1 - result.avg_latency_ms / max(latency_values))
score += weights["cost"] * (1 - result.cost_per_query / max(cost_values)) if max(cost_values) > 0 else weights["cost"]
scores.append((model, score))
return sorted(scores, key=lambda x: x[1], reverse=True)
Task-Based Model Selection
from dataclasses import dataclass
from typing import Any, Optional
from enum import Enum
class EmbeddingTask(Enum):
"""Types of embedding tasks."""
SEMANTIC_SEARCH = "semantic_search"
DOCUMENT_RETRIEVAL = "document_retrieval"
CODE_SEARCH = "code_search"
MULTILINGUAL = "multilingual"
CLUSTERING = "clustering"
CLASSIFICATION = "classification"
@dataclass
class TaskRequirements:
"""Requirements for an embedding task."""
task: EmbeddingTask
max_latency_ms: float = 100
max_cost_per_1m: float = 0.5
min_dimensions: int = 256
requires_multilingual: bool = False
requires_long_context: bool = False
privacy_required: bool = False
class TaskBasedSelector:
"""Select embedding model based on task requirements."""
def __init__(self, registry: EmbeddingModelRegistry):
self.registry = registry
# Task to model recommendations
self.task_recommendations = {
EmbeddingTask.SEMANTIC_SEARCH: [
"text-embedding-3-small",
"BAAI/bge-large-en-v1.5",
"intfloat/e5-large-v2"
],
EmbeddingTask.DOCUMENT_RETRIEVAL: [
"text-embedding-3-large",
"voyage-large-2",
"embed-english-v3.0"
],
EmbeddingTask.CODE_SEARCH: [
"voyage-code-2",
"text-embedding-3-large"
],
EmbeddingTask.MULTILINGUAL: [
"embed-multilingual-v3.0",
"text-embedding-3-small",
"text-embedding-3-large"
],
EmbeddingTask.CLUSTERING: [
"text-embedding-3-small",
"sentence-transformers/all-MiniLM-L6-v2"
],
EmbeddingTask.CLASSIFICATION: [
"text-embedding-3-small",
"BAAI/bge-large-en-v1.5"
]
}
def select(self, requirements: TaskRequirements) -> list[EmbeddingModelSpec]:
"""Select models matching requirements."""
candidates = []
# Get task recommendations
recommended = self.task_recommendations.get(requirements.task, [])
for model_name in recommended:
spec = self.registry.get(model_name)
if not spec:
continue
# Check requirements
if spec.cost_per_1m_tokens > requirements.max_cost_per_1m:
continue
if spec.dimensions < requirements.min_dimensions:
continue
if requirements.requires_multilingual and not spec.multilingual:
continue
if requirements.requires_long_context and spec.max_tokens < 4000:
continue
if requirements.privacy_required and spec.provider != EmbeddingProvider.HUGGINGFACE:
continue
candidates.append(spec)
# Sort by cost (cheapest first)
candidates.sort(key=lambda x: x.cost_per_1m_tokens)
return candidates
def recommend(self, requirements: TaskRequirements) -> Optional[EmbeddingModelSpec]:
"""Get top recommendation."""
candidates = self.select(requirements)
return candidates[0] if candidates else None
class AdaptiveModelSelector:
"""Dynamically select model based on input characteristics."""
def __init__(
self,
registry: EmbeddingModelRegistry,
default_model: str = "text-embedding-3-small"
):
self.registry = registry
self.default_model = default_model
def select_for_text(self, text: str) -> str:
"""Select model based on text characteristics."""
# Check language
if self._is_multilingual(text):
return "embed-multilingual-v3.0"
# Check if code
if self._is_code(text):
return "voyage-code-2"
# Check length
word_count = len(text.split())
if word_count > 1000:
return "voyage-large-2" # Long context
return self.default_model
def _is_multilingual(self, text: str) -> bool:
"""Check if text contains non-ASCII characters."""
non_ascii = sum(1 for c in text if ord(c) > 127)
return non_ascii / len(text) > 0.1 if text else False
def _is_code(self, text: str) -> bool:
"""Check if text appears to be code."""
code_indicators = [
"def ", "class ", "import ", "function",
"const ", "let ", "var ", "return ",
"if (", "for (", "while (", "=>",
"{", "}", "[]", "()"
]
indicator_count = sum(1 for ind in code_indicators if ind in text)
return indicator_count >= 3
Production Embedding Service
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional
app = FastAPI()
# Initialize components
registry = EmbeddingModelRegistry()
embedding_client = None # Initialize with actual clients
benchmark = None # Initialize with client
task_selector = TaskBasedSelector(registry)
adaptive_selector = AdaptiveModelSelector(registry)
class EmbedRequest(BaseModel):
texts: list[str]
model: Optional[str] = None
class SelectModelRequest(BaseModel):
task: str
max_latency_ms: float = 100
max_cost_per_1m: float = 0.5
requires_multilingual: bool = False
requires_long_context: bool = False
privacy_required: bool = False
class BenchmarkRequest(BaseModel):
models: list[str]
queries: list[dict]
@app.post("/v1/embeddings")
async def create_embeddings(request: EmbedRequest):
"""Generate embeddings."""
# Auto-select model if not specified
model = request.model
if not model:
# Use adaptive selection based on first text
model = adaptive_selector.select_for_text(request.texts[0])
result = await embedding_client.embed(request.texts, model)
return {
"embeddings": result.embeddings,
"model": result.model,
"tokens_used": result.tokens_used,
"latency_ms": result.latency_ms
}
@app.post("/v1/models/select")
async def select_model(request: SelectModelRequest):
"""Select best model for task."""
try:
task = EmbeddingTask(request.task)
except ValueError:
raise HTTPException(status_code=400, detail=f"Unknown task: {request.task}")
requirements = TaskRequirements(
task=task,
max_latency_ms=request.max_latency_ms,
max_cost_per_1m=request.max_cost_per_1m,
requires_multilingual=request.requires_multilingual,
requires_long_context=request.requires_long_context,
privacy_required=request.privacy_required
)
candidates = task_selector.select(requirements)
return {
"recommended": candidates[0].name if candidates else None,
"candidates": [
{
"name": c.name,
"provider": c.provider.value,
"dimensions": c.dimensions,
"cost_per_1m": c.cost_per_1m_tokens,
"description": c.description
}
for c in candidates
]
}
@app.get("/v1/models")
async def list_models(
provider: Optional[str] = None,
multilingual: Optional[bool] = None,
max_cost: Optional[float] = None
):
"""List available models."""
models = list(registry.models.values())
if provider:
try:
p = EmbeddingProvider(provider)
models = [m for m in models if m.provider == p]
except ValueError:
pass
if multilingual is not None:
models = [m for m in models if m.multilingual == multilingual]
if max_cost is not None:
models = [m for m in models if m.cost_per_1m_tokens <= max_cost]
return {
"models": [
{
"name": m.name,
"provider": m.provider.value,
"dimensions": m.dimensions,
"max_tokens": m.max_tokens,
"cost_per_1m": m.cost_per_1m_tokens,
"multilingual": m.multilingual,
"description": m.description
}
for m in models
]
}
@app.get("/v1/models/{model_name}")
async def get_model(model_name: str):
"""Get model details."""
spec = registry.get(model_name)
if not spec:
raise HTTPException(status_code=404, detail="Model not found")
return {
"name": spec.name,
"provider": spec.provider.value,
"dimensions": spec.dimensions,
"max_tokens": spec.max_tokens,
"cost_per_1m": spec.cost_per_1m_tokens,
"multilingual": spec.multilingual,
"description": spec.description
}
@app.post("/v1/benchmark")
async def run_benchmark(request: BenchmarkRequest):
"""Benchmark models."""
queries = [
BenchmarkQuery(
query=q["query"],
relevant_docs=q["relevant_docs"],
irrelevant_docs=q.get("irrelevant_docs", [])
)
for q in request.queries
]
results = await benchmark.compare_models(request.models, queries)
rankings = benchmark.rank_models(results)
return {
"results": {
model: {
"mrr": r.mrr,
"ndcg": r.ndcg,
"precision_at_5": r.precision_at_k.get(5, 0),
"recall_at_5": r.recall_at_k.get(5, 0),
"avg_latency_ms": r.avg_latency_ms,
"cost_per_query": r.cost_per_query
}
for model, r in results.items()
},
"rankings": [{"model": m, "score": s} for m, s in rankings]
}
@app.get("/health")
async def health():
return {"status": "healthy"}
References
- OpenAI Embeddings: https://platform.openai.com/docs/guides/embeddings
- Cohere Embed: https://docs.cohere.com/docs/embeddings
- Voyage AI: https://docs.voyageai.com/
- MTEB Leaderboard: https://huggingface.co/spaces/mteb/leaderboard
- Sentence Transformers: https://www.sbert.net/
Conclusion
Embedding model selection requires balancing quality, cost, and latency for your specific use case. Start with a model registry that captures key specifications: dimensions, token limits, pricing, and capabilities. Build a unified client that abstracts provider differences, making it easy to switch models. Benchmark models on your actual data—public benchmarks like MTEB provide guidance, but your domain-specific queries matter most. Use task-based selection to narrow candidates: semantic search, code retrieval, and multilingual applications have different optimal models. Consider adaptive selection that examines input characteristics to route to appropriate models dynamically. For production systems, text-embedding-3-small offers excellent cost-performance for most English use cases. Cohere's multilingual model excels for international applications. Voyage-code-2 is purpose-built for code search. Open-source models like BGE and E5 provide privacy and cost control for self-hosted deployments. The key is measuring what matters for your application and making data-driven decisions rather than defaulting to the most popular option.
Discover more from Code, Cloud & Context
Subscribe to get the latest posts sent to your email.

Leave a Reply