Introduction: Not every request needs GPT-4. Simple questions can be handled by smaller, faster, cheaper models, while complex reasoning tasks benefit from more capable ones. Model routing intelligently directs requests to the most appropriate model based on task complexity, cost constraints, latency requirements, and quality needs. This approach can reduce costs by 50-80% while maintaining quality where it matters. This guide covers practical routing strategies: classifying request complexity, implementing rule-based and ML-based routers, balancing cost and quality tradeoffs, handling fallbacks when models fail, and building adaptive systems that learn from feedback. Whether you’re building a chatbot, API service, or enterprise application, model routing is essential for operating LLMs economically at scale.

Request Classification
from dataclasses import dataclass, field
from typing import Any, Optional
from enum import Enum
from abc import ABC, abstractmethod
class TaskComplexity(Enum):
"""Task complexity levels."""
SIMPLE = "simple" # Factual, short answers
MODERATE = "moderate" # Some reasoning required
COMPLEX = "complex" # Multi-step reasoning
EXPERT = "expert" # Specialized knowledge
class TaskType(Enum):
"""Types of tasks."""
CHAT = "chat"
QA = "qa"
SUMMARIZATION = "summarization"
CODE = "code"
REASONING = "reasoning"
CREATIVE = "creative"
EXTRACTION = "extraction"
TRANSLATION = "translation"
@dataclass
class ClassificationResult:
"""Result of request classification."""
complexity: TaskComplexity
task_type: TaskType
confidence: float
features: dict = field(default_factory=dict)
class RequestClassifier(ABC):
"""Abstract request classifier."""
@abstractmethod
async def classify(self, request: str) -> ClassificationResult:
"""Classify a request."""
pass
class RuleBasedClassifier(RequestClassifier):
"""Rule-based request classifier."""
def __init__(self):
self.complexity_patterns = {
TaskComplexity.SIMPLE: [
r"what is",
r"who is",
r"when did",
r"where is",
r"define",
r"translate",
],
TaskComplexity.MODERATE: [
r"explain",
r"describe",
r"compare",
r"summarize",
r"how to",
],
TaskComplexity.COMPLEX: [
r"analyze",
r"evaluate",
r"why does",
r"what if",
r"design",
r"implement",
],
TaskComplexity.EXPERT: [
r"prove",
r"derive",
r"optimize",
r"architect",
r"debug.*complex",
]
}
self.task_patterns = {
TaskType.CODE: [r"code", r"function", r"implement", r"debug", r"program"],
TaskType.SUMMARIZATION: [r"summarize", r"summary", r"tldr", r"brief"],
TaskType.TRANSLATION: [r"translate", r"in \w+ language"],
TaskType.REASONING: [r"why", r"because", r"reason", r"logic"],
TaskType.CREATIVE: [r"write.*story", r"poem", r"creative", r"imagine"],
TaskType.EXTRACTION: [r"extract", r"find.*in", r"list.*from"],
}
async def classify(self, request: str) -> ClassificationResult:
"""Classify using rules."""
import re
request_lower = request.lower()
# Determine complexity
complexity = TaskComplexity.SIMPLE
for level, patterns in self.complexity_patterns.items():
for pattern in patterns:
if re.search(pattern, request_lower):
complexity = level
break
# Determine task type
task_type = TaskType.CHAT
for ttype, patterns in self.task_patterns.items():
for pattern in patterns:
if re.search(pattern, request_lower):
task_type = ttype
break
# Additional heuristics
features = {
"length": len(request),
"question_marks": request.count("?"),
"code_blocks": request.count("```"),
"word_count": len(request.split())
}
# Adjust complexity based on length
if features["word_count"] > 200:
if complexity == TaskComplexity.SIMPLE:
complexity = TaskComplexity.MODERATE
return ClassificationResult(
complexity=complexity,
task_type=task_type,
confidence=0.7, # Rule-based has moderate confidence
features=features
)
class MLClassifier(RequestClassifier):
"""ML-based request classifier."""
def __init__(
self,
embedding_model: Any,
classifier_model: Any = None
):
self.embedding_model = embedding_model
self.classifier_model = classifier_model
# Example embeddings for each complexity level
self._complexity_centroids = {}
self._task_centroids = {}
async def classify(self, request: str) -> ClassificationResult:
"""Classify using embeddings."""
# Get request embedding
embedding = await self.embedding_model.embed([request])
embedding = embedding[0]
# Find nearest complexity centroid
complexity = self._find_nearest_complexity(embedding)
# Find nearest task type centroid
task_type = self._find_nearest_task(embedding)
# Use classifier if available
if self.classifier_model:
prediction = self.classifier_model.predict([embedding])[0]
complexity = TaskComplexity(prediction["complexity"])
task_type = TaskType(prediction["task_type"])
confidence = prediction["confidence"]
else:
confidence = 0.8
return ClassificationResult(
complexity=complexity,
task_type=task_type,
confidence=confidence,
features={"embedding_norm": sum(x*x for x in embedding) ** 0.5}
)
def _find_nearest_complexity(self, embedding: list[float]) -> TaskComplexity:
"""Find nearest complexity centroid."""
if not self._complexity_centroids:
return TaskComplexity.MODERATE
best_complexity = TaskComplexity.MODERATE
best_similarity = -1
for complexity, centroid in self._complexity_centroids.items():
similarity = self._cosine_similarity(embedding, centroid)
if similarity > best_similarity:
best_similarity = similarity
best_complexity = complexity
return best_complexity
def _find_nearest_task(self, embedding: list[float]) -> TaskType:
"""Find nearest task type centroid."""
if not self._task_centroids:
return TaskType.CHAT
best_task = TaskType.CHAT
best_similarity = -1
for task_type, centroid in self._task_centroids.items():
similarity = self._cosine_similarity(embedding, centroid)
if similarity > best_similarity:
best_similarity = similarity
best_task = task_type
return best_task
def _cosine_similarity(self, a: list[float], b: list[float]) -> float:
"""Compute cosine similarity."""
import math
dot = sum(x * y for x, y in zip(a, b))
norm_a = math.sqrt(sum(x * x for x in a))
norm_b = math.sqrt(sum(x * x for x in b))
return dot / (norm_a * norm_b) if norm_a and norm_b else 0.0
class LLMClassifier(RequestClassifier):
"""LLM-based request classifier."""
def __init__(self, client: Any, model: str = "gpt-4o-mini"):
self.client = client
self.model = model
async def classify(self, request: str) -> ClassificationResult:
"""Classify using LLM."""
prompt = f"""Classify this request by complexity and task type.
Request: {request[:500]}
Complexity levels:
- simple: factual questions, definitions, translations
- moderate: explanations, comparisons, summaries
- complex: analysis, multi-step reasoning, design
- expert: proofs, optimizations, specialized knowledge
Task types:
- chat: general conversation
- qa: question answering
- summarization: condensing content
- code: programming tasks
- reasoning: logical analysis
- creative: creative writing
- extraction: extracting information
- translation: language translation
Return JSON:
{{"complexity": "...", "task_type": "...", "confidence": 0.0-1.0}}"""
response = await self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
temperature=0
)
import json
import re
content = response.choices[0].message.content
json_match = re.search(r'\{[\s\S]*\}', content)
if json_match:
data = json.loads(json_match.group(0))
return ClassificationResult(
complexity=TaskComplexity(data["complexity"]),
task_type=TaskType(data["task_type"]),
confidence=data.get("confidence", 0.9)
)
return ClassificationResult(
complexity=TaskComplexity.MODERATE,
task_type=TaskType.CHAT,
confidence=0.5
)
Model Selection
from dataclasses import dataclass
from typing import Any, Optional
from enum import Enum
@dataclass
class ModelConfig:
"""Configuration for a model."""
name: str
provider: str
cost_per_1k_input: float
cost_per_1k_output: float
max_tokens: int
latency_ms: float # Average latency
quality_score: float # 0-1 quality rating
capabilities: set[str] = None
def __post_init__(self):
self.capabilities = self.capabilities or set()
class ModelRegistry:
"""Registry of available models."""
def __init__(self):
self.models: dict[str, ModelConfig] = {}
self._setup_default_models()
def _setup_default_models(self):
"""Setup default model configurations."""
self.models = {
"gpt-4o": ModelConfig(
name="gpt-4o",
provider="openai",
cost_per_1k_input=0.0025,
cost_per_1k_output=0.01,
max_tokens=128000,
latency_ms=1500,
quality_score=0.95,
capabilities={"code", "reasoning", "creative", "vision"}
),
"gpt-4o-mini": ModelConfig(
name="gpt-4o-mini",
provider="openai",
cost_per_1k_input=0.00015,
cost_per_1k_output=0.0006,
max_tokens=128000,
latency_ms=800,
quality_score=0.85,
capabilities={"code", "reasoning", "creative"}
),
"claude-3-5-sonnet": ModelConfig(
name="claude-3-5-sonnet-20241022",
provider="anthropic",
cost_per_1k_input=0.003,
cost_per_1k_output=0.015,
max_tokens=200000,
latency_ms=1200,
quality_score=0.94,
capabilities={"code", "reasoning", "creative", "long_context"}
),
"claude-3-5-haiku": ModelConfig(
name="claude-3-5-haiku-20241022",
provider="anthropic",
cost_per_1k_input=0.0008,
cost_per_1k_output=0.004,
max_tokens=200000,
latency_ms=600,
quality_score=0.82,
capabilities={"code", "reasoning"}
),
"gemini-1.5-pro": ModelConfig(
name="gemini-1.5-pro",
provider="google",
cost_per_1k_input=0.00125,
cost_per_1k_output=0.005,
max_tokens=2000000,
latency_ms=1000,
quality_score=0.90,
capabilities={"code", "reasoning", "long_context", "vision"}
),
"gemini-1.5-flash": ModelConfig(
name="gemini-1.5-flash",
provider="google",
cost_per_1k_input=0.000075,
cost_per_1k_output=0.0003,
max_tokens=1000000,
latency_ms=400,
quality_score=0.78,
capabilities={"code", "reasoning"}
)
}
def get_model(self, name: str) -> Optional[ModelConfig]:
"""Get model by name."""
return self.models.get(name)
def get_models_by_capability(self, capability: str) -> list[ModelConfig]:
"""Get models with a specific capability."""
return [m for m in self.models.values() if capability in m.capabilities]
def get_cheapest_model(
self,
min_quality: float = 0.0,
required_capabilities: set[str] = None
) -> Optional[ModelConfig]:
"""Get cheapest model meeting requirements."""
candidates = list(self.models.values())
# Filter by quality
candidates = [m for m in candidates if m.quality_score >= min_quality]
# Filter by capabilities
if required_capabilities:
candidates = [
m for m in candidates
if required_capabilities.issubset(m.capabilities)
]
if not candidates:
return None
# Sort by cost (input + output weighted)
return min(
candidates,
key=lambda m: m.cost_per_1k_input + m.cost_per_1k_output
)
class ModelSelector:
"""Select optimal model for request."""
def __init__(self, registry: ModelRegistry):
self.registry = registry
# Complexity to quality mapping
self.complexity_quality = {
TaskComplexity.SIMPLE: 0.7,
TaskComplexity.MODERATE: 0.8,
TaskComplexity.COMPLEX: 0.9,
TaskComplexity.EXPERT: 0.95
}
# Task type to capability mapping
self.task_capabilities = {
TaskType.CODE: {"code"},
TaskType.REASONING: {"reasoning"},
TaskType.CREATIVE: {"creative"},
}
def select(
self,
classification: ClassificationResult,
constraints: dict = None
) -> ModelConfig:
"""Select model based on classification."""
constraints = constraints or {}
# Determine minimum quality
min_quality = self.complexity_quality.get(
classification.complexity,
0.8
)
# Override with constraint if specified
if "min_quality" in constraints:
min_quality = max(min_quality, constraints["min_quality"])
# Determine required capabilities
required_caps = self.task_capabilities.get(
classification.task_type,
set()
)
# Get candidates
candidates = [
m for m in self.registry.models.values()
if m.quality_score >= min_quality
and required_caps.issubset(m.capabilities)
]
if not candidates:
# Fallback to best available
return max(
self.registry.models.values(),
key=lambda m: m.quality_score
)
# Apply constraints
if "max_cost" in constraints:
candidates = [
m for m in candidates
if m.cost_per_1k_input <= constraints["max_cost"]
]
if "max_latency" in constraints:
candidates = [
m for m in candidates
if m.latency_ms <= constraints["max_latency"]
]
if "provider" in constraints:
candidates = [
m for m in candidates
if m.provider == constraints["provider"]
]
if not candidates:
return self.registry.get_cheapest_model(min_quality)
# Select cheapest among candidates
return min(
candidates,
key=lambda m: m.cost_per_1k_input + m.cost_per_1k_output
)
Router Implementation
from dataclasses import dataclass
from typing import Any, Optional, Callable
from datetime import datetime
import asyncio
@dataclass
class RoutingDecision:
"""A routing decision."""
model: ModelConfig
classification: ClassificationResult
reason: str
estimated_cost: float
estimated_latency: float
@dataclass
class RoutingResult:
"""Result of routing and execution."""
decision: RoutingDecision
response: str
actual_cost: float
actual_latency_ms: float
success: bool
error: str = None
class ModelRouter:
"""Route requests to optimal models."""
def __init__(
self,
classifier: RequestClassifier,
selector: ModelSelector,
clients: dict[str, Any]
):
self.classifier = classifier
self.selector = selector
self.clients = clients # provider -> client
self._metrics: list[RoutingResult] = []
async def route(
self,
request: str,
constraints: dict = None
) -> RoutingDecision:
"""Route request to optimal model."""
# Classify request
classification = await self.classifier.classify(request)
# Select model
model = self.selector.select(classification, constraints)
# Estimate cost and latency
estimated_tokens = len(request.split()) * 1.3
estimated_cost = (
model.cost_per_1k_input * estimated_tokens / 1000 +
model.cost_per_1k_output * 100 / 1000 # Assume 100 output tokens
)
return RoutingDecision(
model=model,
classification=classification,
reason=f"Selected {model.name} for {classification.complexity.value} {classification.task_type.value} task",
estimated_cost=estimated_cost,
estimated_latency=model.latency_ms
)
async def route_and_execute(
self,
request: str,
constraints: dict = None,
system_prompt: str = None
) -> RoutingResult:
"""Route and execute request."""
start_time = datetime.utcnow()
# Get routing decision
decision = await self.route(request, constraints)
# Get client for provider
client = self.clients.get(decision.model.provider)
if not client:
return RoutingResult(
decision=decision,
response="",
actual_cost=0,
actual_latency_ms=0,
success=False,
error=f"No client for provider: {decision.model.provider}"
)
# Execute request
try:
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": request})
response = await client.chat.completions.create(
model=decision.model.name,
messages=messages
)
end_time = datetime.utcnow()
latency = (end_time - start_time).total_seconds() * 1000
# Calculate actual cost
usage = response.usage
actual_cost = (
decision.model.cost_per_1k_input * usage.prompt_tokens / 1000 +
decision.model.cost_per_1k_output * usage.completion_tokens / 1000
)
result = RoutingResult(
decision=decision,
response=response.choices[0].message.content,
actual_cost=actual_cost,
actual_latency_ms=latency,
success=True
)
except Exception as e:
result = RoutingResult(
decision=decision,
response="",
actual_cost=0,
actual_latency_ms=0,
success=False,
error=str(e)
)
self._metrics.append(result)
return result
def get_metrics(self) -> dict:
"""Get routing metrics."""
if not self._metrics:
return {}
successful = [m for m in self._metrics if m.success]
return {
"total_requests": len(self._metrics),
"success_rate": len(successful) / len(self._metrics),
"total_cost": sum(m.actual_cost for m in successful),
"avg_latency_ms": sum(m.actual_latency_ms for m in successful) / len(successful) if successful else 0,
"model_distribution": self._get_model_distribution()
}
def _get_model_distribution(self) -> dict[str, int]:
"""Get distribution of models used."""
distribution = {}
for m in self._metrics:
model_name = m.decision.model.name
distribution[model_name] = distribution.get(model_name, 0) + 1
return distribution
class FallbackRouter(ModelRouter):
"""Router with fallback support."""
def __init__(
self,
classifier: RequestClassifier,
selector: ModelSelector,
clients: dict[str, Any],
fallback_chain: list[str] = None
):
super().__init__(classifier, selector, clients)
self.fallback_chain = fallback_chain or ["gpt-4o-mini", "gpt-4o"]
async def route_and_execute(
self,
request: str,
constraints: dict = None,
system_prompt: str = None
) -> RoutingResult:
"""Route with fallback on failure."""
# Try primary route
result = await super().route_and_execute(
request, constraints, system_prompt
)
if result.success:
return result
# Try fallback chain
for fallback_model in self.fallback_chain:
model_config = self.selector.registry.get_model(fallback_model)
if not model_config:
continue
client = self.clients.get(model_config.provider)
if not client:
continue
try:
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": request})
response = await client.chat.completions.create(
model=model_config.name,
messages=messages
)
return RoutingResult(
decision=RoutingDecision(
model=model_config,
classification=result.decision.classification,
reason=f"Fallback to {model_config.name}",
estimated_cost=0,
estimated_latency=model_config.latency_ms
),
response=response.choices[0].message.content,
actual_cost=0, # Would calculate
actual_latency_ms=0,
success=True
)
except Exception:
continue
return result # Return original failure
Adaptive Routing
from dataclasses import dataclass, field
from typing import Any, Optional
from datetime import datetime, timedelta
import random
@dataclass
class ModelPerformance:
"""Track model performance."""
model_name: str
total_requests: int = 0
successful_requests: int = 0
total_cost: float = 0.0
total_latency_ms: float = 0.0
quality_scores: list[float] = field(default_factory=list)
last_updated: datetime = None
@property
def success_rate(self) -> float:
return self.successful_requests / self.total_requests if self.total_requests > 0 else 0
@property
def avg_latency(self) -> float:
return self.total_latency_ms / self.total_requests if self.total_requests > 0 else 0
@property
def avg_quality(self) -> float:
return sum(self.quality_scores) / len(self.quality_scores) if self.quality_scores else 0
class AdaptiveRouter:
"""Adaptive router that learns from feedback."""
def __init__(
self,
classifier: RequestClassifier,
registry: ModelRegistry,
clients: dict[str, Any],
exploration_rate: float = 0.1
):
self.classifier = classifier
self.registry = registry
self.clients = clients
self.exploration_rate = exploration_rate
# Performance tracking per (complexity, task_type) -> model
self._performance: dict[tuple, dict[str, ModelPerformance]] = {}
async def route(
self,
request: str,
constraints: dict = None
) -> RoutingDecision:
"""Route using learned performance."""
classification = await self.classifier.classify(request)
key = (classification.complexity, classification.task_type)
# Exploration vs exploitation
if random.random() < self.exploration_rate:
# Explore: try random model
model = random.choice(list(self.registry.models.values()))
reason = "Exploration"
else:
# Exploit: use best known model
model = self._get_best_model(key, constraints)
reason = "Best performing"
return RoutingDecision(
model=model,
classification=classification,
reason=reason,
estimated_cost=0,
estimated_latency=model.latency_ms
)
def _get_best_model(
self,
key: tuple,
constraints: dict = None
) -> ModelConfig:
"""Get best model for key based on performance."""
perf_data = self._performance.get(key, {})
if not perf_data:
# No data, use default selection
return self.registry.get_cheapest_model(min_quality=0.8)
# Score models
scores = {}
for model_name, perf in perf_data.items():
if perf.total_requests < 5:
continue # Not enough data
# Score = quality * success_rate / cost
model = self.registry.get_model(model_name)
if not model:
continue
cost = model.cost_per_1k_input + model.cost_per_1k_output
score = (perf.avg_quality * perf.success_rate) / (cost + 0.001)
# Apply constraints
if constraints:
if "max_latency" in constraints and perf.avg_latency > constraints["max_latency"]:
continue
if "max_cost" in constraints and cost > constraints["max_cost"]:
continue
scores[model_name] = score
if scores:
best_model = max(scores, key=scores.get)
return self.registry.get_model(best_model)
return self.registry.get_cheapest_model(min_quality=0.8)
def record_feedback(
self,
decision: RoutingDecision,
success: bool,
latency_ms: float,
cost: float,
quality_score: float = None
):
"""Record feedback for learning."""
key = (decision.classification.complexity, decision.classification.task_type)
model_name = decision.model.name
if key not in self._performance:
self._performance[key] = {}
if model_name not in self._performance[key]:
self._performance[key][model_name] = ModelPerformance(model_name=model_name)
perf = self._performance[key][model_name]
perf.total_requests += 1
if success:
perf.successful_requests += 1
perf.total_cost += cost
perf.total_latency_ms += latency_ms
if quality_score is not None:
perf.quality_scores.append(quality_score)
# Keep only recent scores
perf.quality_scores = perf.quality_scores[-100:]
perf.last_updated = datetime.utcnow()
def get_performance_report(self) -> dict:
"""Get performance report."""
report = {}
for key, models in self._performance.items():
complexity, task_type = key
key_str = f"{complexity.value}_{task_type.value}"
report[key_str] = {
model_name: {
"requests": perf.total_requests,
"success_rate": perf.success_rate,
"avg_latency_ms": perf.avg_latency,
"avg_quality": perf.avg_quality,
"total_cost": perf.total_cost
}
for model_name, perf in models.items()
}
return report
class CostOptimizedRouter:
"""Router optimized for cost with quality constraints."""
def __init__(
self,
classifier: RequestClassifier,
registry: ModelRegistry,
clients: dict[str, Any],
budget_per_hour: float = 10.0
):
self.classifier = classifier
self.registry = registry
self.clients = clients
self.budget_per_hour = budget_per_hour
self._hourly_spend: dict[str, float] = {}
self._current_hour: str = ""
async def route(
self,
request: str,
min_quality: float = 0.7
) -> RoutingDecision:
"""Route with cost optimization."""
classification = await self.classifier.classify(request)
# Check budget
current_hour = datetime.utcnow().strftime("%Y-%m-%d-%H")
if current_hour != self._current_hour:
self._current_hour = current_hour
self._hourly_spend[current_hour] = 0.0
spent = self._hourly_spend.get(current_hour, 0.0)
remaining_budget = self.budget_per_hour - spent
# Adjust quality requirement based on budget
if remaining_budget < self.budget_per_hour * 0.2:
# Low budget, use cheaper models
min_quality = max(0.6, min_quality - 0.1)
# Get cheapest model meeting quality
model = self.registry.get_cheapest_model(min_quality=min_quality)
if not model:
model = list(self.registry.models.values())[0]
return RoutingDecision(
model=model,
classification=classification,
reason=f"Cost optimized (budget: ${remaining_budget:.2f} remaining)",
estimated_cost=model.cost_per_1k_input * 0.1, # Rough estimate
estimated_latency=model.latency_ms
)
def record_cost(self, cost: float):
"""Record cost for budget tracking."""
current_hour = datetime.utcnow().strftime("%Y-%m-%d-%H")
if current_hour not in self._hourly_spend:
self._hourly_spend[current_hour] = 0.0
self._hourly_spend[current_hour] += cost
Production Routing Service
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional
from enum import Enum
app = FastAPI()
# Initialize components
registry = ModelRegistry()
classifier = RuleBasedClassifier()
selector = ModelSelector(registry)
# router = ModelRouter(classifier, selector, clients)
class RouteRequest(BaseModel):
prompt: str
system_prompt: Optional[str] = None
constraints: Optional[dict] = None
execute: bool = True
class RouteResponse(BaseModel):
model: str
provider: str
complexity: str
task_type: str
reason: str
estimated_cost: float
response: Optional[str] = None
actual_cost: Optional[float] = None
latency_ms: Optional[float] = None
class FeedbackRequest(BaseModel):
request_id: str
quality_score: float
success: bool
@app.post("/v1/route")
async def route_request(request: RouteRequest) -> RouteResponse:
"""Route request to optimal model."""
# Classify
classification = await classifier.classify(request.prompt)
# Select model
model = selector.select(classification, request.constraints)
response_text = None
actual_cost = None
latency = None
if request.execute:
# Would execute request
response_text = "Response placeholder"
actual_cost = 0.001
latency = 500
return RouteResponse(
model=model.name,
provider=model.provider,
complexity=classification.complexity.value,
task_type=classification.task_type.value,
reason=f"Selected for {classification.complexity.value} {classification.task_type.value}",
estimated_cost=model.cost_per_1k_input * 0.1,
response=response_text,
actual_cost=actual_cost,
latency_ms=latency
)
@app.post("/v1/classify")
async def classify_request(prompt: str):
"""Classify request without routing."""
classification = await classifier.classify(prompt)
return {
"complexity": classification.complexity.value,
"task_type": classification.task_type.value,
"confidence": classification.confidence,
"features": classification.features
}
@app.get("/v1/models")
async def list_models():
"""List available models."""
return {
"models": [
{
"name": m.name,
"provider": m.provider,
"cost_per_1k_input": m.cost_per_1k_input,
"cost_per_1k_output": m.cost_per_1k_output,
"quality_score": m.quality_score,
"latency_ms": m.latency_ms,
"capabilities": list(m.capabilities)
}
for m in registry.models.values()
]
}
@app.get("/v1/models/{model_name}")
async def get_model(model_name: str):
"""Get model details."""
model = registry.get_model(model_name)
if not model:
raise HTTPException(404, "Model not found")
return {
"name": model.name,
"provider": model.provider,
"cost_per_1k_input": model.cost_per_1k_input,
"cost_per_1k_output": model.cost_per_1k_output,
"quality_score": model.quality_score,
"latency_ms": model.latency_ms,
"capabilities": list(model.capabilities)
}
@app.post("/v1/feedback")
async def submit_feedback(request: FeedbackRequest):
"""Submit feedback for adaptive routing."""
# Would record feedback
return {"status": "recorded"}
@app.get("/v1/metrics")
async def get_metrics():
"""Get routing metrics."""
# Would return actual metrics
return {
"total_requests": 1000,
"success_rate": 0.98,
"total_cost": 15.50,
"avg_latency_ms": 850,
"model_distribution": {
"gpt-4o-mini": 650,
"gpt-4o": 200,
"claude-3-5-haiku": 150
},
"cost_savings_percent": 65
}
@app.get("/v1/budget")
async def get_budget():
"""Get budget status."""
return {
"hourly_budget": 10.0,
"current_hour_spend": 3.50,
"remaining": 6.50,
"utilization_percent": 35
}
@app.get("/health")
async def health():
return {"status": "healthy"}
References
- Martian Router: https://withmartian.com/
- OpenRouter: https://openrouter.ai/
- RouteLLM: https://github.com/lm-sys/RouteLLM
- LiteLLM: https://github.com/BerriAI/litellm
Conclusion
Model routing is essential for operating LLMs economically at scale. Start with request classification—determine task complexity and type to inform model selection. Build a model registry with accurate cost, latency, and quality metrics for each available model. Implement selection logic that matches task requirements to model capabilities while optimizing for cost. Add fallback chains to handle model failures gracefully. For production systems, build adaptive routers that learn from feedback—track which models perform best for which task types and adjust routing accordingly. Monitor key metrics: cost savings, quality scores, latency percentiles, and model utilization. The key insight is that model routing is a continuous optimization problem—the best model for a task depends on current costs, model performance, and your quality requirements. A well-tuned router can reduce costs by 50-80% while maintaining quality where it matters, making the difference between an economically viable LLM application and one that burns through budget.
Discover more from Code, Cloud & Context
Subscribe to get the latest posts sent to your email.