Introduction: Not every query needs GPT-4. Routing simple questions to cheaper, faster models while reserving expensive models for complex tasks can cut costs by 70% or more without sacrificing quality. Smart LLM routing is the difference between a $10,000/month AI bill and a $3,000 one. This guide covers implementing intelligent model selection: classifying query complexity, building routing logic, handling fallbacks, and optimizing for cost-quality tradeoffs. Whether you’re using a single provider with multiple tiers or orchestrating across OpenAI, Anthropic, and local models, these patterns will help you build efficient, cost-effective LLM applications.

Basic Model Router
from dataclasses import dataclass
from enum import Enum
from typing import Optional, Callable
from openai import OpenAI
client = OpenAI()
class TaskComplexity(Enum):
SIMPLE = "simple" # Factual Q&A, simple formatting
MEDIUM = "medium" # Summarization, basic analysis
COMPLEX = "complex" # Reasoning, coding, creative writing
EXPERT = "expert" # Multi-step reasoning, specialized knowledge
@dataclass
class ModelConfig:
name: str
cost_per_1k_input: float
cost_per_1k_output: float
max_tokens: int
strengths: list[str]
class ModelRouter:
"""Route requests to appropriate models based on complexity."""
MODELS = {
TaskComplexity.SIMPLE: ModelConfig(
name="gpt-4o-mini",
cost_per_1k_input=0.00015,
cost_per_1k_output=0.0006,
max_tokens=128000,
strengths=["fast", "cheap", "simple_qa"]
),
TaskComplexity.MEDIUM: ModelConfig(
name="gpt-4o-mini",
cost_per_1k_input=0.00015,
cost_per_1k_output=0.0006,
max_tokens=128000,
strengths=["summarization", "formatting"]
),
TaskComplexity.COMPLEX: ModelConfig(
name="gpt-4o",
cost_per_1k_input=0.005,
cost_per_1k_output=0.015,
max_tokens=128000,
strengths=["reasoning", "coding", "analysis"]
),
TaskComplexity.EXPERT: ModelConfig(
name="gpt-4o",
cost_per_1k_input=0.005,
cost_per_1k_output=0.015,
max_tokens=128000,
strengths=["complex_reasoning", "specialized"]
),
}
def __init__(self):
self.classifier_model = "gpt-4o-mini"
def classify_complexity(self, query: str) -> TaskComplexity:
"""Classify query complexity using a small model."""
response = client.chat.completions.create(
model=self.classifier_model,
messages=[
{
"role": "system",
"content": """Classify the complexity of this query:
- SIMPLE: Factual questions, definitions, simple formatting
- MEDIUM: Summarization, basic analysis, straightforward tasks
- COMPLEX: Multi-step reasoning, coding, creative writing, detailed analysis
- EXPERT: Specialized knowledge, complex problem-solving, research-level questions
Return only: SIMPLE, MEDIUM, COMPLEX, or EXPERT"""
},
{"role": "user", "content": query}
],
max_tokens=10,
temperature=0
)
result = response.choices[0].message.content.strip().upper()
try:
return TaskComplexity(result.lower())
except ValueError:
return TaskComplexity.MEDIUM # Default to medium
def route(self, query: str, force_model: Optional[str] = None) -> tuple[str, ModelConfig]:
"""Route query to appropriate model."""
if force_model:
# Find config for forced model
for config in self.MODELS.values():
if config.name == force_model:
return force_model, config
complexity = self.classify_complexity(query)
config = self.MODELS[complexity]
return config.name, config
def complete(self, query: str, system_prompt: str = "", **kwargs) -> str:
"""Complete query with automatic routing."""
model, config = self.route(query)
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": query})
response = client.chat.completions.create(
model=model,
messages=messages,
**kwargs
)
return response.choices[0].message.content
# Usage
router = ModelRouter()
# Simple query -> routes to gpt-4o-mini
result = router.complete("What is the capital of France?")
# Complex query -> routes to gpt-4o
result = router.complete("Write a Python function to implement a red-black tree with all standard operations.")
Multi-Provider Router
from abc import ABC, abstractmethod
from openai import OpenAI
import anthropic
class LLMProvider(ABC):
"""Abstract base for LLM providers."""
@abstractmethod
def complete(self, messages: list[dict], **kwargs) -> str:
pass
@abstractmethod
def get_cost(self, input_tokens: int, output_tokens: int) -> float:
pass
class OpenAIProvider(LLMProvider):
def __init__(self, model: str = "gpt-4o"):
self.client = OpenAI()
self.model = model
self.costs = {
"gpt-4o": (0.005, 0.015),
"gpt-4o-mini": (0.00015, 0.0006),
"gpt-4-turbo-preview": (0.01, 0.03),
}
def complete(self, messages: list[dict], **kwargs) -> str:
response = self.client.chat.completions.create(
model=self.model,
messages=messages,
**kwargs
)
return response.choices[0].message.content
def get_cost(self, input_tokens: int, output_tokens: int) -> float:
input_cost, output_cost = self.costs.get(self.model, (0.01, 0.03))
return (input_tokens / 1000) * input_cost + (output_tokens / 1000) * output_cost
class AnthropicProvider(LLMProvider):
def __init__(self, model: str = "claude-3-5-sonnet-20241022"):
self.client = anthropic.Anthropic()
self.model = model
self.costs = {
"claude-3-5-sonnet-20241022": (0.003, 0.015),
"claude-3-haiku-20240307": (0.00025, 0.00125),
"claude-3-opus-20240229": (0.015, 0.075),
}
def complete(self, messages: list[dict], **kwargs) -> str:
# Convert OpenAI format to Anthropic
system = ""
anthropic_messages = []
for msg in messages:
if msg["role"] == "system":
system = msg["content"]
else:
anthropic_messages.append(msg)
response = self.client.messages.create(
model=self.model,
system=system,
messages=anthropic_messages,
max_tokens=kwargs.get("max_tokens", 1024)
)
return response.content[0].text
def get_cost(self, input_tokens: int, output_tokens: int) -> float:
input_cost, output_cost = self.costs.get(self.model, (0.003, 0.015))
return (input_tokens / 1000) * input_cost + (output_tokens / 1000) * output_cost
class MultiProviderRouter:
"""Route across multiple LLM providers."""
def __init__(self):
self.providers = {
"openai_fast": OpenAIProvider("gpt-4o-mini"),
"openai_smart": OpenAIProvider("gpt-4o"),
"claude_balanced": AnthropicProvider("claude-3-5-sonnet-20241022"),
"claude_fast": AnthropicProvider("claude-3-haiku-20240307"),
}
self.task_routing = {
"coding": "openai_smart",
"reasoning": "claude_balanced",
"simple_qa": "openai_fast",
"creative": "claude_balanced",
"analysis": "openai_smart",
"summarization": "claude_fast",
}
def classify_task(self, query: str) -> str:
"""Classify task type."""
# Use fast model for classification
provider = self.providers["openai_fast"]
result = provider.complete([
{
"role": "system",
"content": "Classify this task. Return only: coding, reasoning, simple_qa, creative, analysis, or summarization"
},
{"role": "user", "content": query}
], max_tokens=20)
task_type = result.strip().lower()
return task_type if task_type in self.task_routing else "simple_qa"
def route(self, query: str) -> tuple[str, LLMProvider]:
"""Route to best provider for task."""
task_type = self.classify_task(query)
provider_key = self.task_routing[task_type]
return provider_key, self.providers[provider_key]
def complete(self, query: str, system_prompt: str = "") -> str:
"""Complete with automatic routing."""
provider_key, provider = self.route(query)
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": query})
return provider.complete(messages)
Fallback and Retry Logic
import time
from typing import Optional
class ResilientRouter:
"""Router with fallback and retry logic."""
def __init__(self):
self.providers = {
"primary": OpenAIProvider("gpt-4o"),
"secondary": AnthropicProvider("claude-3-5-sonnet-20241022"),
"fallback": OpenAIProvider("gpt-4o-mini"),
}
self.provider_order = ["primary", "secondary", "fallback"]
self.max_retries = 3
self.retry_delay = 1.0
def complete_with_fallback(
self,
messages: list[dict],
timeout: float = 30.0,
**kwargs
) -> tuple[str, str]: # Returns (response, provider_used)
"""Complete with automatic fallback on failure."""
last_error = None
for provider_key in self.provider_order:
provider = self.providers[provider_key]
for attempt in range(self.max_retries):
try:
response = provider.complete(messages, **kwargs)
return response, provider_key
except Exception as e:
last_error = e
# Check if retryable
if self._is_retryable(e):
time.sleep(self.retry_delay * (attempt + 1))
continue
else:
break # Move to next provider
# Log provider failure
print(f"Provider {provider_key} failed: {last_error}")
raise RuntimeError(f"All providers failed. Last error: {last_error}")
def _is_retryable(self, error: Exception) -> bool:
"""Check if error is retryable."""
retryable_messages = [
"rate limit",
"timeout",
"overloaded",
"503",
"529",
]
error_str = str(error).lower()
return any(msg in error_str for msg in retryable_messages)
# Load balancing router
class LoadBalancingRouter:
"""Route with load balancing across providers."""
def __init__(self):
self.providers = [
{"provider": OpenAIProvider("gpt-4o"), "weight": 0.5, "healthy": True},
{"provider": AnthropicProvider("claude-3-5-sonnet-20241022"), "weight": 0.5, "healthy": True},
]
self.health_check_interval = 60
self.last_health_check = 0
def select_provider(self) -> LLMProvider:
"""Select provider based on weights and health."""
import random
healthy = [p for p in self.providers if p["healthy"]]
if not healthy:
# All unhealthy, try first one anyway
return self.providers[0]["provider"]
# Weighted random selection
total_weight = sum(p["weight"] for p in healthy)
r = random.uniform(0, total_weight)
cumulative = 0
for p in healthy:
cumulative += p["weight"]
if r <= cumulative:
return p["provider"]
return healthy[-1]["provider"]
def complete(self, messages: list[dict], **kwargs) -> str:
"""Complete with load balancing."""
provider = self.select_provider()
return provider.complete(messages, **kwargs)
Cost-Optimized Routing
from dataclasses import dataclass
@dataclass
class RoutingDecision:
provider: str
model: str
estimated_cost: float
quality_score: float
reasoning: str
class CostOptimizedRouter:
"""Route to minimize cost while meeting quality requirements."""
def __init__(self, max_cost_per_request: float = 0.10):
self.max_cost = max_cost_per_request
self.models = [
{
"provider": "openai",
"model": "gpt-4o-mini",
"cost_per_1k": 0.00015 + 0.0006, # avg input + output
"quality_score": 0.7,
"best_for": ["simple", "formatting", "extraction"]
},
{
"provider": "anthropic",
"model": "claude-3-haiku-20240307",
"cost_per_1k": 0.00025 + 0.00125,
"quality_score": 0.75,
"best_for": ["simple", "summarization"]
},
{
"provider": "openai",
"model": "gpt-4o",
"cost_per_1k": 0.005 + 0.015,
"quality_score": 0.95,
"best_for": ["complex", "coding", "reasoning"]
},
{
"provider": "anthropic",
"model": "claude-3-5-sonnet-20241022",
"cost_per_1k": 0.003 + 0.015,
"quality_score": 0.93,
"best_for": ["complex", "creative", "analysis"]
},
]
def estimate_tokens(self, query: str) -> int:
"""Estimate token count."""
return len(query) // 4 + 500 # Query + estimated response
def route(
self,
query: str,
min_quality: float = 0.7,
task_type: Optional[str] = None
) -> RoutingDecision:
"""Find cheapest model meeting quality requirements."""
estimated_tokens = self.estimate_tokens(query)
candidates = []
for model in self.models:
# Check quality threshold
if model["quality_score"] < min_quality:
continue
# Estimate cost
estimated_cost = (estimated_tokens / 1000) * model["cost_per_1k"]
# Check cost limit
if estimated_cost > self.max_cost:
continue
# Bonus for task match
task_bonus = 0.1 if task_type and task_type in model["best_for"] else 0
candidates.append({
**model,
"estimated_cost": estimated_cost,
"effective_score": model["quality_score"] + task_bonus
})
if not candidates:
# Fall back to cheapest
candidates = sorted(self.models, key=lambda x: x["cost_per_1k"])
best = candidates[0]
estimated_cost = (estimated_tokens / 1000) * best["cost_per_1k"]
else:
# Sort by cost (cheapest first)
candidates.sort(key=lambda x: x["estimated_cost"])
best = candidates[0]
estimated_cost = best["estimated_cost"]
return RoutingDecision(
provider=best["provider"],
model=best["model"],
estimated_cost=estimated_cost,
quality_score=best["quality_score"],
reasoning=f"Selected for cost efficiency with quality >= {min_quality}"
)
# Usage
router = CostOptimizedRouter(max_cost_per_request=0.05)
# Simple query -> routes to cheapest
decision = router.route("What is 2+2?", min_quality=0.6)
print(f"Model: {decision.model}, Est. cost: ${decision.estimated_cost:.4f}")
# Complex query with high quality requirement -> routes to premium
decision = router.route(
"Implement a distributed consensus algorithm in Python",
min_quality=0.9,
task_type="coding"
)
print(f"Model: {decision.model}, Est. cost: ${decision.estimated_cost:.4f}")
References
- LiteLLM: https://docs.litellm.ai/
- OpenRouter: https://openrouter.ai/docs
- Martian: https://withmartian.com/
- Portkey: https://portkey.ai/docs
Conclusion
Smart LLM routing is essential for production applications. The naive approach of using GPT-4 for everything works but bleeds money. A well-designed router classifies queries, matches them to appropriate models, handles failures gracefully, and optimizes for your specific cost-quality tradeoffs. Start simple with complexity-based routing to a single provider’s model tiers. Add multi-provider support for resilience and to leverage each model’s strengths. Implement fallback logic to handle outages. Finally, add cost optimization to stay within budget while meeting quality requirements. The investment in routing infrastructure pays for itself quickly—often reducing LLM costs by 50-80% while maintaining or improving response quality. Monitor your routing decisions, track costs per route, and continuously tune your classification and selection logic based on real usage patterns.
Discover more from Code, Cloud & Context
Subscribe to get the latest posts sent to your email.

Leave a Reply