Introduction: LLM costs can spiral out of control without proper tracking. A single runaway feature or inefficient prompt can burn through your budget in hours. Understanding where your tokens go—by user, feature, model, and time—is essential for cost optimization and capacity planning. This guide covers practical cost tracking: metering token usage at the request level, calculating costs across different models and providers, aggregating usage for reporting, setting budgets and alerts, and building dashboards that give you visibility into your LLM spend.

Token Metering
from dataclasses import dataclass, field
from typing import Any, Optional
from datetime import datetime
import tiktoken
@dataclass
class TokenUsage:
"""Token usage for a single request."""
prompt_tokens: int
completion_tokens: int
total_tokens: int
# Metadata
model: str = None
request_id: str = None
timestamp: datetime = None
# Attribution
user_id: str = None
feature: str = None
session_id: str = None
class TokenCounter:
"""Count tokens for different models."""
# Model to encoding mapping
ENCODINGS = {
"gpt-4": "cl100k_base",
"gpt-4-turbo": "cl100k_base",
"gpt-4o": "o200k_base",
"gpt-4o-mini": "o200k_base",
"gpt-3.5-turbo": "cl100k_base",
"text-embedding-3-small": "cl100k_base",
"text-embedding-3-large": "cl100k_base",
}
def __init__(self):
self._encoders: dict[str, Any] = {}
def _get_encoder(self, model: str) -> Any:
"""Get or create encoder for model."""
encoding_name = self.ENCODINGS.get(model, "cl100k_base")
if encoding_name not in self._encoders:
self._encoders[encoding_name] = tiktoken.get_encoding(encoding_name)
return self._encoders[encoding_name]
def count(self, text: str, model: str = "gpt-4o") -> int:
"""Count tokens in text."""
encoder = self._get_encoder(model)
return len(encoder.encode(text))
def count_messages(
self,
messages: list[dict],
model: str = "gpt-4o"
) -> int:
"""Count tokens in chat messages."""
encoder = self._get_encoder(model)
# Token overhead per message
tokens_per_message = 3
tokens_per_name = 1
total = 0
for message in messages:
total += tokens_per_message
for key, value in message.items():
if isinstance(value, str):
total += len(encoder.encode(value))
if key == "name":
total += tokens_per_name
# Reply priming
total += 3
return total
class UsageMeter:
"""Meter token usage from API responses."""
def __init__(self):
self._usage_log: list[TokenUsage] = []
def record(
self,
response: Any,
model: str,
user_id: str = None,
feature: str = None,
session_id: str = None
) -> TokenUsage:
"""Record usage from API response."""
import uuid
usage = TokenUsage(
prompt_tokens=response.usage.prompt_tokens,
completion_tokens=response.usage.completion_tokens,
total_tokens=response.usage.total_tokens,
model=model,
request_id=str(uuid.uuid4()),
timestamp=datetime.utcnow(),
user_id=user_id,
feature=feature,
session_id=session_id
)
self._usage_log.append(usage)
return usage
def get_usage(
self,
start_time: datetime = None,
end_time: datetime = None,
user_id: str = None,
feature: str = None
) -> list[TokenUsage]:
"""Get filtered usage records."""
results = self._usage_log
if start_time:
results = [u for u in results if u.timestamp >= start_time]
if end_time:
results = [u for u in results if u.timestamp <= end_time]
if user_id:
results = [u for u in results if u.user_id == user_id]
if feature:
results = [u for u in results if u.feature == feature]
return results
Cost Calculation
from dataclasses import dataclass
from typing import Any, Optional
from decimal import Decimal
@dataclass
class ModelPricing:
"""Pricing for a model."""
model: str
provider: str
# Per 1K tokens
input_cost: Decimal
output_cost: Decimal
# Optional cached pricing
cached_input_cost: Decimal = None
# Current pricing (as of late 2024)
PRICING = {
# OpenAI
"gpt-4o": ModelPricing(
model="gpt-4o",
provider="openai",
input_cost=Decimal("0.0025"),
output_cost=Decimal("0.01"),
cached_input_cost=Decimal("0.00125")
),
"gpt-4o-mini": ModelPricing(
model="gpt-4o-mini",
provider="openai",
input_cost=Decimal("0.00015"),
output_cost=Decimal("0.0006"),
cached_input_cost=Decimal("0.000075")
),
"gpt-4-turbo": ModelPricing(
model="gpt-4-turbo",
provider="openai",
input_cost=Decimal("0.01"),
output_cost=Decimal("0.03")
),
"gpt-3.5-turbo": ModelPricing(
model="gpt-3.5-turbo",
provider="openai",
input_cost=Decimal("0.0005"),
output_cost=Decimal("0.0015")
),
# Anthropic
"claude-3-5-sonnet": ModelPricing(
model="claude-3-5-sonnet-20241022",
provider="anthropic",
input_cost=Decimal("0.003"),
output_cost=Decimal("0.015"),
cached_input_cost=Decimal("0.0003")
),
"claude-3-opus": ModelPricing(
model="claude-3-opus-20240229",
provider="anthropic",
input_cost=Decimal("0.015"),
output_cost=Decimal("0.075")
),
"claude-3-haiku": ModelPricing(
model="claude-3-haiku-20240307",
provider="anthropic",
input_cost=Decimal("0.00025"),
output_cost=Decimal("0.00125")
),
# Embeddings
"text-embedding-3-small": ModelPricing(
model="text-embedding-3-small",
provider="openai",
input_cost=Decimal("0.00002"),
output_cost=Decimal("0")
),
"text-embedding-3-large": ModelPricing(
model="text-embedding-3-large",
provider="openai",
input_cost=Decimal("0.00013"),
output_cost=Decimal("0")
),
}
@dataclass
class CostBreakdown:
"""Detailed cost breakdown."""
input_cost: Decimal
output_cost: Decimal
total_cost: Decimal
input_tokens: int
output_tokens: int
total_tokens: int
model: str
currency: str = "USD"
class CostCalculator:
"""Calculate costs from token usage."""
def __init__(self, pricing: dict[str, ModelPricing] = None):
self.pricing = pricing or PRICING
def calculate(self, usage: TokenUsage) -> CostBreakdown:
"""Calculate cost for a single usage record."""
# Find pricing
model_pricing = self._find_pricing(usage.model)
if model_pricing is None:
raise ValueError(f"No pricing found for model: {usage.model}")
# Calculate costs (per 1K tokens)
input_cost = (
Decimal(usage.prompt_tokens) / 1000 * model_pricing.input_cost
)
output_cost = (
Decimal(usage.completion_tokens) / 1000 * model_pricing.output_cost
)
return CostBreakdown(
input_cost=input_cost,
output_cost=output_cost,
total_cost=input_cost + output_cost,
input_tokens=usage.prompt_tokens,
output_tokens=usage.completion_tokens,
total_tokens=usage.total_tokens,
model=usage.model
)
def calculate_batch(
self,
usages: list[TokenUsage]
) -> CostBreakdown:
"""Calculate total cost for multiple usage records."""
total_input_cost = Decimal("0")
total_output_cost = Decimal("0")
total_input_tokens = 0
total_output_tokens = 0
for usage in usages:
breakdown = self.calculate(usage)
total_input_cost += breakdown.input_cost
total_output_cost += breakdown.output_cost
total_input_tokens += breakdown.input_tokens
total_output_tokens += breakdown.output_tokens
return CostBreakdown(
input_cost=total_input_cost,
output_cost=total_output_cost,
total_cost=total_input_cost + total_output_cost,
input_tokens=total_input_tokens,
output_tokens=total_output_tokens,
total_tokens=total_input_tokens + total_output_tokens,
model="mixed"
)
def _find_pricing(self, model: str) -> Optional[ModelPricing]:
"""Find pricing for a model."""
# Exact match
if model in self.pricing:
return self.pricing[model]
# Partial match
for key, pricing in self.pricing.items():
if key in model or model in key:
return pricing
return None
Usage Aggregation
from dataclasses import dataclass, field
from typing import Any, Optional
from datetime import datetime, timedelta
from collections import defaultdict
from decimal import Decimal
@dataclass
class UsageSummary:
"""Aggregated usage summary."""
total_requests: int
total_tokens: int
total_cost: Decimal
# Breakdowns
by_model: dict[str, dict] = field(default_factory=dict)
by_user: dict[str, dict] = field(default_factory=dict)
by_feature: dict[str, dict] = field(default_factory=dict)
by_day: dict[str, dict] = field(default_factory=dict)
class UsageAggregator:
"""Aggregate usage data for reporting."""
def __init__(self, calculator: CostCalculator = None):
self.calculator = calculator or CostCalculator()
def aggregate(
self,
usages: list[TokenUsage],
group_by: list[str] = None
) -> UsageSummary:
"""Aggregate usage records."""
group_by = group_by or ["model", "user", "feature", "day"]
total_requests = len(usages)
total_tokens = sum(u.total_tokens for u in usages)
total_cost = Decimal("0")
by_model = defaultdict(lambda: {"requests": 0, "tokens": 0, "cost": Decimal("0")})
by_user = defaultdict(lambda: {"requests": 0, "tokens": 0, "cost": Decimal("0")})
by_feature = defaultdict(lambda: {"requests": 0, "tokens": 0, "cost": Decimal("0")})
by_day = defaultdict(lambda: {"requests": 0, "tokens": 0, "cost": Decimal("0")})
for usage in usages:
cost = self.calculator.calculate(usage)
total_cost += cost.total_cost
if "model" in group_by and usage.model:
by_model[usage.model]["requests"] += 1
by_model[usage.model]["tokens"] += usage.total_tokens
by_model[usage.model]["cost"] += cost.total_cost
if "user" in group_by and usage.user_id:
by_user[usage.user_id]["requests"] += 1
by_user[usage.user_id]["tokens"] += usage.total_tokens
by_user[usage.user_id]["cost"] += cost.total_cost
if "feature" in group_by and usage.feature:
by_feature[usage.feature]["requests"] += 1
by_feature[usage.feature]["tokens"] += usage.total_tokens
by_feature[usage.feature]["cost"] += cost.total_cost
if "day" in group_by and usage.timestamp:
day = usage.timestamp.strftime("%Y-%m-%d")
by_day[day]["requests"] += 1
by_day[day]["tokens"] += usage.total_tokens
by_day[day]["cost"] += cost.total_cost
return UsageSummary(
total_requests=total_requests,
total_tokens=total_tokens,
total_cost=total_cost,
by_model=dict(by_model),
by_user=dict(by_user),
by_feature=dict(by_feature),
by_day=dict(by_day)
)
class BudgetManager:
"""Manage budgets and alerts."""
def __init__(self):
self._budgets: dict[str, Decimal] = {}
self._alerts: list[dict] = []
self._spent: dict[str, Decimal] = defaultdict(Decimal)
def set_budget(
self,
scope: str, # "global", "user:{id}", "feature:{name}"
amount: Decimal,
period: str = "monthly" # "daily", "weekly", "monthly"
):
"""Set a budget for a scope."""
key = f"{scope}:{period}"
self._budgets[key] = amount
def record_spend(
self,
amount: Decimal,
user_id: str = None,
feature: str = None
):
"""Record spending."""
self._spent["global"] += amount
if user_id:
self._spent[f"user:{user_id}"] += amount
if feature:
self._spent[f"feature:{feature}"] += amount
# Check budgets
self._check_budgets()
def _check_budgets(self):
"""Check if any budgets are exceeded."""
for key, budget in self._budgets.items():
scope = key.rsplit(":", 1)[0]
spent = self._spent.get(scope, Decimal("0"))
if spent >= budget:
self._alerts.append({
"type": "budget_exceeded",
"scope": scope,
"budget": budget,
"spent": spent,
"timestamp": datetime.utcnow()
})
elif spent >= budget * Decimal("0.8"):
self._alerts.append({
"type": "budget_warning",
"scope": scope,
"budget": budget,
"spent": spent,
"percentage": float(spent / budget * 100),
"timestamp": datetime.utcnow()
})
def get_alerts(self) -> list[dict]:
"""Get pending alerts."""
alerts = self._alerts.copy()
self._alerts.clear()
return alerts
def get_remaining(self, scope: str, period: str = "monthly") -> Decimal:
"""Get remaining budget for a scope."""
key = f"{scope}:{period}"
budget = self._budgets.get(key, Decimal("0"))
spent = self._spent.get(scope, Decimal("0"))
return max(Decimal("0"), budget - spent)
Cost-Aware Client
from dataclasses import dataclass
from typing import Any, Optional, Callable
from decimal import Decimal
class CostAwareLLMClient:
"""LLM client with built-in cost tracking."""
def __init__(
self,
client: Any,
meter: UsageMeter = None,
calculator: CostCalculator = None,
budget_manager: BudgetManager = None
):
self.client = client
self.meter = meter or UsageMeter()
self.calculator = calculator or CostCalculator()
self.budget_manager = budget_manager
async def complete(
self,
messages: list[dict],
model: str = "gpt-4o-mini",
user_id: str = None,
feature: str = None,
**kwargs
) -> tuple[str, CostBreakdown]:
"""Complete with cost tracking."""
# Check budget before request
if self.budget_manager and user_id:
remaining = self.budget_manager.get_remaining(f"user:{user_id}")
if remaining <= 0:
raise BudgetExceededError(f"Budget exceeded for user: {user_id}")
# Make request
response = await self.client.chat.completions.create(
model=model,
messages=messages,
**kwargs
)
# Record usage
usage = self.meter.record(
response=response,
model=model,
user_id=user_id,
feature=feature
)
# Calculate cost
cost = self.calculator.calculate(usage)
# Record spend
if self.budget_manager:
self.budget_manager.record_spend(
amount=cost.total_cost,
user_id=user_id,
feature=feature
)
return response.choices[0].message.content, cost
def get_usage_summary(
self,
start_time: datetime = None,
end_time: datetime = None,
user_id: str = None,
feature: str = None
) -> UsageSummary:
"""Get usage summary."""
usages = self.meter.get_usage(
start_time=start_time,
end_time=end_time,
user_id=user_id,
feature=feature
)
aggregator = UsageAggregator(self.calculator)
return aggregator.aggregate(usages)
class BudgetExceededError(Exception):
"""Raised when budget is exceeded."""
pass
class CostOptimizer:
"""Optimize costs through model selection."""
def __init__(self, calculator: CostCalculator = None):
self.calculator = calculator or CostCalculator()
def estimate_cost(
self,
prompt_tokens: int,
estimated_output_tokens: int,
model: str
) -> Decimal:
"""Estimate cost before making request."""
usage = TokenUsage(
prompt_tokens=prompt_tokens,
completion_tokens=estimated_output_tokens,
total_tokens=prompt_tokens + estimated_output_tokens,
model=model
)
return self.calculator.calculate(usage).total_cost
def recommend_model(
self,
prompt_tokens: int,
estimated_output_tokens: int,
max_cost: Decimal = None,
min_quality: str = None # "low", "medium", "high"
) -> str:
"""Recommend most cost-effective model."""
quality_tiers = {
"low": ["gpt-4o-mini", "claude-3-haiku", "gpt-3.5-turbo"],
"medium": ["gpt-4o", "claude-3-5-sonnet"],
"high": ["gpt-4-turbo", "claude-3-opus"]
}
candidates = []
if min_quality:
# Only consider models at or above quality tier
tier_order = ["low", "medium", "high"]
min_tier_idx = tier_order.index(min_quality)
for tier in tier_order[min_tier_idx:]:
candidates.extend(quality_tiers[tier])
else:
for tier_models in quality_tiers.values():
candidates.extend(tier_models)
# Calculate costs and filter
model_costs = []
for model in candidates:
try:
cost = self.estimate_cost(
prompt_tokens,
estimated_output_tokens,
model
)
if max_cost is None or cost <= max_cost:
model_costs.append((model, cost))
except ValueError:
continue
if not model_costs:
return None
# Return cheapest option
model_costs.sort(key=lambda x: x[1])
return model_costs[0][0]
Production Cost Service
from fastapi import FastAPI, HTTPException, Depends
from pydantic import BaseModel
from typing import Optional
from datetime import datetime
from decimal import Decimal
app = FastAPI()
# Initialize components
meter = UsageMeter()
calculator = CostCalculator()
aggregator = UsageAggregator(calculator)
budget_manager = BudgetManager()
class RecordUsageRequest(BaseModel):
prompt_tokens: int
completion_tokens: int
model: str
user_id: Optional[str] = None
feature: Optional[str] = None
class SetBudgetRequest(BaseModel):
scope: str
amount: float
period: str = "monthly"
class EstimateCostRequest(BaseModel):
prompt_tokens: int
estimated_output_tokens: int
model: str
@app.post("/v1/usage/record")
async def record_usage(request: RecordUsageRequest):
"""Record token usage."""
import uuid
usage = TokenUsage(
prompt_tokens=request.prompt_tokens,
completion_tokens=request.completion_tokens,
total_tokens=request.prompt_tokens + request.completion_tokens,
model=request.model,
request_id=str(uuid.uuid4()),
timestamp=datetime.utcnow(),
user_id=request.user_id,
feature=request.feature
)
meter._usage_log.append(usage)
cost = calculator.calculate(usage)
if budget_manager:
budget_manager.record_spend(
amount=cost.total_cost,
user_id=request.user_id,
feature=request.feature
)
return {
"request_id": usage.request_id,
"cost": float(cost.total_cost),
"tokens": usage.total_tokens
}
@app.get("/v1/usage/summary")
async def get_summary(
start_date: Optional[str] = None,
end_date: Optional[str] = None,
user_id: Optional[str] = None,
feature: Optional[str] = None
):
"""Get usage summary."""
start_time = datetime.fromisoformat(start_date) if start_date else None
end_time = datetime.fromisoformat(end_date) if end_date else None
usages = meter.get_usage(
start_time=start_time,
end_time=end_time,
user_id=user_id,
feature=feature
)
summary = aggregator.aggregate(usages)
return {
"total_requests": summary.total_requests,
"total_tokens": summary.total_tokens,
"total_cost": float(summary.total_cost),
"by_model": {k: {**v, "cost": float(v["cost"])} for k, v in summary.by_model.items()},
"by_user": {k: {**v, "cost": float(v["cost"])} for k, v in summary.by_user.items()},
"by_feature": {k: {**v, "cost": float(v["cost"])} for k, v in summary.by_feature.items()},
"by_day": {k: {**v, "cost": float(v["cost"])} for k, v in summary.by_day.items()}
}
@app.post("/v1/budgets")
async def set_budget(request: SetBudgetRequest):
"""Set a budget."""
budget_manager.set_budget(
scope=request.scope,
amount=Decimal(str(request.amount)),
period=request.period
)
return {"set": True}
@app.get("/v1/budgets/{scope}/remaining")
async def get_remaining(scope: str, period: str = "monthly"):
"""Get remaining budget."""
remaining = budget_manager.get_remaining(scope, period)
return {"remaining": float(remaining)}
@app.get("/v1/alerts")
async def get_alerts():
"""Get budget alerts."""
alerts = budget_manager.get_alerts()
return {"alerts": alerts}
@app.post("/v1/estimate")
async def estimate_cost(request: EstimateCostRequest):
"""Estimate cost before request."""
optimizer = CostOptimizer(calculator)
cost = optimizer.estimate_cost(
request.prompt_tokens,
request.estimated_output_tokens,
request.model
)
return {"estimated_cost": float(cost)}
@app.get("/health")
async def health():
return {"status": "healthy"}
References
- OpenAI Pricing: https://openai.com/pricing
- Anthropic Pricing: https://www.anthropic.com/pricing
- tiktoken Library: https://github.com/openai/tiktoken
- LLM Cost Optimization: https://platform.openai.com/docs/guides/production-best-practices
Conclusion
LLM cost tracking is essential for sustainable AI applications. Start with accurate token metering—use tiktoken for pre-request estimation and API response data for actual usage. Calculate costs using current pricing, and remember that input and output tokens often have different rates. Aggregate usage by user, feature, model, and time to understand where your budget goes. Set budgets with alerts at 80% to catch runaway costs before they become problems. Build cost awareness into your client so every request is tracked automatically. Use cost estimation to recommend cheaper models when quality requirements allow. The key insight is that LLM costs are highly variable—a single inefficient prompt or unexpected traffic spike can blow your budget. Visibility into your spending lets you optimize proactively rather than react to surprise bills.
