Categories

Archives

A sample text widget

Etiam pulvinar consectetur dolor sed malesuada. Ut convallis euismod dolor nec pretium. Nunc ut tristique massa.

Nam sodales mi vitae dolor ullamcorper et vulputate enim accumsan. Morbi orci magna, tincidunt vitae molestie nec, molestie at mi. Nulla nulla lorem, suscipit in posuere in, interdum non magna.

LLM Cost Tracking: Visibility and Control for AI Spending

Introduction: LLM costs can spiral out of control without proper tracking. A single runaway feature or inefficient prompt can burn through your budget in hours. Understanding where your tokens go—by user, feature, model, and time—is essential for cost optimization and capacity planning. This guide covers practical cost tracking: metering token usage at the request level, calculating costs across different models and providers, aggregating usage for reporting, setting budgets and alerts, and building dashboards that give you visibility into your LLM spend.

Cost Tracking
Cost Tracking: Token Metering, Cost Calculation, Usage Aggregation

Token Metering

from dataclasses import dataclass, field
from typing import Any, Optional
from datetime import datetime
import tiktoken

@dataclass
class TokenUsage:
    """Token usage for a single request."""
    
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
    
    # Metadata
    model: str = None
    request_id: str = None
    timestamp: datetime = None
    
    # Attribution
    user_id: str = None
    feature: str = None
    session_id: str = None

class TokenCounter:
    """Count tokens for different models."""
    
    # Model to encoding mapping
    ENCODINGS = {
        "gpt-4": "cl100k_base",
        "gpt-4-turbo": "cl100k_base",
        "gpt-4o": "o200k_base",
        "gpt-4o-mini": "o200k_base",
        "gpt-3.5-turbo": "cl100k_base",
        "text-embedding-3-small": "cl100k_base",
        "text-embedding-3-large": "cl100k_base",
    }
    
    def __init__(self):
        self._encoders: dict[str, Any] = {}
    
    def _get_encoder(self, model: str) -> Any:
        """Get or create encoder for model."""
        
        encoding_name = self.ENCODINGS.get(model, "cl100k_base")
        
        if encoding_name not in self._encoders:
            self._encoders[encoding_name] = tiktoken.get_encoding(encoding_name)
        
        return self._encoders[encoding_name]
    
    def count(self, text: str, model: str = "gpt-4o") -> int:
        """Count tokens in text."""
        
        encoder = self._get_encoder(model)
        return len(encoder.encode(text))
    
    def count_messages(
        self,
        messages: list[dict],
        model: str = "gpt-4o"
    ) -> int:
        """Count tokens in chat messages."""
        
        encoder = self._get_encoder(model)
        
        # Token overhead per message
        tokens_per_message = 3
        tokens_per_name = 1
        
        total = 0
        
        for message in messages:
            total += tokens_per_message
            
            for key, value in message.items():
                if isinstance(value, str):
                    total += len(encoder.encode(value))
                
                if key == "name":
                    total += tokens_per_name
        
        # Reply priming
        total += 3
        
        return total

class UsageMeter:
    """Meter token usage from API responses."""
    
    def __init__(self):
        self._usage_log: list[TokenUsage] = []
    
    def record(
        self,
        response: Any,
        model: str,
        user_id: str = None,
        feature: str = None,
        session_id: str = None
    ) -> TokenUsage:
        """Record usage from API response."""
        
        import uuid
        
        usage = TokenUsage(
            prompt_tokens=response.usage.prompt_tokens,
            completion_tokens=response.usage.completion_tokens,
            total_tokens=response.usage.total_tokens,
            model=model,
            request_id=str(uuid.uuid4()),
            timestamp=datetime.utcnow(),
            user_id=user_id,
            feature=feature,
            session_id=session_id
        )
        
        self._usage_log.append(usage)
        return usage
    
    def get_usage(
        self,
        start_time: datetime = None,
        end_time: datetime = None,
        user_id: str = None,
        feature: str = None
    ) -> list[TokenUsage]:
        """Get filtered usage records."""
        
        results = self._usage_log
        
        if start_time:
            results = [u for u in results if u.timestamp >= start_time]
        
        if end_time:
            results = [u for u in results if u.timestamp <= end_time]
        
        if user_id:
            results = [u for u in results if u.user_id == user_id]
        
        if feature:
            results = [u for u in results if u.feature == feature]
        
        return results

Cost Calculation

from dataclasses import dataclass
from typing import Any, Optional
from decimal import Decimal

@dataclass
class ModelPricing:
    """Pricing for a model."""
    
    model: str
    provider: str
    
    # Per 1K tokens
    input_cost: Decimal
    output_cost: Decimal
    
    # Optional cached pricing
    cached_input_cost: Decimal = None

# Current pricing (as of late 2024)
PRICING = {
    # OpenAI
    "gpt-4o": ModelPricing(
        model="gpt-4o",
        provider="openai",
        input_cost=Decimal("0.0025"),
        output_cost=Decimal("0.01"),
        cached_input_cost=Decimal("0.00125")
    ),
    "gpt-4o-mini": ModelPricing(
        model="gpt-4o-mini",
        provider="openai",
        input_cost=Decimal("0.00015"),
        output_cost=Decimal("0.0006"),
        cached_input_cost=Decimal("0.000075")
    ),
    "gpt-4-turbo": ModelPricing(
        model="gpt-4-turbo",
        provider="openai",
        input_cost=Decimal("0.01"),
        output_cost=Decimal("0.03")
    ),
    "gpt-3.5-turbo": ModelPricing(
        model="gpt-3.5-turbo",
        provider="openai",
        input_cost=Decimal("0.0005"),
        output_cost=Decimal("0.0015")
    ),
    
    # Anthropic
    "claude-3-5-sonnet": ModelPricing(
        model="claude-3-5-sonnet-20241022",
        provider="anthropic",
        input_cost=Decimal("0.003"),
        output_cost=Decimal("0.015"),
        cached_input_cost=Decimal("0.0003")
    ),
    "claude-3-opus": ModelPricing(
        model="claude-3-opus-20240229",
        provider="anthropic",
        input_cost=Decimal("0.015"),
        output_cost=Decimal("0.075")
    ),
    "claude-3-haiku": ModelPricing(
        model="claude-3-haiku-20240307",
        provider="anthropic",
        input_cost=Decimal("0.00025"),
        output_cost=Decimal("0.00125")
    ),
    
    # Embeddings
    "text-embedding-3-small": ModelPricing(
        model="text-embedding-3-small",
        provider="openai",
        input_cost=Decimal("0.00002"),
        output_cost=Decimal("0")
    ),
    "text-embedding-3-large": ModelPricing(
        model="text-embedding-3-large",
        provider="openai",
        input_cost=Decimal("0.00013"),
        output_cost=Decimal("0")
    ),
}

@dataclass
class CostBreakdown:
    """Detailed cost breakdown."""
    
    input_cost: Decimal
    output_cost: Decimal
    total_cost: Decimal
    
    input_tokens: int
    output_tokens: int
    total_tokens: int
    
    model: str
    currency: str = "USD"

class CostCalculator:
    """Calculate costs from token usage."""
    
    def __init__(self, pricing: dict[str, ModelPricing] = None):
        self.pricing = pricing or PRICING
    
    def calculate(self, usage: TokenUsage) -> CostBreakdown:
        """Calculate cost for a single usage record."""
        
        # Find pricing
        model_pricing = self._find_pricing(usage.model)
        
        if model_pricing is None:
            raise ValueError(f"No pricing found for model: {usage.model}")
        
        # Calculate costs (per 1K tokens)
        input_cost = (
            Decimal(usage.prompt_tokens) / 1000 * model_pricing.input_cost
        )
        output_cost = (
            Decimal(usage.completion_tokens) / 1000 * model_pricing.output_cost
        )
        
        return CostBreakdown(
            input_cost=input_cost,
            output_cost=output_cost,
            total_cost=input_cost + output_cost,
            input_tokens=usage.prompt_tokens,
            output_tokens=usage.completion_tokens,
            total_tokens=usage.total_tokens,
            model=usage.model
        )
    
    def calculate_batch(
        self,
        usages: list[TokenUsage]
    ) -> CostBreakdown:
        """Calculate total cost for multiple usage records."""
        
        total_input_cost = Decimal("0")
        total_output_cost = Decimal("0")
        total_input_tokens = 0
        total_output_tokens = 0
        
        for usage in usages:
            breakdown = self.calculate(usage)
            total_input_cost += breakdown.input_cost
            total_output_cost += breakdown.output_cost
            total_input_tokens += breakdown.input_tokens
            total_output_tokens += breakdown.output_tokens
        
        return CostBreakdown(
            input_cost=total_input_cost,
            output_cost=total_output_cost,
            total_cost=total_input_cost + total_output_cost,
            input_tokens=total_input_tokens,
            output_tokens=total_output_tokens,
            total_tokens=total_input_tokens + total_output_tokens,
            model="mixed"
        )
    
    def _find_pricing(self, model: str) -> Optional[ModelPricing]:
        """Find pricing for a model."""
        
        # Exact match
        if model in self.pricing:
            return self.pricing[model]
        
        # Partial match
        for key, pricing in self.pricing.items():
            if key in model or model in key:
                return pricing
        
        return None

Usage Aggregation

from dataclasses import dataclass, field
from typing import Any, Optional
from datetime import datetime, timedelta
from collections import defaultdict
from decimal import Decimal

@dataclass
class UsageSummary:
    """Aggregated usage summary."""
    
    total_requests: int
    total_tokens: int
    total_cost: Decimal
    
    # Breakdowns
    by_model: dict[str, dict] = field(default_factory=dict)
    by_user: dict[str, dict] = field(default_factory=dict)
    by_feature: dict[str, dict] = field(default_factory=dict)
    by_day: dict[str, dict] = field(default_factory=dict)

class UsageAggregator:
    """Aggregate usage data for reporting."""
    
    def __init__(self, calculator: CostCalculator = None):
        self.calculator = calculator or CostCalculator()
    
    def aggregate(
        self,
        usages: list[TokenUsage],
        group_by: list[str] = None
    ) -> UsageSummary:
        """Aggregate usage records."""
        
        group_by = group_by or ["model", "user", "feature", "day"]
        
        total_requests = len(usages)
        total_tokens = sum(u.total_tokens for u in usages)
        total_cost = Decimal("0")
        
        by_model = defaultdict(lambda: {"requests": 0, "tokens": 0, "cost": Decimal("0")})
        by_user = defaultdict(lambda: {"requests": 0, "tokens": 0, "cost": Decimal("0")})
        by_feature = defaultdict(lambda: {"requests": 0, "tokens": 0, "cost": Decimal("0")})
        by_day = defaultdict(lambda: {"requests": 0, "tokens": 0, "cost": Decimal("0")})
        
        for usage in usages:
            cost = self.calculator.calculate(usage)
            total_cost += cost.total_cost
            
            if "model" in group_by and usage.model:
                by_model[usage.model]["requests"] += 1
                by_model[usage.model]["tokens"] += usage.total_tokens
                by_model[usage.model]["cost"] += cost.total_cost
            
            if "user" in group_by and usage.user_id:
                by_user[usage.user_id]["requests"] += 1
                by_user[usage.user_id]["tokens"] += usage.total_tokens
                by_user[usage.user_id]["cost"] += cost.total_cost
            
            if "feature" in group_by and usage.feature:
                by_feature[usage.feature]["requests"] += 1
                by_feature[usage.feature]["tokens"] += usage.total_tokens
                by_feature[usage.feature]["cost"] += cost.total_cost
            
            if "day" in group_by and usage.timestamp:
                day = usage.timestamp.strftime("%Y-%m-%d")
                by_day[day]["requests"] += 1
                by_day[day]["tokens"] += usage.total_tokens
                by_day[day]["cost"] += cost.total_cost
        
        return UsageSummary(
            total_requests=total_requests,
            total_tokens=total_tokens,
            total_cost=total_cost,
            by_model=dict(by_model),
            by_user=dict(by_user),
            by_feature=dict(by_feature),
            by_day=dict(by_day)
        )

class BudgetManager:
    """Manage budgets and alerts."""
    
    def __init__(self):
        self._budgets: dict[str, Decimal] = {}
        self._alerts: list[dict] = []
        self._spent: dict[str, Decimal] = defaultdict(Decimal)
    
    def set_budget(
        self,
        scope: str,  # "global", "user:{id}", "feature:{name}"
        amount: Decimal,
        period: str = "monthly"  # "daily", "weekly", "monthly"
    ):
        """Set a budget for a scope."""
        
        key = f"{scope}:{period}"
        self._budgets[key] = amount
    
    def record_spend(
        self,
        amount: Decimal,
        user_id: str = None,
        feature: str = None
    ):
        """Record spending."""
        
        self._spent["global"] += amount
        
        if user_id:
            self._spent[f"user:{user_id}"] += amount
        
        if feature:
            self._spent[f"feature:{feature}"] += amount
        
        # Check budgets
        self._check_budgets()
    
    def _check_budgets(self):
        """Check if any budgets are exceeded."""
        
        for key, budget in self._budgets.items():
            scope = key.rsplit(":", 1)[0]
            spent = self._spent.get(scope, Decimal("0"))
            
            if spent >= budget:
                self._alerts.append({
                    "type": "budget_exceeded",
                    "scope": scope,
                    "budget": budget,
                    "spent": spent,
                    "timestamp": datetime.utcnow()
                })
            elif spent >= budget * Decimal("0.8"):
                self._alerts.append({
                    "type": "budget_warning",
                    "scope": scope,
                    "budget": budget,
                    "spent": spent,
                    "percentage": float(spent / budget * 100),
                    "timestamp": datetime.utcnow()
                })
    
    def get_alerts(self) -> list[dict]:
        """Get pending alerts."""
        
        alerts = self._alerts.copy()
        self._alerts.clear()
        return alerts
    
    def get_remaining(self, scope: str, period: str = "monthly") -> Decimal:
        """Get remaining budget for a scope."""
        
        key = f"{scope}:{period}"
        budget = self._budgets.get(key, Decimal("0"))
        spent = self._spent.get(scope, Decimal("0"))
        
        return max(Decimal("0"), budget - spent)

Cost-Aware Client

from dataclasses import dataclass
from typing import Any, Optional, Callable
from decimal import Decimal

class CostAwareLLMClient:
    """LLM client with built-in cost tracking."""
    
    def __init__(
        self,
        client: Any,
        meter: UsageMeter = None,
        calculator: CostCalculator = None,
        budget_manager: BudgetManager = None
    ):
        self.client = client
        self.meter = meter or UsageMeter()
        self.calculator = calculator or CostCalculator()
        self.budget_manager = budget_manager
    
    async def complete(
        self,
        messages: list[dict],
        model: str = "gpt-4o-mini",
        user_id: str = None,
        feature: str = None,
        **kwargs
    ) -> tuple[str, CostBreakdown]:
        """Complete with cost tracking."""
        
        # Check budget before request
        if self.budget_manager and user_id:
            remaining = self.budget_manager.get_remaining(f"user:{user_id}")
            if remaining <= 0:
                raise BudgetExceededError(f"Budget exceeded for user: {user_id}")
        
        # Make request
        response = await self.client.chat.completions.create(
            model=model,
            messages=messages,
            **kwargs
        )
        
        # Record usage
        usage = self.meter.record(
            response=response,
            model=model,
            user_id=user_id,
            feature=feature
        )
        
        # Calculate cost
        cost = self.calculator.calculate(usage)
        
        # Record spend
        if self.budget_manager:
            self.budget_manager.record_spend(
                amount=cost.total_cost,
                user_id=user_id,
                feature=feature
            )
        
        return response.choices[0].message.content, cost
    
    def get_usage_summary(
        self,
        start_time: datetime = None,
        end_time: datetime = None,
        user_id: str = None,
        feature: str = None
    ) -> UsageSummary:
        """Get usage summary."""
        
        usages = self.meter.get_usage(
            start_time=start_time,
            end_time=end_time,
            user_id=user_id,
            feature=feature
        )
        
        aggregator = UsageAggregator(self.calculator)
        return aggregator.aggregate(usages)

class BudgetExceededError(Exception):
    """Raised when budget is exceeded."""
    pass

class CostOptimizer:
    """Optimize costs through model selection."""
    
    def __init__(self, calculator: CostCalculator = None):
        self.calculator = calculator or CostCalculator()
    
    def estimate_cost(
        self,
        prompt_tokens: int,
        estimated_output_tokens: int,
        model: str
    ) -> Decimal:
        """Estimate cost before making request."""
        
        usage = TokenUsage(
            prompt_tokens=prompt_tokens,
            completion_tokens=estimated_output_tokens,
            total_tokens=prompt_tokens + estimated_output_tokens,
            model=model
        )
        
        return self.calculator.calculate(usage).total_cost
    
    def recommend_model(
        self,
        prompt_tokens: int,
        estimated_output_tokens: int,
        max_cost: Decimal = None,
        min_quality: str = None  # "low", "medium", "high"
    ) -> str:
        """Recommend most cost-effective model."""
        
        quality_tiers = {
            "low": ["gpt-4o-mini", "claude-3-haiku", "gpt-3.5-turbo"],
            "medium": ["gpt-4o", "claude-3-5-sonnet"],
            "high": ["gpt-4-turbo", "claude-3-opus"]
        }
        
        candidates = []
        
        if min_quality:
            # Only consider models at or above quality tier
            tier_order = ["low", "medium", "high"]
            min_tier_idx = tier_order.index(min_quality)
            
            for tier in tier_order[min_tier_idx:]:
                candidates.extend(quality_tiers[tier])
        else:
            for tier_models in quality_tiers.values():
                candidates.extend(tier_models)
        
        # Calculate costs and filter
        model_costs = []
        
        for model in candidates:
            try:
                cost = self.estimate_cost(
                    prompt_tokens,
                    estimated_output_tokens,
                    model
                )
                
                if max_cost is None or cost <= max_cost:
                    model_costs.append((model, cost))
            except ValueError:
                continue
        
        if not model_costs:
            return None
        
        # Return cheapest option
        model_costs.sort(key=lambda x: x[1])
        return model_costs[0][0]

Production Cost Service

from fastapi import FastAPI, HTTPException, Depends
from pydantic import BaseModel
from typing import Optional
from datetime import datetime
from decimal import Decimal

app = FastAPI()

# Initialize components
meter = UsageMeter()
calculator = CostCalculator()
aggregator = UsageAggregator(calculator)
budget_manager = BudgetManager()

class RecordUsageRequest(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    model: str
    user_id: Optional[str] = None
    feature: Optional[str] = None

class SetBudgetRequest(BaseModel):
    scope: str
    amount: float
    period: str = "monthly"

class EstimateCostRequest(BaseModel):
    prompt_tokens: int
    estimated_output_tokens: int
    model: str

@app.post("/v1/usage/record")
async def record_usage(request: RecordUsageRequest):
    """Record token usage."""
    
    import uuid
    
    usage = TokenUsage(
        prompt_tokens=request.prompt_tokens,
        completion_tokens=request.completion_tokens,
        total_tokens=request.prompt_tokens + request.completion_tokens,
        model=request.model,
        request_id=str(uuid.uuid4()),
        timestamp=datetime.utcnow(),
        user_id=request.user_id,
        feature=request.feature
    )
    
    meter._usage_log.append(usage)
    cost = calculator.calculate(usage)
    
    if budget_manager:
        budget_manager.record_spend(
            amount=cost.total_cost,
            user_id=request.user_id,
            feature=request.feature
        )
    
    return {
        "request_id": usage.request_id,
        "cost": float(cost.total_cost),
        "tokens": usage.total_tokens
    }

@app.get("/v1/usage/summary")
async def get_summary(
    start_date: Optional[str] = None,
    end_date: Optional[str] = None,
    user_id: Optional[str] = None,
    feature: Optional[str] = None
):
    """Get usage summary."""
    
    start_time = datetime.fromisoformat(start_date) if start_date else None
    end_time = datetime.fromisoformat(end_date) if end_date else None
    
    usages = meter.get_usage(
        start_time=start_time,
        end_time=end_time,
        user_id=user_id,
        feature=feature
    )
    
    summary = aggregator.aggregate(usages)
    
    return {
        "total_requests": summary.total_requests,
        "total_tokens": summary.total_tokens,
        "total_cost": float(summary.total_cost),
        "by_model": {k: {**v, "cost": float(v["cost"])} for k, v in summary.by_model.items()},
        "by_user": {k: {**v, "cost": float(v["cost"])} for k, v in summary.by_user.items()},
        "by_feature": {k: {**v, "cost": float(v["cost"])} for k, v in summary.by_feature.items()},
        "by_day": {k: {**v, "cost": float(v["cost"])} for k, v in summary.by_day.items()}
    }

@app.post("/v1/budgets")
async def set_budget(request: SetBudgetRequest):
    """Set a budget."""
    
    budget_manager.set_budget(
        scope=request.scope,
        amount=Decimal(str(request.amount)),
        period=request.period
    )
    
    return {"set": True}

@app.get("/v1/budgets/{scope}/remaining")
async def get_remaining(scope: str, period: str = "monthly"):
    """Get remaining budget."""
    
    remaining = budget_manager.get_remaining(scope, period)
    
    return {"remaining": float(remaining)}

@app.get("/v1/alerts")
async def get_alerts():
    """Get budget alerts."""
    
    alerts = budget_manager.get_alerts()
    
    return {"alerts": alerts}

@app.post("/v1/estimate")
async def estimate_cost(request: EstimateCostRequest):
    """Estimate cost before request."""
    
    optimizer = CostOptimizer(calculator)
    cost = optimizer.estimate_cost(
        request.prompt_tokens,
        request.estimated_output_tokens,
        request.model
    )
    
    return {"estimated_cost": float(cost)}

@app.get("/health")
async def health():
    return {"status": "healthy"}

References

Conclusion

LLM cost tracking is essential for sustainable AI applications. Start with accurate token metering—use tiktoken for pre-request estimation and API response data for actual usage. Calculate costs using current pricing, and remember that input and output tokens often have different rates. Aggregate usage by user, feature, model, and time to understand where your budget goes. Set budgets with alerts at 80% to catch runaway costs before they become problems. Build cost awareness into your client so every request is tracked automatically. Use cost estimation to recommend cheaper models when quality requirements allow. The key insight is that LLM costs are highly variable—a single inefficient prompt or unexpected traffic spike can blow your budget. Visibility into your spending lets you optimize proactively rather than react to surprise bills.