Introduction: Tokens are the currency of LLM applications—every token costs money and consumes context window space. Efficient token usage directly impacts both cost and capability. This guide covers practical token optimization techniques: accurate token counting across different models, content compression strategies that preserve meaning, budget management for staying within limits, and prompt engineering patterns that minimize token waste. Whether you’re building cost-sensitive applications or pushing the limits of context windows, mastering token optimization is essential for production LLM systems.

Token Counting
from dataclasses import dataclass
from typing import Any, Optional
import tiktoken
@dataclass
class TokenCount:
"""Token count result."""
text: str
token_count: int
model: str
encoding: str
class TokenCounter:
"""Count tokens for different models."""
# Model to encoding mapping
MODEL_ENCODINGS = {
"gpt-4": "cl100k_base",
"gpt-4-turbo": "cl100k_base",
"gpt-4o": "o200k_base",
"gpt-4o-mini": "o200k_base",
"gpt-3.5-turbo": "cl100k_base",
"text-embedding-3-small": "cl100k_base",
"text-embedding-3-large": "cl100k_base",
"claude-3-opus": "cl100k_base", # Approximation
"claude-3-sonnet": "cl100k_base",
"claude-3-haiku": "cl100k_base",
}
def __init__(self, model: str = "gpt-4o"):
self.model = model
self.encoding_name = self.MODEL_ENCODINGS.get(model, "cl100k_base")
self.encoding = tiktoken.get_encoding(self.encoding_name)
def count(self, text: str) -> TokenCount:
"""Count tokens in text."""
tokens = self.encoding.encode(text)
return TokenCount(
text=text,
token_count=len(tokens),
model=self.model,
encoding=self.encoding_name
)
def count_messages(self, messages: list[dict]) -> int:
"""Count tokens in chat messages."""
# Token overhead per message (varies by model)
tokens_per_message = 4 # Approximate
tokens_per_name = -1 # If name is present
total = 0
for message in messages:
total += tokens_per_message
for key, value in message.items():
total += len(self.encoding.encode(str(value)))
if key == "name":
total += tokens_per_name
total += 3 # Reply priming
return total
def truncate_to_limit(
self,
text: str,
max_tokens: int
) -> str:
"""Truncate text to fit within token limit."""
tokens = self.encoding.encode(text)
if len(tokens) <= max_tokens:
return text
truncated_tokens = tokens[:max_tokens]
return self.encoding.decode(truncated_tokens)
def split_by_tokens(
self,
text: str,
chunk_size: int,
overlap: int = 0
) -> list[str]:
"""Split text into chunks by token count."""
tokens = self.encoding.encode(text)
chunks = []
start = 0
while start < len(tokens):
end = min(start + chunk_size, len(tokens))
chunk_tokens = tokens[start:end]
chunks.append(self.encoding.decode(chunk_tokens))
start = end - overlap
return chunks
class MultiModelCounter:
"""Count tokens across multiple models."""
def __init__(self, models: list[str] = None):
self.models = models or ["gpt-4o", "gpt-4", "gpt-3.5-turbo"]
self.counters = {
model: TokenCounter(model)
for model in self.models
}
def count_all(self, text: str) -> dict[str, TokenCount]:
"""Count tokens for all models."""
return {
model: counter.count(text)
for model, counter in self.counters.items()
}
def estimate_cost(
self,
text: str,
is_input: bool = True
) -> dict[str, float]:
"""Estimate cost for each model."""
# Prices per 1M tokens (approximate)
PRICES = {
"gpt-4o": {"input": 2.50, "output": 10.00},
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
"gpt-4": {"input": 30.00, "output": 60.00},
"gpt-4-turbo": {"input": 10.00, "output": 30.00},
"gpt-3.5-turbo": {"input": 0.50, "output": 1.50},
}
costs = {}
price_type = "input" if is_input else "output"
for model, counter in self.counters.items():
count = counter.count(text).token_count
price = PRICES.get(model, {}).get(price_type, 0)
costs[model] = (count / 1_000_000) * price
return costs
Content Compression
from dataclasses import dataclass
from typing import Any, Optional
import re
@dataclass
class CompressionResult:
"""Result of content compression."""
original: str
compressed: str
original_tokens: int
compressed_tokens: int
compression_ratio: float
class TextCompressor:
"""Compress text while preserving meaning."""
def __init__(self, counter: TokenCounter = None):
self.counter = counter or TokenCounter()
def compress(self, text: str) -> CompressionResult:
"""Apply multiple compression techniques."""
original_tokens = self.counter.count(text).token_count
compressed = text
compressed = self._remove_redundant_whitespace(compressed)
compressed = self._abbreviate_common_phrases(compressed)
compressed = self._remove_filler_words(compressed)
compressed = self._simplify_punctuation(compressed)
compressed_tokens = self.counter.count(compressed).token_count
return CompressionResult(
original=text,
compressed=compressed,
original_tokens=original_tokens,
compressed_tokens=compressed_tokens,
compression_ratio=compressed_tokens / max(1, original_tokens)
)
def _remove_redundant_whitespace(self, text: str) -> str:
"""Remove extra whitespace."""
# Multiple spaces to single
text = re.sub(r' +', ' ', text)
# Multiple newlines to single
text = re.sub(r'\n+', '\n', text)
# Trim lines
lines = [line.strip() for line in text.split('\n')]
return '\n'.join(lines)
def _abbreviate_common_phrases(self, text: str) -> str:
"""Replace common phrases with shorter versions."""
replacements = {
"for example": "e.g.",
"that is": "i.e.",
"in other words": "i.e.",
"and so on": "etc.",
"as soon as possible": "ASAP",
"with respect to": "re:",
"in order to": "to",
"due to the fact that": "because",
"at this point in time": "now",
"in the event that": "if",
"prior to": "before",
"subsequent to": "after",
"in addition to": "plus",
"a large number of": "many",
"a small number of": "few",
}
for phrase, abbrev in replacements.items():
text = re.sub(
rf'\b{re.escape(phrase)}\b',
abbrev,
text,
flags=re.IGNORECASE
)
return text
def _remove_filler_words(self, text: str) -> str:
"""Remove unnecessary filler words."""
fillers = [
r'\bvery\b',
r'\breally\b',
r'\bactually\b',
r'\bbasically\b',
r'\bjust\b',
r'\bsimply\b',
r'\bquite\b',
r'\brather\b',
]
for filler in fillers:
text = re.sub(filler, '', text, flags=re.IGNORECASE)
# Clean up double spaces
text = re.sub(r' +', ' ', text)
return text
def _simplify_punctuation(self, text: str) -> str:
"""Simplify punctuation."""
# Multiple punctuation to single
text = re.sub(r'\.{2,}', '.', text)
text = re.sub(r'!{2,}', '!', text)
text = re.sub(r'\?{2,}', '?', text)
return text
class LLMCompressor:
"""Use LLM to compress content."""
def __init__(self, client: Any, model: str = "gpt-4o-mini"):
self.client = client
self.model = model
self.counter = TokenCounter(model)
async def compress(
self,
text: str,
target_ratio: float = 0.5,
preserve_key_info: list[str] = None
) -> CompressionResult:
"""Compress text using LLM."""
original_tokens = self.counter.count(text).token_count
target_tokens = int(original_tokens * target_ratio)
preserve_instruction = ""
if preserve_key_info:
preserve_instruction = f"\nMust preserve: {', '.join(preserve_key_info)}"
prompt = f"""Compress this text to approximately {target_tokens} tokens.
Preserve all key information and meaning.
Remove redundancy and verbose language.{preserve_instruction}
Text to compress:
{text}
Compressed version:"""
response = await self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
temperature=0.3
)
compressed = response.choices[0].message.content
compressed_tokens = self.counter.count(compressed).token_count
return CompressionResult(
original=text,
compressed=compressed,
original_tokens=original_tokens,
compressed_tokens=compressed_tokens,
compression_ratio=compressed_tokens / max(1, original_tokens)
)
class SummarizingCompressor:
"""Compress by summarizing sections."""
def __init__(self, client: Any, model: str = "gpt-4o-mini"):
self.client = client
self.model = model
self.counter = TokenCounter(model)
async def compress(
self,
text: str,
max_tokens: int
) -> CompressionResult:
"""Compress by summarizing to fit token limit."""
original_tokens = self.counter.count(text).token_count
if original_tokens <= max_tokens:
return CompressionResult(
original=text,
compressed=text,
original_tokens=original_tokens,
compressed_tokens=original_tokens,
compression_ratio=1.0
)
# Split into sections and summarize
sections = self._split_into_sections(text)
# Calculate tokens per section
tokens_per_section = max_tokens // len(sections)
compressed_sections = []
for section in sections:
section_tokens = self.counter.count(section).token_count
if section_tokens <= tokens_per_section:
compressed_sections.append(section)
else:
summary = await self._summarize(section, tokens_per_section)
compressed_sections.append(summary)
compressed = "\n\n".join(compressed_sections)
compressed_tokens = self.counter.count(compressed).token_count
return CompressionResult(
original=text,
compressed=compressed,
original_tokens=original_tokens,
compressed_tokens=compressed_tokens,
compression_ratio=compressed_tokens / max(1, original_tokens)
)
def _split_into_sections(self, text: str) -> list[str]:
"""Split text into logical sections."""
# Split on double newlines or headers
sections = re.split(r'\n\n+|(?=^#+\s)', text, flags=re.MULTILINE)
return [s.strip() for s in sections if s.strip()]
async def _summarize(self, text: str, max_tokens: int) -> str:
"""Summarize text to fit token limit."""
prompt = f"""Summarize this text in approximately {max_tokens} tokens.
Keep the most important information.
Text:
{text}
Summary:"""
response = await self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
temperature=0.3,
max_tokens=max_tokens + 50 # Buffer
)
return response.choices[0].message.content
Budget Management
from dataclasses import dataclass, field
from typing import Any, Optional
from enum import Enum
class BudgetStrategy(Enum):
"""Budget allocation strategies."""
FIXED = "fixed" # Fixed allocation per component
PROPORTIONAL = "proportional" # Proportional to content
PRIORITY = "priority" # Priority-based allocation
DYNAMIC = "dynamic" # Dynamic based on importance
@dataclass
class TokenBudget:
"""Token budget allocation."""
total: int
system_prompt: int = 0
context: int = 0
user_message: int = 0
reserved_output: int = 0
remaining: int = 0
class BudgetManager:
"""Manage token budgets for prompts."""
def __init__(
self,
model: str = "gpt-4o",
max_context: int = 128000,
default_output_reserve: int = 4096
):
self.model = model
self.max_context = max_context
self.default_output_reserve = default_output_reserve
self.counter = TokenCounter(model)
def allocate(
self,
system_prompt: str,
context_items: list[str],
user_message: str,
output_reserve: int = None
) -> TokenBudget:
"""Allocate token budget."""
output_reserve = output_reserve or self.default_output_reserve
# Count fixed components
system_tokens = self.counter.count(system_prompt).token_count
user_tokens = self.counter.count(user_message).token_count
# Calculate available for context
available = self.max_context - system_tokens - user_tokens - output_reserve
# Count context items
context_tokens = sum(
self.counter.count(item).token_count
for item in context_items
)
return TokenBudget(
total=self.max_context,
system_prompt=system_tokens,
context=min(context_tokens, available),
user_message=user_tokens,
reserved_output=output_reserve,
remaining=max(0, available - context_tokens)
)
def fit_context(
self,
context_items: list[str],
available_tokens: int,
strategy: BudgetStrategy = BudgetStrategy.PRIORITY
) -> list[str]:
"""Fit context items within budget."""
if strategy == BudgetStrategy.PRIORITY:
return self._fit_priority(context_items, available_tokens)
elif strategy == BudgetStrategy.PROPORTIONAL:
return self._fit_proportional(context_items, available_tokens)
else:
return self._fit_fixed(context_items, available_tokens)
def _fit_priority(
self,
items: list[str],
available: int
) -> list[str]:
"""Fit items by priority (first items have higher priority)."""
result = []
used = 0
for item in items:
item_tokens = self.counter.count(item).token_count
if used + item_tokens <= available:
result.append(item)
used += item_tokens
else:
# Try to fit truncated version
remaining = available - used
if remaining > 100: # Minimum useful size
truncated = self.counter.truncate_to_limit(item, remaining)
result.append(truncated)
break
return result
def _fit_proportional(
self,
items: list[str],
available: int
) -> list[str]:
"""Fit items proportionally."""
# Calculate total tokens
item_tokens = [
self.counter.count(item).token_count
for item in items
]
total_tokens = sum(item_tokens)
if total_tokens <= available:
return items
# Allocate proportionally
result = []
for item, tokens in zip(items, item_tokens):
allocated = int(available * (tokens / total_tokens))
truncated = self.counter.truncate_to_limit(item, allocated)
result.append(truncated)
return result
def _fit_fixed(
self,
items: list[str],
available: int
) -> list[str]:
"""Fit items with fixed allocation per item."""
per_item = available // max(1, len(items))
return [
self.counter.truncate_to_limit(item, per_item)
for item in items
]
class DynamicBudgetManager:
"""Dynamic budget management based on content importance."""
def __init__(
self,
client: Any,
model: str = "gpt-4o-mini"
):
self.client = client
self.model = model
self.counter = TokenCounter(model)
async def allocate_by_importance(
self,
items: list[str],
available_tokens: int,
query: str
) -> list[str]:
"""Allocate budget based on relevance to query."""
# Score items by relevance
scores = await self._score_relevance(items, query)
# Sort by score
scored_items = sorted(
zip(items, scores),
key=lambda x: x[1],
reverse=True
)
# Allocate tokens based on score
total_score = sum(scores)
result = []
used = 0
for item, score in scored_items:
# Allocate proportional to score
allocated = int(available_tokens * (score / total_score))
item_tokens = self.counter.count(item).token_count
if item_tokens <= allocated and used + item_tokens <= available_tokens:
result.append(item)
used += item_tokens
elif used + allocated <= available_tokens:
truncated = self.counter.truncate_to_limit(item, allocated)
result.append(truncated)
used += allocated
return result
async def _score_relevance(
self,
items: list[str],
query: str
) -> list[float]:
"""Score items by relevance to query."""
# Use LLM to score relevance
items_text = "\n".join([
f"{i+1}. {item[:200]}..."
for i, item in enumerate(items)
])
prompt = f"""Score the relevance of each item to this query.
Return scores from 0-10 for each item, one per line.
Query: {query}
Items:
{items_text}
Scores (one per line):"""
response = await self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
temperature=0
)
# Parse scores
scores = []
for line in response.choices[0].message.content.strip().split('\n'):
try:
score = float(line.strip().split()[0])
scores.append(max(0.1, min(10, score))) # Clamp to 0.1-10
except (ValueError, IndexError):
scores.append(5.0) # Default score
# Pad if needed
while len(scores) < len(items):
scores.append(5.0)
return scores[:len(items)]
Prompt Optimization
from dataclasses import dataclass
from typing import Any, Optional
@dataclass
class OptimizedPrompt:
"""An optimized prompt."""
original: str
optimized: str
original_tokens: int
optimized_tokens: int
savings: int
savings_percent: float
class PromptOptimizer:
"""Optimize prompts for token efficiency."""
def __init__(self, counter: TokenCounter = None):
self.counter = counter or TokenCounter()
def optimize(self, prompt: str) -> OptimizedPrompt:
"""Apply optimization techniques to prompt."""
original_tokens = self.counter.count(prompt).token_count
optimized = prompt
optimized = self._use_concise_instructions(optimized)
optimized = self._remove_redundant_examples(optimized)
optimized = self._compress_formatting(optimized)
optimized_tokens = self.counter.count(optimized).token_count
savings = original_tokens - optimized_tokens
return OptimizedPrompt(
original=prompt,
optimized=optimized,
original_tokens=original_tokens,
optimized_tokens=optimized_tokens,
savings=savings,
savings_percent=(savings / max(1, original_tokens)) * 100
)
def _use_concise_instructions(self, prompt: str) -> str:
"""Replace verbose instructions with concise versions."""
replacements = {
"Please provide": "Give",
"I would like you to": "",
"Can you please": "",
"It would be great if you could": "",
"Make sure to": "",
"Be sure to": "",
"Please ensure that": "",
"I want you to": "",
"Your task is to": "",
"You should": "",
"Please note that": "Note:",
"Keep in mind that": "Note:",
"It is important to": "",
"Remember to": "",
}
for verbose, concise in replacements.items():
prompt = prompt.replace(verbose, concise)
# Clean up double spaces
prompt = ' '.join(prompt.split())
return prompt
def _remove_redundant_examples(self, prompt: str) -> str:
"""Remove redundant examples if too many."""
# Count examples (simple heuristic)
example_markers = ["Example:", "For example:", "e.g.", "E.g."]
example_count = sum(prompt.count(marker) for marker in example_markers)
# If more than 3 examples, keep only first 3
if example_count > 3:
# This is a simplified implementation
# In practice, would need more sophisticated parsing
pass
return prompt
def _compress_formatting(self, prompt: str) -> str:
"""Compress formatting elements."""
# Remove excessive newlines
prompt = '\n'.join(
line for line in prompt.split('\n')
if line.strip()
)
# Compress bullet points
prompt = prompt.replace(' - ', '- ')
prompt = prompt.replace(' * ', '* ')
return prompt
class TemplateOptimizer:
"""Optimize prompt templates."""
def __init__(self, counter: TokenCounter = None):
self.counter = counter or TokenCounter()
self._cache: dict[str, str] = {}
def optimize_template(
self,
template: str,
sample_values: dict[str, str] = None
) -> str:
"""Optimize a prompt template."""
# Check cache
cache_key = hash(template)
if cache_key in self._cache:
return self._cache[cache_key]
optimized = template
# Remove verbose template instructions
optimized = self._simplify_placeholders(optimized)
# Compress static parts
optimized = self._compress_static_parts(optimized)
self._cache[cache_key] = optimized
return optimized
def _simplify_placeholders(self, template: str) -> str:
"""Simplify placeholder descriptions."""
import re
# Replace verbose placeholders
# {user_input: The input provided by the user} -> {user_input}
pattern = r'\{(\w+):\s*[^}]+\}'
template = re.sub(pattern, r'{\1}', template)
return template
def _compress_static_parts(self, template: str) -> str:
"""Compress static parts of template."""
optimizer = PromptOptimizer(self.counter)
# Find static parts (not in placeholders)
import re
parts = re.split(r'(\{[^}]+\})', template)
compressed_parts = []
for part in parts:
if part.startswith('{') and part.endswith('}'):
compressed_parts.append(part)
else:
result = optimizer.optimize(part)
compressed_parts.append(result.optimized)
return ''.join(compressed_parts)
Production Token Service
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional
app = FastAPI()
# Initialize components
counter = TokenCounter()
compressor = TextCompressor(counter)
budget_manager = BudgetManager()
prompt_optimizer = PromptOptimizer(counter)
class CountRequest(BaseModel):
text: str
model: str = "gpt-4o"
class CompressRequest(BaseModel):
text: str
target_ratio: float = 0.5
class BudgetRequest(BaseModel):
system_prompt: str
context_items: list[str]
user_message: str
output_reserve: int = 4096
class OptimizeRequest(BaseModel):
prompt: str
@app.post("/v1/tokens/count")
async def count_tokens(request: CountRequest):
"""Count tokens in text."""
counter = TokenCounter(request.model)
result = counter.count(request.text)
return {
"text_length": len(request.text),
"token_count": result.token_count,
"model": result.model,
"encoding": result.encoding
}
@app.post("/v1/tokens/count-multi")
async def count_tokens_multi(request: CountRequest):
"""Count tokens across multiple models."""
multi_counter = MultiModelCounter()
results = multi_counter.count_all(request.text)
costs = multi_counter.estimate_cost(request.text)
return {
"text_length": len(request.text),
"counts": {
model: result.token_count
for model, result in results.items()
},
"estimated_costs": costs
}
@app.post("/v1/compress")
async def compress_text(request: CompressRequest):
"""Compress text."""
result = compressor.compress(request.text)
return {
"original_tokens": result.original_tokens,
"compressed_tokens": result.compressed_tokens,
"compression_ratio": result.compression_ratio,
"compressed": result.compressed
}
@app.post("/v1/budget/allocate")
async def allocate_budget(request: BudgetRequest):
"""Allocate token budget."""
budget = budget_manager.allocate(
request.system_prompt,
request.context_items,
request.user_message,
request.output_reserve
)
return {
"total": budget.total,
"system_prompt": budget.system_prompt,
"context": budget.context,
"user_message": budget.user_message,
"reserved_output": budget.reserved_output,
"remaining": budget.remaining
}
@app.post("/v1/budget/fit")
async def fit_context(request: BudgetRequest):
"""Fit context items within budget."""
budget = budget_manager.allocate(
request.system_prompt,
request.context_items,
request.user_message,
request.output_reserve
)
fitted = budget_manager.fit_context(
request.context_items,
budget.remaining + budget.context
)
return {
"original_items": len(request.context_items),
"fitted_items": len(fitted),
"fitted_context": fitted
}
@app.post("/v1/optimize")
async def optimize_prompt(request: OptimizeRequest):
"""Optimize a prompt."""
result = prompt_optimizer.optimize(request.prompt)
return {
"original_tokens": result.original_tokens,
"optimized_tokens": result.optimized_tokens,
"savings": result.savings,
"savings_percent": result.savings_percent,
"optimized": result.optimized
}
@app.get("/health")
async def health():
return {"status": "healthy"}
References
- tiktoken Library: https://github.com/openai/tiktoken
- OpenAI Tokenizer: https://platform.openai.com/tokenizer
- Token Counting Guide: https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken
- Prompt Compression Research: https://arxiv.org/abs/2310.06839
Conclusion
Token optimization is about getting maximum value from every token. Start with accurate counting—use tiktoken for OpenAI models and understand that different models have different tokenizers. Apply text compression techniques to reduce token usage without losing meaning: remove redundant whitespace, abbreviate common phrases, and eliminate filler words. For larger reductions, use LLM-based compression that can intelligently summarize while preserving key information. Implement budget management to allocate tokens across prompt components—system prompts, context, user messages, and output reserves. Use priority-based or importance-based allocation to ensure the most relevant content gets included. Optimize your prompt templates to eliminate verbose instructions and redundant examples. The key insight is that token efficiency compounds—small savings per request add up to significant cost reductions and enable fitting more context within limits. Build token optimization into your LLM infrastructure from the start and you'll have more headroom for both cost and capability.
