Introduction: Prompt engineering is both art and science—small changes in wording can dramatically affect LLM output quality. Systematic prompt optimization goes beyond trial and error to find prompts that consistently perform well. This guide covers proven optimization techniques: few-shot learning with carefully selected examples, chain-of-thought prompting for complex reasoning, structured output formatting, prompt compression for efficiency, and automated prompt optimization using DSPy and similar frameworks. These techniques help you extract maximum performance from any LLM while reducing costs and improving reliability.

Few-Shot Learning
from openai import OpenAI
from dataclasses import dataclass
from typing import Optional
import json
client = OpenAI()
@dataclass
class Example:
input: str
output: str
explanation: Optional[str] = None
class FewShotPrompt:
"""Build few-shot prompts with examples."""
def __init__(
self,
task_description: str,
examples: list[Example] = None
):
self.task_description = task_description
self.examples = examples or []
def add_example(self, example: Example):
"""Add an example to the prompt."""
self.examples.append(example)
def build(self, query: str) -> str:
"""Build the complete prompt."""
parts = [f"Task: {self.task_description}\n"]
if self.examples:
parts.append("Examples:\n")
for i, ex in enumerate(self.examples, 1):
parts.append(f"Example {i}:")
parts.append(f"Input: {ex.input}")
if ex.explanation:
parts.append(f"Reasoning: {ex.explanation}")
parts.append(f"Output: {ex.output}\n")
parts.append(f"Now solve this:")
parts.append(f"Input: {query}")
parts.append("Output:")
return "\n".join(parts)
def run(self, query: str, model: str = "gpt-4o-mini") -> str:
"""Run the few-shot prompt."""
prompt = self.build(query)
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
# Example: Sentiment classification
sentiment_prompt = FewShotPrompt(
task_description="Classify the sentiment of the text as positive, negative, or neutral."
)
sentiment_prompt.add_example(Example(
input="This product exceeded my expectations!",
output="positive"
))
sentiment_prompt.add_example(Example(
input="The service was terrible and the food was cold.",
output="negative"
))
sentiment_prompt.add_example(Example(
input="The meeting is scheduled for 3pm.",
output="neutral"
))
result = sentiment_prompt.run("I love how easy this software is to use!")
print(result) # positive
Chain-of-Thought Prompting
class ChainOfThoughtPrompt:
"""Prompts that encourage step-by-step reasoning."""
def __init__(self, task_description: str):
self.task_description = task_description
def zero_shot_cot(self, query: str) -> str:
"""Zero-shot chain-of-thought with 'Let's think step by step'."""
prompt = f"""{self.task_description}
Question: {query}
Let's think step by step:"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
def few_shot_cot(self, query: str, examples: list[Example]) -> str:
"""Few-shot chain-of-thought with reasoning examples."""
parts = [f"Task: {self.task_description}\n"]
for i, ex in enumerate(examples, 1):
parts.append(f"Example {i}:")
parts.append(f"Question: {ex.input}")
parts.append(f"Reasoning: {ex.explanation}")
parts.append(f"Answer: {ex.output}\n")
parts.append(f"Question: {query}")
parts.append("Reasoning:")
prompt = "\n".join(parts)
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
def self_consistency(self, query: str, n_samples: int = 5) -> str:
"""Generate multiple reasoning paths and take majority vote."""
prompt = f"""{self.task_description}
Question: {query}
Think through this step by step and provide your final answer."""
responses = []
for _ in range(n_samples):
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
temperature=0.7 # Higher temperature for diversity
)
responses.append(response.choices[0].message.content)
# Extract final answers and vote
# In practice, you'd parse the answers more carefully
return self._majority_vote(responses)
def _majority_vote(self, responses: list[str]) -> str:
"""Simple majority voting on responses."""
# Ask LLM to extract and compare answers
prompt = f"""Given these {len(responses)} responses to the same question,
determine the most common final answer.
Responses:
{chr(10).join([f'{i+1}. {r[-200:]}' for i, r in enumerate(responses)])}
What is the consensus answer?"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
# Usage
cot = ChainOfThoughtPrompt("Solve the following math word problem.")
# Zero-shot CoT
result = cot.zero_shot_cot(
"If a train travels at 60 mph for 2.5 hours, then at 80 mph for 1.5 hours, what is the total distance?"
)
print(result)
# Few-shot CoT with examples
examples = [
Example(
input="A store sells apples for $2 each. If you buy 5 apples and pay with a $20 bill, how much change do you get?",
explanation="Cost of apples = 5 × $2 = $10. Change = $20 - $10 = $10.",
output="$10"
)
]
result = cot.few_shot_cot(
"A rectangle has a length of 12 cm and a width of 8 cm. What is its perimeter?",
examples
)
Structured Output Formatting
from pydantic import BaseModel
from typing import Optional
class StructuredPrompt:
"""Prompts that enforce structured output."""
def json_output(
self,
query: str,
schema: dict,
task_description: str = ""
) -> dict:
"""Get JSON output matching a schema."""
prompt = f"""{task_description}
{query}
Respond with valid JSON matching this schema:
{json.dumps(schema, indent=2)}"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
def pydantic_output(
self,
query: str,
model_class: type[BaseModel],
task_description: str = ""
) -> BaseModel:
"""Get output as Pydantic model."""
schema = model_class.model_json_schema()
result = self.json_output(query, schema, task_description)
return model_class(**result)
def xml_output(
self,
query: str,
tags: list[str],
task_description: str = ""
) -> dict:
"""Get output in XML format for easy parsing."""
tags_example = "\n".join([f"<{tag}>value{tag}>" for tag in tags])
prompt = f"""{task_description}
{query}
Respond using these XML tags:
{tags_example}"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
# Parse XML tags
import re
result = {}
for tag in tags:
match = re.search(f"<{tag}>(.*?){tag}>", response.choices[0].message.content, re.DOTALL)
if match:
result[tag] = match.group(1).strip()
return result
# Usage
class ProductReview(BaseModel):
sentiment: str
key_points: list[str]
rating: int
summary: str
structured = StructuredPrompt()
review = structured.pydantic_output(
query="Analyze this review: 'Great laptop! Fast performance, excellent battery life, but the keyboard could be better. Overall very satisfied with my purchase.'",
model_class=ProductReview,
task_description="Extract structured information from product reviews."
)
print(f"Sentiment: {review.sentiment}")
print(f"Rating: {review.rating}/5")
print(f"Key points: {review.key_points}")
Prompt Compression
class PromptCompressor:
"""Compress prompts to reduce tokens while maintaining quality."""
def compress_examples(
self,
examples: list[Example],
max_examples: int = 3
) -> list[Example]:
"""Select most diverse/representative examples."""
if len(examples) <= max_examples:
return examples
# Use embeddings to find diverse examples
texts = [f"{ex.input} {ex.output}" for ex in examples]
response = client.embeddings.create(
model="text-embedding-3-small",
input=texts
)
embeddings = [e.embedding for e in response.data]
# Greedy selection for diversity
import numpy as np
selected_indices = [0] # Start with first
while len(selected_indices) < max_examples:
max_min_dist = -1
best_idx = -1
for i in range(len(examples)):
if i in selected_indices:
continue
# Find minimum distance to selected
min_dist = min(
np.dot(embeddings[i], embeddings[j])
for j in selected_indices
)
if min_dist > max_min_dist:
max_min_dist = min_dist
best_idx = i
selected_indices.append(best_idx)
return [examples[i] for i in selected_indices]
def compress_context(self, context: str, max_tokens: int = 500) -> str:
"""Compress context while preserving key information."""
prompt = f"""Compress this text to under {max_tokens} tokens while preserving all key information.
Remove redundancy but keep important details.
Text to compress:
{context}
Compressed version:"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
max_tokens=max_tokens
)
return response.choices[0].message.content
def remove_redundancy(self, prompt: str) -> str:
"""Remove redundant phrases and instructions."""
redundant_phrases = [
"Please note that",
"It's important to",
"Make sure to",
"Remember that",
"Keep in mind that",
"As mentioned earlier",
]
result = prompt
for phrase in redundant_phrases:
result = result.replace(phrase, "")
# Remove extra whitespace
import re
result = re.sub(r'\n\s*\n', '\n\n', result)
result = re.sub(r' +', ' ', result)
return result.strip()
# Usage
compressor = PromptCompressor()
# Compress examples
all_examples = [Example(f"input {i}", f"output {i}") for i in range(10)]
selected = compressor.compress_examples(all_examples, max_examples=3)
print(f"Selected {len(selected)} diverse examples")
Automated Prompt Optimization
from dataclasses import dataclass
from typing import Callable
import random
@dataclass
class PromptCandidate:
prompt_template: str
score: float = 0.0
evaluations: int = 0
class PromptOptimizer:
"""Automatically optimize prompts using evaluation."""
def __init__(
self,
eval_fn: Callable[[str, str], float],
test_cases: list[dict]
):
self.eval_fn = eval_fn # (output, expected) -> score
self.test_cases = test_cases
self.candidates: list[PromptCandidate] = []
def evaluate_prompt(self, prompt_template: str) -> float:
"""Evaluate a prompt on test cases."""
scores = []
for case in self.test_cases:
# Fill in the template
prompt = prompt_template.format(input=case["input"])
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}]
)
output = response.choices[0].message.content
score = self.eval_fn(output, case["expected"])
scores.append(score)
return sum(scores) / len(scores)
def generate_variations(self, base_prompt: str, n: int = 5) -> list[str]:
"""Generate prompt variations using LLM."""
prompt = f"""Generate {n} variations of this prompt that might perform better.
Keep the same intent but try different:
- Wording and phrasing
- Structure and formatting
- Level of detail
- Instruction style
Original prompt:
{base_prompt}
Return JSON: {{"variations": ["prompt1", "prompt2", ...]}}"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
return data.get("variations", [])
def optimize(
self,
initial_prompt: str,
iterations: int = 3,
candidates_per_iteration: int = 5
) -> str:
"""Optimize prompt through iterative improvement."""
# Evaluate initial prompt
initial_score = self.evaluate_prompt(initial_prompt)
best_prompt = initial_prompt
best_score = initial_score
print(f"Initial score: {initial_score:.3f}")
for iteration in range(iterations):
# Generate variations of best prompt
variations = self.generate_variations(best_prompt, candidates_per_iteration)
for var in variations:
score = self.evaluate_prompt(var)
if score > best_score:
best_score = score
best_prompt = var
print(f"Iteration {iteration + 1}: New best score {score:.3f}")
print(f"Final score: {best_score:.3f} (improved by {best_score - initial_score:.3f})")
return best_prompt
# Usage
def exact_match_score(output: str, expected: str) -> float:
return 1.0 if expected.lower() in output.lower() else 0.0
test_cases = [
{"input": "What is 2 + 2?", "expected": "4"},
{"input": "What is the capital of France?", "expected": "Paris"},
{"input": "What color is the sky?", "expected": "blue"},
]
optimizer = PromptOptimizer(exact_match_score, test_cases)
initial_prompt = "Answer this question: {input}"
optimized = optimizer.optimize(initial_prompt, iterations=2)
Production Prompt Management
from fastapi import FastAPI
from pydantic import BaseModel
from typing import Optional
from datetime import datetime
import hashlib
app = FastAPI()
# Prompt registry
prompts_db: dict[str, dict] = {}
class PromptVersion(BaseModel):
template: str
version: int
created_at: str
metrics: dict = {}
is_active: bool = False
class PromptConfig(BaseModel):
name: str
description: str
versions: list[PromptVersion] = []
active_version: int = 0
@app.post("/prompts/{name}")
async def create_prompt(name: str, template: str, description: str = ""):
"""Create a new prompt."""
if name in prompts_db:
return {"error": "Prompt already exists"}
version = PromptVersion(
template=template,
version=1,
created_at=datetime.now().isoformat(),
is_active=True
)
prompts_db[name] = {
"name": name,
"description": description,
"versions": [version.model_dump()],
"active_version": 1
}
return {"created": name, "version": 1}
@app.post("/prompts/{name}/versions")
async def add_version(name: str, template: str):
"""Add a new version of a prompt."""
if name not in prompts_db:
return {"error": "Prompt not found"}
prompt = prompts_db[name]
new_version = len(prompt["versions"]) + 1
version = PromptVersion(
template=template,
version=new_version,
created_at=datetime.now().isoformat(),
is_active=False
)
prompt["versions"].append(version.model_dump())
return {"added_version": new_version}
@app.post("/prompts/{name}/activate/{version}")
async def activate_version(name: str, version: int):
"""Activate a specific version."""
if name not in prompts_db:
return {"error": "Prompt not found"}
prompt = prompts_db[name]
for v in prompt["versions"]:
v["is_active"] = (v["version"] == version)
prompt["active_version"] = version
return {"activated": version}
@app.get("/prompts/{name}")
async def get_prompt(name: str, version: Optional[int] = None):
"""Get a prompt template."""
if name not in prompts_db:
return {"error": "Prompt not found"}
prompt = prompts_db[name]
if version:
for v in prompt["versions"]:
if v["version"] == version:
return v
return {"error": "Version not found"}
# Return active version
for v in prompt["versions"]:
if v["is_active"]:
return v
return prompt["versions"][-1]
@app.post("/prompts/{name}/metrics")
async def record_metrics(name: str, version: int, metrics: dict):
"""Record performance metrics for a prompt version."""
if name not in prompts_db:
return {"error": "Prompt not found"}
prompt = prompts_db[name]
for v in prompt["versions"]:
if v["version"] == version:
v["metrics"] = metrics
return {"recorded": True}
return {"error": "Version not found"}
References
- Chain-of-Thought Paper: https://arxiv.org/abs/2201.11903
- DSPy Framework: https://github.com/stanfordnlp/dspy
- OpenAI Prompt Engineering: https://platform.openai.com/docs/guides/prompt-engineering
- Self-Consistency Paper: https://arxiv.org/abs/2203.11171
Conclusion
Systematic prompt optimization delivers consistent improvements over ad-hoc prompt engineering. Use few-shot examples to demonstrate the desired behavior—select diverse examples that cover edge cases. Apply chain-of-thought prompting for tasks requiring reasoning, either zero-shot with “Let’s think step by step” or few-shot with reasoning examples. Enforce structured output with JSON schemas or XML tags for reliable parsing. Compress prompts to reduce costs while maintaining quality. Consider automated optimization for high-volume applications where small improvements compound. Version and track your prompts like code—measure performance and iterate based on data.
Discover more from Code, Cloud & Context
Subscribe to get the latest posts sent to your email.