Introduction: Prompt engineering is the art and science of communicating effectively with large language models. Unlike traditional programming where you write explicit instructions, prompt engineering requires understanding how models interpret language, what context they need, and how to structure requests for optimal results. This guide covers the fundamental techniques that separate amateur prompts from production-quality ones: structuring prompts with clear roles and instructions, providing effective few-shot examples, using chain-of-thought reasoning for complex tasks, implementing output format specifications, and iteratively refining prompts based on evaluation. Whether you’re building chatbots, code assistants, or content generation systems, these techniques will help you get consistent, high-quality outputs from any LLM.

Prompt Structure and Components
from dataclasses import dataclass, field
from typing import Any, Optional
from abc import ABC, abstractmethod
from enum import Enum
class PromptRole(Enum):
"""Roles for prompt components."""
SYSTEM = "system"
USER = "user"
ASSISTANT = "assistant"
@dataclass
class PromptComponent:
"""A component of a structured prompt."""
role: PromptRole
content: str
name: str = None
@dataclass
class StructuredPrompt:
"""A well-structured prompt."""
system: str = ""
context: str = ""
instruction: str = ""
examples: list[dict] = field(default_factory=list)
output_format: str = ""
constraints: list[str] = field(default_factory=list)
def build(self) -> list[dict]:
"""Build prompt messages."""
messages = []
# System message
system_parts = []
if self.system:
system_parts.append(self.system)
if self.constraints:
system_parts.append("Constraints:\n" + "\n".join(f"- {c}" for c in self.constraints))
if system_parts:
messages.append({
"role": "system",
"content": "\n\n".join(system_parts)
})
# Few-shot examples
for example in self.examples:
messages.append({"role": "user", "content": example["input"]})
messages.append({"role": "assistant", "content": example["output"]})
# User message with context and instruction
user_parts = []
if self.context:
user_parts.append(f"Context:\n{self.context}")
if self.instruction:
user_parts.append(f"Task:\n{self.instruction}")
if self.output_format:
user_parts.append(f"Output Format:\n{self.output_format}")
if user_parts:
messages.append({
"role": "user",
"content": "\n\n".join(user_parts)
})
return messages
class PromptBuilder:
"""Fluent builder for prompts."""
def __init__(self):
self._prompt = StructuredPrompt()
def system(self, content: str) -> 'PromptBuilder':
"""Set system prompt."""
self._prompt.system = content
return self
def context(self, content: str) -> 'PromptBuilder':
"""Add context."""
self._prompt.context = content
return self
def instruction(self, content: str) -> 'PromptBuilder':
"""Set instruction."""
self._prompt.instruction = content
return self
def example(self, input_text: str, output_text: str) -> 'PromptBuilder':
"""Add few-shot example."""
self._prompt.examples.append({
"input": input_text,
"output": output_text
})
return self
def output_format(self, format_spec: str) -> 'PromptBuilder':
"""Specify output format."""
self._prompt.output_format = format_spec
return self
def constraint(self, constraint: str) -> 'PromptBuilder':
"""Add constraint."""
self._prompt.constraints.append(constraint)
return self
def build(self) -> StructuredPrompt:
"""Build the prompt."""
return self._prompt
class RoleBasedPrompt:
"""Prompt with explicit role definition."""
def __init__(
self,
role: str,
expertise: list[str] = None,
personality: str = None,
communication_style: str = None
):
self.role = role
self.expertise = expertise or []
self.personality = personality
self.communication_style = communication_style
def build_system_prompt(self) -> str:
"""Build system prompt from role definition."""
parts = [f"You are {self.role}."]
if self.expertise:
parts.append(f"Your areas of expertise include: {', '.join(self.expertise)}.")
if self.personality:
parts.append(f"Your personality: {self.personality}.")
if self.communication_style:
parts.append(f"Communication style: {self.communication_style}.")
return " ".join(parts)
# Example usage
def create_code_review_prompt(code: str, language: str) -> StructuredPrompt:
"""Create a code review prompt."""
return (
PromptBuilder()
.system("You are an expert code reviewer with 20 years of experience.")
.context(f"Language: {language}")
.instruction(f"Review the following code for bugs, security issues, and improvements:\n\n```{language}\n{code}\n```")
.constraint("Focus on critical issues first")
.constraint("Provide specific line numbers")
.constraint("Suggest concrete fixes, not just problems")
.output_format("""Provide your review in this format:
## Critical Issues
- Issue 1: [description] (line X)
Fix: [suggested fix]
## Improvements
- Improvement 1: [description]
## Summary
[Overall assessment]""")
.build()
)
Few-Shot Learning
from dataclasses import dataclass
from typing import Any, Optional
import random
@dataclass
class Example:
"""A few-shot example."""
input: str
output: str
metadata: dict = None
def to_messages(self) -> list[dict]:
"""Convert to message format."""
return [
{"role": "user", "content": self.input},
{"role": "assistant", "content": self.output}
]
class ExampleStore:
"""Store and retrieve few-shot examples."""
def __init__(self):
self.examples: list[Example] = []
self.by_category: dict[str, list[Example]] = {}
def add(self, example: Example, category: str = "default"):
"""Add example to store."""
self.examples.append(example)
if category not in self.by_category:
self.by_category[category] = []
self.by_category[category].append(example)
def get_random(self, n: int = 3, category: str = None) -> list[Example]:
"""Get random examples."""
pool = self.by_category.get(category, self.examples) if category else self.examples
return random.sample(pool, min(n, len(pool)))
def get_diverse(self, n: int = 3) -> list[Example]:
"""Get diverse examples from different categories."""
result = []
categories = list(self.by_category.keys())
for i in range(n):
cat = categories[i % len(categories)]
examples = self.by_category[cat]
if examples:
result.append(random.choice(examples))
return result
class SemanticExampleSelector:
"""Select examples based on semantic similarity."""
def __init__(self, embedding_model: Any, examples: list[Example]):
self.embedder = embedding_model
self.examples = examples
self.embeddings = None
self._build_index()
def _build_index(self):
"""Build embedding index."""
import numpy as np
embeddings = []
for example in self.examples:
emb = self.embedder.embed(example.input)
embeddings.append(emb)
self.embeddings = np.array(embeddings)
def select(self, query: str, k: int = 3) -> list[Example]:
"""Select most similar examples."""
import numpy as np
query_embedding = self.embedder.embed(query)
# Calculate similarities
similarities = np.dot(self.embeddings, query_embedding)
# Get top-k indices
top_indices = np.argsort(similarities)[-k:][::-1]
return [self.examples[i] for i in top_indices]
class DynamicFewShotPrompt:
"""Prompt with dynamically selected examples."""
def __init__(
self,
base_prompt: str,
example_selector: SemanticExampleSelector,
num_examples: int = 3
):
self.base_prompt = base_prompt
self.selector = example_selector
self.num_examples = num_examples
def build(self, query: str) -> list[dict]:
"""Build prompt with selected examples."""
messages = [{"role": "system", "content": self.base_prompt}]
# Select relevant examples
examples = self.selector.select(query, k=self.num_examples)
# Add examples
for example in examples:
messages.extend(example.to_messages())
# Add query
messages.append({"role": "user", "content": query})
return messages
class ExampleFormatter:
"""Format examples for different tasks."""
@staticmethod
def classification(input_text: str, label: str) -> Example:
"""Format classification example."""
return Example(
input=f"Classify: {input_text}",
output=f"Classification: {label}"
)
@staticmethod
def extraction(text: str, entities: dict) -> Example:
"""Format extraction example."""
import json
return Example(
input=f"Extract entities from: {text}",
output=json.dumps(entities, indent=2)
)
@staticmethod
def transformation(input_text: str, output_text: str, task: str) -> Example:
"""Format transformation example."""
return Example(
input=f"{task}:\n{input_text}",
output=output_text
)
@staticmethod
def qa(question: str, answer: str, context: str = None) -> Example:
"""Format Q&A example."""
if context:
input_text = f"Context: {context}\n\nQuestion: {question}"
else:
input_text = f"Question: {question}"
return Example(
input=input_text,
output=f"Answer: {answer}"
)
Chain-of-Thought Prompting
from dataclasses import dataclass
from typing import Any, Optional
@dataclass
class ThoughtStep:
"""A step in chain-of-thought reasoning."""
step_number: int
thought: str
action: str = None
observation: str = None
class ChainOfThoughtPrompt:
"""Prompt that encourages step-by-step reasoning."""
def __init__(self, task: str, cot_trigger: str = None):
self.task = task
self.cot_trigger = cot_trigger or "Let's think step by step."
def build(self) -> str:
"""Build CoT prompt."""
return f"{self.task}\n\n{self.cot_trigger}"
def build_with_format(self) -> str:
"""Build CoT prompt with explicit format."""
return f"""{self.task}
{self.cot_trigger}
Format your response as:
Step 1: [First reasoning step]
Step 2: [Second reasoning step]
...
Final Answer: [Your conclusion]"""
class ZeroShotCoT:
"""Zero-shot chain-of-thought prompting."""
def __init__(self, llm_client: Any):
self.llm = llm_client
async def reason(self, question: str) -> tuple[str, list[str]]:
"""Apply zero-shot CoT."""
# First pass: generate reasoning
cot_prompt = f"{question}\n\nLet's think step by step."
reasoning_response = await self.llm.complete(cot_prompt)
reasoning = reasoning_response.content
# Second pass: extract answer
answer_prompt = f"""{question}
{reasoning}
Therefore, the answer is:"""
answer_response = await self.llm.complete(answer_prompt)
# Parse steps
steps = self._parse_steps(reasoning)
return answer_response.content, steps
def _parse_steps(self, reasoning: str) -> list[str]:
"""Parse reasoning into steps."""
import re
# Try to find numbered steps
step_pattern = r'(?:Step \d+:|^\d+\.|\*)\s*(.+?)(?=(?:Step \d+:|^\d+\.|\*|$))'
matches = re.findall(step_pattern, reasoning, re.MULTILINE | re.DOTALL)
if matches:
return [m.strip() for m in matches if m.strip()]
# Fall back to sentences
sentences = reasoning.split('. ')
return [s.strip() + '.' for s in sentences if s.strip()]
class FewShotCoT:
"""Few-shot chain-of-thought with examples."""
def __init__(self, llm_client: Any, examples: list[dict]):
self.llm = llm_client
self.examples = examples # Each has 'question', 'reasoning', 'answer'
async def reason(self, question: str) -> tuple[str, str]:
"""Apply few-shot CoT."""
# Build prompt with examples
prompt_parts = []
for ex in self.examples:
prompt_parts.append(f"Question: {ex['question']}")
prompt_parts.append(f"Reasoning: {ex['reasoning']}")
prompt_parts.append(f"Answer: {ex['answer']}")
prompt_parts.append("")
prompt_parts.append(f"Question: {question}")
prompt_parts.append("Reasoning:")
prompt = "\n".join(prompt_parts)
response = await self.llm.complete(prompt)
# Parse response
content = response.content
if "Answer:" in content:
parts = content.split("Answer:")
reasoning = parts[0].strip()
answer = parts[1].strip()
else:
reasoning = content
answer = content.split('\n')[-1]
return answer, reasoning
class SelfConsistency:
"""Self-consistency with multiple reasoning paths."""
def __init__(self, llm_client: Any, num_paths: int = 5):
self.llm = llm_client
self.num_paths = num_paths
async def reason(self, question: str) -> tuple[str, list[str], float]:
"""Generate multiple paths and vote."""
import asyncio
from collections import Counter
# Generate multiple reasoning paths
cot_prompt = f"{question}\n\nLet's think step by step."
tasks = [
self.llm.complete(cot_prompt, temperature=0.7)
for _ in range(self.num_paths)
]
responses = await asyncio.gather(*tasks)
# Extract answers from each path
answers = []
reasonings = []
for response in responses:
answer = self._extract_answer(response.content)
answers.append(answer)
reasonings.append(response.content)
# Vote for most common answer
counter = Counter(answers)
most_common = counter.most_common(1)[0]
confidence = most_common[1] / len(answers)
return most_common[0], reasonings, confidence
def _extract_answer(self, reasoning: str) -> str:
"""Extract final answer from reasoning."""
import re
# Look for explicit answer markers
patterns = [
r'(?:the answer is|therefore|thus|so)[:\s]+(.+?)(?:\.|$)',
r'(?:final answer)[:\s]+(.+?)(?:\.|$)',
r'= (.+?)$'
]
for pattern in patterns:
match = re.search(pattern, reasoning, re.IGNORECASE)
if match:
return match.group(1).strip()
# Fall back to last line
lines = reasoning.strip().split('\n')
return lines[-1].strip()
class TreeOfThought:
"""Tree of thought reasoning."""
def __init__(
self,
llm_client: Any,
branching_factor: int = 3,
max_depth: int = 3
):
self.llm = llm_client
self.branching_factor = branching_factor
self.max_depth = max_depth
async def reason(self, problem: str) -> tuple[str, dict]:
"""Explore reasoning tree."""
tree = {"problem": problem, "children": []}
# Generate initial thoughts
initial_thoughts = await self._generate_thoughts(problem, [])
for thought in initial_thoughts[:self.branching_factor]:
node = await self._explore_branch(problem, [thought], depth=1)
tree["children"].append(node)
# Find best path
best_path, best_score = self._find_best_path(tree)
return best_path[-1] if best_path else "", tree
async def _generate_thoughts(
self,
problem: str,
path: list[str]
) -> list[str]:
"""Generate next thoughts."""
context = "\n".join(path) if path else "Start"
prompt = f"""Problem: {problem}
Current reasoning path:
{context}
Generate {self.branching_factor} different next steps in the reasoning.
Format: One thought per line."""
response = await self.llm.complete(prompt)
thoughts = response.content.strip().split('\n')
return [t.strip() for t in thoughts if t.strip()]
async def _explore_branch(
self,
problem: str,
path: list[str],
depth: int
) -> dict:
"""Explore a branch of the tree."""
node = {
"thought": path[-1],
"score": await self._evaluate_thought(problem, path),
"children": []
}
if depth < self.max_depth:
next_thoughts = await self._generate_thoughts(problem, path)
for thought in next_thoughts[:self.branching_factor]:
child = await self._explore_branch(
problem,
path + [thought],
depth + 1
)
node["children"].append(child)
return node
async def _evaluate_thought(
self,
problem: str,
path: list[str]
) -> float:
"""Evaluate quality of reasoning path."""
prompt = f"""Problem: {problem}
Reasoning path:
{chr(10).join(path)}
Rate this reasoning path from 0 to 1 based on:
- Logical coherence
- Progress toward solution
- Correctness
Output only a number between 0 and 1."""
response = await self.llm.complete(prompt)
try:
return float(response.content.strip())
except ValueError:
return 0.5
def _find_best_path(self, tree: dict) -> tuple[list[str], float]:
"""Find highest-scoring path in tree."""
def dfs(node, path):
current_path = path + [node.get("thought", "")]
if not node.get("children"):
return current_path, node.get("score", 0)
best_child_path = []
best_score = -1
for child in node["children"]:
child_path, score = dfs(child, current_path)
if score > best_score:
best_score = score
best_child_path = child_path
return best_child_path, best_score
return dfs(tree, [])
Output Format Specification
from dataclasses import dataclass
from typing import Any, Optional, Type
from abc import ABC, abstractmethod
import json
class OutputFormat(ABC):
"""Abstract output format specification."""
@abstractmethod
def get_instruction(self) -> str:
"""Get format instruction for prompt."""
pass
@abstractmethod
def parse(self, output: str) -> Any:
"""Parse output according to format."""
pass
@abstractmethod
def validate(self, parsed: Any) -> bool:
"""Validate parsed output."""
pass
class JSONFormat(OutputFormat):
"""JSON output format."""
def __init__(self, schema: dict = None, example: dict = None):
self.schema = schema
self.example = example
def get_instruction(self) -> str:
"""Get JSON format instruction."""
instruction = "Output your response as valid JSON."
if self.schema:
instruction += f"\n\nSchema:\n```json\n{json.dumps(self.schema, indent=2)}\n```"
if self.example:
instruction += f"\n\nExample:\n```json\n{json.dumps(self.example, indent=2)}\n```"
return instruction
def parse(self, output: str) -> Any:
"""Parse JSON output."""
import re
# Try to extract JSON from markdown code blocks
json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', output)
if json_match:
json_str = json_match.group(1)
else:
json_str = output
return json.loads(json_str)
def validate(self, parsed: Any) -> bool:
"""Validate against schema."""
if not self.schema:
return True
# Simple validation - check required fields
if "required" in self.schema:
for field in self.schema["required"]:
if field not in parsed:
return False
return True
class MarkdownFormat(OutputFormat):
"""Markdown output format."""
def __init__(self, sections: list[str] = None):
self.sections = sections or []
def get_instruction(self) -> str:
"""Get markdown format instruction."""
instruction = "Format your response in Markdown."
if self.sections:
instruction += "\n\nInclude these sections:\n"
for section in self.sections:
instruction += f"## {section}\n"
return instruction
def parse(self, output: str) -> dict:
"""Parse markdown into sections."""
import re
sections = {}
current_section = "intro"
current_content = []
for line in output.split('\n'):
header_match = re.match(r'^##\s+(.+)$', line)
if header_match:
# Save previous section
if current_content:
sections[current_section] = '\n'.join(current_content).strip()
current_section = header_match.group(1)
current_content = []
else:
current_content.append(line)
# Save last section
if current_content:
sections[current_section] = '\n'.join(current_content).strip()
return sections
def validate(self, parsed: dict) -> bool:
"""Validate sections exist."""
for section in self.sections:
if section not in parsed:
return False
return True
class StructuredFormat(OutputFormat):
"""Pydantic-based structured output."""
def __init__(self, model_class: Type):
self.model_class = model_class
def get_instruction(self) -> str:
"""Get structured format instruction."""
schema = self.model_class.model_json_schema()
return f"""Output your response as JSON matching this schema:
```json
{json.dumps(schema, indent=2)}
```
Ensure all required fields are present and types are correct."""
def parse(self, output: str) -> Any:
"""Parse and validate with Pydantic."""
import re
# Extract JSON
json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', output)
if json_match:
json_str = json_match.group(1)
else:
json_str = output
data = json.loads(json_str)
return self.model_class.model_validate(data)
def validate(self, parsed: Any) -> bool:
"""Already validated by Pydantic."""
return True
class XMLFormat(OutputFormat):
"""XML output format."""
def __init__(self, root_element: str, elements: list[str] = None):
self.root = root_element
self.elements = elements or []
def get_instruction(self) -> str:
"""Get XML format instruction."""
instruction = f"Output your response as XML with root element <{self.root}>."
if self.elements:
instruction += "\n\nInclude these elements:\n"
for elem in self.elements:
instruction += f"<{elem}>...{elem}>\n"
return instruction
def parse(self, output: str) -> dict:
"""Parse XML output."""
import xml.etree.ElementTree as ET
import re
# Extract XML from code blocks if present
xml_match = re.search(r'```(?:xml)?\s*([\s\S]*?)\s*```', output)
if xml_match:
xml_str = xml_match.group(1)
else:
xml_str = output
root = ET.fromstring(xml_str)
result = {}
for child in root:
result[child.tag] = child.text
return result
def validate(self, parsed: dict) -> bool:
"""Validate elements exist."""
for elem in self.elements:
if elem not in parsed:
return False
return True
class OutputParser:
"""Parse and validate LLM outputs."""
def __init__(self, format_spec: OutputFormat, retry_llm: Any = None):
self.format = format_spec
self.retry_llm = retry_llm
async def parse(self, output: str, max_retries: int = 2) -> Any:
"""Parse output with retries."""
for attempt in range(max_retries + 1):
try:
parsed = self.format.parse(output)
if self.format.validate(parsed):
return parsed
else:
raise ValueError("Validation failed")
except Exception as e:
if attempt < max_retries and self.retry_llm:
output = await self._fix_output(output, str(e))
else:
raise
return None
async def _fix_output(self, output: str, error: str) -> str:
"""Ask LLM to fix malformed output."""
prompt = f"""The following output has a formatting error:
{output}
Error: {error}
{self.format.get_instruction()}
Please fix the output to match the required format."""
response = await self.retry_llm.complete(prompt)
return response.content
Prompt Templates and Management
from dataclasses import dataclass, field
from typing import Any, Optional
from string import Template
import re
@dataclass
class PromptTemplate:
"""Reusable prompt template."""
name: str
template: str
variables: list[str] = field(default_factory=list)
defaults: dict = field(default_factory=dict)
version: str = "1.0"
def __post_init__(self):
# Auto-detect variables if not provided
if not self.variables:
self.variables = self._extract_variables()
def _extract_variables(self) -> list[str]:
"""Extract variable names from template."""
pattern = r'\{(\w+)\}'
return list(set(re.findall(pattern, self.template)))
def format(self, **kwargs) -> str:
"""Format template with variables."""
# Merge defaults with provided values
values = {**self.defaults, **kwargs}
# Check for missing variables
missing = set(self.variables) - set(values.keys())
if missing:
raise ValueError(f"Missing variables: {missing}")
return self.template.format(**values)
def partial(self, **kwargs) -> 'PromptTemplate':
"""Create partial template with some variables filled."""
new_defaults = {**self.defaults, **kwargs}
new_template = self.template
for key, value in kwargs.items():
new_template = new_template.replace(f"{{{key}}}", str(value))
return PromptTemplate(
name=f"{self.name}_partial",
template=new_template,
defaults=new_defaults,
version=self.version
)
class PromptLibrary:
"""Library of prompt templates."""
def __init__(self):
self.templates: dict[str, PromptTemplate] = {}
self.versions: dict[str, list[PromptTemplate]] = {}
def add(self, template: PromptTemplate):
"""Add template to library."""
self.templates[template.name] = template
if template.name not in self.versions:
self.versions[template.name] = []
self.versions[template.name].append(template)
def get(self, name: str, version: str = None) -> Optional[PromptTemplate]:
"""Get template by name and optional version."""
if version:
for t in self.versions.get(name, []):
if t.version == version:
return t
return None
return self.templates.get(name)
def list_templates(self) -> list[str]:
"""List all template names."""
return list(self.templates.keys())
def export(self) -> dict:
"""Export library as dict."""
return {
name: {
"template": t.template,
"variables": t.variables,
"defaults": t.defaults,
"version": t.version
}
for name, t in self.templates.items()
}
def import_from(self, data: dict):
"""Import templates from dict."""
for name, config in data.items():
template = PromptTemplate(
name=name,
template=config["template"],
variables=config.get("variables", []),
defaults=config.get("defaults", {}),
version=config.get("version", "1.0")
)
self.add(template)
class PromptChain:
"""Chain multiple prompts together."""
def __init__(self, llm_client: Any):
self.llm = llm_client
self.steps: list[tuple[PromptTemplate, str]] = []
def add_step(
self,
template: PromptTemplate,
output_key: str
) -> 'PromptChain':
"""Add step to chain."""
self.steps.append((template, output_key))
return self
async def run(self, initial_vars: dict) -> dict:
"""Run the chain."""
context = dict(initial_vars)
for template, output_key in self.steps:
# Format prompt with current context
prompt = template.format(**context)
# Get response
response = await self.llm.complete(prompt)
# Add to context
context[output_key] = response.content
return context
class ConditionalPrompt:
"""Prompt with conditional sections."""
def __init__(self, base_template: str):
self.base = base_template
self.conditions: list[tuple[callable, str]] = []
def when(self, condition: callable, section: str) -> 'ConditionalPrompt':
"""Add conditional section."""
self.conditions.append((condition, section))
return self
def build(self, context: dict) -> str:
"""Build prompt with applicable sections."""
sections = [self.base]
for condition, section in self.conditions:
if condition(context):
sections.append(section)
return "\n\n".join(sections)
# Example templates
COMMON_TEMPLATES = PromptLibrary()
COMMON_TEMPLATES.add(PromptTemplate(
name="summarize",
template="""Summarize the following text in {length} sentences:
{text}
Summary:""",
defaults={"length": "3"}
))
COMMON_TEMPLATES.add(PromptTemplate(
name="translate",
template="""Translate the following text from {source_lang} to {target_lang}:
{text}
Translation:""",
defaults={"source_lang": "English"}
))
COMMON_TEMPLATES.add(PromptTemplate(
name="extract_entities",
template="""Extract all {entity_types} from the following text:
{text}
Output as JSON array.""",
defaults={"entity_types": "named entities (people, places, organizations)"}
))
Prompt Optimization and Evaluation
from dataclasses import dataclass, field
from typing import Any, Optional
import asyncio
@dataclass
class EvaluationResult:
"""Result of prompt evaluation."""
prompt_version: str
accuracy: float
latency_ms: float
cost: float
samples_evaluated: int
errors: list[str] = field(default_factory=list)
@dataclass
class TestCase:
"""Test case for prompt evaluation."""
input: dict
expected_output: Any
metadata: dict = None
class PromptEvaluator:
"""Evaluate prompt performance."""
def __init__(
self,
llm_client: Any,
test_cases: list[TestCase],
output_parser: Any = None
):
self.llm = llm_client
self.test_cases = test_cases
self.parser = output_parser
async def evaluate(
self,
template: Any,
sample_size: int = None
) -> EvaluationResult:
"""Evaluate prompt on test cases."""
import time
cases = self.test_cases
if sample_size:
import random
cases = random.sample(cases, min(sample_size, len(cases)))
correct = 0
total_latency = 0
errors = []
for case in cases:
try:
# Format prompt
prompt = template.format(**case.input)
# Time the call
start = time.time()
response = await self.llm.complete(prompt)
latency = (time.time() - start) * 1000
total_latency += latency
# Parse output
if self.parser:
output = await self.parser.parse(response.content)
else:
output = response.content
# Check correctness
if self._check_correct(output, case.expected_output):
correct += 1
except Exception as e:
errors.append(f"{case.input}: {str(e)}")
return EvaluationResult(
prompt_version=template.version if hasattr(template, 'version') else "unknown",
accuracy=correct / len(cases) if cases else 0,
latency_ms=total_latency / len(cases) if cases else 0,
cost=0, # Would calculate from token usage
samples_evaluated=len(cases),
errors=errors
)
def _check_correct(self, output: Any, expected: Any) -> bool:
"""Check if output matches expected."""
if isinstance(expected, str):
return expected.lower() in str(output).lower()
elif isinstance(expected, dict):
if not isinstance(output, dict):
return False
for key, value in expected.items():
if key not in output or output[key] != value:
return False
return True
else:
return output == expected
class ABTestRunner:
"""Run A/B tests on prompts."""
def __init__(
self,
llm_client: Any,
evaluator: PromptEvaluator
):
self.llm = llm_client
self.evaluator = evaluator
async def compare(
self,
prompt_a: Any,
prompt_b: Any,
sample_size: int = 100
) -> dict:
"""Compare two prompts."""
# Evaluate both
result_a, result_b = await asyncio.gather(
self.evaluator.evaluate(prompt_a, sample_size),
self.evaluator.evaluate(prompt_b, sample_size)
)
# Calculate significance
significance = self._calculate_significance(
result_a.accuracy,
result_b.accuracy,
sample_size
)
return {
"prompt_a": {
"version": result_a.prompt_version,
"accuracy": result_a.accuracy,
"latency_ms": result_a.latency_ms
},
"prompt_b": {
"version": result_b.prompt_version,
"accuracy": result_b.accuracy,
"latency_ms": result_b.latency_ms
},
"winner": "a" if result_a.accuracy > result_b.accuracy else "b",
"accuracy_diff": abs(result_a.accuracy - result_b.accuracy),
"statistically_significant": significance < 0.05
}
def _calculate_significance(
self,
acc_a: float,
acc_b: float,
n: int
) -> float:
"""Calculate statistical significance (simplified)."""
import math
# Simple z-test approximation
p = (acc_a + acc_b) / 2
se = math.sqrt(2 * p * (1 - p) / n) if p > 0 and p < 1 else 1
z = abs(acc_a - acc_b) / se if se > 0 else 0
# Approximate p-value
return 2 * (1 - min(0.9999, 0.5 + 0.5 * math.erf(z / math.sqrt(2))))
class PromptOptimizer:
"""Optimize prompts through iteration."""
def __init__(
self,
llm_client: Any,
evaluator: PromptEvaluator,
meta_llm: Any = None
):
self.llm = llm_client
self.evaluator = evaluator
self.meta_llm = meta_llm or llm_client
async def optimize(
self,
initial_prompt: Any,
iterations: int = 5,
target_accuracy: float = 0.95
) -> tuple[Any, list[EvaluationResult]]:
"""Iteratively optimize prompt."""
current_prompt = initial_prompt
history = []
for i in range(iterations):
# Evaluate current prompt
result = await self.evaluator.evaluate(current_prompt)
history.append(result)
if result.accuracy >= target_accuracy:
break
# Generate improved prompt
current_prompt = await self._improve_prompt(
current_prompt,
result
)
return current_prompt, history
async def _improve_prompt(
self,
prompt: Any,
result: EvaluationResult
) -> Any:
"""Use meta-LLM to improve prompt."""
improvement_prompt = f"""You are a prompt engineering expert. Improve this prompt based on evaluation results.
Current prompt:
{prompt.template if hasattr(prompt, 'template') else str(prompt)}
Evaluation results:
- Accuracy: {result.accuracy:.2%}
- Common errors: {result.errors[:3]}
Suggest an improved version of the prompt that addresses the errors while maintaining the same variables.
Output only the improved prompt template."""
response = await self.meta_llm.complete(improvement_prompt)
# Create new template with improved text
if hasattr(prompt, 'template'):
from copy import deepcopy
new_prompt = deepcopy(prompt)
new_prompt.template = response.content
new_prompt.version = f"{prompt.version}.1"
return new_prompt
return response.content
Production Prompt Service
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional, Any
import asyncio
app = FastAPI()
class PromptRequest(BaseModel):
template_name: str
variables: dict
output_format: Optional[str] = None
class TemplateCreate(BaseModel):
name: str
template: str
variables: list[str] = []
defaults: dict = {}
# Initialize library
library = PromptLibrary()
# Add default templates
library.add(PromptTemplate(
name="summarize",
template="Summarize in {length} sentences:\n\n{text}",
defaults={"length": "3"}
))
library.add(PromptTemplate(
name="classify",
template="Classify the following into {categories}:\n\n{text}\n\nClassification:",
defaults={"categories": "positive, negative, neutral"}
))
# Mock LLM
class MockLLM:
async def complete(self, prompt: str):
class Response:
content = f"Response to: {prompt[:100]}"
await asyncio.sleep(0.1)
return Response()
llm = MockLLM()
@app.post("/v1/prompts/execute")
async def execute_prompt(request: PromptRequest) -> dict:
"""Execute a prompt template."""
template = library.get(request.template_name)
if not template:
raise HTTPException(status_code=404, detail="Template not found")
try:
prompt = template.format(**request.variables)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
response = await llm.complete(prompt)
return {
"template": request.template_name,
"prompt": prompt,
"response": response.content
}
@app.post("/v1/templates")
async def create_template(request: TemplateCreate) -> dict:
"""Create a new template."""
template = PromptTemplate(
name=request.name,
template=request.template,
variables=request.variables,
defaults=request.defaults
)
library.add(template)
return {
"name": template.name,
"variables": template.variables,
"version": template.version
}
@app.get("/v1/templates")
async def list_templates() -> list[str]:
"""List all templates."""
return library.list_templates()
@app.get("/v1/templates/{name}")
async def get_template(name: str) -> dict:
"""Get template details."""
template = library.get(name)
if not template:
raise HTTPException(status_code=404, detail="Template not found")
return {
"name": template.name,
"template": template.template,
"variables": template.variables,
"defaults": template.defaults,
"version": template.version
}
@app.get("/health")
async def health():
return {"status": "healthy"}
References
- OpenAI Prompt Engineering Guide: https://platform.openai.com/docs/guides/prompt-engineering
- Anthropic Prompt Design: https://docs.anthropic.com/claude/docs/prompt-design
- Chain-of-Thought Paper: https://arxiv.org/abs/2201.11903
- Tree of Thoughts: https://arxiv.org/abs/2305.10601
Conclusion
Prompt engineering is a skill that improves with practice and systematic experimentation. Start with clear structure: define the role, provide context, give explicit instructions, and specify the output format. Use few-shot examples when you need consistent formatting or when the task is ambiguous—three well-chosen examples often outperform lengthy instructions. Apply chain-of-thought prompting for reasoning tasks; even a simple “Let’s think step by step” can dramatically improve accuracy on complex problems. Build a library of tested templates for common tasks and version them so you can track what works. Evaluate prompts systematically with test cases that cover edge cases and failure modes. Remember that different models respond differently to the same prompt—what works for GPT-4 may not work for Claude or open-source models. The most important insight is that prompt engineering is iterative: start simple, measure results, identify failures, and refine. The best prompts are often surprisingly simple but precisely crafted to guide the model toward the desired behavior.
