Prompt Optimization Strategies: From Structure to Automatic Refinement

Introduction: Prompt optimization is the systematic process of improving prompts to achieve better LLM outputs—higher accuracy, more consistent formatting, reduced latency, and lower costs. Unlike ad-hoc prompt engineering, optimization treats prompts as artifacts that can be measured, tested, and iteratively improved. This guide covers the techniques that make prompts more effective: structural patterns that improve clarity, few-shot example selection strategies, automatic prompt optimization with DSPy and similar frameworks, A/B testing methodologies, and production prompt management. Whether you’re optimizing for quality, speed, or cost, these patterns will help you build prompts that perform reliably at scale.

Prompt Structure Patterns

from dataclasses import dataclass, field
from typing import Any, Optional
from abc import ABC, abstractmethod
from enum import Enum

class PromptSection(Enum):
    """Sections of a structured prompt."""
    
    SYSTEM = "system"
    CONTEXT = "context"
    TASK = "task"
    EXAMPLES = "examples"
    CONSTRAINTS = "constraints"
    OUTPUT_FORMAT = "output_format"

@dataclass
class PromptComponent:
    """A component of a prompt."""
    
    section: PromptSection
    content: str
    priority: int = 0  # Higher = more important

class StructuredPrompt:
    """Build structured prompts."""
    
    def __init__(self):
        self.components: list[PromptComponent] = []
    
    def add_system(self, content: str, priority: int = 100) -> 'StructuredPrompt':
        """Add system instruction."""
        
        self.components.append(PromptComponent(
            section=PromptSection.SYSTEM,
            content=content,
            priority=priority
        ))
        return self
    
    def add_context(self, content: str, priority: int = 80) -> 'StructuredPrompt':
        """Add context information."""
        
        self.components.append(PromptComponent(
            section=PromptSection.CONTEXT,
            content=content,
            priority=priority
        ))
        return self
    
    def add_task(self, content: str, priority: int = 90) -> 'StructuredPrompt':
        """Add task description."""
        
        self.components.append(PromptComponent(
            section=PromptSection.TASK,
            content=content,
            priority=priority
        ))
        return self
    
    def add_examples(self, examples: list[dict], priority: int = 70) -> 'StructuredPrompt':
        """Add few-shot examples."""
        
        formatted = "\n\n".join(
            f"Input: {ex['input']}\nOutput: {ex['output']}"
            for ex in examples
        )
        
        self.components.append(PromptComponent(
            section=PromptSection.EXAMPLES,
            content=f"Examples:\n{formatted}",
            priority=priority
        ))
        return self
    
    def add_constraints(self, constraints: list[str], priority: int = 85) -> 'StructuredPrompt':
        """Add constraints."""
        
        formatted = "\n".join(f"- {c}" for c in constraints)
        
        self.components.append(PromptComponent(
            section=PromptSection.CONSTRAINTS,
            content=f"Constraints:\n{formatted}",
            priority=priority
        ))
        return self
    
    def add_output_format(self, format_spec: str, priority: int = 95) -> 'StructuredPrompt':
        """Add output format specification."""
        
        self.components.append(PromptComponent(
            section=PromptSection.OUTPUT_FORMAT,
            content=f"Output format:\n{format_spec}",
            priority=priority
        ))
        return self
    
    def build(self, max_tokens: int = None) -> str:
        """Build the prompt."""
        
        # Sort by priority
        sorted_components = sorted(
            self.components,
            key=lambda c: c.priority,
            reverse=True
        )
        
        # Build prompt
        sections = []
        current_tokens = 0
        
        for component in sorted_components:
            component_tokens = len(component.content.split()) * 1.3  # Rough estimate
            
            if max_tokens and current_tokens + component_tokens > max_tokens:
                continue
            
            sections.append(component.content)
            current_tokens += component_tokens
        
        return "\n\n".join(sections)

class PromptTemplate:
    """Template-based prompt construction."""
    
    def __init__(self, template: str):
        self.template = template
        self.variables: set[str] = self._extract_variables()
    
    def _extract_variables(self) -> set[str]:
        """Extract template variables."""
        
        import re
        return set(re.findall(r'\{(\w+)\}', self.template))
    
    def format(self, **kwargs) -> str:
        """Format template with variables."""
        
        missing = self.variables - set(kwargs.keys())
        if missing:
            raise ValueError(f"Missing variables: {missing}")
        
        return self.template.format(**kwargs)
    
    def partial(self, **kwargs) -> 'PromptTemplate':
        """Partially fill template."""
        
        new_template = self.template
        for key, value in kwargs.items():
            new_template = new_template.replace(f"{{{key}}}", value)
        
        return PromptTemplate(new_template)

class ChainOfThoughtPrompt:
    """Build chain-of-thought prompts."""
    
    def __init__(self, task: str):
        self.task = task
        self.reasoning_steps: list[str] = []
    
    def add_step(self, step: str) -> 'ChainOfThoughtPrompt':
        """Add reasoning step."""
        
        self.reasoning_steps.append(step)
        return self
    
    def build(self) -> str:
        """Build CoT prompt."""
        
        steps = "\n".join(
            f"{i+1}. {step}"
            for i, step in enumerate(self.reasoning_steps)
        )
        
        return f"""{self.task}

Let's think step by step:
{steps}

Now, apply this reasoning to solve the problem."""

class TreeOfThoughtPrompt:
    """Build tree-of-thought prompts."""
    
    def __init__(self, task: str, num_branches: int = 3):
        self.task = task
        self.num_branches = num_branches
    
    def build_exploration_prompt(self) -> str:
        """Build prompt for exploring options."""
        
        return f"""{self.task}

Generate {self.num_branches} different approaches to solve this problem.
For each approach:
1. Describe the approach
2. List potential advantages
3. List potential challenges

Approaches:"""
    
    def build_evaluation_prompt(self, approaches: list[str]) -> str:
        """Build prompt for evaluating approaches."""
        
        formatted = "\n\n".join(
            f"Approach {i+1}: {a}"
            for i, a in enumerate(approaches)
        )
        
        return f"""Evaluate these approaches for solving: {self.task}

{formatted}

For each approach, rate on a scale of 1-10:
- Feasibility
- Effectiveness
- Efficiency

Then recommend the best approach and explain why."""

Few-Shot Example Selection

from dataclasses import dataclass, field
from typing import Any, Optional
from abc import ABC, abstractmethod
import numpy as np

@dataclass
class Example:
    """A few-shot example."""
    
    input: str
    output: str
    embedding: np.ndarray = None
    metadata: dict = field(default_factory=dict)

class ExampleSelector(ABC):
    """Abstract example selector."""
    
    @abstractmethod
    def select(self, query: str, k: int = 3) -> list[Example]:
        """Select examples for query."""
        pass

class RandomSelector(ExampleSelector):
    """Random example selection."""
    
    def __init__(self, examples: list[Example], seed: int = 42):
        self.examples = examples
        self.rng = np.random.RandomState(seed)
    
    def select(self, query: str, k: int = 3) -> list[Example]:
        """Select random examples."""
        
        indices = self.rng.choice(len(self.examples), size=min(k, len(self.examples)), replace=False)
        return [self.examples[i] for i in indices]

class SemanticSelector(ExampleSelector):
    """Semantic similarity-based selection."""
    
    def __init__(self, examples: list[Example], embedding_model: Any):
        self.examples = examples
        self.embedder = embedding_model
        
        # Pre-compute embeddings
        for ex in self.examples:
            if ex.embedding is None:
                ex.embedding = self.embedder.embed(ex.input).vector
    
    def select(self, query: str, k: int = 3) -> list[Example]:
        """Select most similar examples."""
        
        query_embedding = self.embedder.embed(query).vector
        
        # Calculate similarities
        similarities = []
        for ex in self.examples:
            sim = np.dot(query_embedding, ex.embedding) / (
                np.linalg.norm(query_embedding) * np.linalg.norm(ex.embedding)
            )
            similarities.append(sim)
        
        # Get top-k
        top_indices = np.argsort(similarities)[-k:][::-1]
        return [self.examples[i] for i in top_indices]

class DiversitySelector(ExampleSelector):
    """Select diverse examples."""
    
    def __init__(self, examples: list[Example], embedding_model: Any):
        self.examples = examples
        self.embedder = embedding_model
        
        for ex in self.examples:
            if ex.embedding is None:
                ex.embedding = self.embedder.embed(ex.input).vector
    
    def select(self, query: str, k: int = 3) -> list[Example]:
        """Select diverse examples using MMR."""
        
        query_embedding = self.embedder.embed(query).vector
        
        # Calculate query similarities
        query_sims = []
        for ex in self.examples:
            sim = np.dot(query_embedding, ex.embedding) / (
                np.linalg.norm(query_embedding) * np.linalg.norm(ex.embedding)
            )
            query_sims.append(sim)
        
        # MMR selection
        selected = []
        remaining = list(range(len(self.examples)))
        
        for _ in range(k):
            if not remaining:
                break
            
            best_idx = None
            best_score = -float('inf')
            
            for idx in remaining:
                # Relevance to query
                relevance = query_sims[idx]
                
                # Max similarity to already selected
                if selected:
                    max_sim = max(
                        np.dot(self.examples[idx].embedding, self.examples[s].embedding) / (
                            np.linalg.norm(self.examples[idx].embedding) * 
                            np.linalg.norm(self.examples[s].embedding)
                        )
                        for s in selected
                    )
                else:
                    max_sim = 0
                
                # MMR score
                lambda_param = 0.5
                score = lambda_param * relevance - (1 - lambda_param) * max_sim
                
                if score > best_score:
                    best_score = score
                    best_idx = idx
            
            selected.append(best_idx)
            remaining.remove(best_idx)
        
        return [self.examples[i] for i in selected]

class CoverageSelector(ExampleSelector):
    """Select examples that cover different aspects."""
    
    def __init__(self, examples: list[Example], categories: dict[str, list[int]]):
        self.examples = examples
        self.categories = categories  # category -> example indices
    
    def select(self, query: str, k: int = 3) -> list[Example]:
        """Select examples covering different categories."""
        
        selected = []
        categories_used = set()
        
        # First, select one from each category
        for category, indices in self.categories.items():
            if len(selected) >= k:
                break
            
            if category not in categories_used and indices:
                selected.append(self.examples[indices[0]])
                categories_used.add(category)
        
        # Fill remaining with random
        remaining = [
            ex for i, ex in enumerate(self.examples)
            if ex not in selected
        ]
        
        while len(selected) < k and remaining:
            selected.append(remaining.pop(0))
        
        return selected

class AdaptiveSelector(ExampleSelector):
    """Adapt selection based on task performance."""
    
    def __init__(self, examples: list[Example]):
        self.examples = examples
        self.performance: dict[int, list[float]] = {i: [] for i in range(len(examples))}
    
    def select(self, query: str, k: int = 3) -> list[Example]:
        """Select based on historical performance."""
        
        # Calculate average performance
        avg_performance = {}
        for idx, scores in self.performance.items():
            if scores:
                avg_performance[idx] = sum(scores) / len(scores)
            else:
                avg_performance[idx] = 0.5  # Default
        
        # Sort by performance
        sorted_indices = sorted(avg_performance.keys(), key=lambda i: avg_performance[i], reverse=True)
        
        return [self.examples[i] for i in sorted_indices[:k]]
    
    def record_performance(self, example_indices: list[int], score: float):
        """Record performance for examples."""
        
        for idx in example_indices:
            self.performance[idx].append(score)

Automatic Prompt Optimization

from dataclasses import dataclass, field
from typing import Any, Optional, Callable
from abc import ABC, abstractmethod

@dataclass
class OptimizationResult:
    """Result of prompt optimization."""
    
    original_prompt: str
    optimized_prompt: str
    original_score: float
    optimized_score: float
    iterations: int
    history: list[dict] = field(default_factory=list)

class PromptOptimizer(ABC):
    """Abstract prompt optimizer."""
    
    @abstractmethod
    async def optimize(
        self,
        prompt: str,
        eval_fn: Callable,
        max_iterations: int = 10
    ) -> OptimizationResult:
        """Optimize prompt."""
        pass

class GradientFreeOptimizer(PromptOptimizer):
    """Optimize prompts without gradients."""
    
    def __init__(self, llm_client: Any):
        self.llm = llm_client
    
    async def optimize(
        self,
        prompt: str,
        eval_fn: Callable,
        max_iterations: int = 10
    ) -> OptimizationResult:
        """Optimize using LLM-based refinement."""
        
        current_prompt = prompt
        current_score = await eval_fn(current_prompt)
        
        history = [{"prompt": current_prompt, "score": current_score}]
        
        for i in range(max_iterations):
            # Generate variations
            variations = await self._generate_variations(current_prompt)
            
            # Evaluate variations
            best_variation = current_prompt
            best_score = current_score
            
            for variation in variations:
                score = await eval_fn(variation)
                
                if score > best_score:
                    best_variation = variation
                    best_score = score
            
            # Update if improved
            if best_score > current_score:
                current_prompt = best_variation
                current_score = best_score
                history.append({"prompt": current_prompt, "score": current_score})
            else:
                # No improvement, try different strategy
                current_prompt = await self._refine_prompt(current_prompt, history)
                current_score = await eval_fn(current_prompt)
                history.append({"prompt": current_prompt, "score": current_score})
        
        return OptimizationResult(
            original_prompt=prompt,
            optimized_prompt=current_prompt,
            original_score=history[0]["score"],
            optimized_score=current_score,
            iterations=len(history) - 1,
            history=history
        )
    
    async def _generate_variations(self, prompt: str) -> list[str]:
        """Generate prompt variations."""
        
        meta_prompt = f"""Generate 3 variations of this prompt that might perform better.
Keep the core task the same but try different:
- Wording and phrasing
- Structure and organization
- Level of detail
- Tone and style

Original prompt:
{prompt}

Variations (separated by ---):"""
        
        response = await self.llm.complete(meta_prompt)
        
        variations = response.content.split('---')
        return [v.strip() for v in variations if v.strip()]
    
    async def _refine_prompt(self, prompt: str, history: list[dict]) -> str:
        """Refine prompt based on history."""
        
        history_text = "\n".join(
            f"Score {h['score']:.2f}: {h['prompt'][:100]}..."
            for h in history[-3:]
        )
        
        meta_prompt = f"""Analyze these prompt attempts and their scores, then create an improved version.

History:
{history_text}

Current prompt:
{prompt}

Create an improved prompt that addresses weaknesses in previous attempts:"""
        
        response = await self.llm.complete(meta_prompt)
        return response.content.strip()

class DSPyOptimizer(PromptOptimizer):
    """DSPy-style prompt optimization."""
    
    def __init__(self, llm_client: Any):
        self.llm = llm_client
    
    async def optimize(
        self,
        prompt: str,
        eval_fn: Callable,
        max_iterations: int = 10,
        train_examples: list[dict] = None
    ) -> OptimizationResult:
        """Optimize using DSPy-style compilation."""
        
        # Extract signature from prompt
        signature = self._extract_signature(prompt)
        
        # Bootstrap examples
        if train_examples:
            bootstrapped = await self._bootstrap_examples(signature, train_examples)
        else:
            bootstrapped = []
        
        # Optimize instruction
        best_instruction = await self._optimize_instruction(
            signature,
            bootstrapped,
            eval_fn,
            max_iterations
        )
        
        # Build optimized prompt
        optimized = self._build_prompt(signature, best_instruction, bootstrapped)
        
        original_score = await eval_fn(prompt)
        optimized_score = await eval_fn(optimized)
        
        return OptimizationResult(
            original_prompt=prompt,
            optimized_prompt=optimized,
            original_score=original_score,
            optimized_score=optimized_score,
            iterations=max_iterations,
            history=[]
        )
    
    def _extract_signature(self, prompt: str) -> dict:
        """Extract input/output signature from prompt."""
        
        # Simplified extraction
        return {
            "inputs": ["input"],
            "outputs": ["output"],
            "task": prompt
        }
    
    async def _bootstrap_examples(
        self,
        signature: dict,
        train_examples: list[dict]
    ) -> list[dict]:
        """Bootstrap high-quality examples."""
        
        bootstrapped = []
        
        for example in train_examples[:10]:
            # Generate reasoning trace
            prompt = f"""Task: {signature['task']}

Input: {example['input']}

Think step by step to produce the output:"""
            
            response = await self.llm.complete(prompt)
            
            bootstrapped.append({
                "input": example["input"],
                "reasoning": response.content,
                "output": example.get("output", "")
            })
        
        return bootstrapped
    
    async def _optimize_instruction(
        self,
        signature: dict,
        examples: list[dict],
        eval_fn: Callable,
        max_iterations: int
    ) -> str:
        """Optimize the instruction."""
        
        current_instruction = signature["task"]
        best_score = 0
        
        for _ in range(max_iterations):
            # Generate instruction candidates
            candidates = await self._generate_instructions(current_instruction, examples)
            
            # Evaluate each
            for candidate in candidates:
                test_prompt = self._build_prompt(signature, candidate, examples)
                score = await eval_fn(test_prompt)
                
                if score > best_score:
                    best_score = score
                    current_instruction = candidate
        
        return current_instruction
    
    async def _generate_instructions(
        self,
        current: str,
        examples: list[dict]
    ) -> list[str]:
        """Generate instruction candidates."""
        
        prompt = f"""Generate 3 improved versions of this instruction.

Current instruction: {current}

Example inputs/outputs:
{examples[:2]}

Improved instructions (one per line):"""
        
        response = await self.llm.complete(prompt)
        return [line.strip() for line in response.content.split('\n') if line.strip()]
    
    def _build_prompt(
        self,
        signature: dict,
        instruction: str,
        examples: list[dict]
    ) -> str:
        """Build prompt from components."""
        
        examples_text = "\n\n".join(
            f"Input: {ex['input']}\nOutput: {ex['output']}"
            for ex in examples[:3]
        )
        
        return f"""{instruction}

Examples:
{examples_text}

Now process the following:
Input: {{input}}
Output:"""

class EvolutionaryOptimizer(PromptOptimizer):
    """Evolutionary prompt optimization."""
    
    def __init__(self, llm_client: Any, population_size: int = 10):
        self.llm = llm_client
        self.population_size = population_size
    
    async def optimize(
        self,
        prompt: str,
        eval_fn: Callable,
        max_iterations: int = 10
    ) -> OptimizationResult:
        """Optimize using evolutionary algorithm."""
        
        # Initialize population
        population = [prompt]
        for _ in range(self.population_size - 1):
            mutated = await self._mutate(prompt)
            population.append(mutated)
        
        history = []
        
        for generation in range(max_iterations):
            # Evaluate fitness
            fitness = []
            for individual in population:
                score = await eval_fn(individual)
                fitness.append((individual, score))
            
            # Sort by fitness
            fitness.sort(key=lambda x: x[1], reverse=True)
            
            # Record best
            history.append({
                "generation": generation,
                "best_score": fitness[0][1],
                "best_prompt": fitness[0][0]
            })
            
            # Selection (top half)
            survivors = [f[0] for f in fitness[:self.population_size // 2]]
            
            # Reproduction
            new_population = survivors.copy()
            
            while len(new_population) < self.population_size:
                # Crossover
                parent1, parent2 = np.random.choice(survivors, 2, replace=False)
                child = await self._crossover(parent1, parent2)
                
                # Mutation
                if np.random.random() < 0.3:
                    child = await self._mutate(child)
                
                new_population.append(child)
            
            population = new_population
        
        # Return best
        best = max(history, key=lambda h: h["best_score"])
        
        return OptimizationResult(
            original_prompt=prompt,
            optimized_prompt=best["best_prompt"],
            original_score=history[0]["best_score"],
            optimized_score=best["best_score"],
            iterations=max_iterations,
            history=history
        )
    
    async def _mutate(self, prompt: str) -> str:
        """Mutate a prompt."""
        
        mutation_prompt = f"""Slightly modify this prompt while keeping its core meaning.
Make one small change (word choice, structure, or detail level).

Original: {prompt}

Modified:"""
        
        response = await self.llm.complete(mutation_prompt)
        return response.content.strip()
    
    async def _crossover(self, parent1: str, parent2: str) -> str:
        """Crossover two prompts."""
        
        crossover_prompt = f"""Combine the best elements of these two prompts into one.

Prompt 1: {parent1}

Prompt 2: {parent2}

Combined prompt:"""
        
        response = await self.llm.complete(crossover_prompt)
        return response.content.strip()

A/B Testing Framework

from dataclasses import dataclass, field
from typing import Any, Optional, Callable
from datetime import datetime
import numpy as np
from scipy import stats

@dataclass
class Variant:
    """A prompt variant."""
    
    id: str
    prompt: str
    weight: float = 1.0

@dataclass
class ExperimentResult:
    """Result of an A/B test."""
    
    variant_id: str
    input: str
    output: str
    metrics: dict
    timestamp: datetime = field(default_factory=datetime.now)

class ABTest:
    """A/B test for prompts."""
    
    def __init__(
        self,
        name: str,
        variants: list[Variant],
        metrics: list[str]
    ):
        self.name = name
        self.variants = {v.id: v for v in variants}
        self.metrics = metrics
        self.results: list[ExperimentResult] = []
        
        # Normalize weights
        total_weight = sum(v.weight for v in variants)
        self.weights = {v.id: v.weight / total_weight for v in variants}
    
    def select_variant(self) -> Variant:
        """Select variant based on weights."""
        
        ids = list(self.weights.keys())
        weights = [self.weights[id] for id in ids]
        
        selected_id = np.random.choice(ids, p=weights)
        return self.variants[selected_id]
    
    def record_result(
        self,
        variant_id: str,
        input: str,
        output: str,
        metrics: dict
    ):
        """Record experiment result."""
        
        self.results.append(ExperimentResult(
            variant_id=variant_id,
            input=input,
            output=output,
            metrics=metrics
        ))
    
    def analyze(self) -> dict:
        """Analyze experiment results."""
        
        analysis = {}
        
        for metric in self.metrics:
            metric_analysis = {}
            
            for variant_id in self.variants:
                values = [
                    r.metrics.get(metric, 0)
                    for r in self.results
                    if r.variant_id == variant_id
                ]
                
                if values:
                    metric_analysis[variant_id] = {
                        "mean": np.mean(values),
                        "std": np.std(values),
                        "count": len(values)
                    }
            
            # Statistical significance
            if len(metric_analysis) == 2:
                ids = list(metric_analysis.keys())
                values1 = [r.metrics.get(metric, 0) for r in self.results if r.variant_id == ids[0]]
                values2 = [r.metrics.get(metric, 0) for r in self.results if r.variant_id == ids[1]]
                
                if len(values1) > 1 and len(values2) > 1:
                    t_stat, p_value = stats.ttest_ind(values1, values2)
                    metric_analysis["significance"] = {
                        "t_statistic": t_stat,
                        "p_value": p_value,
                        "significant": p_value < 0.05
                    }
            
            analysis[metric] = metric_analysis
        
        return analysis
    
    def get_winner(self, metric: str) -> Optional[str]:
        """Get winning variant for metric."""
        
        analysis = self.analyze()
        
        if metric not in analysis:
            return None
        
        metric_data = analysis[metric]
        
        # Check significance
        if "significance" in metric_data and not metric_data["significance"]["significant"]:
            return None  # No significant winner
        
        # Find best
        best_id = None
        best_mean = -float('inf')
        
        for variant_id, data in metric_data.items():
            if variant_id == "significance":
                continue
            
            if data["mean"] > best_mean:
                best_mean = data["mean"]
                best_id = variant_id
        
        return best_id

class MultiArmedBandit:
    """Multi-armed bandit for prompt selection."""
    
    def __init__(self, variants: list[Variant], epsilon: float = 0.1):
        self.variants = {v.id: v for v in variants}
        self.epsilon = epsilon
        self.rewards: dict[str, list[float]] = {v.id: [] for v in variants}
    
    def select_variant(self) -> Variant:
        """Select using epsilon-greedy."""
        
        if np.random.random() < self.epsilon:
            # Explore
            return np.random.choice(list(self.variants.values()))
        
        # Exploit
        best_id = None
        best_mean = -float('inf')
        
        for variant_id, rewards in self.rewards.items():
            mean = np.mean(rewards) if rewards else 0
            if mean > best_mean:
                best_mean = mean
                best_id = variant_id
        
        return self.variants[best_id]
    
    def record_reward(self, variant_id: str, reward: float):
        """Record reward for variant."""
        
        self.rewards[variant_id].append(reward)

class ThompsonSampling:
    """Thompson sampling for prompt selection."""
    
    def __init__(self, variants: list[Variant]):
        self.variants = {v.id: v for v in variants}
        # Beta distribution parameters
        self.alpha: dict[str, float] = {v.id: 1.0 for v in variants}
        self.beta: dict[str, float] = {v.id: 1.0 for v in variants}
    
    def select_variant(self) -> Variant:
        """Select using Thompson sampling."""
        
        samples = {}
        
        for variant_id in self.variants:
            samples[variant_id] = np.random.beta(
                self.alpha[variant_id],
                self.beta[variant_id]
            )
        
        best_id = max(samples, key=samples.get)
        return self.variants[best_id]
    
    def record_result(self, variant_id: str, success: bool):
        """Record result (success/failure)."""
        
        if success:
            self.alpha[variant_id] += 1
        else:
            self.beta[variant_id] += 1

Production Prompt Management

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional, Any
from datetime import datetime
import uuid

app = FastAPI()

class PromptVersion:
    """A versioned prompt."""
    
    def __init__(
        self,
        prompt_id: str,
        version: int,
        content: str,
        metadata: dict = None
    ):
        self.prompt_id = prompt_id
        self.version = version
        self.content = content
        self.metadata = metadata or {}
        self.created_at = datetime.now()
        self.is_active = False

class PromptRegistry:
    """Registry for managing prompts."""
    
    def __init__(self):
        self.prompts: dict[str, dict[int, PromptVersion]] = {}
        self.active_versions: dict[str, int] = {}
    
    def register(
        self,
        prompt_id: str,
        content: str,
        metadata: dict = None
    ) -> PromptVersion:
        """Register new prompt version."""
        
        if prompt_id not in self.prompts:
            self.prompts[prompt_id] = {}
        
        version = len(self.prompts[prompt_id]) + 1
        
        prompt_version = PromptVersion(
            prompt_id=prompt_id,
            version=version,
            content=content,
            metadata=metadata
        )
        
        self.prompts[prompt_id][version] = prompt_version
        
        # Auto-activate if first version
        if version == 1:
            self.activate(prompt_id, version)
        
        return prompt_version
    
    def activate(self, prompt_id: str, version: int):
        """Activate a prompt version."""
        
        if prompt_id not in self.prompts:
            raise ValueError(f"Unknown prompt: {prompt_id}")
        
        if version not in self.prompts[prompt_id]:
            raise ValueError(f"Unknown version: {version}")
        
        # Deactivate current
        if prompt_id in self.active_versions:
            current = self.active_versions[prompt_id]
            self.prompts[prompt_id][current].is_active = False
        
        # Activate new
        self.prompts[prompt_id][version].is_active = True
        self.active_versions[prompt_id] = version
    
    def get_active(self, prompt_id: str) -> Optional[PromptVersion]:
        """Get active version of prompt."""
        
        if prompt_id not in self.active_versions:
            return None
        
        version = self.active_versions[prompt_id]
        return self.prompts[prompt_id][version]
    
    def get_version(self, prompt_id: str, version: int) -> Optional[PromptVersion]:
        """Get specific version."""
        
        if prompt_id not in self.prompts:
            return None
        
        return self.prompts[prompt_id].get(version)
    
    def list_versions(self, prompt_id: str) -> list[PromptVersion]:
        """List all versions of a prompt."""
        
        if prompt_id not in self.prompts:
            return []
        
        return list(self.prompts[prompt_id].values())

# Global registry
registry = PromptRegistry()

class RegisterRequest(BaseModel):
    prompt_id: str
    content: str
    metadata: Optional[dict] = None

class ActivateRequest(BaseModel):
    prompt_id: str
    version: int

class PromptResponse(BaseModel):
    prompt_id: str
    version: int
    content: str
    is_active: bool
    created_at: str

@app.post("/v1/prompts")
async def register_prompt(request: RegisterRequest) -> PromptResponse:
    """Register new prompt version."""
    
    version = registry.register(
        prompt_id=request.prompt_id,
        content=request.content,
        metadata=request.metadata
    )
    
    return PromptResponse(
        prompt_id=version.prompt_id,
        version=version.version,
        content=version.content,
        is_active=version.is_active,
        created_at=version.created_at.isoformat()
    )

@app.post("/v1/prompts/activate")
async def activate_prompt(request: ActivateRequest) -> dict:
    """Activate prompt version."""
    
    try:
        registry.activate(request.prompt_id, request.version)
        return {"status": "activated"}
    except ValueError as e:
        raise HTTPException(status_code=404, detail=str(e))

@app.get("/v1/prompts/{prompt_id}")
async def get_prompt(prompt_id: str) -> PromptResponse:
    """Get active prompt."""
    
    version = registry.get_active(prompt_id)
    
    if not version:
        raise HTTPException(status_code=404, detail="Prompt not found")
    
    return PromptResponse(
        prompt_id=version.prompt_id,
        version=version.version,
        content=version.content,
        is_active=version.is_active,
        created_at=version.created_at.isoformat()
    )

@app.get("/v1/prompts/{prompt_id}/versions")
async def list_versions(prompt_id: str) -> list[PromptResponse]:
    """List all versions."""
    
    versions = registry.list_versions(prompt_id)
    
    return [
        PromptResponse(
            prompt_id=v.prompt_id,
            version=v.version,
            content=v.content,
            is_active=v.is_active,
            created_at=v.created_at.isoformat()
        )
        for v in versions
    ]

@app.get("/health")
async def health():
    return {"status": "healthy"}

References

DSPy Framework: https://github.com/stanfordnlp/dspy
Prompt Engineering Guide: https://www.promptingguide.ai/
Chain-of-Thought Paper: https://arxiv.org/abs/2201.11903
Tree of Thoughts Paper: https://arxiv.org/abs/2305.10601

Conclusion

Prompt optimization transforms prompt engineering from art to science. Start with structured prompts—clear sections for system instructions, context, task, examples, constraints, and output format make prompts easier to understand and modify. Few-shot example selection matters more than example count; semantic similarity and diversity-based selection outperform random selection. Automatic optimization with frameworks like DSPy can discover prompts that outperform hand-crafted ones, especially when you have evaluation data. A/B testing is essential for production; use multi-armed bandits or Thompson sampling to balance exploration and exploitation. Version your prompts like code—track changes, maintain rollback capability, and associate prompts with their performance metrics. Monitor prompt performance continuously; model updates and data drift can degrade prompt effectiveness over time. The key insight is that prompts are not static artifacts—they should evolve based on measured performance, and the infrastructure for testing and deploying prompt changes is as important as the prompts themselves.

Discover more from Code, Cloud & Context

Subscribe to get the latest posts sent to your email.

Searching in

Code, Cloud & Context

Latest Articles