Introduction: Prompts are code—they define your application’s behavior and should be managed with the same rigor as source code. Yet many teams treat prompts as ad-hoc strings scattered throughout their codebase, making it impossible to track changes, compare versions, or systematically improve performance. This guide covers practical prompt management: version control systems for prompts, A/B testing frameworks for comparing variants, statistical analysis for making data-driven decisions, and deployment strategies for rolling out prompt changes safely. Whether you’re optimizing a single prompt or managing hundreds across your organization, systematic prompt versioning transforms prompt engineering from guesswork into engineering.

Prompt Version Control
from dataclasses import dataclass, field
from typing import Any, Optional
from datetime import datetime
import hashlib
import json
@dataclass
class PromptVersion:
"""A versioned prompt."""
id: str
name: str
version: int
content: str
variables: list[str]
metadata: dict = field(default_factory=dict)
created_at: datetime = field(default_factory=datetime.utcnow)
created_by: str = None
parent_version: int = None
@property
def content_hash(self) -> str:
"""Hash of prompt content."""
return hashlib.sha256(self.content.encode()).hexdigest()[:12]
@dataclass
class PromptDiff:
"""Difference between prompt versions."""
old_version: int
new_version: int
old_content: str
new_content: str
changes: list[dict]
class PromptRegistry:
"""Registry for versioned prompts."""
def __init__(self):
self._prompts: dict[str, list[PromptVersion]] = {}
self._active_versions: dict[str, int] = {}
def register(
self,
name: str,
content: str,
variables: list[str] = None,
metadata: dict = None,
created_by: str = None
) -> PromptVersion:
"""Register a new prompt version."""
# Get next version number
versions = self._prompts.get(name, [])
version_num = len(versions) + 1
parent = versions[-1].version if versions else None
# Create version
prompt = PromptVersion(
id=f"{name}_v{version_num}",
name=name,
version=version_num,
content=content,
variables=variables or self._extract_variables(content),
metadata=metadata or {},
created_by=created_by,
parent_version=parent
)
# Store
if name not in self._prompts:
self._prompts[name] = []
self._prompts[name].append(prompt)
# Set as active if first version
if version_num == 1:
self._active_versions[name] = version_num
return prompt
def get(
self,
name: str,
version: int = None
) -> Optional[PromptVersion]:
"""Get a prompt by name and version."""
versions = self._prompts.get(name, [])
if not versions:
return None
if version is None:
version = self._active_versions.get(name, len(versions))
for v in versions:
if v.version == version:
return v
return None
def get_active(self, name: str) -> Optional[PromptVersion]:
"""Get the active version of a prompt."""
version = self._active_versions.get(name)
return self.get(name, version)
def set_active(self, name: str, version: int):
"""Set the active version of a prompt."""
if name not in self._prompts:
raise ValueError(f"Prompt not found: {name}")
versions = [v.version for v in self._prompts[name]]
if version not in versions:
raise ValueError(f"Version not found: {version}")
self._active_versions[name] = version
def list_versions(self, name: str) -> list[PromptVersion]:
"""List all versions of a prompt."""
return self._prompts.get(name, [])
def diff(
self,
name: str,
old_version: int,
new_version: int
) -> PromptDiff:
"""Get diff between versions."""
old = self.get(name, old_version)
new = self.get(name, new_version)
if not old or not new:
raise ValueError("Version not found")
# Simple line-based diff
old_lines = old.content.split('\n')
new_lines = new.content.split('\n')
changes = []
for i, (o, n) in enumerate(zip(old_lines, new_lines)):
if o != n:
changes.append({
"line": i + 1,
"old": o,
"new": n
})
return PromptDiff(
old_version=old_version,
new_version=new_version,
old_content=old.content,
new_content=new.content,
changes=changes
)
def _extract_variables(self, content: str) -> list[str]:
"""Extract variable names from prompt."""
import re
return list(set(re.findall(r'\{(\w+)\}', content)))
class GitPromptStore:
"""Store prompts in Git for version control."""
def __init__(self, repo_path: str):
self.repo_path = repo_path
self.prompts_dir = f"{repo_path}/prompts"
def save(self, prompt: PromptVersion):
"""Save prompt to Git."""
import os
# Create directory
prompt_dir = f"{self.prompts_dir}/{prompt.name}"
os.makedirs(prompt_dir, exist_ok=True)
# Save prompt content
content_file = f"{prompt_dir}/v{prompt.version}.txt"
with open(content_file, 'w') as f:
f.write(prompt.content)
# Save metadata
meta_file = f"{prompt_dir}/v{prompt.version}.json"
with open(meta_file, 'w') as f:
json.dump({
"id": prompt.id,
"name": prompt.name,
"version": prompt.version,
"variables": prompt.variables,
"metadata": prompt.metadata,
"created_at": prompt.created_at.isoformat(),
"created_by": prompt.created_by,
"content_hash": prompt.content_hash
}, f, indent=2)
def load(self, name: str, version: int) -> Optional[PromptVersion]:
"""Load prompt from Git."""
import os
content_file = f"{self.prompts_dir}/{name}/v{version}.txt"
meta_file = f"{self.prompts_dir}/{name}/v{version}.json"
if not os.path.exists(content_file):
return None
with open(content_file, 'r') as f:
content = f.read()
with open(meta_file, 'r') as f:
meta = json.load(f)
return PromptVersion(
id=meta["id"],
name=meta["name"],
version=meta["version"],
content=content,
variables=meta["variables"],
metadata=meta.get("metadata", {}),
created_at=datetime.fromisoformat(meta["created_at"]),
created_by=meta.get("created_by")
)
A/B Testing Framework
from dataclasses import dataclass, field
from typing import Any, Optional, Callable
from datetime import datetime
import random
import hashlib
@dataclass
class Variant:
"""A variant in an A/B test."""
name: str
prompt_name: str
prompt_version: int
weight: float = 1.0
@dataclass
class Assignment:
"""User assignment to a variant."""
experiment_id: str
variant_name: str
user_id: str
assigned_at: datetime = field(default_factory=datetime.utcnow)
@dataclass
class Experiment:
"""An A/B test experiment."""
id: str
name: str
description: str
variants: list[Variant]
start_time: datetime = None
end_time: datetime = None
status: str = "draft" # draft, running, paused, completed
@property
def is_active(self) -> bool:
if self.status != "running":
return False
now = datetime.utcnow()
if self.start_time and now < self.start_time:
return False
if self.end_time and now > self.end_time:
return False
return True
class ABTestManager:
"""Manage A/B tests for prompts."""
def __init__(self, registry: PromptRegistry):
self.registry = registry
self._experiments: dict[str, Experiment] = {}
self._assignments: dict[str, dict[str, Assignment]] = {}
def create_experiment(
self,
name: str,
description: str,
variants: list[dict]
) -> Experiment:
"""Create a new experiment."""
exp_id = f"exp_{hashlib.md5(name.encode()).hexdigest()[:8]}"
variant_objects = [
Variant(
name=v["name"],
prompt_name=v["prompt_name"],
prompt_version=v["prompt_version"],
weight=v.get("weight", 1.0)
)
for v in variants
]
experiment = Experiment(
id=exp_id,
name=name,
description=description,
variants=variant_objects
)
self._experiments[exp_id] = experiment
self._assignments[exp_id] = {}
return experiment
def start_experiment(self, experiment_id: str):
"""Start an experiment."""
exp = self._experiments.get(experiment_id)
if not exp:
raise ValueError(f"Experiment not found: {experiment_id}")
exp.status = "running"
exp.start_time = datetime.utcnow()
def stop_experiment(self, experiment_id: str):
"""Stop an experiment."""
exp = self._experiments.get(experiment_id)
if not exp:
raise ValueError(f"Experiment not found: {experiment_id}")
exp.status = "completed"
exp.end_time = datetime.utcnow()
def get_variant(
self,
experiment_id: str,
user_id: str
) -> Optional[Variant]:
"""Get variant for a user."""
exp = self._experiments.get(experiment_id)
if not exp or not exp.is_active:
return None
# Check existing assignment
assignments = self._assignments.get(experiment_id, {})
if user_id in assignments:
variant_name = assignments[user_id].variant_name
for v in exp.variants:
if v.name == variant_name:
return v
# Assign new variant
variant = self._assign_variant(exp, user_id)
# Store assignment
self._assignments[experiment_id][user_id] = Assignment(
experiment_id=experiment_id,
variant_name=variant.name,
user_id=user_id
)
return variant
def _assign_variant(
self,
experiment: Experiment,
user_id: str
) -> Variant:
"""Assign a variant to a user."""
# Deterministic assignment based on user_id
hash_input = f"{experiment.id}:{user_id}"
hash_value = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
# Weighted random selection
total_weight = sum(v.weight for v in experiment.variants)
threshold = (hash_value % 1000) / 1000 * total_weight
cumulative = 0
for variant in experiment.variants:
cumulative += variant.weight
if threshold < cumulative:
return variant
return experiment.variants[-1]
def get_prompt_for_user(
self,
experiment_id: str,
user_id: str
) -> Optional[PromptVersion]:
"""Get the prompt for a user in an experiment."""
variant = self.get_variant(experiment_id, user_id)
if not variant:
return None
return self.registry.get(variant.prompt_name, variant.prompt_version)
class MultiArmedBandit:
"""Multi-armed bandit for adaptive prompt selection."""
def __init__(self, variants: list[str], epsilon: float = 0.1):
self.variants = variants
self.epsilon = epsilon
self._rewards: dict[str, list[float]] = {v: [] for v in variants}
def select(self) -> str:
"""Select a variant using epsilon-greedy."""
if random.random() < self.epsilon:
# Explore: random selection
return random.choice(self.variants)
# Exploit: select best performing
means = {
v: sum(r) / len(r) if r else 0
for v, r in self._rewards.items()
}
return max(means, key=means.get)
def record_reward(self, variant: str, reward: float):
"""Record reward for a variant."""
if variant in self._rewards:
self._rewards[variant].append(reward)
def get_stats(self) -> dict[str, dict]:
"""Get statistics for all variants."""
return {
v: {
"count": len(r),
"mean": sum(r) / len(r) if r else 0,
"total": sum(r)
}
for v, r in self._rewards.items()
}
Performance Analysis
from dataclasses import dataclass, field
from typing import Any, Optional
from datetime import datetime
import math
@dataclass
class MetricSample:
"""A metric sample."""
variant: str
value: float
timestamp: datetime = field(default_factory=datetime.utcnow)
metadata: dict = field(default_factory=dict)
@dataclass
class VariantStats:
"""Statistics for a variant."""
variant: str
count: int
mean: float
std: float
min_value: float
max_value: float
confidence_interval: tuple[float, float]
@dataclass
class ExperimentResults:
"""Results of an experiment."""
experiment_id: str
variants: dict[str, VariantStats]
winner: str = None
confidence: float = None
is_significant: bool = False
class ExperimentAnalyzer:
"""Analyze A/B test results."""
def __init__(self, confidence_level: float = 0.95):
self.confidence_level = confidence_level
self._samples: dict[str, dict[str, list[MetricSample]]] = {}
def record_sample(
self,
experiment_id: str,
variant: str,
value: float,
metadata: dict = None
):
"""Record a metric sample."""
if experiment_id not in self._samples:
self._samples[experiment_id] = {}
if variant not in self._samples[experiment_id]:
self._samples[experiment_id][variant] = []
self._samples[experiment_id][variant].append(
MetricSample(
variant=variant,
value=value,
metadata=metadata or {}
)
)
def analyze(self, experiment_id: str) -> ExperimentResults:
"""Analyze experiment results."""
samples = self._samples.get(experiment_id, {})
if not samples:
return ExperimentResults(
experiment_id=experiment_id,
variants={}
)
# Calculate stats for each variant
variant_stats = {}
for variant, variant_samples in samples.items():
values = [s.value for s in variant_samples]
stats = self._calculate_stats(variant, values)
variant_stats[variant] = stats
# Determine winner
winner, confidence, is_significant = self._determine_winner(variant_stats)
return ExperimentResults(
experiment_id=experiment_id,
variants=variant_stats,
winner=winner,
confidence=confidence,
is_significant=is_significant
)
def _calculate_stats(
self,
variant: str,
values: list[float]
) -> VariantStats:
"""Calculate statistics for a variant."""
n = len(values)
if n == 0:
return VariantStats(
variant=variant,
count=0,
mean=0,
std=0,
min_value=0,
max_value=0,
confidence_interval=(0, 0)
)
mean = sum(values) / n
if n > 1:
variance = sum((x - mean) ** 2 for x in values) / (n - 1)
std = math.sqrt(variance)
else:
std = 0
# Confidence interval
z = 1.96 # 95% confidence
margin = z * std / math.sqrt(n) if n > 0 else 0
ci = (mean - margin, mean + margin)
return VariantStats(
variant=variant,
count=n,
mean=mean,
std=std,
min_value=min(values),
max_value=max(values),
confidence_interval=ci
)
def _determine_winner(
self,
stats: dict[str, VariantStats]
) -> tuple[str, float, bool]:
"""Determine the winning variant."""
if len(stats) < 2:
return None, 0, False
# Find variant with highest mean
sorted_variants = sorted(
stats.items(),
key=lambda x: x[1].mean,
reverse=True
)
best = sorted_variants[0]
second = sorted_variants[1]
# Check if difference is significant
# Using simple t-test approximation
n1, n2 = best[1].count, second[1].count
m1, m2 = best[1].mean, second[1].mean
s1, s2 = best[1].std, second[1].std
if n1 < 30 or n2 < 30:
return best[0], 0, False
# Pooled standard error
se = math.sqrt(s1**2/n1 + s2**2/n2)
if se == 0:
return best[0], 1.0, True
# t-statistic
t = (m1 - m2) / se
# Approximate p-value (simplified)
# For large samples, t ~ N(0,1)
p_value = 2 * (1 - self._normal_cdf(abs(t)))
confidence = 1 - p_value
is_significant = p_value < (1 - self.confidence_level)
return best[0], confidence, is_significant
def _normal_cdf(self, x: float) -> float:
"""Approximate normal CDF."""
return 0.5 * (1 + math.erf(x / math.sqrt(2)))
class MetricCollector:
"""Collect metrics for prompt evaluation."""
def __init__(self, analyzer: ExperimentAnalyzer):
self.analyzer = analyzer
def record_quality(
self,
experiment_id: str,
variant: str,
score: float
):
"""Record quality score (0-1)."""
self.analyzer.record_sample(
experiment_id,
variant,
score,
{"metric": "quality"}
)
def record_latency(
self,
experiment_id: str,
variant: str,
latency_ms: float
):
"""Record latency in milliseconds."""
# Invert so higher is better
score = 1 / (1 + latency_ms / 1000)
self.analyzer.record_sample(
experiment_id,
variant,
score,
{"metric": "latency", "raw_latency_ms": latency_ms}
)
def record_cost(
self,
experiment_id: str,
variant: str,
cost_usd: float
):
"""Record cost in USD."""
# Invert so higher is better (lower cost)
score = 1 / (1 + cost_usd)
self.analyzer.record_sample(
experiment_id,
variant,
score,
{"metric": "cost", "raw_cost_usd": cost_usd}
)
def record_user_feedback(
self,
experiment_id: str,
variant: str,
rating: int,
max_rating: int = 5
):
"""Record user feedback rating."""
score = rating / max_rating
self.analyzer.record_sample(
experiment_id,
variant,
score,
{"metric": "user_feedback", "rating": rating}
)
Deployment Strategies
from dataclasses import dataclass
from typing import Any, Optional, Callable
from datetime import datetime
from enum import Enum
class RolloutStrategy(Enum):
"""Rollout strategies."""
IMMEDIATE = "immediate"
GRADUAL = "gradual"
CANARY = "canary"
BLUE_GREEN = "blue_green"
@dataclass
class RolloutConfig:
"""Configuration for a rollout."""
strategy: RolloutStrategy
target_percentage: float = 100
increment: float = 10
interval_minutes: int = 60
rollback_threshold: float = 0.1 # Error rate threshold
@dataclass
class RolloutState:
"""State of a rollout."""
prompt_name: str
old_version: int
new_version: int
current_percentage: float
status: str # pending, in_progress, completed, rolled_back
started_at: datetime = None
completed_at: datetime = None
class RolloutManager:
"""Manage prompt rollouts."""
def __init__(
self,
registry: PromptRegistry,
analyzer: ExperimentAnalyzer
):
self.registry = registry
self.analyzer = analyzer
self._rollouts: dict[str, RolloutState] = {}
self._traffic_split: dict[str, dict[int, float]] = {}
def start_rollout(
self,
prompt_name: str,
new_version: int,
config: RolloutConfig
) -> RolloutState:
"""Start a rollout."""
current = self.registry.get_active(prompt_name)
if not current:
raise ValueError(f"Prompt not found: {prompt_name}")
state = RolloutState(
prompt_name=prompt_name,
old_version=current.version,
new_version=new_version,
current_percentage=0,
status="pending",
started_at=datetime.utcnow()
)
self._rollouts[prompt_name] = state
if config.strategy == RolloutStrategy.IMMEDIATE:
self._immediate_rollout(state)
elif config.strategy == RolloutStrategy.CANARY:
self._canary_rollout(state, config)
elif config.strategy == RolloutStrategy.GRADUAL:
self._gradual_rollout(state, config)
return state
def _immediate_rollout(self, state: RolloutState):
"""Immediate full rollout."""
state.current_percentage = 100
state.status = "completed"
state.completed_at = datetime.utcnow()
self.registry.set_active(state.prompt_name, state.new_version)
self._traffic_split[state.prompt_name] = {
state.new_version: 100
}
def _canary_rollout(
self,
state: RolloutState,
config: RolloutConfig
):
"""Canary rollout (small percentage first)."""
state.current_percentage = 5 # Start with 5%
state.status = "in_progress"
self._traffic_split[state.prompt_name] = {
state.old_version: 95,
state.new_version: 5
}
def _gradual_rollout(
self,
state: RolloutState,
config: RolloutConfig
):
"""Gradual rollout with increments."""
state.current_percentage = config.increment
state.status = "in_progress"
self._traffic_split[state.prompt_name] = {
state.old_version: 100 - config.increment,
state.new_version: config.increment
}
def advance_rollout(
self,
prompt_name: str,
increment: float = 10
):
"""Advance a rollout by increment."""
state = self._rollouts.get(prompt_name)
if not state or state.status != "in_progress":
return
new_percentage = min(100, state.current_percentage + increment)
state.current_percentage = new_percentage
self._traffic_split[prompt_name] = {
state.old_version: 100 - new_percentage,
state.new_version: new_percentage
}
if new_percentage >= 100:
state.status = "completed"
state.completed_at = datetime.utcnow()
self.registry.set_active(prompt_name, state.new_version)
def rollback(self, prompt_name: str):
"""Rollback a rollout."""
state = self._rollouts.get(prompt_name)
if not state:
return
state.status = "rolled_back"
state.current_percentage = 0
state.completed_at = datetime.utcnow()
self._traffic_split[prompt_name] = {
state.old_version: 100
}
def get_version_for_request(
self,
prompt_name: str,
request_id: str
) -> int:
"""Get version to use for a request."""
split = self._traffic_split.get(prompt_name)
if not split:
active = self.registry.get_active(prompt_name)
return active.version if active else 1
# Deterministic selection based on request_id
import hashlib
hash_value = int(hashlib.md5(request_id.encode()).hexdigest(), 16)
threshold = hash_value % 100
cumulative = 0
for version, percentage in sorted(split.items()):
cumulative += percentage
if threshold < cumulative:
return version
return list(split.keys())[-1]
def check_health(self, prompt_name: str) -> bool:
"""Check if rollout is healthy."""
state = self._rollouts.get(prompt_name)
if not state or state.status != "in_progress":
return True
# Check error rates from analyzer
# This would integrate with actual metrics
return True
Production Prompt Service
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional
app = FastAPI()
# Initialize components
registry = PromptRegistry()
ab_manager = ABTestManager(registry)
analyzer = ExperimentAnalyzer()
rollout_manager = RolloutManager(registry, analyzer)
class RegisterPromptRequest(BaseModel):
name: str
content: str
variables: Optional[list[str]] = None
metadata: Optional[dict] = None
class CreateExperimentRequest(BaseModel):
name: str
description: str
variants: list[dict]
class RecordMetricRequest(BaseModel):
experiment_id: str
variant: str
metric: str
value: float
class StartRolloutRequest(BaseModel):
prompt_name: str
new_version: int
strategy: str = "gradual"
increment: float = 10
@app.post("/v1/prompts")
async def register_prompt(request: RegisterPromptRequest):
"""Register a new prompt version."""
prompt = registry.register(
name=request.name,
content=request.content,
variables=request.variables,
metadata=request.metadata
)
return {
"id": prompt.id,
"name": prompt.name,
"version": prompt.version,
"content_hash": prompt.content_hash
}
@app.get("/v1/prompts/{name}")
async def get_prompt(name: str, version: Optional[int] = None):
"""Get a prompt."""
prompt = registry.get(name, version)
if not prompt:
raise HTTPException(404, "Prompt not found")
return {
"id": prompt.id,
"name": prompt.name,
"version": prompt.version,
"content": prompt.content,
"variables": prompt.variables
}
@app.get("/v1/prompts/{name}/versions")
async def list_versions(name: str):
"""List all versions of a prompt."""
versions = registry.list_versions(name)
return {
"name": name,
"versions": [
{
"version": v.version,
"content_hash": v.content_hash,
"created_at": v.created_at.isoformat()
}
for v in versions
]
}
@app.post("/v1/experiments")
async def create_experiment(request: CreateExperimentRequest):
"""Create an A/B test experiment."""
experiment = ab_manager.create_experiment(
name=request.name,
description=request.description,
variants=request.variants
)
return {
"id": experiment.id,
"name": experiment.name,
"status": experiment.status
}
@app.post("/v1/experiments/{experiment_id}/start")
async def start_experiment(experiment_id: str):
"""Start an experiment."""
ab_manager.start_experiment(experiment_id)
return {"status": "started"}
@app.get("/v1/experiments/{experiment_id}/variant")
async def get_variant(experiment_id: str, user_id: str):
"""Get variant for a user."""
prompt = ab_manager.get_prompt_for_user(experiment_id, user_id)
if not prompt:
raise HTTPException(404, "No active experiment or variant")
return {
"prompt_name": prompt.name,
"prompt_version": prompt.version,
"content": prompt.content
}
@app.post("/v1/metrics")
async def record_metric(request: RecordMetricRequest):
"""Record a metric for analysis."""
analyzer.record_sample(
request.experiment_id,
request.variant,
request.value,
{"metric": request.metric}
)
return {"status": "recorded"}
@app.get("/v1/experiments/{experiment_id}/results")
async def get_results(experiment_id: str):
"""Get experiment results."""
results = analyzer.analyze(experiment_id)
return {
"experiment_id": results.experiment_id,
"winner": results.winner,
"confidence": results.confidence,
"is_significant": results.is_significant,
"variants": {
name: {
"count": stats.count,
"mean": stats.mean,
"std": stats.std,
"confidence_interval": stats.confidence_interval
}
for name, stats in results.variants.items()
}
}
@app.post("/v1/rollouts")
async def start_rollout(request: StartRolloutRequest):
"""Start a prompt rollout."""
config = RolloutConfig(
strategy=RolloutStrategy(request.strategy),
increment=request.increment
)
state = rollout_manager.start_rollout(
request.prompt_name,
request.new_version,
config
)
return {
"prompt_name": state.prompt_name,
"old_version": state.old_version,
"new_version": state.new_version,
"current_percentage": state.current_percentage,
"status": state.status
}
@app.get("/health")
async def health():
return {"status": "healthy"}
References
- PromptLayer: https://promptlayer.com/
- Humanloop: https://humanloop.com/
- LangSmith: https://smith.langchain.com/
- A/B Testing Statistics: https://www.evanmiller.org/ab-testing/
Conclusion
Prompt versioning transforms prompt engineering from ad-hoc experimentation into systematic improvement. Start with a version control system—every prompt change should be tracked with metadata about who changed it and why. Implement A/B testing to compare prompt variants with statistical rigor—don’t rely on gut feelings when you can measure. Use proper statistical analysis to determine winners with confidence, accounting for sample size and variance. Deploy prompt changes safely with gradual rollouts—start with a small percentage of traffic and increase as you verify performance. Consider multi-armed bandits for continuous optimization that automatically shifts traffic to better-performing variants. The key insight is that prompts are a critical part of your application’s behavior and deserve the same engineering discipline as code. Build prompt management infrastructure early and you’ll iterate faster, catch regressions sooner, and make data-driven decisions about prompt improvements.
Discover more from Code, Cloud & Context
Subscribe to get the latest posts sent to your email.