Introduction: Prompts are code. They determine how your LLM application behaves, and like code, they need version control, testing, and deployment pipelines. Yet many teams treat prompts as afterthoughts—hardcoded strings scattered across the codebase, changed ad-hoc without tracking. This leads to regressions, inconsistent behavior, and difficulty understanding why outputs changed. This guide covers practical prompt management: versioning strategies, template systems, A/B testing frameworks, and deployment patterns that bring software engineering rigor to prompt development.

Prompt Templates
from dataclasses import dataclass, field
from typing import Any, Optional
from datetime import datetime
import re
import hashlib
@dataclass
class PromptTemplate:
"""A versioned prompt template."""
name: str
template: str
version: str
description: str = ""
variables: list[str] = field(default_factory=list)
metadata: dict = field(default_factory=dict)
created_at: datetime = field(default_factory=datetime.utcnow)
def __post_init__(self):
# Auto-detect variables if not provided
if not self.variables:
self.variables = self._extract_variables()
def _extract_variables(self) -> list[str]:
"""Extract variable names from template."""
# Match {variable} and {{variable}} patterns
pattern = r'\{(\w+)\}'
matches = re.findall(pattern, self.template)
return list(set(matches))
def render(self, **kwargs) -> str:
"""Render template with variables."""
# Validate all required variables are provided
missing = set(self.variables) - set(kwargs.keys())
if missing:
raise ValueError(f"Missing variables: {missing}")
result = self.template
for key, value in kwargs.items():
result = result.replace(f"{{{key}}}", str(value))
return result
@property
def hash(self) -> str:
"""Content hash for change detection."""
return hashlib.sha256(self.template.encode()).hexdigest()[:12]
class PromptBuilder:
"""Build prompts with components."""
def __init__(self):
self.system: str = ""
self.context: list[str] = []
self.examples: list[tuple[str, str]] = []
self.instructions: list[str] = []
self.output_format: str = ""
def set_system(self, system: str) -> 'PromptBuilder':
"""Set system message."""
self.system = system
return self
def add_context(self, context: str) -> 'PromptBuilder':
"""Add context section."""
self.context.append(context)
return self
def add_example(self, user: str, assistant: str) -> 'PromptBuilder':
"""Add few-shot example."""
self.examples.append((user, assistant))
return self
def add_instruction(self, instruction: str) -> 'PromptBuilder':
"""Add instruction."""
self.instructions.append(instruction)
return self
def set_output_format(self, format_spec: str) -> 'PromptBuilder':
"""Set expected output format."""
self.output_format = format_spec
return self
def build(self) -> str:
"""Build the complete prompt."""
parts = []
if self.system:
parts.append(self.system)
if self.context:
parts.append("\n".join(self.context))
if self.instructions:
parts.append("Instructions:\n" + "\n".join(f"- {i}" for i in self.instructions))
if self.examples:
examples_text = "Examples:\n"
for user, assistant in self.examples:
examples_text += f"\nUser: {user}\nAssistant: {assistant}\n"
parts.append(examples_text)
if self.output_format:
parts.append(f"Output format: {self.output_format}")
return "\n\n".join(parts)
def to_messages(self, user_input: str) -> list[dict]:
"""Convert to chat messages format."""
messages = []
if self.system:
messages.append({"role": "system", "content": self.system})
# Add few-shot examples as conversation
for user, assistant in self.examples:
messages.append({"role": "user", "content": user})
messages.append({"role": "assistant", "content": assistant})
# Build user message with context and instructions
user_parts = []
if self.context:
user_parts.append("Context:\n" + "\n".join(self.context))
if self.instructions:
user_parts.append("Instructions:\n" + "\n".join(f"- {i}" for i in self.instructions))
user_parts.append(user_input)
if self.output_format:
user_parts.append(f"Output format: {self.output_format}")
messages.append({"role": "user", "content": "\n\n".join(user_parts)})
return messages
Version Storage
from dataclasses import dataclass
from typing import Optional
from datetime import datetime
import json
import os
@dataclass
class PromptVersion:
"""A specific version of a prompt."""
prompt_id: str
version: str
template: str
variables: list[str]
created_at: datetime
created_by: str
commit_message: str = ""
is_active: bool = False
class FilePromptStore:
"""Store prompts in filesystem (Git-friendly)."""
def __init__(self, base_path: str):
self.base_path = base_path
os.makedirs(base_path, exist_ok=True)
def save(self, template: PromptTemplate, commit_message: str = ""):
"""Save a prompt template."""
prompt_dir = os.path.join(self.base_path, template.name)
os.makedirs(prompt_dir, exist_ok=True)
# Save template file
template_path = os.path.join(prompt_dir, f"v{template.version}.txt")
with open(template_path, 'w') as f:
f.write(template.template)
# Save metadata
metadata = {
"name": template.name,
"version": template.version,
"description": template.description,
"variables": template.variables,
"hash": template.hash,
"created_at": template.created_at.isoformat(),
"commit_message": commit_message
}
metadata_path = os.path.join(prompt_dir, f"v{template.version}.json")
with open(metadata_path, 'w') as f:
json.dump(metadata, f, indent=2)
# Update latest pointer
latest_path = os.path.join(prompt_dir, "latest.txt")
with open(latest_path, 'w') as f:
f.write(template.version)
def load(self, name: str, version: str = None) -> Optional[PromptTemplate]:
"""Load a prompt template."""
prompt_dir = os.path.join(self.base_path, name)
if not os.path.exists(prompt_dir):
return None
# Get version
if version is None:
latest_path = os.path.join(prompt_dir, "latest.txt")
if os.path.exists(latest_path):
with open(latest_path, 'r') as f:
version = f.read().strip()
else:
return None
# Load template
template_path = os.path.join(prompt_dir, f"v{version}.txt")
if not os.path.exists(template_path):
return None
with open(template_path, 'r') as f:
template_text = f.read()
# Load metadata
metadata_path = os.path.join(prompt_dir, f"v{version}.json")
metadata = {}
if os.path.exists(metadata_path):
with open(metadata_path, 'r') as f:
metadata = json.load(f)
return PromptTemplate(
name=name,
template=template_text,
version=version,
description=metadata.get("description", ""),
variables=metadata.get("variables", [])
)
def list_versions(self, name: str) -> list[str]:
"""List all versions of a prompt."""
prompt_dir = os.path.join(self.base_path, name)
if not os.path.exists(prompt_dir):
return []
versions = []
for filename in os.listdir(prompt_dir):
if filename.startswith("v") and filename.endswith(".txt"):
version = filename[1:-4] # Remove 'v' prefix and '.txt' suffix
versions.append(version)
return sorted(versions)
class DatabasePromptStore:
"""Store prompts in database."""
def __init__(self, connection_string: str):
self.connection_string = connection_string
# In production, use SQLAlchemy or similar
self.prompts: dict[str, dict[str, PromptVersion]] = {}
def save(
self,
template: PromptTemplate,
created_by: str,
commit_message: str = ""
) -> PromptVersion:
"""Save a prompt version."""
version = PromptVersion(
prompt_id=template.name,
version=template.version,
template=template.template,
variables=template.variables,
created_at=datetime.utcnow(),
created_by=created_by,
commit_message=commit_message
)
if template.name not in self.prompts:
self.prompts[template.name] = {}
self.prompts[template.name][template.version] = version
return version
def load(self, name: str, version: str = None) -> Optional[PromptVersion]:
"""Load a prompt version."""
if name not in self.prompts:
return None
versions = self.prompts[name]
if version:
return versions.get(version)
# Return latest (highest version number)
if versions:
latest = max(versions.keys())
return versions[latest]
return None
def set_active(self, name: str, version: str):
"""Set the active version for a prompt."""
if name not in self.prompts:
raise ValueError(f"Prompt not found: {name}")
# Deactivate all versions
for v in self.prompts[name].values():
v.is_active = False
# Activate specified version
if version in self.prompts[name]:
self.prompts[name][version].is_active = True
def get_active(self, name: str) -> Optional[PromptVersion]:
"""Get the active version of a prompt."""
if name not in self.prompts:
return None
for version in self.prompts[name].values():
if version.is_active:
return version
return None
A/B Testing
from dataclasses import dataclass, field
from typing import Any, Optional
from datetime import datetime
import random
import hashlib
@dataclass
class Variant:
"""A variant in an A/B test."""
name: str
prompt_name: str
prompt_version: str
weight: float = 1.0
@dataclass
class ABTest:
"""An A/B test configuration."""
test_id: str
name: str
variants: list[Variant]
start_date: datetime
end_date: Optional[datetime] = None
is_active: bool = True
@dataclass
class TestResult:
"""Result of a single test interaction."""
test_id: str
variant_name: str
user_id: str
timestamp: datetime
metrics: dict = field(default_factory=dict)
class ABTestManager:
"""Manage prompt A/B tests."""
def __init__(self, prompt_store):
self.prompt_store = prompt_store
self.tests: dict[str, ABTest] = {}
self.results: list[TestResult] = []
def create_test(
self,
test_id: str,
name: str,
variants: list[Variant]
) -> ABTest:
"""Create a new A/B test."""
# Normalize weights
total_weight = sum(v.weight for v in variants)
for v in variants:
v.weight = v.weight / total_weight
test = ABTest(
test_id=test_id,
name=name,
variants=variants,
start_date=datetime.utcnow()
)
self.tests[test_id] = test
return test
def get_variant(self, test_id: str, user_id: str) -> Optional[Variant]:
"""Get variant for a user (deterministic assignment)."""
test = self.tests.get(test_id)
if not test or not test.is_active:
return None
# Deterministic assignment based on user_id
hash_input = f"{test_id}:{user_id}"
hash_value = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
bucket = (hash_value % 1000) / 1000 # 0.0 to 0.999
cumulative = 0.0
for variant in test.variants:
cumulative += variant.weight
if bucket < cumulative:
return variant
return test.variants[-1] # Fallback
def get_prompt(self, test_id: str, user_id: str) -> Optional[PromptTemplate]:
"""Get the prompt for a user in a test."""
variant = self.get_variant(test_id, user_id)
if not variant:
return None
return self.prompt_store.load(variant.prompt_name, variant.prompt_version)
def record_result(
self,
test_id: str,
user_id: str,
metrics: dict
):
"""Record test result."""
variant = self.get_variant(test_id, user_id)
if not variant:
return
result = TestResult(
test_id=test_id,
variant_name=variant.name,
user_id=user_id,
timestamp=datetime.utcnow(),
metrics=metrics
)
self.results.append(result)
def get_statistics(self, test_id: str) -> dict:
"""Get test statistics."""
test = self.tests.get(test_id)
if not test:
return {}
# Group results by variant
variant_results: dict[str, list[TestResult]] = {}
for result in self.results:
if result.test_id == test_id:
if result.variant_name not in variant_results:
variant_results[result.variant_name] = []
variant_results[result.variant_name].append(result)
stats = {}
for variant_name, results in variant_results.items():
# Calculate metrics
metric_sums: dict[str, float] = {}
for result in results:
for metric, value in result.metrics.items():
if metric not in metric_sums:
metric_sums[metric] = 0
metric_sums[metric] += value
stats[variant_name] = {
"count": len(results),
"metrics": {
metric: total / len(results)
for metric, total in metric_sums.items()
}
}
return stats
def end_test(self, test_id: str, winner: str = None):
"""End a test and optionally declare winner."""
test = self.tests.get(test_id)
if test:
test.is_active = False
test.end_date = datetime.utcnow()
return self.get_statistics(test_id)
Deployment Pipeline
from dataclasses import dataclass
from typing import Optional
from datetime import datetime
from enum import Enum
class DeploymentStage(Enum):
DEVELOPMENT = "development"
STAGING = "staging"
PRODUCTION = "production"
@dataclass
class Deployment:
"""A prompt deployment."""
prompt_name: str
version: str
stage: DeploymentStage
deployed_at: datetime
deployed_by: str
rollback_version: Optional[str] = None
class PromptDeployer:
"""Deploy prompts through stages."""
def __init__(self, prompt_store):
self.prompt_store = prompt_store
self.deployments: dict[str, dict[DeploymentStage, Deployment]] = {}
def deploy(
self,
prompt_name: str,
version: str,
stage: DeploymentStage,
deployed_by: str
) -> Deployment:
"""Deploy a prompt version to a stage."""
# Verify prompt exists
prompt = self.prompt_store.load(prompt_name, version)
if not prompt:
raise ValueError(f"Prompt not found: {prompt_name} v{version}")
# Get current deployment for rollback
current = self.get_deployment(prompt_name, stage)
rollback_version = current.version if current else None
deployment = Deployment(
prompt_name=prompt_name,
version=version,
stage=stage,
deployed_at=datetime.utcnow(),
deployed_by=deployed_by,
rollback_version=rollback_version
)
if prompt_name not in self.deployments:
self.deployments[prompt_name] = {}
self.deployments[prompt_name][stage] = deployment
return deployment
def get_deployment(
self,
prompt_name: str,
stage: DeploymentStage
) -> Optional[Deployment]:
"""Get current deployment for a stage."""
if prompt_name not in self.deployments:
return None
return self.deployments[prompt_name].get(stage)
def get_prompt(
self,
prompt_name: str,
stage: DeploymentStage
) -> Optional[PromptTemplate]:
"""Get the deployed prompt for a stage."""
deployment = self.get_deployment(prompt_name, stage)
if not deployment:
return None
return self.prompt_store.load(prompt_name, deployment.version)
def rollback(
self,
prompt_name: str,
stage: DeploymentStage,
rolled_back_by: str
) -> Optional[Deployment]:
"""Rollback to previous version."""
current = self.get_deployment(prompt_name, stage)
if not current or not current.rollback_version:
return None
return self.deploy(
prompt_name=prompt_name,
version=current.rollback_version,
stage=stage,
deployed_by=rolled_back_by
)
def promote(
self,
prompt_name: str,
from_stage: DeploymentStage,
to_stage: DeploymentStage,
promoted_by: str
) -> Deployment:
"""Promote deployment from one stage to another."""
current = self.get_deployment(prompt_name, from_stage)
if not current:
raise ValueError(f"No deployment in {from_stage.value}")
return self.deploy(
prompt_name=prompt_name,
version=current.version,
stage=to_stage,
deployed_by=promoted_by
)
class PromptRegistry:
"""Central registry for prompt access."""
def __init__(
self,
prompt_store,
deployer: PromptDeployer,
default_stage: DeploymentStage = DeploymentStage.PRODUCTION
):
self.prompt_store = prompt_store
self.deployer = deployer
self.default_stage = default_stage
self.cache: dict[str, PromptTemplate] = {}
def get(
self,
prompt_name: str,
stage: DeploymentStage = None
) -> Optional[PromptTemplate]:
"""Get a prompt from the registry."""
stage = stage or self.default_stage
cache_key = f"{prompt_name}:{stage.value}"
# Check cache
if cache_key in self.cache:
return self.cache[cache_key]
# Get from deployer
prompt = self.deployer.get_prompt(prompt_name, stage)
if prompt:
self.cache[cache_key] = prompt
return prompt
def invalidate_cache(self, prompt_name: str = None):
"""Invalidate cache entries."""
if prompt_name:
keys_to_remove = [k for k in self.cache if k.startswith(f"{prompt_name}:")]
for key in keys_to_remove:
del self.cache[key]
else:
self.cache.clear()
Production Service
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional
app = FastAPI()
# Initialize components
prompt_store = FilePromptStore("./prompts")
deployer = PromptDeployer(prompt_store)
registry = PromptRegistry(prompt_store, deployer)
ab_manager = ABTestManager(prompt_store)
class CreatePromptRequest(BaseModel):
name: str
template: str
version: str
description: str = ""
commit_message: str = ""
class DeployRequest(BaseModel):
prompt_name: str
version: str
stage: str
deployed_by: str
class RenderRequest(BaseModel):
prompt_name: str
variables: dict
stage: str = "production"
class ABTestRequest(BaseModel):
test_id: str
name: str
variants: list[dict]
class RecordResultRequest(BaseModel):
test_id: str
user_id: str
metrics: dict
@app.post("/v1/prompts")
async def create_prompt(request: CreatePromptRequest):
"""Create a new prompt version."""
template = PromptTemplate(
name=request.name,
template=request.template,
version=request.version,
description=request.description
)
prompt_store.save(template, request.commit_message)
return {
"name": template.name,
"version": template.version,
"hash": template.hash,
"variables": template.variables
}
@app.get("/v1/prompts/{name}")
async def get_prompt(name: str, version: str = None):
"""Get a prompt template."""
template = prompt_store.load(name, version)
if not template:
raise HTTPException(404, "Prompt not found")
return {
"name": template.name,
"version": template.version,
"template": template.template,
"variables": template.variables,
"description": template.description
}
@app.get("/v1/prompts/{name}/versions")
async def list_versions(name: str):
"""List all versions of a prompt."""
versions = prompt_store.list_versions(name)
return {"name": name, "versions": versions}
@app.post("/v1/prompts/deploy")
async def deploy_prompt(request: DeployRequest):
"""Deploy a prompt to a stage."""
try:
stage = DeploymentStage(request.stage)
except ValueError:
raise HTTPException(400, f"Invalid stage: {request.stage}")
try:
deployment = deployer.deploy(
prompt_name=request.prompt_name,
version=request.version,
stage=stage,
deployed_by=request.deployed_by
)
# Invalidate cache
registry.invalidate_cache(request.prompt_name)
return {
"prompt_name": deployment.prompt_name,
"version": deployment.version,
"stage": deployment.stage.value,
"deployed_at": deployment.deployed_at.isoformat()
}
except ValueError as e:
raise HTTPException(400, str(e))
@app.post("/v1/prompts/render")
async def render_prompt(request: RenderRequest):
"""Render a prompt with variables."""
try:
stage = DeploymentStage(request.stage)
except ValueError:
raise HTTPException(400, f"Invalid stage: {request.stage}")
template = registry.get(request.prompt_name, stage)
if not template:
raise HTTPException(404, "Prompt not found or not deployed")
try:
rendered = template.render(**request.variables)
return {"rendered": rendered}
except ValueError as e:
raise HTTPException(400, str(e))
@app.post("/v1/ab-tests")
async def create_ab_test(request: ABTestRequest):
"""Create an A/B test."""
variants = [
Variant(
name=v["name"],
prompt_name=v["prompt_name"],
prompt_version=v["prompt_version"],
weight=v.get("weight", 1.0)
)
for v in request.variants
]
test = ab_manager.create_test(
test_id=request.test_id,
name=request.name,
variants=variants
)
return {
"test_id": test.test_id,
"name": test.name,
"variants": [v.name for v in test.variants]
}
@app.get("/v1/ab-tests/{test_id}/variant")
async def get_variant(test_id: str, user_id: str):
"""Get variant assignment for a user."""
variant = ab_manager.get_variant(test_id, user_id)
if not variant:
raise HTTPException(404, "Test not found or inactive")
return {
"test_id": test_id,
"variant": variant.name,
"prompt_name": variant.prompt_name,
"prompt_version": variant.prompt_version
}
@app.post("/v1/ab-tests/{test_id}/results")
async def record_result(test_id: str, request: RecordResultRequest):
"""Record A/B test result."""
ab_manager.record_result(
test_id=test_id,
user_id=request.user_id,
metrics=request.metrics
)
return {"status": "recorded"}
@app.get("/v1/ab-tests/{test_id}/stats")
async def get_test_stats(test_id: str):
"""Get A/B test statistics."""
stats = ab_manager.get_statistics(test_id)
return {"test_id": test_id, "statistics": stats}
@app.get("/health")
async def health():
return {"status": "healthy"}
References
- LangSmith Prompt Hub: https://docs.smith.langchain.com/prompt_engineering
- Humanloop Prompt Management: https://humanloop.com/docs/prompts
- PromptLayer: https://promptlayer.com/
- Weights & Biases Prompts: https://docs.wandb.ai/guides/prompts
Conclusion
Treating prompts as code transforms how you develop LLM applications. Version control enables tracking changes, understanding regressions, and rolling back when needed. Template systems with variable extraction make prompts reusable and testable. A/B testing frameworks let you measure the impact of prompt changes with statistical rigor—deterministic user assignment ensures consistent experiences while enabling fair comparisons. Deployment pipelines with staging environments catch issues before they reach production. Build a central registry that applications query for prompts rather than hardcoding strings. Cache aggressively but invalidate on deployments. The investment in prompt infrastructure pays off as your application grows—you’ll iterate faster, catch regressions earlier, and have confidence that prompt changes actually improve outcomes.
Discover more from Code, Cloud & Context
Subscribe to get the latest posts sent to your email.