Prompt Versioning and Management: Bringing Software Engineering Rigor to LLM Development

Introduction: Prompts are code. They determine how your LLM application behaves, and like code, they need version control, testing, and deployment pipelines. Yet many teams treat prompts as afterthoughts—hardcoded strings scattered across the codebase, changed ad-hoc without tracking. This leads to regressions, inconsistent behavior, and difficulty understanding why outputs changed. This guide covers practical prompt management: versioning strategies, template systems, A/B testing frameworks, and deployment patterns that bring software engineering rigor to prompt development.

Prompt Templates

from dataclasses import dataclass, field
from typing import Any, Optional
from datetime import datetime
import re
import hashlib

@dataclass
class PromptTemplate:
    """A versioned prompt template."""
    
    name: str
    template: str
    version: str
    description: str = ""
    variables: list[str] = field(default_factory=list)
    metadata: dict = field(default_factory=dict)
    created_at: datetime = field(default_factory=datetime.utcnow)
    
    def __post_init__(self):
        # Auto-detect variables if not provided
        if not self.variables:
            self.variables = self._extract_variables()
    
    def _extract_variables(self) -> list[str]:
        """Extract variable names from template."""
        
        # Match {variable} and {{variable}} patterns
        pattern = r'\{(\w+)\}'
        matches = re.findall(pattern, self.template)
        return list(set(matches))
    
    def render(self, **kwargs) -> str:
        """Render template with variables."""
        
        # Validate all required variables are provided
        missing = set(self.variables) - set(kwargs.keys())
        if missing:
            raise ValueError(f"Missing variables: {missing}")
        
        result = self.template
        for key, value in kwargs.items():
            result = result.replace(f"{{{key}}}", str(value))
        
        return result
    
    @property
    def hash(self) -> str:
        """Content hash for change detection."""
        return hashlib.sha256(self.template.encode()).hexdigest()[:12]

class PromptBuilder:
    """Build prompts with components."""
    
    def __init__(self):
        self.system: str = ""
        self.context: list[str] = []
        self.examples: list[tuple[str, str]] = []
        self.instructions: list[str] = []
        self.output_format: str = ""
    
    def set_system(self, system: str) -> 'PromptBuilder':
        """Set system message."""
        self.system = system
        return self
    
    def add_context(self, context: str) -> 'PromptBuilder':
        """Add context section."""
        self.context.append(context)
        return self
    
    def add_example(self, user: str, assistant: str) -> 'PromptBuilder':
        """Add few-shot example."""
        self.examples.append((user, assistant))
        return self
    
    def add_instruction(self, instruction: str) -> 'PromptBuilder':
        """Add instruction."""
        self.instructions.append(instruction)
        return self
    
    def set_output_format(self, format_spec: str) -> 'PromptBuilder':
        """Set expected output format."""
        self.output_format = format_spec
        return self
    
    def build(self) -> str:
        """Build the complete prompt."""
        
        parts = []
        
        if self.system:
            parts.append(self.system)
        
        if self.context:
            parts.append("\n".join(self.context))
        
        if self.instructions:
            parts.append("Instructions:\n" + "\n".join(f"- {i}" for i in self.instructions))
        
        if self.examples:
            examples_text = "Examples:\n"
            for user, assistant in self.examples:
                examples_text += f"\nUser: {user}\nAssistant: {assistant}\n"
            parts.append(examples_text)
        
        if self.output_format:
            parts.append(f"Output format: {self.output_format}")
        
        return "\n\n".join(parts)
    
    def to_messages(self, user_input: str) -> list[dict]:
        """Convert to chat messages format."""
        
        messages = []
        
        if self.system:
            messages.append({"role": "system", "content": self.system})
        
        # Add few-shot examples as conversation
        for user, assistant in self.examples:
            messages.append({"role": "user", "content": user})
            messages.append({"role": "assistant", "content": assistant})
        
        # Build user message with context and instructions
        user_parts = []
        
        if self.context:
            user_parts.append("Context:\n" + "\n".join(self.context))
        
        if self.instructions:
            user_parts.append("Instructions:\n" + "\n".join(f"- {i}" for i in self.instructions))
        
        user_parts.append(user_input)
        
        if self.output_format:
            user_parts.append(f"Output format: {self.output_format}")
        
        messages.append({"role": "user", "content": "\n\n".join(user_parts)})
        
        return messages

Version Storage

from dataclasses import dataclass
from typing import Optional
from datetime import datetime
import json
import os

@dataclass
class PromptVersion:
    """A specific version of a prompt."""
    
    prompt_id: str
    version: str
    template: str
    variables: list[str]
    created_at: datetime
    created_by: str
    commit_message: str = ""
    is_active: bool = False

class FilePromptStore:
    """Store prompts in filesystem (Git-friendly)."""
    
    def __init__(self, base_path: str):
        self.base_path = base_path
        os.makedirs(base_path, exist_ok=True)
    
    def save(self, template: PromptTemplate, commit_message: str = ""):
        """Save a prompt template."""
        
        prompt_dir = os.path.join(self.base_path, template.name)
        os.makedirs(prompt_dir, exist_ok=True)
        
        # Save template file
        template_path = os.path.join(prompt_dir, f"v{template.version}.txt")
        with open(template_path, 'w') as f:
            f.write(template.template)
        
        # Save metadata
        metadata = {
            "name": template.name,
            "version": template.version,
            "description": template.description,
            "variables": template.variables,
            "hash": template.hash,
            "created_at": template.created_at.isoformat(),
            "commit_message": commit_message
        }
        
        metadata_path = os.path.join(prompt_dir, f"v{template.version}.json")
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=2)
        
        # Update latest pointer
        latest_path = os.path.join(prompt_dir, "latest.txt")
        with open(latest_path, 'w') as f:
            f.write(template.version)
    
    def load(self, name: str, version: str = None) -> Optional[PromptTemplate]:
        """Load a prompt template."""
        
        prompt_dir = os.path.join(self.base_path, name)
        
        if not os.path.exists(prompt_dir):
            return None
        
        # Get version
        if version is None:
            latest_path = os.path.join(prompt_dir, "latest.txt")
            if os.path.exists(latest_path):
                with open(latest_path, 'r') as f:
                    version = f.read().strip()
            else:
                return None
        
        # Load template
        template_path = os.path.join(prompt_dir, f"v{version}.txt")
        if not os.path.exists(template_path):
            return None
        
        with open(template_path, 'r') as f:
            template_text = f.read()
        
        # Load metadata
        metadata_path = os.path.join(prompt_dir, f"v{version}.json")
        metadata = {}
        if os.path.exists(metadata_path):
            with open(metadata_path, 'r') as f:
                metadata = json.load(f)
        
        return PromptTemplate(
            name=name,
            template=template_text,
            version=version,
            description=metadata.get("description", ""),
            variables=metadata.get("variables", [])
        )
    
    def list_versions(self, name: str) -> list[str]:
        """List all versions of a prompt."""
        
        prompt_dir = os.path.join(self.base_path, name)
        
        if not os.path.exists(prompt_dir):
            return []
        
        versions = []
        for filename in os.listdir(prompt_dir):
            if filename.startswith("v") and filename.endswith(".txt"):
                version = filename[1:-4]  # Remove 'v' prefix and '.txt' suffix
                versions.append(version)
        
        return sorted(versions)

class DatabasePromptStore:
    """Store prompts in database."""
    
    def __init__(self, connection_string: str):
        self.connection_string = connection_string
        # In production, use SQLAlchemy or similar
        self.prompts: dict[str, dict[str, PromptVersion]] = {}
    
    def save(
        self,
        template: PromptTemplate,
        created_by: str,
        commit_message: str = ""
    ) -> PromptVersion:
        """Save a prompt version."""
        
        version = PromptVersion(
            prompt_id=template.name,
            version=template.version,
            template=template.template,
            variables=template.variables,
            created_at=datetime.utcnow(),
            created_by=created_by,
            commit_message=commit_message
        )
        
        if template.name not in self.prompts:
            self.prompts[template.name] = {}
        
        self.prompts[template.name][template.version] = version
        
        return version
    
    def load(self, name: str, version: str = None) -> Optional[PromptVersion]:
        """Load a prompt version."""
        
        if name not in self.prompts:
            return None
        
        versions = self.prompts[name]
        
        if version:
            return versions.get(version)
        
        # Return latest (highest version number)
        if versions:
            latest = max(versions.keys())
            return versions[latest]
        
        return None
    
    def set_active(self, name: str, version: str):
        """Set the active version for a prompt."""
        
        if name not in self.prompts:
            raise ValueError(f"Prompt not found: {name}")
        
        # Deactivate all versions
        for v in self.prompts[name].values():
            v.is_active = False
        
        # Activate specified version
        if version in self.prompts[name]:
            self.prompts[name][version].is_active = True
    
    def get_active(self, name: str) -> Optional[PromptVersion]:
        """Get the active version of a prompt."""
        
        if name not in self.prompts:
            return None
        
        for version in self.prompts[name].values():
            if version.is_active:
                return version
        
        return None

A/B Testing

from dataclasses import dataclass, field
from typing import Any, Optional
from datetime import datetime
import random
import hashlib

@dataclass
class Variant:
    """A variant in an A/B test."""
    
    name: str
    prompt_name: str
    prompt_version: str
    weight: float = 1.0
    
@dataclass
class ABTest:
    """An A/B test configuration."""
    
    test_id: str
    name: str
    variants: list[Variant]
    start_date: datetime
    end_date: Optional[datetime] = None
    is_active: bool = True

@dataclass
class TestResult:
    """Result of a single test interaction."""
    
    test_id: str
    variant_name: str
    user_id: str
    timestamp: datetime
    metrics: dict = field(default_factory=dict)

class ABTestManager:
    """Manage prompt A/B tests."""
    
    def __init__(self, prompt_store):
        self.prompt_store = prompt_store
        self.tests: dict[str, ABTest] = {}
        self.results: list[TestResult] = []
    
    def create_test(
        self,
        test_id: str,
        name: str,
        variants: list[Variant]
    ) -> ABTest:
        """Create a new A/B test."""
        
        # Normalize weights
        total_weight = sum(v.weight for v in variants)
        for v in variants:
            v.weight = v.weight / total_weight
        
        test = ABTest(
            test_id=test_id,
            name=name,
            variants=variants,
            start_date=datetime.utcnow()
        )
        
        self.tests[test_id] = test
        return test
    
    def get_variant(self, test_id: str, user_id: str) -> Optional[Variant]:
        """Get variant for a user (deterministic assignment)."""
        
        test = self.tests.get(test_id)
        if not test or not test.is_active:
            return None
        
        # Deterministic assignment based on user_id
        hash_input = f"{test_id}:{user_id}"
        hash_value = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
        bucket = (hash_value % 1000) / 1000  # 0.0 to 0.999
        
        cumulative = 0.0
        for variant in test.variants:
            cumulative += variant.weight
            if bucket < cumulative:
                return variant
        
        return test.variants[-1]  # Fallback
    
    def get_prompt(self, test_id: str, user_id: str) -> Optional[PromptTemplate]:
        """Get the prompt for a user in a test."""
        
        variant = self.get_variant(test_id, user_id)
        if not variant:
            return None
        
        return self.prompt_store.load(variant.prompt_name, variant.prompt_version)
    
    def record_result(
        self,
        test_id: str,
        user_id: str,
        metrics: dict
    ):
        """Record test result."""
        
        variant = self.get_variant(test_id, user_id)
        if not variant:
            return
        
        result = TestResult(
            test_id=test_id,
            variant_name=variant.name,
            user_id=user_id,
            timestamp=datetime.utcnow(),
            metrics=metrics
        )
        
        self.results.append(result)
    
    def get_statistics(self, test_id: str) -> dict:
        """Get test statistics."""
        
        test = self.tests.get(test_id)
        if not test:
            return {}
        
        # Group results by variant
        variant_results: dict[str, list[TestResult]] = {}
        for result in self.results:
            if result.test_id == test_id:
                if result.variant_name not in variant_results:
                    variant_results[result.variant_name] = []
                variant_results[result.variant_name].append(result)
        
        stats = {}
        for variant_name, results in variant_results.items():
            # Calculate metrics
            metric_sums: dict[str, float] = {}
            for result in results:
                for metric, value in result.metrics.items():
                    if metric not in metric_sums:
                        metric_sums[metric] = 0
                    metric_sums[metric] += value
            
            stats[variant_name] = {
                "count": len(results),
                "metrics": {
                    metric: total / len(results)
                    for metric, total in metric_sums.items()
                }
            }
        
        return stats
    
    def end_test(self, test_id: str, winner: str = None):
        """End a test and optionally declare winner."""
        
        test = self.tests.get(test_id)
        if test:
            test.is_active = False
            test.end_date = datetime.utcnow()
        
        return self.get_statistics(test_id)

Deployment Pipeline

from dataclasses import dataclass
from typing import Optional
from datetime import datetime
from enum import Enum

class DeploymentStage(Enum):
    DEVELOPMENT = "development"
    STAGING = "staging"
    PRODUCTION = "production"

@dataclass
class Deployment:
    """A prompt deployment."""
    
    prompt_name: str
    version: str
    stage: DeploymentStage
    deployed_at: datetime
    deployed_by: str
    rollback_version: Optional[str] = None

class PromptDeployer:
    """Deploy prompts through stages."""
    
    def __init__(self, prompt_store):
        self.prompt_store = prompt_store
        self.deployments: dict[str, dict[DeploymentStage, Deployment]] = {}
    
    def deploy(
        self,
        prompt_name: str,
        version: str,
        stage: DeploymentStage,
        deployed_by: str
    ) -> Deployment:
        """Deploy a prompt version to a stage."""
        
        # Verify prompt exists
        prompt = self.prompt_store.load(prompt_name, version)
        if not prompt:
            raise ValueError(f"Prompt not found: {prompt_name} v{version}")
        
        # Get current deployment for rollback
        current = self.get_deployment(prompt_name, stage)
        rollback_version = current.version if current else None
        
        deployment = Deployment(
            prompt_name=prompt_name,
            version=version,
            stage=stage,
            deployed_at=datetime.utcnow(),
            deployed_by=deployed_by,
            rollback_version=rollback_version
        )
        
        if prompt_name not in self.deployments:
            self.deployments[prompt_name] = {}
        
        self.deployments[prompt_name][stage] = deployment
        
        return deployment
    
    def get_deployment(
        self,
        prompt_name: str,
        stage: DeploymentStage
    ) -> Optional[Deployment]:
        """Get current deployment for a stage."""
        
        if prompt_name not in self.deployments:
            return None
        
        return self.deployments[prompt_name].get(stage)
    
    def get_prompt(
        self,
        prompt_name: str,
        stage: DeploymentStage
    ) -> Optional[PromptTemplate]:
        """Get the deployed prompt for a stage."""
        
        deployment = self.get_deployment(prompt_name, stage)
        if not deployment:
            return None
        
        return self.prompt_store.load(prompt_name, deployment.version)
    
    def rollback(
        self,
        prompt_name: str,
        stage: DeploymentStage,
        rolled_back_by: str
    ) -> Optional[Deployment]:
        """Rollback to previous version."""
        
        current = self.get_deployment(prompt_name, stage)
        if not current or not current.rollback_version:
            return None
        
        return self.deploy(
            prompt_name=prompt_name,
            version=current.rollback_version,
            stage=stage,
            deployed_by=rolled_back_by
        )
    
    def promote(
        self,
        prompt_name: str,
        from_stage: DeploymentStage,
        to_stage: DeploymentStage,
        promoted_by: str
    ) -> Deployment:
        """Promote deployment from one stage to another."""
        
        current = self.get_deployment(prompt_name, from_stage)
        if not current:
            raise ValueError(f"No deployment in {from_stage.value}")
        
        return self.deploy(
            prompt_name=prompt_name,
            version=current.version,
            stage=to_stage,
            deployed_by=promoted_by
        )

class PromptRegistry:
    """Central registry for prompt access."""
    
    def __init__(
        self,
        prompt_store,
        deployer: PromptDeployer,
        default_stage: DeploymentStage = DeploymentStage.PRODUCTION
    ):
        self.prompt_store = prompt_store
        self.deployer = deployer
        self.default_stage = default_stage
        self.cache: dict[str, PromptTemplate] = {}
    
    def get(
        self,
        prompt_name: str,
        stage: DeploymentStage = None
    ) -> Optional[PromptTemplate]:
        """Get a prompt from the registry."""
        
        stage = stage or self.default_stage
        cache_key = f"{prompt_name}:{stage.value}"
        
        # Check cache
        if cache_key in self.cache:
            return self.cache[cache_key]
        
        # Get from deployer
        prompt = self.deployer.get_prompt(prompt_name, stage)
        
        if prompt:
            self.cache[cache_key] = prompt
        
        return prompt
    
    def invalidate_cache(self, prompt_name: str = None):
        """Invalidate cache entries."""
        
        if prompt_name:
            keys_to_remove = [k for k in self.cache if k.startswith(f"{prompt_name}:")]
            for key in keys_to_remove:
                del self.cache[key]
        else:
            self.cache.clear()

Production Service

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional

app = FastAPI()

# Initialize components
prompt_store = FilePromptStore("./prompts")
deployer = PromptDeployer(prompt_store)
registry = PromptRegistry(prompt_store, deployer)
ab_manager = ABTestManager(prompt_store)

class CreatePromptRequest(BaseModel):
    name: str
    template: str
    version: str
    description: str = ""
    commit_message: str = ""

class DeployRequest(BaseModel):
    prompt_name: str
    version: str
    stage: str
    deployed_by: str

class RenderRequest(BaseModel):
    prompt_name: str
    variables: dict
    stage: str = "production"

class ABTestRequest(BaseModel):
    test_id: str
    name: str
    variants: list[dict]

class RecordResultRequest(BaseModel):
    test_id: str
    user_id: str
    metrics: dict

@app.post("/v1/prompts")
async def create_prompt(request: CreatePromptRequest):
    """Create a new prompt version."""
    
    template = PromptTemplate(
        name=request.name,
        template=request.template,
        version=request.version,
        description=request.description
    )
    
    prompt_store.save(template, request.commit_message)
    
    return {
        "name": template.name,
        "version": template.version,
        "hash": template.hash,
        "variables": template.variables
    }

@app.get("/v1/prompts/{name}")
async def get_prompt(name: str, version: str = None):
    """Get a prompt template."""
    
    template = prompt_store.load(name, version)
    
    if not template:
        raise HTTPException(404, "Prompt not found")
    
    return {
        "name": template.name,
        "version": template.version,
        "template": template.template,
        "variables": template.variables,
        "description": template.description
    }

@app.get("/v1/prompts/{name}/versions")
async def list_versions(name: str):
    """List all versions of a prompt."""
    
    versions = prompt_store.list_versions(name)
    
    return {"name": name, "versions": versions}

@app.post("/v1/prompts/deploy")
async def deploy_prompt(request: DeployRequest):
    """Deploy a prompt to a stage."""
    
    try:
        stage = DeploymentStage(request.stage)
    except ValueError:
        raise HTTPException(400, f"Invalid stage: {request.stage}")
    
    try:
        deployment = deployer.deploy(
            prompt_name=request.prompt_name,
            version=request.version,
            stage=stage,
            deployed_by=request.deployed_by
        )
        
        # Invalidate cache
        registry.invalidate_cache(request.prompt_name)
        
        return {
            "prompt_name": deployment.prompt_name,
            "version": deployment.version,
            "stage": deployment.stage.value,
            "deployed_at": deployment.deployed_at.isoformat()
        }
    
    except ValueError as e:
        raise HTTPException(400, str(e))

@app.post("/v1/prompts/render")
async def render_prompt(request: RenderRequest):
    """Render a prompt with variables."""
    
    try:
        stage = DeploymentStage(request.stage)
    except ValueError:
        raise HTTPException(400, f"Invalid stage: {request.stage}")
    
    template = registry.get(request.prompt_name, stage)
    
    if not template:
        raise HTTPException(404, "Prompt not found or not deployed")
    
    try:
        rendered = template.render(**request.variables)
        return {"rendered": rendered}
    
    except ValueError as e:
        raise HTTPException(400, str(e))

@app.post("/v1/ab-tests")
async def create_ab_test(request: ABTestRequest):
    """Create an A/B test."""
    
    variants = [
        Variant(
            name=v["name"],
            prompt_name=v["prompt_name"],
            prompt_version=v["prompt_version"],
            weight=v.get("weight", 1.0)
        )
        for v in request.variants
    ]
    
    test = ab_manager.create_test(
        test_id=request.test_id,
        name=request.name,
        variants=variants
    )
    
    return {
        "test_id": test.test_id,
        "name": test.name,
        "variants": [v.name for v in test.variants]
    }

@app.get("/v1/ab-tests/{test_id}/variant")
async def get_variant(test_id: str, user_id: str):
    """Get variant assignment for a user."""
    
    variant = ab_manager.get_variant(test_id, user_id)
    
    if not variant:
        raise HTTPException(404, "Test not found or inactive")
    
    return {
        "test_id": test_id,
        "variant": variant.name,
        "prompt_name": variant.prompt_name,
        "prompt_version": variant.prompt_version
    }

@app.post("/v1/ab-tests/{test_id}/results")
async def record_result(test_id: str, request: RecordResultRequest):
    """Record A/B test result."""
    
    ab_manager.record_result(
        test_id=test_id,
        user_id=request.user_id,
        metrics=request.metrics
    )
    
    return {"status": "recorded"}

@app.get("/v1/ab-tests/{test_id}/stats")
async def get_test_stats(test_id: str):
    """Get A/B test statistics."""
    
    stats = ab_manager.get_statistics(test_id)
    
    return {"test_id": test_id, "statistics": stats}

@app.get("/health")
async def health():
    return {"status": "healthy"}

References

LangSmith Prompt Hub: https://docs.smith.langchain.com/prompt_engineering
Humanloop Prompt Management: https://humanloop.com/docs/prompts
PromptLayer: https://promptlayer.com/
Weights & Biases Prompts: https://docs.wandb.ai/guides/prompts

Conclusion

Treating prompts as code transforms how you develop LLM applications. Version control enables tracking changes, understanding regressions, and rolling back when needed. Template systems with variable extraction make prompts reusable and testable. A/B testing frameworks let you measure the impact of prompt changes with statistical rigor—deterministic user assignment ensures consistent experiences while enabling fair comparisons. Deployment pipelines with staging environments catch issues before they reach production. Build a central registry that applications query for prompts rather than hardcoding strings. Cache aggressively but invalidate on deployments. The investment in prompt infrastructure pays off as your application grows—you’ll iterate faster, catch regressions earlier, and have confidence that prompt changes actually improve outcomes.

Discover more from Code, Cloud & Context

Subscribe to get the latest posts sent to your email.

Searching in

Prompt Versioning and Management: Bringing Software Engineering Rigor to LLM Development

Prompt Templates

Version Storage

A/B Testing

Deployment Pipeline

Production Service

References

Conclusion

Discover more from Code, Cloud & Context

Leave a Reply

Searching in

Prompt Templates

Version Storage

A/B Testing

Deployment Pipeline

Production Service

References

Conclusion

Share this article

Discover more from Code, Cloud & Context

Leave a Reply