Introduction: Prompt debugging is an essential skill for building reliable LLM applications. When prompts fail—producing incorrect outputs, hallucinations, or inconsistent results—systematic debugging techniques help identify and fix the root cause. Unlike traditional software debugging where you can step through code, prompt debugging requires understanding how language models interpret instructions and where they commonly fail. This guide covers practical techniques for diagnosing prompt issues: failure pattern analysis, prompt decomposition, ablation testing, output analysis, and iterative refinement. Whether you’re dealing with format violations, reasoning errors, or edge case failures, these patterns will help you build more robust prompts that work reliably in production.
Prompt Debugging: Analyze Failure, Isolate Issue, Apply Fix
Failure Pattern Analysis
from dataclasses import dataclass, field
from typing import Any, Optional, List
from enum import Enum
from datetime import datetime
import re
class FailureType(Enum):
"""Types of prompt failures."""
FORMAT_VIOLATION = "format_violation"
HALLUCINATION = "hallucination"
INCOMPLETE_RESPONSE = "incomplete_response"
WRONG_ANSWER = "wrong_answer"
REFUSAL = "refusal"
REPETITION = "repetition"
OFF_TOPIC = "off_topic"
INCONSISTENT = "inconsistent"
@dataclass
class FailureCase:
"""A recorded failure case."""
prompt: str
expected_output: str
actual_output: str
failure_type: FailureType
timestamp: datetime = field(default_factory=datetime.now)
metadata: dict = field(default_factory=dict)
class FailureAnalyzer:
"""Analyze prompt failures."""
def __init__(self):
self.failure_patterns = {
FailureType.FORMAT_VIOLATION: self._check_format_violation,
FailureType.HALLUCINATION: self._check_hallucination,
FailureType.INCOMPLETE_RESPONSE: self._check_incomplete,
FailureType.REFUSAL: self._check_refusal,
FailureType.REPETITION: self._check_repetition,
FailureType.OFF_TOPIC: self._check_off_topic
}
def analyze(
self,
prompt: str,
output: str,
expected: str = None,
context: dict = None
) -> list[FailureType]:
"""Analyze output for failures."""
failures = []
for failure_type, checker in self.failure_patterns.items():
if checker(prompt, output, expected, context):
failures.append(failure_type)
return failures
def _check_format_violation(
self,
prompt: str,
output: str,
expected: str,
context: dict
) -> bool:
"""Check for format violations."""
# Check for JSON format requests
if "json" in prompt.lower():
try:
import json
json.loads(output)
return False
except json.JSONDecodeError:
return True
# Check for list format
if "list" in prompt.lower() and "bullet" in prompt.lower():
if not re.search(r"^[\-\*\d]", output, re.MULTILINE):
return True
# Check for specific format markers
format_markers = context.get("format_markers", []) if context else []
for marker in format_markers:
if marker not in output:
return True
return False
def _check_hallucination(
self,
prompt: str,
output: str,
expected: str,
context: dict
) -> bool:
"""Check for potential hallucinations."""
# Check for made-up citations
fake_citation_patterns = [
r"\(\d{4}\)", # Year citations
r"according to .+ study",
r"research shows",
r"studies indicate"
]
if context and context.get("no_citations"):
for pattern in fake_citation_patterns:
if re.search(pattern, output, re.IGNORECASE):
return True
# Check for specific facts that should be in context
if context and "required_facts" in context:
for fact in context["required_facts"]:
if fact.lower() not in output.lower():
# Missing required fact might indicate hallucination
pass
return False
def _check_incomplete(
self,
prompt: str,
output: str,
expected: str,
context: dict
) -> bool:
"""Check for incomplete responses."""
# Check for truncation indicators
truncation_indicators = [
output.endswith("..."),
output.endswith("etc"),
len(output) < 50 and "detailed" in prompt.lower(),
output.count("\n") < 3 and "steps" in prompt.lower()
]
return any(truncation_indicators)
def _check_refusal(
self,
prompt: str,
output: str,
expected: str,
context: dict
) -> bool:
"""Check for refusals."""
refusal_phrases = [
"i cannot",
"i'm unable to",
"i can't help",
"i apologize, but",
"as an ai",
"i don't have access",
"i'm not able to"
]
output_lower = output.lower()
return any(phrase in output_lower for phrase in refusal_phrases)
def _check_repetition(
self,
prompt: str,
output: str,
expected: str,
context: dict
) -> bool:
"""Check for repetitive output."""
# Check for repeated sentences
sentences = output.split(".")
if len(sentences) > 3:
unique_sentences = set(s.strip().lower() for s in sentences if s.strip())
if len(unique_sentences) < len(sentences) * 0.7:
return True
# Check for repeated phrases
words = output.split()
if len(words) > 20:
# Look for repeated n-grams
ngrams = [" ".join(words[i:i+5]) for i in range(len(words)-4)]
if len(set(ngrams)) < len(ngrams) * 0.8:
return True
return False
def _check_off_topic(
self,
prompt: str,
output: str,
expected: str,
context: dict
) -> bool:
"""Check if response is off-topic."""
# Extract key terms from prompt
prompt_words = set(prompt.lower().split())
output_words = set(output.lower().split())
# Remove common words
common_words = {"the", "a", "an", "is", "are", "was", "were", "be", "been",
"have", "has", "had", "do", "does", "did", "will", "would",
"could", "should", "may", "might", "must", "shall", "can",
"to", "of", "in", "for", "on", "with", "at", "by", "from",
"as", "into", "through", "during", "before", "after", "above",
"below", "between", "under", "again", "further", "then", "once"}
prompt_keywords = prompt_words - common_words
output_keywords = output_words - common_words
# Check overlap
if prompt_keywords:
overlap = len(prompt_keywords & output_keywords) / len(prompt_keywords)
if overlap < 0.1:
return True
return False
class FailureLogger:
"""Log and track failures."""
def __init__(self):
self.failures: list[FailureCase] = []
def log(self, failure: FailureCase):
"""Log a failure case."""
self.failures.append(failure)
def get_by_type(self, failure_type: FailureType) -> list[FailureCase]:
"""Get failures by type."""
return [f for f in self.failures if f.failure_type == failure_type]
def get_statistics(self) -> dict:
"""Get failure statistics."""
stats = {}
for failure_type in FailureType:
count = len(self.get_by_type(failure_type))
stats[failure_type.value] = count
stats["total"] = len(self.failures)
return stats
def get_common_patterns(self) -> list[dict]:
"""Identify common failure patterns."""
patterns = []
# Group by failure type
for failure_type in FailureType:
cases = self.get_by_type(failure_type)
if len(cases) >= 3:
# Look for common prompt patterns
prompts = [c.prompt for c in cases]
common_words = self._find_common_words(prompts)
patterns.append({
"type": failure_type.value,
"count": len(cases),
"common_words": common_words[:10]
})
return sorted(patterns, key=lambda x: x["count"], reverse=True)
def _find_common_words(self, texts: list[str]) -> list[str]:
"""Find common words across texts."""
from collections import Counter
all_words = []
for text in texts:
words = text.lower().split()
all_words.extend(words)
counter = Counter(all_words)
# Filter common stopwords
stopwords = {"the", "a", "an", "is", "are", "to", "of", "in", "for", "on"}
return [
word for word, count in counter.most_common(20)
if word not in stopwords and count >= len(texts) * 0.5
]
Prompt Decomposition
from dataclasses import dataclass
from typing import Any, Optional, List
@dataclass
class PromptComponent:
"""A component of a prompt."""
name: str
content: str
component_type: str # system, context, instruction, example, constraint
is_essential: bool = True
class PromptDecomposer:
"""Decompose prompts into components for debugging."""
def __init__(self):
self.component_patterns = {
"system": [
r"you are",
r"act as",
r"your role is",
r"as a"
],
"context": [
r"given the following",
r"based on",
r"context:",
r"information:"
],
"instruction": [
r"please",
r"write",
r"generate",
r"create",
r"explain",
r"analyze"
],
"example": [
r"for example",
r"example:",
r"e\.g\.",
r"such as"
],
"constraint": [
r"must",
r"should",
r"do not",
r"don't",
r"avoid",
r"only"
]
}
def decompose(self, prompt: str) -> list[PromptComponent]:
"""Decompose prompt into components."""
components = []
# Split by common delimiters
sections = self._split_sections(prompt)
for i, section in enumerate(sections):
component_type = self._identify_type(section)
components.append(PromptComponent(
name=f"section_{i}",
content=section,
component_type=component_type,
is_essential=self._is_essential(section, component_type)
))
return components
def _split_sections(self, prompt: str) -> list[str]:
"""Split prompt into sections."""
# Split by double newlines or common markers
import re
# Try splitting by headers
sections = re.split(r'\n\n+|\n(?=[A-Z][a-z]+:)', prompt)
# Filter empty sections
return [s.strip() for s in sections if s.strip()]
def _identify_type(self, section: str) -> str:
"""Identify component type."""
section_lower = section.lower()
for comp_type, patterns in self.component_patterns.items():
for pattern in patterns:
if re.search(pattern, section_lower):
return comp_type
return "unknown"
def _is_essential(self, section: str, component_type: str) -> bool:
"""Determine if component is essential."""
# Instructions are always essential
if component_type == "instruction":
return True
# Short sections are often essential
if len(section) < 100:
return True
# Examples are usually not essential for basic functionality
if component_type == "example":
return False
return True
class AblationTester:
"""Test prompts by removing components."""
def __init__(self, llm_client: Any):
self.llm = llm_client
self.decomposer = PromptDecomposer()
async def test_ablation(
self,
prompt: str,
test_input: str,
expected_behavior: str
) -> dict:
"""Test prompt with component ablation."""
components = self.decomposer.decompose(prompt)
results = {}
# Test full prompt
full_output = await self.llm.generate(prompt + "\n" + test_input)
results["full"] = {
"output": full_output,
"components": [c.name for c in components]
}
# Test with each component removed
for i, component in enumerate(components):
if not component.is_essential:
continue
# Create prompt without this component
remaining = [c for j, c in enumerate(components) if j != i]
ablated_prompt = "\n\n".join(c.content for c in remaining)
try:
output = await self.llm.generate(ablated_prompt + "\n" + test_input)
results[f"without_{component.name}"] = {
"output": output,
"removed": component.content[:100],
"component_type": component.component_type
}
except Exception as e:
results[f"without_{component.name}"] = {
"error": str(e)
}
return results
def analyze_ablation_results(self, results: dict) -> dict:
"""Analyze which components are critical."""
full_output = results.get("full", {}).get("output", "")
critical_components = []
optional_components = []
for key, value in results.items():
if key == "full":
continue
if "error" in value:
critical_components.append(key)
elif "output" in value:
# Compare output quality
similarity = self._compute_similarity(full_output, value["output"])
if similarity < 0.5:
critical_components.append(key)
else:
optional_components.append(key)
return {
"critical": critical_components,
"optional": optional_components
}
def _compute_similarity(self, text1: str, text2: str) -> float:
"""Compute text similarity."""
words1 = set(text1.lower().split())
words2 = set(text2.lower().split())
if not words1 or not words2:
return 0.0
intersection = len(words1 & words2)
union = len(words1 | words2)
return intersection / union if union > 0 else 0.0
class MinimalPromptFinder:
"""Find minimal prompt that achieves desired behavior."""
def __init__(self, llm_client: Any):
self.llm = llm_client
self.decomposer = PromptDecomposer()
async def find_minimal(
self,
prompt: str,
test_cases: list[tuple[str, str]], # (input, expected_output)
quality_threshold: float = 0.8
) -> str:
"""Find minimal prompt that passes test cases."""
components = self.decomposer.decompose(prompt)
# Start with all components
current_components = components.copy()
# Try removing each non-essential component
for component in components:
if component.is_essential:
continue
# Test without this component
test_components = [c for c in current_components if c != component]
test_prompt = "\n\n".join(c.content for c in test_components)
# Run test cases
pass_rate = await self._test_prompt(test_prompt, test_cases)
if pass_rate >= quality_threshold:
# Can remove this component
current_components = test_components
return "\n\n".join(c.content for c in current_components)
async def _test_prompt(
self,
prompt: str,
test_cases: list[tuple[str, str]]
) -> float:
"""Test prompt against test cases."""
passed = 0
for input_text, expected in test_cases:
output = await self.llm.generate(prompt + "\n" + input_text)
# Simple similarity check
similarity = self._compute_similarity(output, expected)
if similarity > 0.5:
passed += 1
return passed / len(test_cases) if test_cases else 0.0
def _compute_similarity(self, text1: str, text2: str) -> float:
"""Compute text similarity."""
words1 = set(text1.lower().split())
words2 = set(text2.lower().split())
if not words1 or not words2:
return 0.0
intersection = len(words1 & words2)
union = len(words1 | words2)
return intersection / union if union > 0 else 0.0
Output Analysis
from dataclasses import dataclass
from typing import Any, Optional, List
import re
@dataclass
class OutputAnalysis:
"""Analysis of LLM output."""
output: str
token_count: int
sentence_count: int
has_structure: bool
confidence_indicators: list[str]
uncertainty_indicators: list[str]
issues: list[str]
class OutputAnalyzer:
"""Analyze LLM outputs for debugging."""
def __init__(self):
self.confidence_phrases = [
"definitely", "certainly", "clearly", "obviously",
"without doubt", "absolutely", "surely"
]
self.uncertainty_phrases = [
"might", "maybe", "perhaps", "possibly", "could be",
"i think", "i believe", "it seems", "appears to",
"not sure", "uncertain"
]
self.hedging_phrases = [
"generally", "usually", "often", "sometimes",
"in most cases", "typically"
]
def analyze(self, output: str) -> OutputAnalysis:
"""Analyze output."""
# Basic metrics
token_count = len(output.split())
sentences = re.split(r'[.!?]+', output)
sentence_count = len([s for s in sentences if s.strip()])
# Check for structure
has_structure = self._check_structure(output)
# Find confidence/uncertainty indicators
confidence = self._find_phrases(output, self.confidence_phrases)
uncertainty = self._find_phrases(output, self.uncertainty_phrases)
# Identify issues
issues = self._identify_issues(output)
return OutputAnalysis(
output=output,
token_count=token_count,
sentence_count=sentence_count,
has_structure=has_structure,
confidence_indicators=confidence,
uncertainty_indicators=uncertainty,
issues=issues
)
def _check_structure(self, output: str) -> bool:
"""Check if output has structure."""
# Check for lists
if re.search(r'^[\-\*\d]', output, re.MULTILINE):
return True
# Check for headers
if re.search(r'^#+\s', output, re.MULTILINE):
return True
# Check for paragraphs
if output.count('\n\n') >= 2:
return True
return False
def _find_phrases(self, output: str, phrases: list[str]) -> list[str]:
"""Find phrases in output."""
found = []
output_lower = output.lower()
for phrase in phrases:
if phrase in output_lower:
found.append(phrase)
return found
def _identify_issues(self, output: str) -> list[str]:
"""Identify potential issues in output."""
issues = []
# Check for very short output
if len(output.split()) < 10:
issues.append("very_short_response")
# Check for repetition
words = output.split()
if len(words) > 20:
unique_ratio = len(set(words)) / len(words)
if unique_ratio < 0.5:
issues.append("high_repetition")
# Check for incomplete sentences
if output and not output.rstrip()[-1] in '.!?':
issues.append("incomplete_sentence")
# Check for excessive hedging
hedging_count = sum(
1 for phrase in self.hedging_phrases
if phrase in output.lower()
)
if hedging_count >= 3:
issues.append("excessive_hedging")
# Check for contradictions
if self._has_contradictions(output):
issues.append("potential_contradiction")
return issues
def _has_contradictions(self, output: str) -> bool:
"""Check for potential contradictions."""
contradiction_patterns = [
(r"is (\w+)", r"is not \1"),
(r"can (\w+)", r"cannot \1"),
(r"will (\w+)", r"will not \1"),
(r"should (\w+)", r"should not \1")
]
for positive, negative in contradiction_patterns:
if re.search(positive, output) and re.search(negative, output):
return True
return False
class DiffAnalyzer:
"""Analyze differences between outputs."""
def compare(self, output1: str, output2: str) -> dict:
"""Compare two outputs."""
# Tokenize
words1 = output1.split()
words2 = output2.split()
# Find common and different words
set1 = set(words1)
set2 = set(words2)
common = set1 & set2
only_in_1 = set1 - set2
only_in_2 = set2 - set1
# Calculate similarity
jaccard = len(common) / len(set1 | set2) if set1 | set2 else 0
# Find structural differences
struct_diff = self._compare_structure(output1, output2)
return {
"similarity": jaccard,
"common_words": len(common),
"unique_to_first": list(only_in_1)[:20],
"unique_to_second": list(only_in_2)[:20],
"length_diff": len(words2) - len(words1),
"structural_differences": struct_diff
}
def _compare_structure(self, output1: str, output2: str) -> list[str]:
"""Compare structural elements."""
differences = []
# Compare list presence
has_list1 = bool(re.search(r'^[\-\*\d]', output1, re.MULTILINE))
has_list2 = bool(re.search(r'^[\-\*\d]', output2, re.MULTILINE))
if has_list1 != has_list2:
differences.append("list_presence")
# Compare paragraph count
para1 = output1.count('\n\n')
para2 = output2.count('\n\n')
if abs(para1 - para2) > 2:
differences.append("paragraph_count")
# Compare sentence count
sent1 = len(re.split(r'[.!?]+', output1))
sent2 = len(re.split(r'[.!?]+', output2))
if abs(sent1 - sent2) > 3:
differences.append("sentence_count")
return differences
class ConsistencyChecker:
"""Check output consistency across runs."""
def __init__(self, llm_client: Any):
self.llm = llm_client
async def check_consistency(
self,
prompt: str,
num_runs: int = 5
) -> dict:
"""Check consistency of outputs."""
outputs = []
for _ in range(num_runs):
output = await self.llm.generate(prompt)
outputs.append(output)
# Analyze consistency
diff_analyzer = DiffAnalyzer()
similarities = []
for i in range(len(outputs)):
for j in range(i + 1, len(outputs)):
comparison = diff_analyzer.compare(outputs[i], outputs[j])
similarities.append(comparison["similarity"])
avg_similarity = sum(similarities) / len(similarities) if similarities else 0
return {
"num_runs": num_runs,
"outputs": outputs,
"avg_similarity": avg_similarity,
"min_similarity": min(similarities) if similarities else 0,
"max_similarity": max(similarities) if similarities else 0,
"is_consistent": avg_similarity > 0.7
}
Iterative Refinement
from dataclasses import dataclass, field
from typing import Any, Optional, List, Callable
from datetime import datetime
@dataclass
class PromptVersion:
"""A version of a prompt."""
version: int
prompt: str
changes: str
test_results: dict = field(default_factory=dict)
created_at: datetime = field(default_factory=datetime.now)
class PromptRefiner:
"""Iteratively refine prompts."""
def __init__(self, llm_client: Any):
self.llm = llm_client
self.versions: list[PromptVersion] = []
def add_version(self, prompt: str, changes: str = ""):
"""Add a new prompt version."""
version = PromptVersion(
version=len(self.versions) + 1,
prompt=prompt,
changes=changes
)
self.versions.append(version)
return version
async def test_version(
self,
version: PromptVersion,
test_cases: list[tuple[str, Callable]]
) -> dict:
"""Test a prompt version."""
results = {
"passed": 0,
"failed": 0,
"details": []
}
for input_text, validator in test_cases:
output = await self.llm.generate(version.prompt + "\n" + input_text)
try:
is_valid = validator(output)
if is_valid:
results["passed"] += 1
else:
results["failed"] += 1
results["details"].append({
"input": input_text[:100],
"output": output[:200],
"passed": is_valid
})
except Exception as e:
results["failed"] += 1
results["details"].append({
"input": input_text[:100],
"error": str(e),
"passed": False
})
version.test_results = results
return results
async def auto_refine(
self,
initial_prompt: str,
test_cases: list[tuple[str, Callable]],
max_iterations: int = 5
) -> PromptVersion:
"""Automatically refine prompt."""
current_prompt = initial_prompt
self.add_version(current_prompt, "Initial version")
for i in range(max_iterations):
# Test current version
results = await self.test_version(self.versions[-1], test_cases)
if results["failed"] == 0:
# All tests pass
break
# Analyze failures
failures = [
d for d in results["details"]
if not d.get("passed")
]
# Generate refinement
refined_prompt = await self._generate_refinement(
current_prompt,
failures
)
if refined_prompt != current_prompt:
current_prompt = refined_prompt
self.add_version(
current_prompt,
f"Iteration {i+1}: Fixed {len(failures)} failures"
)
return self.versions[-1]
async def _generate_refinement(
self,
prompt: str,
failures: list[dict]
) -> str:
"""Generate refined prompt based on failures."""
failure_summary = "\n".join([
f"- Input: {f.get('input', 'N/A')}\n Output: {f.get('output', f.get('error', 'N/A'))}"
for f in failures[:3]
])
refinement_prompt = f"""Analyze this prompt and its failures, then provide an improved version.
Original Prompt:
{prompt}
Failures:
{failure_summary}
Provide an improved prompt that addresses these failures. Only output the improved prompt, nothing else."""
return await self.llm.generate(refinement_prompt)
def get_best_version(self) -> Optional[PromptVersion]:
"""Get the best performing version."""
best = None
best_score = -1
for version in self.versions:
if version.test_results:
passed = version.test_results.get("passed", 0)
total = passed + version.test_results.get("failed", 0)
if total > 0:
score = passed / total
if score > best_score:
best_score = score
best = version
return best
class PromptDebugger:
"""Interactive prompt debugger."""
def __init__(self, llm_client: Any):
self.llm = llm_client
self.analyzer = FailureAnalyzer()
self.output_analyzer = OutputAnalyzer()
self.refiner = PromptRefiner(llm_client)
async def debug(
self,
prompt: str,
input_text: str,
expected: str = None
) -> dict:
"""Debug a prompt."""
# Generate output
output = await self.llm.generate(prompt + "\n" + input_text)
# Analyze failures
failures = self.analyzer.analyze(prompt, output, expected)
# Analyze output
output_analysis = self.output_analyzer.analyze(output)
# Generate suggestions
suggestions = self._generate_suggestions(failures, output_analysis)
return {
"output": output,
"failures": [f.value for f in failures],
"output_analysis": {
"token_count": output_analysis.token_count,
"has_structure": output_analysis.has_structure,
"issues": output_analysis.issues,
"uncertainty": output_analysis.uncertainty_indicators
},
"suggestions": suggestions
}
def _generate_suggestions(
self,
failures: list[FailureType],
analysis: OutputAnalysis
) -> list[str]:
"""Generate debugging suggestions."""
suggestions = []
if FailureType.FORMAT_VIOLATION in failures:
suggestions.append(
"Add explicit format instructions with examples"
)
suggestions.append(
"Use format markers like 'Output format: JSON'"
)
if FailureType.INCOMPLETE_RESPONSE in failures:
suggestions.append(
"Increase max_tokens parameter"
)
suggestions.append(
"Add 'Be thorough and complete' instruction"
)
if FailureType.REFUSAL in failures:
suggestions.append(
"Rephrase request to avoid triggering safety filters"
)
suggestions.append(
"Add context explaining legitimate use case"
)
if FailureType.REPETITION in failures:
suggestions.append(
"Lower temperature parameter"
)
suggestions.append(
"Add 'Avoid repetition' instruction"
)
if analysis.uncertainty_indicators:
suggestions.append(
"Add 'Be confident and direct' instruction"
)
if "excessive_hedging" in analysis.issues:
suggestions.append(
"Add 'Provide definitive answers' instruction"
)
return suggestions
Production Debugging Service
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional, List
import time
app = FastAPI()
class DebugRequest(BaseModel):
prompt: str
input_text: str
expected_output: Optional[str] = None
class DebugResponse(BaseModel):
output: str
failures: list[str]
issues: list[str]
suggestions: list[str]
latency_ms: float
class AnalyzeRequest(BaseModel):
output: str
class RefineRequest(BaseModel):
prompt: str
test_cases: list[dict] # [{input, expected}]
max_iterations: Optional[int] = 3
# Mock LLM for demo
class MockLLM:
async def generate(self, prompt):
return f"Mock response for: {prompt[:50]}..."
llm = MockLLM()
analyzer = FailureAnalyzer()
output_analyzer = OutputAnalyzer()
@app.post("/v1/debug")
async def debug_prompt(request: DebugRequest) -> DebugResponse:
"""Debug a prompt."""
start = time.time()
# Generate output
full_prompt = request.prompt + "\n" + request.input_text
output = await llm.generate(full_prompt)
# Analyze
failures = analyzer.analyze(
request.prompt,
output,
request.expected_output
)
analysis = output_analyzer.analyze(output)
# Generate suggestions
suggestions = []
if failures:
suggestions.append("Review failure types and adjust prompt accordingly")
if analysis.issues:
suggestions.extend([f"Address issue: {issue}" for issue in analysis.issues])
latency = (time.time() - start) * 1000
return DebugResponse(
output=output,
failures=[f.value for f in failures],
issues=analysis.issues,
suggestions=suggestions,
latency_ms=latency
)
@app.post("/v1/analyze")
async def analyze_output(request: AnalyzeRequest) -> dict:
"""Analyze an output."""
analysis = output_analyzer.analyze(request.output)
return {
"token_count": analysis.token_count,
"sentence_count": analysis.sentence_count,
"has_structure": analysis.has_structure,
"confidence_indicators": analysis.confidence_indicators,
"uncertainty_indicators": analysis.uncertainty_indicators,
"issues": analysis.issues
}
@app.post("/v1/compare")
async def compare_outputs(outputs: list[str]) -> dict:
"""Compare multiple outputs."""
if len(outputs) < 2:
raise HTTPException(status_code=400, detail="Need at least 2 outputs")
diff_analyzer = DiffAnalyzer()
comparisons = []
for i in range(len(outputs) - 1):
comparison = diff_analyzer.compare(outputs[i], outputs[i+1])
comparisons.append(comparison)
return {"comparisons": comparisons}
@app.get("/v1/failure-types")
async def list_failure_types() -> dict:
"""List failure types."""
return {
"types": [
{"name": ft.value, "description": ft.name.replace("_", " ").title()}
for ft in FailureType
]
}
@app.get("/health")
async def health():
return {"status": "healthy"}
References
Conclusion
Prompt debugging requires a systematic approach that combines failure analysis, component isolation, and iterative refinement. Start by categorizing failures—format violations, hallucinations, refusals, and off-topic responses each have different root causes and fixes. Decompose complex prompts into components and use ablation testing to identify which parts are critical versus optional. Analyze outputs for uncertainty indicators, repetition, and structural issues that signal problems. When refining prompts, make one change at a time and test against a consistent set of cases to measure improvement. Consistency checking across multiple runs reveals whether issues are deterministic or stochastic—high variance suggests the prompt is ambiguous. Build a library of failure cases for regression testing as you iterate. The key insight is that prompt debugging is empirical—you cannot reason your way to a perfect prompt, you must test and iterate. Track your prompt versions and their test results to understand what works for your specific use case. Production systems should log failures for continuous improvement, creating a feedback loop that makes prompts more robust over time.