October 2022 « Code, Cloud & Context

LLM Output Parsing: Extracting Structured Data from Language Model Responses

Introduction: LLMs generate text, but applications need structured data. Parsing LLM outputs reliably is one of the most common challenges in production systems. The model might return JSON with extra text, miss required fields, use unexpected formats, or hallucinate invalid values. This guide covers practical parsing strategies: using structured output modes, building robust parsers with fallbacks, validating against schemas, and handling the inevitable parsing failures gracefully. Master these patterns and your LLM applications will be significantly more reliable.

Structured Output Modes

from dataclasses import dataclass
from typing import Any, Optional, Type, TypeVar
from pydantic import BaseModel, ValidationError
import json

T = TypeVar('T', bound=BaseModel)

class StructuredOutputClient:
    """Client for structured LLM outputs."""
    
    def __init__(self, client: Any, model: str = "gpt-4o"):
        self.client = client
        self.model = model
    
    async def generate_json(
        self,
        prompt: str,
        schema: dict = None
    ) -> dict:
        """Generate JSON output using response_format."""
        
        response = await self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        content = response.choices[0].message.content
        return json.loads(content)
    
    async def generate_typed(
        self,
        prompt: str,
        response_type: Type[T]
    ) -> T:
        """Generate output matching a Pydantic model."""
        
        # Build schema description
        schema = response_type.model_json_schema()
        schema_str = json.dumps(schema, indent=2)
        
        full_prompt = f"""{prompt}

Respond with JSON matching this schema:
{schema_str}"""
        
        response = await self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": full_prompt}],
            response_format={"type": "json_object"}
        )
        
        content = response.choices[0].message.content
        data = json.loads(content)
        
        return response_type.model_validate(data)
    
    async def generate_with_tools(
        self,
        prompt: str,
        response_type: Type[T]
    ) -> T:
        """Generate structured output using function calling."""
        
        schema = response_type.model_json_schema()
        
        # Remove unsupported fields
        if "title" in schema:
            del schema["title"]
        
        tool = {
            "type": "function",
            "function": {
                "name": "respond",
                "description": "Respond with structured data",
                "parameters": schema
            }
        }
        
        response = await self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            tools=[tool],
            tool_choice={"type": "function", "function": {"name": "respond"}}
        )
        
        tool_call = response.choices[0].message.tool_calls[0]
        data = json.loads(tool_call.function.arguments)
        
        return response_type.model_validate(data)

# Example usage
class ProductReview(BaseModel):
    sentiment: str  # positive, negative, neutral
    score: float  # 0.0 to 1.0
    summary: str
    key_points: list[str]

async def analyze_review(client: StructuredOutputClient, review_text: str) -> ProductReview:
    """Analyze a product review."""
    
    prompt = f"""Analyze this product review:

{review_text}

Extract the sentiment, score (0-1), a brief summary, and key points."""
    
    return await client.generate_typed(prompt, ProductReview)

Robust Parsing

from dataclasses import dataclass
from typing import Any, Optional, Type, TypeVar
import re
import json

@dataclass
class ParseResult:
    """Result of parsing attempt."""
    
    success: bool
    data: Any = None
    error: str = None
    method: str = ""

class RobustParser:
    """Parse LLM outputs with multiple strategies."""
    
    def parse_json(self, text: str) -> ParseResult:
        """Parse JSON from text with multiple strategies."""
        
        # Strategy 1: Direct parse
        try:
            data = json.loads(text)
            return ParseResult(success=True, data=data, method="direct")
        except json.JSONDecodeError:
            pass
        
        # Strategy 2: Extract JSON from markdown code block
        json_match = re.search(r'```(?:json)?\s*([\s\S]*?)```', text)
        if json_match:
            try:
                data = json.loads(json_match.group(1))
                return ParseResult(success=True, data=data, method="code_block")
            except json.JSONDecodeError:
                pass
        
        # Strategy 3: Find JSON object in text
        brace_match = re.search(r'\{[\s\S]*\}', text)
        if brace_match:
            try:
                data = json.loads(brace_match.group())
                return ParseResult(success=True, data=data, method="brace_extract")
            except json.JSONDecodeError:
                pass
        
        # Strategy 4: Find JSON array in text
        bracket_match = re.search(r'\[[\s\S]*\]', text)
        if bracket_match:
            try:
                data = json.loads(bracket_match.group())
                return ParseResult(success=True, data=data, method="bracket_extract")
            except json.JSONDecodeError:
                pass
        
        # Strategy 5: Fix common JSON errors
        fixed = self._fix_common_errors(text)
        if fixed != text:
            try:
                data = json.loads(fixed)
                return ParseResult(success=True, data=data, method="error_fix")
            except json.JSONDecodeError:
                pass
        
        return ParseResult(success=False, error="Could not parse JSON")
    
    def _fix_common_errors(self, text: str) -> str:
        """Fix common JSON formatting errors."""
        
        # Remove trailing commas
        text = re.sub(r',\s*}', '}', text)
        text = re.sub(r',\s*]', ']', text)
        
        # Fix single quotes to double quotes
        text = re.sub(r"'([^']*)':", r'"\1":', text)
        
        # Fix unquoted keys
        text = re.sub(r'(\{|,)\s*(\w+)\s*:', r'\1"\2":', text)
        
        # Fix Python-style booleans
        text = text.replace('True', 'true').replace('False', 'false')
        text = text.replace('None', 'null')
        
        return text
    
    def parse_list(self, text: str) -> ParseResult:
        """Parse a list from text."""
        
        # Try JSON array first
        result = self.parse_json(text)
        if result.success and isinstance(result.data, list):
            return result
        
        # Parse bullet points
        items = []
        for line in text.split('\n'):
            line = line.strip()
            
            # Match various bullet formats
            match = re.match(r'^[-*•]\s*(.+)$', line)
            if match:
                items.append(match.group(1))
                continue
            
            # Match numbered lists
            match = re.match(r'^\d+[.)]\s*(.+)$', line)
            if match:
                items.append(match.group(1))
        
        if items:
            return ParseResult(success=True, data=items, method="bullet_parse")
        
        return ParseResult(success=False, error="Could not parse list")
    
    def parse_key_value(self, text: str) -> ParseResult:
        """Parse key-value pairs from text."""
        
        # Try JSON first
        result = self.parse_json(text)
        if result.success and isinstance(result.data, dict):
            return result
        
        # Parse key: value format
        data = {}
        for line in text.split('\n'):
            line = line.strip()
            
            # Match "key: value" format
            match = re.match(r'^([^:]+):\s*(.+)$', line)
            if match:
                key = match.group(1).strip().lower().replace(' ', '_')
                value = match.group(2).strip()
                data[key] = value
        
        if data:
            return ParseResult(success=True, data=data, method="key_value_parse")
        
        return ParseResult(success=False, error="Could not parse key-value pairs")

class TypedParser:
    """Parse into typed Pydantic models."""
    
    def __init__(self):
        self.robust_parser = RobustParser()
    
    def parse(
        self,
        text: str,
        model_type: Type[T]
    ) -> tuple[Optional[T], Optional[str]]:
        """Parse text into a Pydantic model."""
        
        # First, extract JSON
        result = self.robust_parser.parse_json(text)
        
        if not result.success:
            return None, result.error
        
        # Validate against model
        try:
            instance = model_type.model_validate(result.data)
            return instance, None
        except ValidationError as e:
            return None, str(e)
    
    def parse_with_coercion(
        self,
        text: str,
        model_type: Type[T]
    ) -> tuple[Optional[T], Optional[str]]:
        """Parse with type coercion for common issues."""
        
        result = self.robust_parser.parse_json(text)
        
        if not result.success:
            return None, result.error
        
        data = result.data
        
        # Get model fields
        schema = model_type.model_json_schema()
        properties = schema.get("properties", {})
        
        # Coerce types
        for field, field_schema in properties.items():
            if field not in data:
                continue
            
            value = data[field]
            field_type = field_schema.get("type")
            
            # String to number
            if field_type in ["integer", "number"] and isinstance(value, str):
                try:
                    data[field] = float(value) if field_type == "number" else int(value)
                except ValueError:
                    pass
            
            # String to boolean
            elif field_type == "boolean" and isinstance(value, str):
                data[field] = value.lower() in ["true", "yes", "1"]
            
            # Single item to array
            elif field_type == "array" and not isinstance(value, list):
                data[field] = [value]
        
        try:
            instance = model_type.model_validate(data)
            return instance, None
        except ValidationError as e:
            return None, str(e)

Schema Validation

from dataclasses import dataclass
from typing import Any, Optional
from pydantic import BaseModel, Field, field_validator
from enum import Enum

class Sentiment(str, Enum):
    POSITIVE = "positive"
    NEGATIVE = "negative"
    NEUTRAL = "neutral"

class AnalysisResult(BaseModel):
    """Validated analysis result."""
    
    sentiment: Sentiment
    confidence: float = Field(ge=0.0, le=1.0)
    summary: str = Field(min_length=10, max_length=500)
    topics: list[str] = Field(min_length=1, max_length=10)
    
    @field_validator('topics')
    @classmethod
    def validate_topics(cls, v):
        # Remove empty strings and duplicates
        return list(set(t.strip() for t in v if t.strip()))

class SchemaValidator:
    """Validate parsed data against schemas."""
    
    def __init__(self):
        self.validators: dict[str, type[BaseModel]] = {}
    
    def register(self, name: str, model: type[BaseModel]):
        """Register a validation schema."""
        self.validators[name] = model
    
    def validate(
        self,
        data: dict,
        schema_name: str
    ) -> tuple[Optional[BaseModel], list[str]]:
        """Validate data against a registered schema."""
        
        if schema_name not in self.validators:
            return None, [f"Unknown schema: {schema_name}"]
        
        model = self.validators[schema_name]
        
        try:
            instance = model.model_validate(data)
            return instance, []
        except ValidationError as e:
            errors = [f"{err['loc']}: {err['msg']}" for err in e.errors()]
            return None, errors
    
    def validate_partial(
        self,
        data: dict,
        schema_name: str
    ) -> tuple[dict, list[str]]:
        """Validate and return valid fields only."""
        
        if schema_name not in self.validators:
            return {}, [f"Unknown schema: {schema_name}"]
        
        model = self.validators[schema_name]
        schema = model.model_json_schema()
        properties = schema.get("properties", {})
        
        valid_data = {}
        errors = []
        
        for field, field_schema in properties.items():
            if field not in data:
                continue
            
            value = data[field]
            
            # Basic type validation
            field_type = field_schema.get("type")
            
            if field_type == "string" and isinstance(value, str):
                # Check constraints
                min_len = field_schema.get("minLength", 0)
                max_len = field_schema.get("maxLength", float('inf'))
                
                if min_len <= len(value) <= max_len:
                    valid_data[field] = value
                else:
                    errors.append(f"{field}: length out of range")
            
            elif field_type == "number" and isinstance(value, (int, float)):
                minimum = field_schema.get("minimum", float('-inf'))
                maximum = field_schema.get("maximum", float('inf'))
                
                if minimum <= value <= maximum:
                    valid_data[field] = value
                else:
                    errors.append(f"{field}: value out of range")
            
            elif field_type == "array" and isinstance(value, list):
                valid_data[field] = value
            
            elif field_type == "boolean" and isinstance(value, bool):
                valid_data[field] = value
            
            else:
                errors.append(f"{field}: type mismatch")
        
        return valid_data, errors

class OutputRepairer:
    """Repair invalid outputs using LLM."""
    
    def __init__(self, client: Any, model: str = "gpt-4o-mini"):
        self.client = client
        self.model = model
    
    async def repair(
        self,
        original_output: str,
        errors: list[str],
        schema: dict
    ) -> Optional[dict]:
        """Attempt to repair invalid output."""
        
        prompt = f"""The following output has validation errors. Please fix it.

Original output:
{original_output}

Errors:
{chr(10).join(f'- {e}' for e in errors)}

Required schema:
{json.dumps(schema, indent=2)}

Return only the corrected JSON, no explanation."""
        
        response = await self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            response_format={"type": "json_object"}
        )
        
        try:
            return json.loads(response.choices[0].message.content)
        except json.JSONDecodeError:
            return None

Parsing Pipeline

from dataclasses import dataclass
from typing import Any, Callable, Optional, Type, TypeVar

@dataclass
class PipelineResult:
    """Result from parsing pipeline."""
    
    success: bool
    data: Any = None
    errors: list[str] = None
    attempts: int = 0
    final_method: str = ""

class ParsingPipeline:
    """Multi-stage parsing pipeline with retries."""
    
    def __init__(
        self,
        client: Any,
        model: str = "gpt-4o",
        max_retries: int = 2
    ):
        self.client = client
        self.model = model
        self.max_retries = max_retries
        self.robust_parser = RobustParser()
        self.typed_parser = TypedParser()
        self.repairer = OutputRepairer(client, model)
    
    async def parse(
        self,
        prompt: str,
        response_type: Type[T],
        context: str = ""
    ) -> PipelineResult:
        """Run full parsing pipeline."""
        
        errors = []
        
        for attempt in range(self.max_retries + 1):
            # Generate output
            if attempt == 0:
                output = await self._generate(prompt, response_type)
            else:
                # Retry with error feedback
                output = await self._generate_with_feedback(
                    prompt, response_type, errors
                )
            
            # Parse
            instance, parse_error = self.typed_parser.parse_with_coercion(
                output, response_type
            )
            
            if instance:
                return PipelineResult(
                    success=True,
                    data=instance,
                    attempts=attempt + 1,
                    final_method="typed_parse"
                )
            
            errors.append(parse_error)
            
            # Try repair
            if attempt < self.max_retries:
                schema = response_type.model_json_schema()
                repaired = await self.repairer.repair(output, [parse_error], schema)
                
                if repaired:
                    instance, repair_error = self.typed_parser.parse(
                        json.dumps(repaired), response_type
                    )
                    
                    if instance:
                        return PipelineResult(
                            success=True,
                            data=instance,
                            attempts=attempt + 1,
                            final_method="repair"
                        )
                    
                    errors.append(repair_error)
        
        return PipelineResult(
            success=False,
            errors=errors,
            attempts=self.max_retries + 1
        )
    
    async def _generate(
        self,
        prompt: str,
        response_type: Type[T]
    ) -> str:
        """Generate initial output."""
        
        schema = response_type.model_json_schema()
        
        full_prompt = f"""{prompt}

Respond with JSON matching this schema:
{json.dumps(schema, indent=2)}"""
        
        response = await self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": full_prompt}],
            response_format={"type": "json_object"}
        )
        
        return response.choices[0].message.content
    
    async def _generate_with_feedback(
        self,
        prompt: str,
        response_type: Type[T],
        previous_errors: list[str]
    ) -> str:
        """Generate with error feedback."""
        
        schema = response_type.model_json_schema()
        
        full_prompt = f"""{prompt}

Previous attempts had these errors:
{chr(10).join(f'- {e}' for e in previous_errors)}

Please respond with valid JSON matching this schema:
{json.dumps(schema, indent=2)}

Be careful to:
- Use correct types (strings, numbers, booleans)
- Include all required fields
- Follow any constraints (min/max values, lengths)"""
        
        response = await self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": full_prompt}],
            response_format={"type": "json_object"}
        )
        
        return response.choices[0].message.content

class StreamingParser:
    """Parse streaming outputs incrementally."""
    
    def __init__(self):
        self.buffer = ""
        self.partial_data = {}
    
    def feed(self, chunk: str) -> Optional[dict]:
        """Feed a chunk and try to extract complete fields."""
        
        self.buffer += chunk
        
        # Try to parse complete JSON
        try:
            data = json.loads(self.buffer)
            return data
        except json.JSONDecodeError:
            pass
        
        # Try to extract complete fields
        self._extract_partial_fields()
        
        return None
    
    def _extract_partial_fields(self):
        """Extract fields that are complete."""
        
        # Look for complete "key": "value" pairs
        pattern = r'"(\w+)":\s*"([^"]*)"(?=,|\s*})'
        
        for match in re.finditer(pattern, self.buffer):
            key = match.group(1)
            value = match.group(2)
            self.partial_data[key] = value
    
    def get_partial(self) -> dict:
        """Get partially parsed data."""
        return self.partial_data.copy()
    
    def reset(self):
        """Reset parser state."""
        self.buffer = ""
        self.partial_data = {}

Production Parsing Service

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Any, Optional

app = FastAPI()

# Initialize components
robust_parser = RobustParser()
schema_validator = SchemaValidator()
parsing_pipeline = None  # Initialize with actual client

# Register schemas
schema_validator.register("analysis", AnalysisResult)

class ParseJSONRequest(BaseModel):
    text: str

class ParseTypedRequest(BaseModel):
    text: str
    schema_name: str

class GenerateTypedRequest(BaseModel):
    prompt: str
    schema_name: str

@app.post("/v1/parse/json")
async def parse_json(request: ParseJSONRequest):
    """Parse JSON from text."""
    
    result = robust_parser.parse_json(request.text)
    
    if not result.success:
        raise HTTPException(400, result.error)
    
    return {
        "data": result.data,
        "method": result.method
    }

@app.post("/v1/parse/list")
async def parse_list(request: ParseJSONRequest):
    """Parse list from text."""
    
    result = robust_parser.parse_list(request.text)
    
    if not result.success:
        raise HTTPException(400, result.error)
    
    return {
        "data": result.data,
        "method": result.method
    }

@app.post("/v1/parse/typed")
async def parse_typed(request: ParseTypedRequest):
    """Parse and validate against schema."""
    
    # First parse JSON
    json_result = robust_parser.parse_json(request.text)
    
    if not json_result.success:
        raise HTTPException(400, f"JSON parse error: {json_result.error}")
    
    # Validate against schema
    instance, errors = schema_validator.validate(
        json_result.data,
        request.schema_name
    )
    
    if errors:
        raise HTTPException(400, {"validation_errors": errors})
    
    return {
        "data": instance.model_dump(),
        "schema": request.schema_name
    }

@app.post("/v1/generate/typed")
async def generate_typed(request: GenerateTypedRequest):
    """Generate and parse typed output."""
    
    if request.schema_name not in schema_validator.validators:
        raise HTTPException(400, f"Unknown schema: {request.schema_name}")
    
    model_type = schema_validator.validators[request.schema_name]
    
    result = await parsing_pipeline.parse(
        request.prompt,
        model_type
    )
    
    if not result.success:
        raise HTTPException(400, {
            "errors": result.errors,
            "attempts": result.attempts
        })
    
    return {
        "data": result.data.model_dump(),
        "attempts": result.attempts,
        "method": result.final_method
    }

@app.get("/v1/schemas")
async def list_schemas():
    """List available schemas."""
    
    schemas = {}
    for name, model in schema_validator.validators.items():
        schemas[name] = model.model_json_schema()
    
    return {"schemas": schemas}

@app.get("/health")
async def health():
    return {"status": "healthy"}

References

OpenAI JSON Mode: https://platform.openai.com/docs/guides/structured-outputs
Pydantic Documentation: https://docs.pydantic.dev/
Instructor Library: https://python.useinstructor.com/
LangChain Output Parsers: https://python.langchain.com/docs/modules/model_io/output_parsers/

Conclusion

Reliable output parsing is essential for production LLM applications. Use structured output modes (JSON mode, function calling) when available—they dramatically reduce parsing failures. Build robust parsers that try multiple extraction strategies: direct parsing, code block extraction, brace matching, and common error fixes. Validate parsed data against Pydantic schemas to catch type mismatches and constraint violations early. Implement repair loops that feed errors back to the LLM for correction. For streaming outputs, parse incrementally to provide partial results while waiting for completion. Monitor parsing success rates and failure modes to identify prompts that need improvement. The goal is transforming unreliable text outputs into structured data your application can trust.

Searching in

Code, Cloud & Context

Categories

Archives

A sample text widget