Document Processing with LLMs: From PDFs to Structured Data

Introduction: Documents are everywhere—PDFs, Word files, scanned images, spreadsheets. Extracting structured information from unstructured documents is one of the most valuable LLM applications. This guide covers building document processing pipelines: extracting text from various formats, chunking strategies for long documents, processing with LLMs for extraction and summarization, and handling edge cases like tables, images, and multi-column layouts. These patterns apply to invoice processing, contract analysis, research paper summarization, and any workflow involving document understanding.

Text Extraction

# pip install pypdf python-docx openpyxl

from pathlib import Path
from typing import Union
import io

def extract_from_pdf(file_path: Union[str, Path]) -> str:
    """Extract text from PDF."""
    
    from pypdf import PdfReader
    
    reader = PdfReader(file_path)
    
    text_parts = []
    for page in reader.pages:
        text = page.extract_text()
        if text:
            text_parts.append(text)
    
    return "\n\n".join(text_parts)

def extract_from_docx(file_path: Union[str, Path]) -> str:
    """Extract text from Word document."""
    
    from docx import Document
    
    doc = Document(file_path)
    
    text_parts = []
    for para in doc.paragraphs:
        if para.text.strip():
            text_parts.append(para.text)
    
    # Also extract from tables
    for table in doc.tables:
        for row in table.rows:
            row_text = " | ".join(cell.text for cell in row.cells)
            if row_text.strip():
                text_parts.append(row_text)
    
    return "\n\n".join(text_parts)

def extract_from_xlsx(file_path: Union[str, Path]) -> str:
    """Extract text from Excel spreadsheet."""
    
    from openpyxl import load_workbook
    
    wb = load_workbook(file_path, data_only=True)
    
    text_parts = []
    for sheet_name in wb.sheetnames:
        sheet = wb[sheet_name]
        text_parts.append(f"Sheet: {sheet_name}")
        
        for row in sheet.iter_rows(values_only=True):
            row_text = " | ".join(str(cell) if cell else "" for cell in row)
            if row_text.strip(" |"):
                text_parts.append(row_text)
    
    return "\n\n".join(text_parts)

def extract_text(file_path: Union[str, Path]) -> str:
    """Extract text from any supported document type."""
    
    path = Path(file_path)
    suffix = path.suffix.lower()
    
    extractors = {
        ".pdf": extract_from_pdf,
        ".docx": extract_from_docx,
        ".xlsx": extract_from_xlsx,
        ".txt": lambda p: Path(p).read_text(),
        ".md": lambda p: Path(p).read_text(),
    }
    
    if suffix not in extractors:
        raise ValueError(f"Unsupported file type: {suffix}")
    
    return extractors[suffix](path)

# Usage
text = extract_text("contract.pdf")
print(f"Extracted {len(text)} characters")

Smart Chunking

from dataclasses import dataclass
from typing import Iterator
import re

@dataclass
class Chunk:
    text: str
    index: int
    metadata: dict

class DocumentChunker:
    """Split documents into processable chunks."""
    
    def __init__(
        self,
        chunk_size: int = 1000,
        chunk_overlap: int = 200,
        separators: list[str] = None
    ):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.separators = separators or ["\n\n", "\n", ". ", " "]
    
    def chunk_by_size(self, text: str) -> Iterator[Chunk]:
        """Simple size-based chunking with overlap."""
        
        start = 0
        index = 0
        
        while start < len(text):
            end = start + self.chunk_size
            
            # Try to break at a natural boundary
            if end < len(text):
                for sep in self.separators:
                    last_sep = text.rfind(sep, start, end)
                    if last_sep > start:
                        end = last_sep + len(sep)
                        break
            
            chunk_text = text[start:end].strip()
            
            if chunk_text:
                yield Chunk(
                    text=chunk_text,
                    index=index,
                    metadata={"start": start, "end": end}
                )
                index += 1
            
            start = end - self.chunk_overlap
    
    def chunk_by_sections(self, text: str) -> Iterator[Chunk]:
        """Chunk by document sections (headers)."""
        
        # Split by markdown-style headers
        pattern = r'(^#{1,3}\s+.+$)'
        parts = re.split(pattern, text, flags=re.MULTILINE)
        
        current_header = ""
        current_content = []
        index = 0
        
        for part in parts:
            if re.match(r'^#{1,3}\s+', part):
                # This is a header
                if current_content:
                    yield Chunk(
                        text="\n".join(current_content),
                        index=index,
                        metadata={"header": current_header}
                    )
                    index += 1
                
                current_header = part.strip()
                current_content = [current_header]
            else:
                current_content.append(part.strip())
        
        # Don't forget the last section
        if current_content:
            yield Chunk(
                text="\n".join(current_content),
                index=index,
                metadata={"header": current_header}
            )
    
    def chunk_semantic(self, text: str) -> Iterator[Chunk]:
        """Chunk by semantic similarity (paragraph grouping)."""
        
        paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
        
        current_chunk = []
        current_size = 0
        index = 0
        
        for para in paragraphs:
            para_size = len(para)
            
            if current_size + para_size > self.chunk_size and current_chunk:
                yield Chunk(
                    text="\n\n".join(current_chunk),
                    index=index,
                    metadata={"paragraphs": len(current_chunk)}
                )
                index += 1
                
                # Keep last paragraph for context
                current_chunk = [current_chunk[-1]] if current_chunk else []
                current_size = len(current_chunk[0]) if current_chunk else 0
            
            current_chunk.append(para)
            current_size += para_size
        
        if current_chunk:
            yield Chunk(
                text="\n\n".join(current_chunk),
                index=index,
                metadata={"paragraphs": len(current_chunk)}
            )

# Usage
chunker = DocumentChunker(chunk_size=1500, chunk_overlap=200)

text = extract_text("long_document.pdf")
chunks = list(chunker.chunk_by_size(text))

print(f"Split into {len(chunks)} chunks")

LLM Document Processing

from openai import OpenAI
from pydantic import BaseModel
from typing import Optional
import json

client = OpenAI()

class ExtractedEntity(BaseModel):
    name: str
    type: str
    value: str
    confidence: float

class DocumentSummary(BaseModel):
    title: str
    summary: str
    key_points: list[str]
    entities: list[ExtractedEntity]

def summarize_document(text: str, max_length: int = 500) -> str:
    """Summarize a document."""
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": f"Summarize documents concisely in under {max_length} words."
            },
            {
                "role": "user",
                "content": f"Summarize this document:\n\n{text[:10000]}"
            }
        ]
    )
    
    return response.choices[0].message.content

def extract_entities(text: str, entity_types: list[str]) -> list[ExtractedEntity]:
    """Extract specific entities from text."""
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": f"""Extract entities of these types: {', '.join(entity_types)}.
Return as JSON array: [{{"name": "...", "type": "...", "value": "...", "confidence": 0.0-1.0}}]"""
            },
            {
                "role": "user",
                "content": text[:8000]
            }
        ],
        response_format={"type": "json_object"}
    )
    
    data = json.loads(response.choices[0].message.content)
    entities = data.get("entities", data) if isinstance(data, dict) else data
    
    return [ExtractedEntity(**e) for e in entities]

def process_document_chunks(
    chunks: list[Chunk],
    processor: callable,
    combine: callable = None
) -> any:
    """Process document chunks and optionally combine results."""
    
    results = []
    
    for chunk in chunks:
        result = processor(chunk.text)
        results.append({
            "chunk_index": chunk.index,
            "result": result,
            "metadata": chunk.metadata
        })
    
    if combine:
        return combine(results)
    
    return results

# Usage
text = extract_text("research_paper.pdf")
chunks = list(DocumentChunker(chunk_size=2000).chunk_by_size(text))

# Summarize each chunk
chunk_summaries = process_document_chunks(
    chunks,
    processor=lambda t: summarize_document(t, max_length=100)
)

# Combine into final summary
all_summaries = "\n\n".join(r["result"] for r in chunk_summaries)
final_summary = summarize_document(all_summaries, max_length=300)

print(final_summary)

Invoice Processing

from pydantic import BaseModel
from typing import Optional
from datetime import date

class LineItem(BaseModel):
    description: str
    quantity: float
    unit_price: float
    total: float

class Invoice(BaseModel):
    invoice_number: str
    invoice_date: Optional[str]
    due_date: Optional[str]
    vendor_name: str
    vendor_address: Optional[str]
    customer_name: Optional[str]
    line_items: list[LineItem]
    subtotal: float
    tax: Optional[float]
    total: float
    currency: str = "USD"

def extract_invoice(text: str) -> Invoice:
    """Extract structured invoice data from text."""
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": """Extract invoice information into structured JSON.
Include: invoice_number, invoice_date, due_date, vendor_name, vendor_address,
customer_name, line_items (description, quantity, unit_price, total),
subtotal, tax, total, currency.
Use null for missing fields. Dates in YYYY-MM-DD format."""
            },
            {
                "role": "user",
                "content": f"Extract invoice data:\n\n{text}"
            }
        ],
        response_format={"type": "json_object"}
    )
    
    data = json.loads(response.choices[0].message.content)
    return Invoice(**data)

def process_invoice_batch(file_paths: list[str]) -> list[dict]:
    """Process multiple invoices."""
    
    results = []
    
    for path in file_paths:
        try:
            text = extract_text(path)
            invoice = extract_invoice(text)
            
            results.append({
                "file": path,
                "status": "success",
                "invoice": invoice.model_dump()
            })
        except Exception as e:
            results.append({
                "file": path,
                "status": "error",
                "error": str(e)
            })
    
    return results

# Usage
invoice_files = ["invoice1.pdf", "invoice2.pdf", "invoice3.pdf"]
results = process_invoice_batch(invoice_files)

# Export to CSV
import csv

with open("invoices.csv", "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=[
        "file", "invoice_number", "vendor_name", "total", "currency"
    ])
    writer.writeheader()
    
    for r in results:
        if r["status"] == "success":
            inv = r["invoice"]
            writer.writerow({
                "file": r["file"],
                "invoice_number": inv["invoice_number"],
                "vendor_name": inv["vendor_name"],
                "total": inv["total"],
                "currency": inv["currency"]
            })

Contract Analysis

from pydantic import BaseModel
from enum import Enum

class RiskLevel(str, Enum):
    LOW = "low"
    MEDIUM = "medium"
    HIGH = "high"

class ContractClause(BaseModel):
    clause_type: str
    text: str
    risk_level: RiskLevel
    summary: str
    recommendations: list[str]

class ContractAnalysis(BaseModel):
    contract_type: str
    parties: list[str]
    effective_date: Optional[str]
    termination_date: Optional[str]
    key_terms: list[str]
    obligations: list[str]
    risky_clauses: list[ContractClause]
    overall_risk: RiskLevel
    summary: str

def analyze_contract(text: str) -> ContractAnalysis:
    """Analyze a contract for key terms and risks."""
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": """You are a legal contract analyst. Analyze contracts for:
1. Contract type and parties
2. Key dates (effective, termination)
3. Key terms and obligations
4. Risky clauses with risk levels (low/medium/high)
5. Overall risk assessment
6. Summary and recommendations

Return structured JSON matching the ContractAnalysis schema."""
            },
            {
                "role": "user",
                "content": f"Analyze this contract:\n\n{text[:15000]}"
            }
        ],
        response_format={"type": "json_object"}
    )
    
    data = json.loads(response.choices[0].message.content)
    return ContractAnalysis(**data)

def compare_contracts(contract1: str, contract2: str) -> dict:
    """Compare two contracts for differences."""
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": """Compare two contracts and identify:
1. Key differences in terms
2. Added/removed clauses
3. Changed obligations
4. Risk implications of differences"""
            },
            {
                "role": "user",
                "content": f"""Contract 1:
{contract1[:7000]}

Contract 2:
{contract2[:7000]}

Compare these contracts:"""
            }
        ],
        response_format={"type": "json_object"}
    )
    
    return json.loads(response.choices[0].message.content)

# Usage
contract_text = extract_text("service_agreement.pdf")
analysis = analyze_contract(contract_text)

print(f"Contract Type: {analysis.contract_type}")
print(f"Parties: {', '.join(analysis.parties)}")
print(f"Overall Risk: {analysis.overall_risk}")

for clause in analysis.risky_clauses:
    if clause.risk_level == RiskLevel.HIGH:
        print(f"\nHIGH RISK: {clause.clause_type}")
        print(f"  {clause.summary}")
        for rec in clause.recommendations:
            print(f"  - {rec}")

References

pypdf: https://pypdf.readthedocs.io/
python-docx: https://python-docx.readthedocs.io/
Unstructured: https://unstructured.io/
LangChain Document Loaders: https://python.langchain.com/docs/modules/data_connection/document_loaders/

Conclusion

Document processing with LLMs unlocks value from unstructured data at scale. Start with reliable text extraction—pypdf for PDFs, python-docx for Word files. Implement smart chunking that respects document structure rather than arbitrary character limits. Use structured output (JSON mode) for reliable entity extraction. Build specialized processors for common document types like invoices and contracts. For production systems, add error handling, validation, and human review workflows for high-stakes decisions. The combination of traditional document parsing and LLM understanding creates powerful automation for document-heavy workflows that previously required manual processing.

Discover more from Code, Cloud & Context

Subscribe to get the latest posts sent to your email.

Searching in

Code, Cloud & Context

Categories

Archives

A sample text widget

Document Processing with LLMs: From PDFs to Structured Data

Text Extraction

Smart Chunking

LLM Document Processing

Invoice Processing

Contract Analysis

References

Conclusion

Discover more from Code, Cloud & Context

Leave a Reply

Recent Posts

Blog Roll

Meta