July 2021 « Code, Cloud & Context

Multi-Modal LLM Integration: Building Applications with Vision Capabilities

Introduction: Modern LLMs understand more than text. GPT-4V, Claude 3, and Gemini can process images alongside text, enabling applications that reason across modalities. Building multi-modal applications requires handling image encoding, managing mixed-content prompts, and designing interactions that leverage visual understanding. This guide covers practical patterns for integrating vision capabilities: encoding images for API calls, building prompts that combine text and images effectively, handling different image formats and sizes, and creating applications that extract information from visual content.

Image Encoding

from dataclasses import dataclass
from typing import Any, Optional, Union
from pathlib import Path
import base64
import httpx
from PIL import Image
import io

@dataclass
class EncodedImage:
    """Encoded image ready for API."""
    
    data: str
    media_type: str
    source_type: str  # "base64" or "url"
    original_size: tuple[int, int] = None
    encoded_size: int = 0

class ImageEncoder:
    """Encode images for multi-modal LLM APIs."""
    
    SUPPORTED_FORMATS = {"png", "jpeg", "jpg", "gif", "webp"}
    MAX_SIZE = (2048, 2048)  # Max dimensions for most APIs
    
    def encode_file(self, path: Union[str, Path]) -> EncodedImage:
        """Encode image from file path."""
        
        path = Path(path)
        
        if not path.exists():
            raise FileNotFoundError(f"Image not found: {path}")
        
        suffix = path.suffix.lower().lstrip(".")
        if suffix not in self.SUPPORTED_FORMATS:
            raise ValueError(f"Unsupported format: {suffix}")
        
        # Read and optionally resize
        with Image.open(path) as img:
            original_size = img.size
            
            # Resize if too large
            if img.width > self.MAX_SIZE[0] or img.height > self.MAX_SIZE[1]:
                img.thumbnail(self.MAX_SIZE, Image.Resampling.LANCZOS)
            
            # Convert to bytes
            buffer = io.BytesIO()
            format_map = {"jpg": "JPEG", "jpeg": "JPEG", "png": "PNG", "gif": "GIF", "webp": "WEBP"}
            img.save(buffer, format=format_map.get(suffix, "PNG"))
            image_bytes = buffer.getvalue()
        
        # Encode to base64
        encoded = base64.standard_b64encode(image_bytes).decode("utf-8")
        
        media_type = f"image/{suffix}" if suffix != "jpg" else "image/jpeg"
        
        return EncodedImage(
            data=encoded,
            media_type=media_type,
            source_type="base64",
            original_size=original_size,
            encoded_size=len(encoded)
        )
    
    def encode_bytes(self, data: bytes, media_type: str = "image/png") -> EncodedImage:
        """Encode image from bytes."""
        
        # Optionally resize
        with Image.open(io.BytesIO(data)) as img:
            original_size = img.size
            
            if img.width > self.MAX_SIZE[0] or img.height > self.MAX_SIZE[1]:
                img.thumbnail(self.MAX_SIZE, Image.Resampling.LANCZOS)
                buffer = io.BytesIO()
                img.save(buffer, format=img.format or "PNG")
                data = buffer.getvalue()
        
        encoded = base64.standard_b64encode(data).decode("utf-8")
        
        return EncodedImage(
            data=encoded,
            media_type=media_type,
            source_type="base64",
            original_size=original_size,
            encoded_size=len(encoded)
        )
    
    async def encode_url(self, url: str, download: bool = False) -> EncodedImage:
        """Encode image from URL."""
        
        if not download:
            # Return URL reference (for APIs that support it)
            return EncodedImage(
                data=url,
                media_type="image/url",
                source_type="url"
            )
        
        # Download and encode
        async with httpx.AsyncClient() as client:
            response = await client.get(url)
            response.raise_for_status()
            
            content_type = response.headers.get("content-type", "image/png")
            media_type = content_type.split(";")[0]
            
            return self.encode_bytes(response.content, media_type)
    
    def resize_for_detail(
        self,
        path: Union[str, Path],
        detail: str = "auto"
    ) -> EncodedImage:
        """Resize image based on detail level."""
        
        path = Path(path)
        
        with Image.open(path) as img:
            original_size = img.size
            
            if detail == "low":
                # Low detail: 512x512 max
                img.thumbnail((512, 512), Image.Resampling.LANCZOS)
            elif detail == "high":
                # High detail: 2048x2048 max
                img.thumbnail((2048, 2048), Image.Resampling.LANCZOS)
            else:
                # Auto: let API decide
                img.thumbnail(self.MAX_SIZE, Image.Resampling.LANCZOS)
            
            buffer = io.BytesIO()
            img.save(buffer, format=img.format or "PNG")
            image_bytes = buffer.getvalue()
        
        encoded = base64.standard_b64encode(image_bytes).decode("utf-8")
        suffix = path.suffix.lower().lstrip(".")
        media_type = f"image/{suffix}" if suffix != "jpg" else "image/jpeg"
        
        return EncodedImage(
            data=encoded,
            media_type=media_type,
            source_type="base64",
            original_size=original_size,
            encoded_size=len(encoded)
        )

Multi-Modal Message Building

from dataclasses import dataclass, field
from typing import Any, Optional, Union
from enum import Enum

class ContentType(Enum):
    """Content types for multi-modal messages."""
    
    TEXT = "text"
    IMAGE = "image"
    IMAGE_URL = "image_url"

@dataclass
class ContentPart:
    """Part of a multi-modal message."""
    
    type: ContentType
    content: str
    detail: str = "auto"  # For images: low, high, auto

@dataclass
class MultiModalMessage:
    """Multi-modal message with text and images."""
    
    role: str
    parts: list[ContentPart] = field(default_factory=list)
    
    def add_text(self, text: str) -> "MultiModalMessage":
        """Add text content."""
        self.parts.append(ContentPart(type=ContentType.TEXT, content=text))
        return self
    
    def add_image(
        self,
        encoded: EncodedImage,
        detail: str = "auto"
    ) -> "MultiModalMessage":
        """Add encoded image."""
        
        if encoded.source_type == "url":
            self.parts.append(ContentPart(
                type=ContentType.IMAGE_URL,
                content=encoded.data,
                detail=detail
            ))
        else:
            self.parts.append(ContentPart(
                type=ContentType.IMAGE,
                content=f"data:{encoded.media_type};base64,{encoded.data}",
                detail=detail
            ))
        
        return self

class MessageBuilder:
    """Build multi-modal messages for different APIs."""
    
    def __init__(self):
        self.encoder = ImageEncoder()
    
    def build_openai_message(self, message: MultiModalMessage) -> dict:
        """Build message for OpenAI API."""
        
        content = []
        
        for part in message.parts:
            if part.type == ContentType.TEXT:
                content.append({
                    "type": "text",
                    "text": part.content
                })
            elif part.type == ContentType.IMAGE:
                content.append({
                    "type": "image_url",
                    "image_url": {
                        "url": part.content,
                        "detail": part.detail
                    }
                })
            elif part.type == ContentType.IMAGE_URL:
                content.append({
                    "type": "image_url",
                    "image_url": {
                        "url": part.content,
                        "detail": part.detail
                    }
                })
        
        return {
            "role": message.role,
            "content": content
        }
    
    def build_anthropic_message(self, message: MultiModalMessage) -> dict:
        """Build message for Anthropic API."""
        
        content = []
        
        for part in message.parts:
            if part.type == ContentType.TEXT:
                content.append({
                    "type": "text",
                    "text": part.content
                })
            elif part.type == ContentType.IMAGE:
                # Extract base64 data and media type
                if part.content.startswith("data:"):
                    media_type, data = part.content.split(";base64,")
                    media_type = media_type.replace("data:", "")
                else:
                    media_type = "image/png"
                    data = part.content
                
                content.append({
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": media_type,
                        "data": data
                    }
                })
            elif part.type == ContentType.IMAGE_URL:
                # Anthropic requires downloading URL images
                content.append({
                    "type": "image",
                    "source": {
                        "type": "url",
                        "url": part.content
                    }
                })
        
        return {
            "role": message.role,
            "content": content
        }
    
    def build_gemini_message(self, message: MultiModalMessage) -> dict:
        """Build message for Google Gemini API."""
        
        parts = []
        
        for part in message.parts:
            if part.type == ContentType.TEXT:
                parts.append({"text": part.content})
            elif part.type == ContentType.IMAGE:
                if part.content.startswith("data:"):
                    media_type, data = part.content.split(";base64,")
                    media_type = media_type.replace("data:", "")
                else:
                    media_type = "image/png"
                    data = part.content
                
                parts.append({
                    "inline_data": {
                        "mime_type": media_type,
                        "data": data
                    }
                })
        
        return {
            "role": "user" if message.role == "user" else "model",
            "parts": parts
        }

Vision API Clients

from dataclasses import dataclass
from typing import Any, Optional
import asyncio

@dataclass
class VisionResponse:
    """Response from vision API."""
    
    content: str
    model: str
    usage: dict = None

class OpenAIVisionClient:
    """OpenAI GPT-4 Vision client."""
    
    def __init__(self, client: Any, model: str = "gpt-4o"):
        self.client = client
        self.model = model
        self.builder = MessageBuilder()
    
    async def analyze(
        self,
        prompt: str,
        images: list[EncodedImage],
        system_prompt: str = None,
        detail: str = "auto"
    ) -> VisionResponse:
        """Analyze images with text prompt."""
        
        # Build multi-modal message
        message = MultiModalMessage(role="user")
        message.add_text(prompt)
        
        for image in images:
            message.add_image(image, detail)
        
        messages = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})
        
        messages.append(self.builder.build_openai_message(message))
        
        response = await self.client.chat.completions.create(
            model=self.model,
            messages=messages,
            max_tokens=4096
        )
        
        return VisionResponse(
            content=response.choices[0].message.content,
            model=self.model,
            usage={
                "prompt_tokens": response.usage.prompt_tokens,
                "completion_tokens": response.usage.completion_tokens
            }
        )

class AnthropicVisionClient:
    """Anthropic Claude Vision client."""
    
    def __init__(self, client: Any, model: str = "claude-3-5-sonnet-20241022"):
        self.client = client
        self.model = model
        self.builder = MessageBuilder()
    
    async def analyze(
        self,
        prompt: str,
        images: list[EncodedImage],
        system_prompt: str = None
    ) -> VisionResponse:
        """Analyze images with text prompt."""
        
        message = MultiModalMessage(role="user")
        message.add_text(prompt)
        
        for image in images:
            message.add_image(image)
        
        response = await self.client.messages.create(
            model=self.model,
            max_tokens=4096,
            system=system_prompt or "",
            messages=[self.builder.build_anthropic_message(message)]
        )
        
        return VisionResponse(
            content=response.content[0].text,
            model=self.model,
            usage={
                "input_tokens": response.usage.input_tokens,
                "output_tokens": response.usage.output_tokens
            }
        )

class UnifiedVisionClient:
    """Unified client for multiple vision APIs."""
    
    def __init__(
        self,
        openai_client: Any = None,
        anthropic_client: Any = None
    ):
        self.clients = {}
        
        if openai_client:
            self.clients["openai"] = OpenAIVisionClient(openai_client)
        
        if anthropic_client:
            self.clients["anthropic"] = AnthropicVisionClient(anthropic_client)
    
    async def analyze(
        self,
        prompt: str,
        images: list[EncodedImage],
        provider: str = "openai",
        system_prompt: str = None,
        **kwargs
    ) -> VisionResponse:
        """Analyze using specified provider."""
        
        if provider not in self.clients:
            raise ValueError(f"Provider not configured: {provider}")
        
        client = self.clients[provider]
        return await client.analyze(prompt, images, system_prompt, **kwargs)

Vision Applications

from dataclasses import dataclass
from typing import Any, Optional
from pydantic import BaseModel
import json

class ExtractedData(BaseModel):
    """Base model for extracted data."""
    pass

class DocumentInfo(ExtractedData):
    """Extracted document information."""
    
    title: Optional[str] = None
    date: Optional[str] = None
    author: Optional[str] = None
    summary: str
    key_points: list[str]

class ChartData(ExtractedData):
    """Extracted chart data."""
    
    chart_type: str
    title: Optional[str] = None
    x_axis: Optional[str] = None
    y_axis: Optional[str] = None
    data_points: list[dict]
    insights: list[str]

class ImageDescription(ExtractedData):
    """Detailed image description."""
    
    main_subject: str
    objects: list[str]
    colors: list[str]
    mood: Optional[str] = None
    text_content: list[str]
    description: str

class VisionExtractor:
    """Extract structured data from images."""
    
    def __init__(self, client: UnifiedVisionClient):
        self.client = client
        self.encoder = ImageEncoder()
    
    async def extract_document_info(
        self,
        image_path: str,
        provider: str = "openai"
    ) -> DocumentInfo:
        """Extract information from document image."""
        
        encoded = self.encoder.encode_file(image_path)
        
        prompt = """Analyze this document image and extract:
1. Title (if visible)
2. Date (if visible)
3. Author (if visible)
4. A brief summary of the content
5. Key points or main takeaways

Respond in JSON format:
{
    "title": "...",
    "date": "...",
    "author": "...",
    "summary": "...",
    "key_points": ["...", "..."]
}"""
        
        response = await self.client.analyze(
            prompt=prompt,
            images=[encoded],
            provider=provider
        )
        
        # Parse JSON response
        data = json.loads(response.content)
        return DocumentInfo(**data)
    
    async def extract_chart_data(
        self,
        image_path: str,
        provider: str = "openai"
    ) -> ChartData:
        """Extract data from chart image."""
        
        encoded = self.encoder.encode_file(image_path)
        
        prompt = """Analyze this chart/graph and extract:
1. Chart type (bar, line, pie, scatter, etc.)
2. Title
3. X-axis label
4. Y-axis label
5. Data points (approximate values)
6. Key insights

Respond in JSON format:
{
    "chart_type": "...",
    "title": "...",
    "x_axis": "...",
    "y_axis": "...",
    "data_points": [{"label": "...", "value": ...}],
    "insights": ["...", "..."]
}"""
        
        response = await self.client.analyze(
            prompt=prompt,
            images=[encoded],
            provider=provider
        )
        
        data = json.loads(response.content)
        return ChartData(**data)
    
    async def describe_image(
        self,
        image_path: str,
        provider: str = "openai"
    ) -> ImageDescription:
        """Generate detailed image description."""
        
        encoded = self.encoder.encode_file(image_path)
        
        prompt = """Describe this image in detail:
1. Main subject
2. Objects visible
3. Dominant colors
4. Mood/atmosphere
5. Any text content
6. Overall description

Respond in JSON format:
{
    "main_subject": "...",
    "objects": ["...", "..."],
    "colors": ["...", "..."],
    "mood": "...",
    "text_content": ["...", "..."],
    "description": "..."
}"""
        
        response = await self.client.analyze(
            prompt=prompt,
            images=[encoded],
            provider=provider
        )
        
        data = json.loads(response.content)
        return ImageDescription(**data)
    
    async def compare_images(
        self,
        image_paths: list[str],
        comparison_prompt: str = None,
        provider: str = "openai"
    ) -> str:
        """Compare multiple images."""
        
        encoded_images = [
            self.encoder.encode_file(path)
            for path in image_paths
        ]
        
        prompt = comparison_prompt or """Compare these images and describe:
1. Similarities between them
2. Differences between them
3. Which image is better quality (if applicable)
4. Any notable observations"""
        
        response = await self.client.analyze(
            prompt=prompt,
            images=encoded_images,
            provider=provider
        )
        
        return response.content
    
    async def ocr_extract(
        self,
        image_path: str,
        provider: str = "openai"
    ) -> str:
        """Extract all text from image."""
        
        encoded = self.encoder.encode_file(image_path)
        
        prompt = """Extract ALL text visible in this image.
Preserve the original formatting and structure as much as possible.
Include headers, paragraphs, lists, tables, etc.
Return only the extracted text, no commentary."""
        
        response = await self.client.analyze(
            prompt=prompt,
            images=[encoded],
            provider=provider,
            detail="high"
        )
        
        return response.content

Production Vision Service

from fastapi import FastAPI, HTTPException, UploadFile, File
from pydantic import BaseModel
from typing import Optional
import tempfile
import os

app = FastAPI()

# Initialize components
encoder = ImageEncoder()
vision_client = None  # Initialize with API clients
extractor = None  # Initialize with vision client

class AnalyzeRequest(BaseModel):
    prompt: str
    image_urls: list[str] = []
    provider: str = "openai"
    detail: str = "auto"

class ExtractRequest(BaseModel):
    extraction_type: str  # document, chart, description, ocr
    provider: str = "openai"

@app.post("/v1/analyze")
async def analyze_images(request: AnalyzeRequest):
    """Analyze images with custom prompt."""
    
    # Encode images from URLs
    encoded_images = []
    for url in request.image_urls:
        encoded = await encoder.encode_url(url, download=True)
        encoded_images.append(encoded)
    
    response = await vision_client.analyze(
        prompt=request.prompt,
        images=encoded_images,
        provider=request.provider,
        detail=request.detail
    )
    
    return {
        "content": response.content,
        "model": response.model,
        "usage": response.usage
    }

@app.post("/v1/analyze/upload")
async def analyze_uploaded_images(
    prompt: str,
    files: list[UploadFile] = File(...),
    provider: str = "openai",
    detail: str = "auto"
):
    """Analyze uploaded images."""
    
    encoded_images = []
    
    for file in files:
        content = await file.read()
        media_type = file.content_type or "image/png"
        encoded = encoder.encode_bytes(content, media_type)
        encoded_images.append(encoded)
    
    response = await vision_client.analyze(
        prompt=prompt,
        images=encoded_images,
        provider=provider,
        detail=detail
    )
    
    return {
        "content": response.content,
        "model": response.model,
        "usage": response.usage
    }

@app.post("/v1/extract/document")
async def extract_document(file: UploadFile = File(...), provider: str = "openai"):
    """Extract document information."""
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
        content = await file.read()
        tmp.write(content)
        tmp_path = tmp.name
    
    try:
        result = await extractor.extract_document_info(tmp_path, provider)
        return result.model_dump()
    finally:
        os.unlink(tmp_path)

@app.post("/v1/extract/chart")
async def extract_chart(file: UploadFile = File(...), provider: str = "openai"):
    """Extract chart data."""
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
        content = await file.read()
        tmp.write(content)
        tmp_path = tmp.name
    
    try:
        result = await extractor.extract_chart_data(tmp_path, provider)
        return result.model_dump()
    finally:
        os.unlink(tmp_path)

@app.post("/v1/extract/ocr")
async def extract_text(file: UploadFile = File(...), provider: str = "openai"):
    """Extract text from image."""
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
        content = await file.read()
        tmp.write(content)
        tmp_path = tmp.name
    
    try:
        text = await extractor.ocr_extract(tmp_path, provider)
        return {"text": text}
    finally:
        os.unlink(tmp_path)

@app.post("/v1/compare")
async def compare_images(
    files: list[UploadFile] = File(...),
    prompt: Optional[str] = None,
    provider: str = "openai"
):
    """Compare multiple images."""
    
    tmp_paths = []
    
    try:
        for file in files:
            with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
                content = await file.read()
                tmp.write(content)
                tmp_paths.append(tmp.name)
        
        result = await extractor.compare_images(tmp_paths, prompt, provider)
        return {"comparison": result}
    finally:
        for path in tmp_paths:
            os.unlink(path)

@app.get("/health")
async def health():
    return {"status": "healthy"}

References

OpenAI Vision: https://platform.openai.com/docs/guides/vision
Anthropic Vision: https://docs.anthropic.com/en/docs/build-with-claude/vision
Google Gemini Vision: https://ai.google.dev/gemini-api/docs/vision
Pillow (PIL): https://pillow.readthedocs.io/

Conclusion

Multi-modal LLMs open up powerful new application possibilities. Start with proper image encoding—resize images appropriately for your use case, using lower detail for simple tasks and higher detail when precision matters. Build messages that combine text and images effectively, placing images near the text that references them. Use provider-specific message formats since OpenAI, Anthropic, and Google have different structures. For extraction tasks, provide clear JSON schemas in your prompts to get structured output you can parse reliably. Consider cost implications: high-detail images use more tokens, so use low detail when sufficient. Build unified clients that abstract provider differences, making it easy to switch between models or use the best model for each task. The key insight is that vision capabilities work best when you’re specific about what you want—tell the model exactly what to look for and how to format its response.

Searching in

Code, Cloud & Context

Categories

Archives

A sample text widget