Introduction: Modern LLMs understand more than text. GPT-4V, Claude 3, and Gemini can process images alongside text, enabling applications that reason across modalities. Building multi-modal applications requires handling image encoding, managing mixed-content prompts, and designing interactions that leverage visual understanding. This guide covers practical patterns for integrating vision capabilities: encoding images for API calls, building prompts that combine text and images effectively, handling different image formats and sizes, and creating applications that extract information from visual content.

Image Encoding
from dataclasses import dataclass
from typing import Any, Optional, Union
from pathlib import Path
import base64
import httpx
from PIL import Image
import io
@dataclass
class EncodedImage:
"""Encoded image ready for API."""
data: str
media_type: str
source_type: str # "base64" or "url"
original_size: tuple[int, int] = None
encoded_size: int = 0
class ImageEncoder:
"""Encode images for multi-modal LLM APIs."""
SUPPORTED_FORMATS = {"png", "jpeg", "jpg", "gif", "webp"}
MAX_SIZE = (2048, 2048) # Max dimensions for most APIs
def encode_file(self, path: Union[str, Path]) -> EncodedImage:
"""Encode image from file path."""
path = Path(path)
if not path.exists():
raise FileNotFoundError(f"Image not found: {path}")
suffix = path.suffix.lower().lstrip(".")
if suffix not in self.SUPPORTED_FORMATS:
raise ValueError(f"Unsupported format: {suffix}")
# Read and optionally resize
with Image.open(path) as img:
original_size = img.size
# Resize if too large
if img.width > self.MAX_SIZE[0] or img.height > self.MAX_SIZE[1]:
img.thumbnail(self.MAX_SIZE, Image.Resampling.LANCZOS)
# Convert to bytes
buffer = io.BytesIO()
format_map = {"jpg": "JPEG", "jpeg": "JPEG", "png": "PNG", "gif": "GIF", "webp": "WEBP"}
img.save(buffer, format=format_map.get(suffix, "PNG"))
image_bytes = buffer.getvalue()
# Encode to base64
encoded = base64.standard_b64encode(image_bytes).decode("utf-8")
media_type = f"image/{suffix}" if suffix != "jpg" else "image/jpeg"
return EncodedImage(
data=encoded,
media_type=media_type,
source_type="base64",
original_size=original_size,
encoded_size=len(encoded)
)
def encode_bytes(self, data: bytes, media_type: str = "image/png") -> EncodedImage:
"""Encode image from bytes."""
# Optionally resize
with Image.open(io.BytesIO(data)) as img:
original_size = img.size
if img.width > self.MAX_SIZE[0] or img.height > self.MAX_SIZE[1]:
img.thumbnail(self.MAX_SIZE, Image.Resampling.LANCZOS)
buffer = io.BytesIO()
img.save(buffer, format=img.format or "PNG")
data = buffer.getvalue()
encoded = base64.standard_b64encode(data).decode("utf-8")
return EncodedImage(
data=encoded,
media_type=media_type,
source_type="base64",
original_size=original_size,
encoded_size=len(encoded)
)
async def encode_url(self, url: str, download: bool = False) -> EncodedImage:
"""Encode image from URL."""
if not download:
# Return URL reference (for APIs that support it)
return EncodedImage(
data=url,
media_type="image/url",
source_type="url"
)
# Download and encode
async with httpx.AsyncClient() as client:
response = await client.get(url)
response.raise_for_status()
content_type = response.headers.get("content-type", "image/png")
media_type = content_type.split(";")[0]
return self.encode_bytes(response.content, media_type)
def resize_for_detail(
self,
path: Union[str, Path],
detail: str = "auto"
) -> EncodedImage:
"""Resize image based on detail level."""
path = Path(path)
with Image.open(path) as img:
original_size = img.size
if detail == "low":
# Low detail: 512x512 max
img.thumbnail((512, 512), Image.Resampling.LANCZOS)
elif detail == "high":
# High detail: 2048x2048 max
img.thumbnail((2048, 2048), Image.Resampling.LANCZOS)
else:
# Auto: let API decide
img.thumbnail(self.MAX_SIZE, Image.Resampling.LANCZOS)
buffer = io.BytesIO()
img.save(buffer, format=img.format or "PNG")
image_bytes = buffer.getvalue()
encoded = base64.standard_b64encode(image_bytes).decode("utf-8")
suffix = path.suffix.lower().lstrip(".")
media_type = f"image/{suffix}" if suffix != "jpg" else "image/jpeg"
return EncodedImage(
data=encoded,
media_type=media_type,
source_type="base64",
original_size=original_size,
encoded_size=len(encoded)
)
Multi-Modal Message Building
from dataclasses import dataclass, field
from typing import Any, Optional, Union
from enum import Enum
class ContentType(Enum):
"""Content types for multi-modal messages."""
TEXT = "text"
IMAGE = "image"
IMAGE_URL = "image_url"
@dataclass
class ContentPart:
"""Part of a multi-modal message."""
type: ContentType
content: str
detail: str = "auto" # For images: low, high, auto
@dataclass
class MultiModalMessage:
"""Multi-modal message with text and images."""
role: str
parts: list[ContentPart] = field(default_factory=list)
def add_text(self, text: str) -> "MultiModalMessage":
"""Add text content."""
self.parts.append(ContentPart(type=ContentType.TEXT, content=text))
return self
def add_image(
self,
encoded: EncodedImage,
detail: str = "auto"
) -> "MultiModalMessage":
"""Add encoded image."""
if encoded.source_type == "url":
self.parts.append(ContentPart(
type=ContentType.IMAGE_URL,
content=encoded.data,
detail=detail
))
else:
self.parts.append(ContentPart(
type=ContentType.IMAGE,
content=f"data:{encoded.media_type};base64,{encoded.data}",
detail=detail
))
return self
class MessageBuilder:
"""Build multi-modal messages for different APIs."""
def __init__(self):
self.encoder = ImageEncoder()
def build_openai_message(self, message: MultiModalMessage) -> dict:
"""Build message for OpenAI API."""
content = []
for part in message.parts:
if part.type == ContentType.TEXT:
content.append({
"type": "text",
"text": part.content
})
elif part.type == ContentType.IMAGE:
content.append({
"type": "image_url",
"image_url": {
"url": part.content,
"detail": part.detail
}
})
elif part.type == ContentType.IMAGE_URL:
content.append({
"type": "image_url",
"image_url": {
"url": part.content,
"detail": part.detail
}
})
return {
"role": message.role,
"content": content
}
def build_anthropic_message(self, message: MultiModalMessage) -> dict:
"""Build message for Anthropic API."""
content = []
for part in message.parts:
if part.type == ContentType.TEXT:
content.append({
"type": "text",
"text": part.content
})
elif part.type == ContentType.IMAGE:
# Extract base64 data and media type
if part.content.startswith("data:"):
media_type, data = part.content.split(";base64,")
media_type = media_type.replace("data:", "")
else:
media_type = "image/png"
data = part.content
content.append({
"type": "image",
"source": {
"type": "base64",
"media_type": media_type,
"data": data
}
})
elif part.type == ContentType.IMAGE_URL:
# Anthropic requires downloading URL images
content.append({
"type": "image",
"source": {
"type": "url",
"url": part.content
}
})
return {
"role": message.role,
"content": content
}
def build_gemini_message(self, message: MultiModalMessage) -> dict:
"""Build message for Google Gemini API."""
parts = []
for part in message.parts:
if part.type == ContentType.TEXT:
parts.append({"text": part.content})
elif part.type == ContentType.IMAGE:
if part.content.startswith("data:"):
media_type, data = part.content.split(";base64,")
media_type = media_type.replace("data:", "")
else:
media_type = "image/png"
data = part.content
parts.append({
"inline_data": {
"mime_type": media_type,
"data": data
}
})
return {
"role": "user" if message.role == "user" else "model",
"parts": parts
}
Vision API Clients
from dataclasses import dataclass
from typing import Any, Optional
import asyncio
@dataclass
class VisionResponse:
"""Response from vision API."""
content: str
model: str
usage: dict = None
class OpenAIVisionClient:
"""OpenAI GPT-4 Vision client."""
def __init__(self, client: Any, model: str = "gpt-4o"):
self.client = client
self.model = model
self.builder = MessageBuilder()
async def analyze(
self,
prompt: str,
images: list[EncodedImage],
system_prompt: str = None,
detail: str = "auto"
) -> VisionResponse:
"""Analyze images with text prompt."""
# Build multi-modal message
message = MultiModalMessage(role="user")
message.add_text(prompt)
for image in images:
message.add_image(image, detail)
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append(self.builder.build_openai_message(message))
response = await self.client.chat.completions.create(
model=self.model,
messages=messages,
max_tokens=4096
)
return VisionResponse(
content=response.choices[0].message.content,
model=self.model,
usage={
"prompt_tokens": response.usage.prompt_tokens,
"completion_tokens": response.usage.completion_tokens
}
)
class AnthropicVisionClient:
"""Anthropic Claude Vision client."""
def __init__(self, client: Any, model: str = "claude-3-5-sonnet-20241022"):
self.client = client
self.model = model
self.builder = MessageBuilder()
async def analyze(
self,
prompt: str,
images: list[EncodedImage],
system_prompt: str = None
) -> VisionResponse:
"""Analyze images with text prompt."""
message = MultiModalMessage(role="user")
message.add_text(prompt)
for image in images:
message.add_image(image)
response = await self.client.messages.create(
model=self.model,
max_tokens=4096,
system=system_prompt or "",
messages=[self.builder.build_anthropic_message(message)]
)
return VisionResponse(
content=response.content[0].text,
model=self.model,
usage={
"input_tokens": response.usage.input_tokens,
"output_tokens": response.usage.output_tokens
}
)
class UnifiedVisionClient:
"""Unified client for multiple vision APIs."""
def __init__(
self,
openai_client: Any = None,
anthropic_client: Any = None
):
self.clients = {}
if openai_client:
self.clients["openai"] = OpenAIVisionClient(openai_client)
if anthropic_client:
self.clients["anthropic"] = AnthropicVisionClient(anthropic_client)
async def analyze(
self,
prompt: str,
images: list[EncodedImage],
provider: str = "openai",
system_prompt: str = None,
**kwargs
) -> VisionResponse:
"""Analyze using specified provider."""
if provider not in self.clients:
raise ValueError(f"Provider not configured: {provider}")
client = self.clients[provider]
return await client.analyze(prompt, images, system_prompt, **kwargs)
Vision Applications
from dataclasses import dataclass
from typing import Any, Optional
from pydantic import BaseModel
import json
class ExtractedData(BaseModel):
"""Base model for extracted data."""
pass
class DocumentInfo(ExtractedData):
"""Extracted document information."""
title: Optional[str] = None
date: Optional[str] = None
author: Optional[str] = None
summary: str
key_points: list[str]
class ChartData(ExtractedData):
"""Extracted chart data."""
chart_type: str
title: Optional[str] = None
x_axis: Optional[str] = None
y_axis: Optional[str] = None
data_points: list[dict]
insights: list[str]
class ImageDescription(ExtractedData):
"""Detailed image description."""
main_subject: str
objects: list[str]
colors: list[str]
mood: Optional[str] = None
text_content: list[str]
description: str
class VisionExtractor:
"""Extract structured data from images."""
def __init__(self, client: UnifiedVisionClient):
self.client = client
self.encoder = ImageEncoder()
async def extract_document_info(
self,
image_path: str,
provider: str = "openai"
) -> DocumentInfo:
"""Extract information from document image."""
encoded = self.encoder.encode_file(image_path)
prompt = """Analyze this document image and extract:
1. Title (if visible)
2. Date (if visible)
3. Author (if visible)
4. A brief summary of the content
5. Key points or main takeaways
Respond in JSON format:
{
"title": "...",
"date": "...",
"author": "...",
"summary": "...",
"key_points": ["...", "..."]
}"""
response = await self.client.analyze(
prompt=prompt,
images=[encoded],
provider=provider
)
# Parse JSON response
data = json.loads(response.content)
return DocumentInfo(**data)
async def extract_chart_data(
self,
image_path: str,
provider: str = "openai"
) -> ChartData:
"""Extract data from chart image."""
encoded = self.encoder.encode_file(image_path)
prompt = """Analyze this chart/graph and extract:
1. Chart type (bar, line, pie, scatter, etc.)
2. Title
3. X-axis label
4. Y-axis label
5. Data points (approximate values)
6. Key insights
Respond in JSON format:
{
"chart_type": "...",
"title": "...",
"x_axis": "...",
"y_axis": "...",
"data_points": [{"label": "...", "value": ...}],
"insights": ["...", "..."]
}"""
response = await self.client.analyze(
prompt=prompt,
images=[encoded],
provider=provider
)
data = json.loads(response.content)
return ChartData(**data)
async def describe_image(
self,
image_path: str,
provider: str = "openai"
) -> ImageDescription:
"""Generate detailed image description."""
encoded = self.encoder.encode_file(image_path)
prompt = """Describe this image in detail:
1. Main subject
2. Objects visible
3. Dominant colors
4. Mood/atmosphere
5. Any text content
6. Overall description
Respond in JSON format:
{
"main_subject": "...",
"objects": ["...", "..."],
"colors": ["...", "..."],
"mood": "...",
"text_content": ["...", "..."],
"description": "..."
}"""
response = await self.client.analyze(
prompt=prompt,
images=[encoded],
provider=provider
)
data = json.loads(response.content)
return ImageDescription(**data)
async def compare_images(
self,
image_paths: list[str],
comparison_prompt: str = None,
provider: str = "openai"
) -> str:
"""Compare multiple images."""
encoded_images = [
self.encoder.encode_file(path)
for path in image_paths
]
prompt = comparison_prompt or """Compare these images and describe:
1. Similarities between them
2. Differences between them
3. Which image is better quality (if applicable)
4. Any notable observations"""
response = await self.client.analyze(
prompt=prompt,
images=encoded_images,
provider=provider
)
return response.content
async def ocr_extract(
self,
image_path: str,
provider: str = "openai"
) -> str:
"""Extract all text from image."""
encoded = self.encoder.encode_file(image_path)
prompt = """Extract ALL text visible in this image.
Preserve the original formatting and structure as much as possible.
Include headers, paragraphs, lists, tables, etc.
Return only the extracted text, no commentary."""
response = await self.client.analyze(
prompt=prompt,
images=[encoded],
provider=provider,
detail="high"
)
return response.content
Production Vision Service
from fastapi import FastAPI, HTTPException, UploadFile, File
from pydantic import BaseModel
from typing import Optional
import tempfile
import os
app = FastAPI()
# Initialize components
encoder = ImageEncoder()
vision_client = None # Initialize with API clients
extractor = None # Initialize with vision client
class AnalyzeRequest(BaseModel):
prompt: str
image_urls: list[str] = []
provider: str = "openai"
detail: str = "auto"
class ExtractRequest(BaseModel):
extraction_type: str # document, chart, description, ocr
provider: str = "openai"
@app.post("/v1/analyze")
async def analyze_images(request: AnalyzeRequest):
"""Analyze images with custom prompt."""
# Encode images from URLs
encoded_images = []
for url in request.image_urls:
encoded = await encoder.encode_url(url, download=True)
encoded_images.append(encoded)
response = await vision_client.analyze(
prompt=request.prompt,
images=encoded_images,
provider=request.provider,
detail=request.detail
)
return {
"content": response.content,
"model": response.model,
"usage": response.usage
}
@app.post("/v1/analyze/upload")
async def analyze_uploaded_images(
prompt: str,
files: list[UploadFile] = File(...),
provider: str = "openai",
detail: str = "auto"
):
"""Analyze uploaded images."""
encoded_images = []
for file in files:
content = await file.read()
media_type = file.content_type or "image/png"
encoded = encoder.encode_bytes(content, media_type)
encoded_images.append(encoded)
response = await vision_client.analyze(
prompt=prompt,
images=encoded_images,
provider=provider,
detail=detail
)
return {
"content": response.content,
"model": response.model,
"usage": response.usage
}
@app.post("/v1/extract/document")
async def extract_document(file: UploadFile = File(...), provider: str = "openai"):
"""Extract document information."""
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
content = await file.read()
tmp.write(content)
tmp_path = tmp.name
try:
result = await extractor.extract_document_info(tmp_path, provider)
return result.model_dump()
finally:
os.unlink(tmp_path)
@app.post("/v1/extract/chart")
async def extract_chart(file: UploadFile = File(...), provider: str = "openai"):
"""Extract chart data."""
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
content = await file.read()
tmp.write(content)
tmp_path = tmp.name
try:
result = await extractor.extract_chart_data(tmp_path, provider)
return result.model_dump()
finally:
os.unlink(tmp_path)
@app.post("/v1/extract/ocr")
async def extract_text(file: UploadFile = File(...), provider: str = "openai"):
"""Extract text from image."""
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
content = await file.read()
tmp.write(content)
tmp_path = tmp.name
try:
text = await extractor.ocr_extract(tmp_path, provider)
return {"text": text}
finally:
os.unlink(tmp_path)
@app.post("/v1/compare")
async def compare_images(
files: list[UploadFile] = File(...),
prompt: Optional[str] = None,
provider: str = "openai"
):
"""Compare multiple images."""
tmp_paths = []
try:
for file in files:
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
content = await file.read()
tmp.write(content)
tmp_paths.append(tmp.name)
result = await extractor.compare_images(tmp_paths, prompt, provider)
return {"comparison": result}
finally:
for path in tmp_paths:
os.unlink(path)
@app.get("/health")
async def health():
return {"status": "healthy"}
References
- OpenAI Vision: https://platform.openai.com/docs/guides/vision
- Anthropic Vision: https://docs.anthropic.com/en/docs/build-with-claude/vision
- Google Gemini Vision: https://ai.google.dev/gemini-api/docs/vision
- Pillow (PIL): https://pillow.readthedocs.io/
Conclusion
Multi-modal LLMs open up powerful new application possibilities. Start with proper image encoding—resize images appropriately for your use case, using lower detail for simple tasks and higher detail when precision matters. Build messages that combine text and images effectively, placing images near the text that references them. Use provider-specific message formats since OpenAI, Anthropic, and Google have different structures. For extraction tasks, provide clear JSON schemas in your prompts to get structured output you can parse reliably. Consider cost implications: high-detail images use more tokens, so use low detail when sufficient. Build unified clients that abstract provider differences, making it easy to switch between models or use the best model for each task. The key insight is that vision capabilities work best when you’re specific about what you want—tell the model exactly what to look for and how to format its response.
