Introduction: The era of text-only LLMs is ending. Modern vision-language models like GPT-4V, Claude 3, and Gemini can see images, understand diagrams, read documents, and reason about visual content alongside text. This opens entirely new application categories: document understanding, visual Q&A, image-based search, accessibility tools, and creative applications. This guide covers building multi-modal AI applications using the latest APIs, from basic image understanding to complex document processing pipelines, with practical code examples for each major provider.

OpenAI Vision API
from openai import OpenAI
import base64
from pathlib import Path
client = OpenAI()
def encode_image(image_path: str) -> str:
"""Encode image to base64."""
with open(image_path, "rb") as f:
return base64.standard_b64encode(f.read()).decode("utf-8")
def analyze_image(image_path: str, prompt: str) -> str:
"""Analyze an image with GPT-4V."""
base64_image = encode_image(image_path)
# Determine media type
suffix = Path(image_path).suffix.lower()
media_types = {".jpg": "jpeg", ".jpeg": "jpeg", ".png": "png", ".gif": "gif", ".webp": "webp"}
media_type = media_types.get(suffix, "jpeg")
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/{media_type};base64,{base64_image}",
"detail": "high" # "low", "high", or "auto"
}
}
]
}
],
max_tokens=1000
)
return response.choices[0].message.content
# Analyze from URL
def analyze_image_url(url: str, prompt: str) -> str:
"""Analyze an image from URL."""
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": url}}
]
}
],
max_tokens=1000
)
return response.choices[0].message.content
# Multiple images
def compare_images(image_paths: list[str], prompt: str) -> str:
"""Compare multiple images."""
content = [{"type": "text", "text": prompt}]
for path in image_paths:
base64_image = encode_image(path)
content.append({
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
})
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": content}],
max_tokens=1500
)
return response.choices[0].message.content
# Usage examples
result = analyze_image("diagram.png", "Explain this architecture diagram in detail.")
print(result)
comparison = compare_images(
["before.png", "after.png"],
"What are the differences between these two UI designs?"
)
Claude Vision API
import anthropic
import base64
client = anthropic.Anthropic()
def analyze_with_claude(image_path: str, prompt: str) -> str:
"""Analyze image with Claude 3."""
with open(image_path, "rb") as f:
image_data = base64.standard_b64encode(f.read()).decode("utf-8")
# Determine media type
if image_path.endswith(".png"):
media_type = "image/png"
elif image_path.endswith(".gif"):
media_type = "image/gif"
elif image_path.endswith(".webp"):
media_type = "image/webp"
else:
media_type = "image/jpeg"
message = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=1024,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": media_type,
"data": image_data,
},
},
{
"type": "text",
"text": prompt
}
],
}
],
)
return message.content[0].text
# Document understanding with Claude
def extract_from_document(image_path: str) -> dict:
"""Extract structured data from document image."""
prompt = """Analyze this document and extract:
1. Document type (invoice, receipt, form, etc.)
2. Key fields and values
3. Any tables or structured data
4. Important dates and amounts
Return as JSON."""
result = analyze_with_claude(image_path, prompt)
import json
try:
return json.loads(result)
except:
return {"raw_text": result}
# Chart/graph analysis
def analyze_chart(image_path: str) -> str:
"""Analyze a chart or graph."""
prompt = """Analyze this chart/graph:
1. What type of visualization is this?
2. What data is being presented?
3. What are the key trends or insights?
4. Are there any anomalies or notable patterns?
Provide a detailed analysis."""
return analyze_with_claude(image_path, prompt)
Google Gemini Vision
import google.generativeai as genai
from PIL import Image
genai.configure(api_key="your-api-key")
def analyze_with_gemini(image_path: str, prompt: str) -> str:
"""Analyze image with Gemini."""
model = genai.GenerativeModel("gemini-1.5-pro")
image = Image.open(image_path)
response = model.generate_content([prompt, image])
return response.text
# Video analysis with Gemini
def analyze_video(video_path: str, prompt: str) -> str:
"""Analyze video with Gemini (supports up to 1 hour)."""
model = genai.GenerativeModel("gemini-1.5-pro")
# Upload video
video_file = genai.upload_file(video_path)
# Wait for processing
import time
while video_file.state.name == "PROCESSING":
time.sleep(5)
video_file = genai.get_file(video_file.name)
response = model.generate_content([prompt, video_file])
return response.text
# Multi-turn visual conversation
def visual_conversation():
"""Have a multi-turn conversation about an image."""
model = genai.GenerativeModel("gemini-1.5-pro")
chat = model.start_chat()
image = Image.open("architecture.png")
# First turn with image
response = chat.send_message([
"Here's an architecture diagram. What components do you see?",
image
])
print(response.text)
# Follow-up questions (image context maintained)
response = chat.send_message("What are the potential bottlenecks?")
print(response.text)
response = chat.send_message("How would you improve the scalability?")
print(response.text)
Document Processing Pipeline
from dataclasses import dataclass
from typing import Optional
import json
from openai import OpenAI
client = OpenAI()
@dataclass
class DocumentResult:
doc_type: str
extracted_fields: dict
tables: list[dict]
confidence: float
raw_text: str
class DocumentProcessor:
"""Process documents using vision models."""
def __init__(self, model: str = "gpt-4o"):
self.model = model
def process(self, image_path: str) -> DocumentResult:
"""Process a document image."""
# Step 1: Classify document type
doc_type = self._classify_document(image_path)
# Step 2: Extract fields based on type
fields = self._extract_fields(image_path, doc_type)
# Step 3: Extract tables if present
tables = self._extract_tables(image_path)
return DocumentResult(
doc_type=doc_type,
extracted_fields=fields,
tables=tables,
confidence=0.9,
raw_text=""
)
def _classify_document(self, image_path: str) -> str:
"""Classify document type."""
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model=self.model,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Classify this document. Return only the type: invoice, receipt, form, contract, letter, report, or other."
},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
}
]
}
],
max_tokens=50
)
return response.choices[0].message.content.strip().lower()
def _extract_fields(self, image_path: str, doc_type: str) -> dict:
"""Extract fields based on document type."""
field_prompts = {
"invoice": "Extract: invoice_number, date, due_date, vendor_name, total_amount, line_items",
"receipt": "Extract: store_name, date, items, subtotal, tax, total",
"form": "Extract all filled fields and their values",
"contract": "Extract: parties, effective_date, term, key_terms"
}
prompt = field_prompts.get(doc_type, "Extract all key information")
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model=self.model,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": f"{prompt}. Return as JSON."
},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
}
]
}
],
response_format={"type": "json_object"},
max_tokens=1000
)
return json.loads(response.choices[0].message.content)
def _extract_tables(self, image_path: str) -> list[dict]:
"""Extract tables from document."""
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model=self.model,
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "Extract any tables from this document. Return as JSON array of tables, each with headers and rows."
},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
}
]
}
],
response_format={"type": "json_object"},
max_tokens=2000
)
result = json.loads(response.choices[0].message.content)
return result.get("tables", [])
# Usage
processor = DocumentProcessor()
result = processor.process("invoice.png")
print(f"Document type: {result.doc_type}")
print(f"Fields: {result.extracted_fields}")
References
- OpenAI Vision: https://platform.openai.com/docs/guides/vision
- Claude Vision: https://docs.anthropic.com/claude/docs/vision
- Gemini Vision: https://ai.google.dev/gemini-api/docs/vision
- LlamaIndex Multi-Modal: https://docs.llamaindex.ai/en/stable/module_guides/models/multi_modal/
Conclusion
Multi-modal AI transforms what’s possible with LLM applications. Document processing that once required complex OCR pipelines now works with a single API call. Visual Q&A enables natural interaction with images and diagrams. Video understanding opens new possibilities for content analysis and accessibility. The key is choosing the right model for your use case: GPT-4o excels at general vision tasks, Claude 3 is strong at document understanding and reasoning, and Gemini handles long videos uniquely well. Start with simple image analysis, then build toward complex pipelines that combine vision with text processing. Remember that vision tokens are more expensive than text—optimize by using appropriate detail levels and preprocessing images to reasonable sizes. The multi-modal future is here, and the applications are limited only by imagination.
Discover more from Code, Cloud & Context
Subscribe to get the latest posts sent to your email.