Multi-Modal AI: Building Applications with Vision-Language Models

Introduction: The era of text-only LLMs is ending. Modern vision-language models like GPT-4V, Claude 3, and Gemini can see images, understand diagrams, read documents, and reason about visual content alongside text. This opens entirely new application categories: document understanding, visual Q&A, image-based search, accessibility tools, and creative applications. This guide covers building multi-modal AI applications using the latest APIs, from basic image understanding to complex document processing pipelines, with practical code examples for each major provider.

Multi-Modal AI
Multi-Modal AI: Text, Image, Audio, and Video Processing

OpenAI Vision API

from openai import OpenAI
import base64
from pathlib import Path

client = OpenAI()

def encode_image(image_path: str) -> str:
    """Encode image to base64."""
    with open(image_path, "rb") as f:
        return base64.standard_b64encode(f.read()).decode("utf-8")

def analyze_image(image_path: str, prompt: str) -> str:
    """Analyze an image with GPT-4V."""
    
    base64_image = encode_image(image_path)
    
    # Determine media type
    suffix = Path(image_path).suffix.lower()
    media_types = {".jpg": "jpeg", ".jpeg": "jpeg", ".png": "png", ".gif": "gif", ".webp": "webp"}
    media_type = media_types.get(suffix, "jpeg")
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/{media_type};base64,{base64_image}",
                            "detail": "high"  # "low", "high", or "auto"
                        }
                    }
                ]
            }
        ],
        max_tokens=1000
    )
    
    return response.choices[0].message.content

# Analyze from URL
def analyze_image_url(url: str, prompt: str) -> str:
    """Analyze an image from URL."""
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": url}}
                ]
            }
        ],
        max_tokens=1000
    )
    
    return response.choices[0].message.content

# Multiple images
def compare_images(image_paths: list[str], prompt: str) -> str:
    """Compare multiple images."""
    
    content = [{"type": "text", "text": prompt}]
    
    for path in image_paths:
        base64_image = encode_image(path)
        content.append({
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
        })
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": content}],
        max_tokens=1500
    )
    
    return response.choices[0].message.content

# Usage examples
result = analyze_image("diagram.png", "Explain this architecture diagram in detail.")
print(result)

comparison = compare_images(
    ["before.png", "after.png"],
    "What are the differences between these two UI designs?"
)

Claude Vision API

import anthropic
import base64

client = anthropic.Anthropic()

def analyze_with_claude(image_path: str, prompt: str) -> str:
    """Analyze image with Claude 3."""
    
    with open(image_path, "rb") as f:
        image_data = base64.standard_b64encode(f.read()).decode("utf-8")
    
    # Determine media type
    if image_path.endswith(".png"):
        media_type = "image/png"
    elif image_path.endswith(".gif"):
        media_type = "image/gif"
    elif image_path.endswith(".webp"):
        media_type = "image/webp"
    else:
        media_type = "image/jpeg"
    
    message = client.messages.create(
        model="claude-3-5-sonnet-20241022",
        max_tokens=1024,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": media_type,
                            "data": image_data,
                        },
                    },
                    {
                        "type": "text",
                        "text": prompt
                    }
                ],
            }
        ],
    )
    
    return message.content[0].text

# Document understanding with Claude
def extract_from_document(image_path: str) -> dict:
    """Extract structured data from document image."""
    
    prompt = """Analyze this document and extract:
1. Document type (invoice, receipt, form, etc.)
2. Key fields and values
3. Any tables or structured data
4. Important dates and amounts

Return as JSON."""
    
    result = analyze_with_claude(image_path, prompt)
    
    import json
    try:
        return json.loads(result)
    except:
        return {"raw_text": result}

# Chart/graph analysis
def analyze_chart(image_path: str) -> str:
    """Analyze a chart or graph."""
    
    prompt = """Analyze this chart/graph:
1. What type of visualization is this?
2. What data is being presented?
3. What are the key trends or insights?
4. Are there any anomalies or notable patterns?

Provide a detailed analysis."""
    
    return analyze_with_claude(image_path, prompt)

Google Gemini Vision

import google.generativeai as genai
from PIL import Image

genai.configure(api_key="your-api-key")

def analyze_with_gemini(image_path: str, prompt: str) -> str:
    """Analyze image with Gemini."""
    
    model = genai.GenerativeModel("gemini-1.5-pro")
    
    image = Image.open(image_path)
    
    response = model.generate_content([prompt, image])
    
    return response.text

# Video analysis with Gemini
def analyze_video(video_path: str, prompt: str) -> str:
    """Analyze video with Gemini (supports up to 1 hour)."""
    
    model = genai.GenerativeModel("gemini-1.5-pro")
    
    # Upload video
    video_file = genai.upload_file(video_path)
    
    # Wait for processing
    import time
    while video_file.state.name == "PROCESSING":
        time.sleep(5)
        video_file = genai.get_file(video_file.name)
    
    response = model.generate_content([prompt, video_file])
    
    return response.text

# Multi-turn visual conversation
def visual_conversation():
    """Have a multi-turn conversation about an image."""
    
    model = genai.GenerativeModel("gemini-1.5-pro")
    chat = model.start_chat()
    
    image = Image.open("architecture.png")
    
    # First turn with image
    response = chat.send_message([
        "Here's an architecture diagram. What components do you see?",
        image
    ])
    print(response.text)
    
    # Follow-up questions (image context maintained)
    response = chat.send_message("What are the potential bottlenecks?")
    print(response.text)
    
    response = chat.send_message("How would you improve the scalability?")
    print(response.text)

Document Processing Pipeline

from dataclasses import dataclass
from typing import Optional
import json
from openai import OpenAI

client = OpenAI()

@dataclass
class DocumentResult:
    doc_type: str
    extracted_fields: dict
    tables: list[dict]
    confidence: float
    raw_text: str

class DocumentProcessor:
    """Process documents using vision models."""
    
    def __init__(self, model: str = "gpt-4o"):
        self.model = model
    
    def process(self, image_path: str) -> DocumentResult:
        """Process a document image."""
        
        # Step 1: Classify document type
        doc_type = self._classify_document(image_path)
        
        # Step 2: Extract fields based on type
        fields = self._extract_fields(image_path, doc_type)
        
        # Step 3: Extract tables if present
        tables = self._extract_tables(image_path)
        
        return DocumentResult(
            doc_type=doc_type,
            extracted_fields=fields,
            tables=tables,
            confidence=0.9,
            raw_text=""
        )
    
    def _classify_document(self, image_path: str) -> str:
        """Classify document type."""
        
        base64_image = encode_image(image_path)
        
        response = client.chat.completions.create(
            model=self.model,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Classify this document. Return only the type: invoice, receipt, form, contract, letter, report, or other."
                        },
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
                        }
                    ]
                }
            ],
            max_tokens=50
        )
        
        return response.choices[0].message.content.strip().lower()
    
    def _extract_fields(self, image_path: str, doc_type: str) -> dict:
        """Extract fields based on document type."""
        
        field_prompts = {
            "invoice": "Extract: invoice_number, date, due_date, vendor_name, total_amount, line_items",
            "receipt": "Extract: store_name, date, items, subtotal, tax, total",
            "form": "Extract all filled fields and their values",
            "contract": "Extract: parties, effective_date, term, key_terms"
        }
        
        prompt = field_prompts.get(doc_type, "Extract all key information")
        
        base64_image = encode_image(image_path)
        
        response = client.chat.completions.create(
            model=self.model,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": f"{prompt}. Return as JSON."
                        },
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
                        }
                    ]
                }
            ],
            response_format={"type": "json_object"},
            max_tokens=1000
        )
        
        return json.loads(response.choices[0].message.content)
    
    def _extract_tables(self, image_path: str) -> list[dict]:
        """Extract tables from document."""
        
        base64_image = encode_image(image_path)
        
        response = client.chat.completions.create(
            model=self.model,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Extract any tables from this document. Return as JSON array of tables, each with headers and rows."
                        },
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
                        }
                    ]
                }
            ],
            response_format={"type": "json_object"},
            max_tokens=2000
        )
        
        result = json.loads(response.choices[0].message.content)
        return result.get("tables", [])

# Usage
processor = DocumentProcessor()
result = processor.process("invoice.png")
print(f"Document type: {result.doc_type}")
print(f"Fields: {result.extracted_fields}")

References

Conclusion

Multi-modal AI transforms what’s possible with LLM applications. Document processing that once required complex OCR pipelines now works with a single API call. Visual Q&A enables natural interaction with images and diagrams. Video understanding opens new possibilities for content analysis and accessibility. The key is choosing the right model for your use case: GPT-4o excels at general vision tasks, Claude 3 is strong at document understanding and reasoning, and Gemini handles long videos uniquely well. Start with simple image analysis, then build toward complex pipelines that combine vision with text processing. Remember that vision tokens are more expensive than text—optimize by using appropriate detail levels and preprocessing images to reasonable sizes. The multi-modal future is here, and the applications are limited only by imagination.


Discover more from Code, Cloud & Context

Subscribe to get the latest posts sent to your email.

Leave a Reply

Your email address will not be published. Required fields are marked *

This site uses Akismet to reduce spam. Learn how your comment data is processed.