Multi-Modal AI: Building Applications with Vision, Audio, and Text

Introduction: Multi-modal AI combines text, images, audio, and video understanding in a single model. GPT-4V, Claude 3, and Gemini can analyze images, extract text from screenshots, understand charts, and reason about visual content. This guide covers building multi-modal applications: image analysis and description, document understanding with vision, combining OCR with LLM reasoning, audio transcription and analysis, and building applications that seamlessly handle multiple input types. These patterns unlock use cases that were impossible with text-only models.

Image Analysis with GPT-4V

from openai import OpenAI
import base64
from pathlib import Path

client = OpenAI()

def encode_image(image_path: str) -> str:
    """Encode image to base64."""
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

def analyze_image(
    image_path: str,
    prompt: str = "Describe this image in detail.",
    model: str = "gpt-4o"
) -> str:
    """Analyze an image with GPT-4V."""
    
    # Determine media type
    suffix = Path(image_path).suffix.lower()
    media_types = {
        ".jpg": "image/jpeg",
        ".jpeg": "image/jpeg",
        ".png": "image/png",
        ".gif": "image/gif",
        ".webp": "image/webp"
    }
    media_type = media_types.get(suffix, "image/jpeg")
    
    # Encode image
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:{media_type};base64,{base64_image}",
                            "detail": "high"  # "low", "high", or "auto"
                        }
                    }
                ]
            }
        ],
        max_tokens=1000
    )
    
    return response.choices[0].message.content

def analyze_image_url(
    image_url: str,
    prompt: str = "Describe this image."
) -> str:
    """Analyze an image from URL."""
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": image_url}
                    }
                ]
            }
        ]
    )
    
    return response.choices[0].message.content

# Usage
description = analyze_image(
    "product_photo.jpg",
    "Describe this product image for an e-commerce listing. Include color, material, and key features."
)
print(description)

Multiple Image Comparison

def compare_images(
    image_paths: list[str],
    prompt: str = "Compare these images and describe the differences."
) -> str:
    """Compare multiple images."""
    
    content = [{"type": "text", "text": prompt}]
    
    for path in image_paths:
        base64_image = encode_image(path)
        suffix = Path(path).suffix.lower()
        media_type = "image/jpeg" if suffix in [".jpg", ".jpeg"] else "image/png"
        
        content.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:{media_type};base64,{base64_image}"
            }
        })
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": content}],
        max_tokens=1500
    )
    
    return response.choices[0].message.content

# Usage - Compare before/after images
comparison = compare_images(
    ["before.jpg", "after.jpg"],
    "Compare these before and after images. What changes were made?"
)

# Usage - Product comparison
comparison = compare_images(
    ["product_a.jpg", "product_b.jpg", "product_c.jpg"],
    "Compare these three products. Create a comparison table with features, pros, and cons."
)

Document Understanding

from pydantic import BaseModel
from typing import Optional
import json

class ExtractedDocument(BaseModel):
    document_type: str
    title: Optional[str]
    date: Optional[str]
    key_fields: dict
    tables: list[dict]
    summary: str

def extract_document_data(
    image_path: str,
    document_type: str = "auto"
) -> ExtractedDocument:
    """Extract structured data from document image."""
    
    prompt = f"""Analyze this document image and extract all relevant information.

Document type hint: {document_type}

Extract:
1. Document type (invoice, receipt, form, contract, etc.)
2. Title or header
3. Date if present
4. All key fields and their values
5. Any tables with their data
6. Brief summary

Return as JSON with schema:
{{
    "document_type": "string",
    "title": "string or null",
    "date": "string or null",
    "key_fields": {{"field_name": "value"}},
    "tables": [{{"headers": [], "rows": [[]]}}],
    "summary": "string"
}}"""
    
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{base64_image}",
                            "detail": "high"
                        }
                    }
                ]
            }
        ],
        response_format={"type": "json_object"}
    )
    
    data = json.loads(response.choices[0].message.content)
    return ExtractedDocument(**data)

def process_invoice(image_path: str) -> dict:
    """Extract invoice-specific data."""
    
    prompt = """Extract invoice data from this image.

Return JSON with:
{
    "invoice_number": "string",
    "invoice_date": "YYYY-MM-DD",
    "due_date": "YYYY-MM-DD or null",
    "vendor": {"name": "", "address": ""},
    "customer": {"name": "", "address": ""},
    "line_items": [{"description": "", "quantity": 0, "unit_price": 0, "total": 0}],
    "subtotal": 0,
    "tax": 0,
    "total": 0,
    "currency": "USD"
}"""
    
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{base64_image}",
                            "detail": "high"
                        }
                    }
                ]
            }
        ],
        response_format={"type": "json_object"}
    )
    
    return json.loads(response.choices[0].message.content)

# Usage
invoice_data = process_invoice("invoice_scan.png")
print(f"Invoice #{invoice_data['invoice_number']}")
print(f"Total: {invoice_data['currency']} {invoice_data['total']}")

Chart and Graph Analysis

def analyze_chart(
    image_path: str,
    questions: list[str] = None
) -> dict:
    """Analyze a chart or graph image."""
    
    base_prompt = """Analyze this chart/graph image.

Extract:
1. Chart type (bar, line, pie, scatter, etc.)
2. Title and axis labels
3. Data series and their values (estimate if needed)
4. Key trends and insights
5. Any notable outliers or patterns"""
    
    if questions:
        base_prompt += "\n\nAlso answer these specific questions:\n"
        for i, q in enumerate(questions, 1):
            base_prompt += f"{i}. {q}\n"
    
    base_prompt += """

Return JSON:
{
    "chart_type": "string",
    "title": "string",
    "x_axis": "string",
    "y_axis": "string",
    "data_series": [{"name": "", "values": []}],
    "insights": ["string"],
    "answers": ["string"] // if questions provided
}"""
    
    base64_image = encode_image(image_path)
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": base_prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{base64_image}",
                            "detail": "high"
                        }
                    }
                ]
            }
        ],
        response_format={"type": "json_object"}
    )
    
    return json.loads(response.choices[0].message.content)

# Usage
chart_analysis = analyze_chart(
    "sales_chart.png",
    questions=[
        "What month had the highest sales?",
        "What is the overall trend?",
        "Are there any seasonal patterns?"
    ]
)

print(f"Chart type: {chart_analysis['chart_type']}")
for insight in chart_analysis['insights']:
    print(f"- {insight}")

Audio Transcription and Analysis

def transcribe_audio(
    audio_path: str,
    language: str = None,
    prompt: str = None
) -> dict:
    """Transcribe audio using Whisper."""
    
    with open(audio_path, "rb") as audio_file:
        kwargs = {"model": "whisper-1", "file": audio_file}
        
        if language:
            kwargs["language"] = language
        if prompt:
            kwargs["prompt"] = prompt  # Helps with domain-specific terms
        
        response = client.audio.transcriptions.create(**kwargs)
    
    return {"text": response.text}

def transcribe_with_timestamps(audio_path: str) -> dict:
    """Transcribe with word-level timestamps."""
    
    with open(audio_path, "rb") as audio_file:
        response = client.audio.transcriptions.create(
            model="whisper-1",
            file=audio_file,
            response_format="verbose_json",
            timestamp_granularities=["word", "segment"]
        )
    
    return {
        "text": response.text,
        "segments": response.segments,
        "words": response.words
    }

def analyze_audio_content(audio_path: str, analysis_type: str = "summary") -> str:
    """Transcribe and analyze audio content."""
    
    # First transcribe
    transcription = transcribe_audio(audio_path)
    text = transcription["text"]
    
    # Then analyze with LLM
    prompts = {
        "summary": f"Summarize this transcript in 3-5 bullet points:\n\n{text}",
        "action_items": f"Extract action items and next steps from this meeting transcript:\n\n{text}",
        "sentiment": f"Analyze the sentiment and tone of this conversation:\n\n{text}",
        "key_topics": f"Identify the main topics discussed in this transcript:\n\n{text}"
    }
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompts.get(analysis_type, prompts["summary"])}]
    )
    
    return response.choices[0].message.content

# Usage
transcript = transcribe_audio("meeting.mp3")
print(f"Transcript: {transcript['text'][:500]}...")

action_items = analyze_audio_content("meeting.mp3", "action_items")
print(f"Action items:\n{action_items}")

Multi-Modal RAG

from dataclasses import dataclass
from typing import Union
from enum import Enum

class ContentType(str, Enum):
    TEXT = "text"
    IMAGE = "image"
    AUDIO = "audio"

@dataclass
class MultiModalDocument:
    id: str
    content_type: ContentType
    content: Union[str, bytes]
    metadata: dict
    embedding: list[float] = None

class MultiModalRAG:
    """RAG system supporting text, images, and audio."""
    
    def __init__(self):
        self.documents: list[MultiModalDocument] = []
    
    def _get_text_embedding(self, text: str) -> list[float]:
        """Get embedding for text."""
        response = client.embeddings.create(
            model="text-embedding-3-small",
            input=text
        )
        return response.data[0].embedding
    
    def _describe_image(self, image_path: str) -> str:
        """Get text description of image for embedding."""
        return analyze_image(
            image_path,
            "Describe this image in detail for search indexing. Include all visible text, objects, colors, and context."
        )
    
    def _transcribe_audio(self, audio_path: str) -> str:
        """Get text from audio for embedding."""
        result = transcribe_audio(audio_path)
        return result["text"]
    
    def add_document(
        self,
        doc_id: str,
        content_type: ContentType,
        content_path: str,
        metadata: dict = None
    ):
        """Add a document of any type."""
        
        # Convert to text for embedding
        if content_type == ContentType.TEXT:
            with open(content_path) as f:
                text = f.read()
        elif content_type == ContentType.IMAGE:
            text = self._describe_image(content_path)
        elif content_type == ContentType.AUDIO:
            text = self._transcribe_audio(content_path)
        
        # Get embedding
        embedding = self._get_text_embedding(text[:8000])
        
        doc = MultiModalDocument(
            id=doc_id,
            content_type=content_type,
            content=text,
            metadata=metadata or {},
            embedding=embedding
        )
        
        self.documents.append(doc)
    
    def search(self, query: str, k: int = 5) -> list[MultiModalDocument]:
        """Search across all document types."""
        
        query_embedding = self._get_text_embedding(query)
        
        # Calculate similarities
        import numpy as np
        
        scored = []
        for doc in self.documents:
            similarity = np.dot(query_embedding, doc.embedding) / (
                np.linalg.norm(query_embedding) * np.linalg.norm(doc.embedding)
            )
            scored.append((doc, similarity))
        
        # Sort by similarity
        scored.sort(key=lambda x: x[1], reverse=True)
        
        return [doc for doc, _ in scored[:k]]
    
    def query(self, question: str, k: int = 3) -> str:
        """Query with multi-modal context."""
        
        # Retrieve relevant documents
        docs = self.search(question, k=k)
        
        # Build context
        context_parts = []
        for doc in docs:
            prefix = f"[{doc.content_type.value.upper()}]"
            context_parts.append(f"{prefix}: {doc.content[:2000]}")
        
        context = "\n\n".join(context_parts)
        
        # Generate answer
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": f"Answer based on this context:\n\n{context}"
                },
                {"role": "user", "content": question}
            ]
        )
        
        return response.choices[0].message.content

# Usage
rag = MultiModalRAG()

# Add different content types
rag.add_document("doc1", ContentType.TEXT, "report.txt")
rag.add_document("img1", ContentType.IMAGE, "diagram.png")
rag.add_document("audio1", ContentType.AUDIO, "meeting.mp3")

# Query across all types
answer = rag.query("What were the main points discussed about the architecture?")

Production Multi-Modal Service

from fastapi import FastAPI, UploadFile, File, Form
from pydantic import BaseModel
from typing import Optional
import tempfile
import os

app = FastAPI()

class AnalysisResponse(BaseModel):
    content_type: str
    analysis: dict
    text_content: Optional[str]

@app.post("/analyze/image", response_model=AnalysisResponse)
async def analyze_image_endpoint(
    file: UploadFile = File(...),
    prompt: str = Form(default="Describe this image in detail.")
):
    """Analyze an uploaded image."""
    
    # Save temporarily
    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
        content = await file.read()
        tmp.write(content)
        tmp_path = tmp.name
    
    try:
        description = analyze_image(tmp_path, prompt)
        
        return AnalysisResponse(
            content_type="image",
            analysis={"description": description},
            text_content=description
        )
    finally:
        os.unlink(tmp_path)

@app.post("/analyze/document", response_model=AnalysisResponse)
async def analyze_document_endpoint(
    file: UploadFile = File(...),
    document_type: str = Form(default="auto")
):
    """Extract data from document image."""
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
        content = await file.read()
        tmp.write(content)
        tmp_path = tmp.name
    
    try:
        extracted = extract_document_data(tmp_path, document_type)
        
        return AnalysisResponse(
            content_type="document",
            analysis=extracted.model_dump(),
            text_content=extracted.summary
        )
    finally:
        os.unlink(tmp_path)

@app.post("/analyze/audio", response_model=AnalysisResponse)
async def analyze_audio_endpoint(
    file: UploadFile = File(...),
    analysis_type: str = Form(default="summary")
):
    """Transcribe and analyze audio."""
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
        content = await file.read()
        tmp.write(content)
        tmp_path = tmp.name
    
    try:
        transcript = transcribe_audio(tmp_path)
        analysis = analyze_audio_content(tmp_path, analysis_type)
        
        return AnalysisResponse(
            content_type="audio",
            analysis={
                "transcript": transcript["text"],
                "analysis": analysis
            },
            text_content=transcript["text"]
        )
    finally:
        os.unlink(tmp_path)

References

GPT-4 Vision: https://platform.openai.com/docs/guides/vision
Whisper API: https://platform.openai.com/docs/guides/speech-to-text
Claude Vision: https://docs.anthropic.com/claude/docs/vision
Gemini Multi-Modal: https://ai.google.dev/docs/multimodal_concepts

Conclusion

Multi-modal AI opens new possibilities for applications that understand the world beyond text. Use vision models for document processing, product analysis, and chart understanding. Combine audio transcription with LLM analysis for meeting summaries and content extraction. Build multi-modal RAG systems that search across text, images, and audio. The key is converting all modalities to a common representation (text or embeddings) for unified processing. As multi-modal models improve, expect even tighter integration between modalities and new capabilities like video understanding and real-time audio conversation.

Discover more from Code, Cloud & Context

Subscribe to get the latest posts sent to your email.

Searching in

Code, Cloud & Context

Categories

Archives

A sample text widget