Introduction: Multimodal AI processes and generates content across multiple modalities—text, images, audio, and video. This capability enables applications that were previously impossible: describing images, generating images from text, transcribing and understanding audio, and creating unified experiences that combine all these modalities. This guide covers the practical aspects of building multimodal applications: vision-language models for image understanding, text-to-image generation, speech recognition and synthesis, video analysis, and multimodal RAG systems. Whether you’re building an assistant that can see and hear, a content generation pipeline, or an accessibility tool, these patterns will help you harness the power of multimodal AI effectively.

Vision-Language Models
from dataclasses import dataclass, field
from typing import Any, Optional, Union
from abc import ABC, abstractmethod
import base64
from pathlib import Path
@dataclass
class ImageInput:
"""Image input for vision models."""
source: str # URL, file path, or base64
source_type: str = "auto" # "url", "file", "base64", "auto"
def to_base64(self) -> str:
"""Convert to base64."""
if self.source_type == "base64" or (
self.source_type == "auto" and
not self.source.startswith(("http", "/"))
):
return self.source
if self.source_type == "file" or (
self.source_type == "auto" and
Path(self.source).exists()
):
with open(self.source, "rb") as f:
return base64.b64encode(f.read()).decode()
# URL - fetch and encode
import requests
response = requests.get(self.source)
return base64.b64encode(response.content).decode()
def to_url(self) -> str:
"""Get as URL or data URL."""
if self.source.startswith("http"):
return self.source
# Convert to data URL
b64 = self.to_base64()
return f"data:image/jpeg;base64,{b64}"
@dataclass
class VisionResponse:
"""Response from vision model."""
text: str
model: str
usage: dict = field(default_factory=dict)
class VisionModel(ABC):
"""Abstract vision-language model."""
@abstractmethod
async def analyze(
self,
image: ImageInput,
prompt: str,
**kwargs
) -> VisionResponse:
"""Analyze image with prompt."""
pass
class OpenAIVision(VisionModel):
"""OpenAI GPT-4 Vision."""
def __init__(self, api_key: str, model: str = "gpt-4-vision-preview"):
from openai import AsyncOpenAI
self.client = AsyncOpenAI(api_key=api_key)
self.model = model
async def analyze(
self,
image: ImageInput,
prompt: str,
max_tokens: int = 1000,
detail: str = "auto"
) -> VisionResponse:
"""Analyze with GPT-4V."""
response = await self.client.chat.completions.create(
model=self.model,
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image.to_url(),
"detail": detail
}
},
{
"type": "text",
"text": prompt
}
]
}
],
max_tokens=max_tokens
)
return VisionResponse(
text=response.choices[0].message.content,
model=self.model,
usage={
"prompt_tokens": response.usage.prompt_tokens,
"completion_tokens": response.usage.completion_tokens
}
)
class ClaudeVision(VisionModel):
"""Anthropic Claude Vision."""
def __init__(self, api_key: str, model: str = "claude-3-opus-20240229"):
from anthropic import AsyncAnthropic
self.client = AsyncAnthropic(api_key=api_key)
self.model = model
async def analyze(
self,
image: ImageInput,
prompt: str,
max_tokens: int = 1000
) -> VisionResponse:
"""Analyze with Claude."""
response = await self.client.messages.create(
model=self.model,
max_tokens=max_tokens,
messages=[
{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/jpeg",
"data": image.to_base64()
}
},
{
"type": "text",
"text": prompt
}
]
}
]
)
return VisionResponse(
text=response.content[0].text,
model=self.model,
usage={
"input_tokens": response.usage.input_tokens,
"output_tokens": response.usage.output_tokens
}
)
class GeminiVision(VisionModel):
"""Google Gemini Vision."""
def __init__(self, api_key: str, model: str = "gemini-pro-vision"):
import google.generativeai as genai
genai.configure(api_key=api_key)
self.model = genai.GenerativeModel(model)
async def analyze(
self,
image: ImageInput,
prompt: str,
**kwargs
) -> VisionResponse:
"""Analyze with Gemini."""
from PIL import Image
import io
# Load image
if image.source.startswith("http"):
import requests
response = requests.get(image.source)
img = Image.open(io.BytesIO(response.content))
else:
img = Image.open(image.source)
response = self.model.generate_content([prompt, img])
return VisionResponse(
text=response.text,
model="gemini-pro-vision"
)
class MultiImageAnalyzer:
"""Analyze multiple images together."""
def __init__(self, vision_model: VisionModel):
self.model = vision_model
async def compare(
self,
images: list[ImageInput],
prompt: str = "Compare these images and describe the differences."
) -> VisionResponse:
"""Compare multiple images."""
# For models that support multiple images
# Would need model-specific implementation
# Fallback: analyze each and combine
analyses = []
for i, img in enumerate(images):
result = await self.model.analyze(
img,
f"Describe image {i+1} in detail."
)
analyses.append(f"Image {i+1}: {result.text}")
combined = "\n\n".join(analyses)
return VisionResponse(
text=combined,
model="multi-image"
)
async def extract_text(self, image: ImageInput) -> str:
"""Extract text from image (OCR)."""
result = await self.model.analyze(
image,
"Extract all text visible in this image. Return only the extracted text, preserving formatting where possible."
)
return result.text
async def describe_for_accessibility(self, image: ImageInput) -> str:
"""Generate accessibility description."""
result = await self.model.analyze(
image,
"""Describe this image for someone who cannot see it. Include:
1. Main subject and action
2. Important details and context
3. Text visible in the image
4. Colors and composition if relevant
Keep the description concise but informative."""
)
return result.text
Text-to-Image Generation
from dataclasses import dataclass, field
from typing import Any, Optional
from abc import ABC, abstractmethod
@dataclass
class GeneratedImage:
"""Generated image result."""
url: str = None
base64: str = None
revised_prompt: str = None
model: str = None
class ImageGenerator(ABC):
"""Abstract image generator."""
@abstractmethod
async def generate(
self,
prompt: str,
**kwargs
) -> GeneratedImage:
"""Generate image from prompt."""
pass
class DALLEGenerator(ImageGenerator):
"""OpenAI DALL-E generator."""
def __init__(self, api_key: str, model: str = "dall-e-3"):
from openai import AsyncOpenAI
self.client = AsyncOpenAI(api_key=api_key)
self.model = model
async def generate(
self,
prompt: str,
size: str = "1024x1024",
quality: str = "standard",
style: str = "vivid",
n: int = 1
) -> GeneratedImage:
"""Generate with DALL-E."""
response = await self.client.images.generate(
model=self.model,
prompt=prompt,
size=size,
quality=quality,
style=style,
n=n,
response_format="url"
)
return GeneratedImage(
url=response.data[0].url,
revised_prompt=response.data[0].revised_prompt,
model=self.model
)
class StableDiffusionGenerator(ImageGenerator):
"""Stable Diffusion generator."""
def __init__(self, model_id: str = "stabilityai/stable-diffusion-xl-base-1.0"):
from diffusers import StableDiffusionXLPipeline
import torch
self.pipe = StableDiffusionXLPipeline.from_pretrained(
model_id,
torch_dtype=torch.float16,
use_safetensors=True
)
self.pipe.to("cuda")
async def generate(
self,
prompt: str,
negative_prompt: str = None,
num_inference_steps: int = 50,
guidance_scale: float = 7.5,
width: int = 1024,
height: int = 1024
) -> GeneratedImage:
"""Generate with Stable Diffusion."""
import io
import base64
image = self.pipe(
prompt=prompt,
negative_prompt=negative_prompt,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
width=width,
height=height
).images[0]
# Convert to base64
buffer = io.BytesIO()
image.save(buffer, format="PNG")
b64 = base64.b64encode(buffer.getvalue()).decode()
return GeneratedImage(
base64=b64,
model="stable-diffusion-xl"
)
class ImageEditor:
"""Edit images with AI."""
def __init__(self, api_key: str):
from openai import AsyncOpenAI
self.client = AsyncOpenAI(api_key=api_key)
async def edit(
self,
image: ImageInput,
mask: ImageInput,
prompt: str,
size: str = "1024x1024"
) -> GeneratedImage:
"""Edit image with mask."""
response = await self.client.images.edit(
model="dall-e-2",
image=open(image.source, "rb"),
mask=open(mask.source, "rb"),
prompt=prompt,
size=size,
n=1
)
return GeneratedImage(
url=response.data[0].url,
model="dall-e-2-edit"
)
async def variation(
self,
image: ImageInput,
n: int = 1,
size: str = "1024x1024"
) -> list[GeneratedImage]:
"""Generate variations of image."""
response = await self.client.images.create_variation(
image=open(image.source, "rb"),
n=n,
size=size
)
return [
GeneratedImage(url=data.url, model="dall-e-2-variation")
for data in response.data
]
class PromptEnhancer:
"""Enhance prompts for better image generation."""
def __init__(self, llm_client: Any):
self.llm = llm_client
async def enhance(self, prompt: str, style: str = None) -> str:
"""Enhance prompt for image generation."""
enhancement_prompt = f"""Enhance this image generation prompt to be more detailed and effective.
Add specific details about composition, lighting, style, and quality.
Original prompt: {prompt}
{"Style preference: " + style if style else ""}
Enhanced prompt (just the prompt, no explanation):"""
response = await self.llm.complete(enhancement_prompt)
return response.content.strip()
async def generate_negative_prompt(self, prompt: str) -> str:
"""Generate negative prompt."""
neg_prompt = f"""Generate a negative prompt for this image generation prompt.
List things to avoid: artifacts, quality issues, unwanted elements.
Prompt: {prompt}
Negative prompt (comma-separated list):"""
response = await self.llm.complete(neg_prompt)
return response.content.strip()
Speech and Audio Processing
from dataclasses import dataclass, field
from typing import Any, Optional, AsyncIterator
from abc import ABC, abstractmethod
from pathlib import Path
@dataclass
class TranscriptionResult:
"""Speech transcription result."""
text: str
language: str = None
duration_seconds: float = 0
segments: list[dict] = field(default_factory=list)
words: list[dict] = field(default_factory=list)
@dataclass
class SpeechResult:
"""Text-to-speech result."""
audio_data: bytes
format: str = "mp3"
duration_seconds: float = 0
class SpeechToText(ABC):
"""Abstract speech-to-text."""
@abstractmethod
async def transcribe(
self,
audio_path: str,
**kwargs
) -> TranscriptionResult:
"""Transcribe audio to text."""
pass
class WhisperSTT(SpeechToText):
"""OpenAI Whisper speech-to-text."""
def __init__(self, api_key: str, model: str = "whisper-1"):
from openai import AsyncOpenAI
self.client = AsyncOpenAI(api_key=api_key)
self.model = model
async def transcribe(
self,
audio_path: str,
language: str = None,
response_format: str = "verbose_json",
timestamp_granularities: list[str] = None
) -> TranscriptionResult:
"""Transcribe with Whisper."""
with open(audio_path, "rb") as audio_file:
response = await self.client.audio.transcriptions.create(
model=self.model,
file=audio_file,
language=language,
response_format=response_format,
timestamp_granularities=timestamp_granularities or ["segment"]
)
return TranscriptionResult(
text=response.text,
language=response.language,
duration_seconds=response.duration,
segments=[
{
"start": s.start,
"end": s.end,
"text": s.text
}
for s in response.segments
] if hasattr(response, 'segments') else [],
words=[
{
"start": w.start,
"end": w.end,
"word": w.word
}
for w in response.words
] if hasattr(response, 'words') else []
)
class LocalWhisperSTT(SpeechToText):
"""Local Whisper model."""
def __init__(self, model_size: str = "base"):
import whisper
self.model = whisper.load_model(model_size)
async def transcribe(
self,
audio_path: str,
language: str = None,
**kwargs
) -> TranscriptionResult:
"""Transcribe with local Whisper."""
result = self.model.transcribe(
audio_path,
language=language
)
return TranscriptionResult(
text=result["text"],
language=result["language"],
segments=[
{
"start": s["start"],
"end": s["end"],
"text": s["text"]
}
for s in result["segments"]
]
)
class TextToSpeech(ABC):
"""Abstract text-to-speech."""
@abstractmethod
async def synthesize(
self,
text: str,
**kwargs
) -> SpeechResult:
"""Synthesize speech from text."""
pass
class OpenAITTS(TextToSpeech):
"""OpenAI text-to-speech."""
def __init__(self, api_key: str, model: str = "tts-1"):
from openai import AsyncOpenAI
self.client = AsyncOpenAI(api_key=api_key)
self.model = model
async def synthesize(
self,
text: str,
voice: str = "alloy",
speed: float = 1.0,
response_format: str = "mp3"
) -> SpeechResult:
"""Synthesize with OpenAI TTS."""
response = await self.client.audio.speech.create(
model=self.model,
voice=voice,
input=text,
speed=speed,
response_format=response_format
)
return SpeechResult(
audio_data=response.content,
format=response_format
)
class ElevenLabsTTS(TextToSpeech):
"""ElevenLabs text-to-speech."""
def __init__(self, api_key: str):
from elevenlabs import AsyncElevenLabs
self.client = AsyncElevenLabs(api_key=api_key)
async def synthesize(
self,
text: str,
voice_id: str = "21m00Tcm4TlvDq8ikWAM",
model_id: str = "eleven_monolingual_v1"
) -> SpeechResult:
"""Synthesize with ElevenLabs."""
audio = await self.client.generate(
text=text,
voice=voice_id,
model=model_id
)
return SpeechResult(
audio_data=audio,
format="mp3"
)
class RealtimeSpeech:
"""Real-time speech processing."""
def __init__(self, stt: SpeechToText, tts: TextToSpeech):
self.stt = stt
self.tts = tts
async def process_stream(
self,
audio_stream: AsyncIterator[bytes],
processor: callable
) -> AsyncIterator[bytes]:
"""Process audio stream in real-time."""
import tempfile
import os
buffer = b""
chunk_duration = 5 # seconds
async for chunk in audio_stream:
buffer += chunk
# Process when buffer is large enough
if len(buffer) > chunk_duration * 16000 * 2: # 16kHz, 16-bit
# Save to temp file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
f.write(buffer)
temp_path = f.name
try:
# Transcribe
result = await self.stt.transcribe(temp_path)
# Process text
response_text = await processor(result.text)
# Synthesize response
speech = await self.tts.synthesize(response_text)
yield speech.audio_data
finally:
os.unlink(temp_path)
buffer = b""
Video Analysis
from dataclasses import dataclass, field
from typing import Any, Optional
from pathlib import Path
import asyncio
@dataclass
class VideoFrame:
"""A frame from video."""
index: int
timestamp_seconds: float
image: Any # PIL Image or numpy array
@dataclass
class VideoAnalysis:
"""Video analysis result."""
summary: str
frame_descriptions: list[dict] = field(default_factory=list)
detected_objects: list[str] = field(default_factory=list)
transcript: str = None
duration_seconds: float = 0
class VideoAnalyzer:
"""Analyze video content."""
def __init__(
self,
vision_model: VisionModel,
stt: SpeechToText = None
):
self.vision = vision_model
self.stt = stt
async def analyze(
self,
video_path: str,
sample_rate: int = 1, # frames per second
max_frames: int = 10
) -> VideoAnalysis:
"""Analyze video content."""
# Extract frames
frames = self._extract_frames(video_path, sample_rate, max_frames)
# Analyze frames
frame_descriptions = []
for frame in frames:
# Convert to ImageInput
import io
import base64
buffer = io.BytesIO()
frame.image.save(buffer, format="JPEG")
b64 = base64.b64encode(buffer.getvalue()).decode()
image_input = ImageInput(source=b64, source_type="base64")
result = await self.vision.analyze(
image_input,
"Describe what's happening in this video frame."
)
frame_descriptions.append({
"timestamp": frame.timestamp_seconds,
"description": result.text
})
# Extract audio transcript if STT available
transcript = None
if self.stt:
audio_path = self._extract_audio(video_path)
if audio_path:
transcript_result = await self.stt.transcribe(audio_path)
transcript = transcript_result.text
# Generate summary
summary = await self._generate_summary(frame_descriptions, transcript)
return VideoAnalysis(
summary=summary,
frame_descriptions=frame_descriptions,
transcript=transcript,
duration_seconds=frames[-1].timestamp_seconds if frames else 0
)
def _extract_frames(
self,
video_path: str,
sample_rate: int,
max_frames: int
) -> list[VideoFrame]:
"""Extract frames from video."""
import cv2
from PIL import Image
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_interval = int(fps / sample_rate)
frames = []
frame_idx = 0
while len(frames) < max_frames:
ret, frame = cap.read()
if not ret:
break
if frame_idx % frame_interval == 0:
# Convert BGR to RGB
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(rgb_frame)
frames.append(VideoFrame(
index=len(frames),
timestamp_seconds=frame_idx / fps,
image=pil_image
))
frame_idx += 1
cap.release()
return frames
def _extract_audio(self, video_path: str) -> Optional[str]:
"""Extract audio from video."""
import subprocess
import tempfile
output_path = tempfile.mktemp(suffix=".wav")
try:
subprocess.run([
"ffmpeg", "-i", video_path,
"-vn", "-acodec", "pcm_s16le",
"-ar", "16000", "-ac", "1",
output_path
], check=True, capture_output=True)
return output_path
except subprocess.CalledProcessError:
return None
async def _generate_summary(
self,
frame_descriptions: list[dict],
transcript: str
) -> str:
"""Generate video summary."""
descriptions = "\n".join(
f"[{d['timestamp']:.1f}s] {d['description']}"
for d in frame_descriptions
)
prompt = f"""Summarize this video based on the frame descriptions and transcript.
Frame descriptions:
{descriptions}
{"Transcript: " + transcript if transcript else "No audio transcript available."}
Summary:"""
# Would use LLM to generate summary
return f"Video summary based on {len(frame_descriptions)} frames."
class VideoQA:
"""Question answering over video."""
def __init__(self, analyzer: VideoAnalyzer, llm_client: Any):
self.analyzer = analyzer
self.llm = llm_client
self.cached_analyses: dict[str, VideoAnalysis] = {}
async def ask(self, video_path: str, question: str) -> str:
"""Answer question about video."""
# Get or create analysis
if video_path not in self.cached_analyses:
self.cached_analyses[video_path] = await self.analyzer.analyze(video_path)
analysis = self.cached_analyses[video_path]
# Build context
context = f"""Video Summary: {analysis.summary}
Frame Details:
{chr(10).join(f"[{d['timestamp']:.1f}s] {d['description']}" for d in analysis.frame_descriptions)}
{"Transcript: " + analysis.transcript if analysis.transcript else ""}"""
prompt = f"""Based on this video analysis, answer the question.
{context}
Question: {question}
Answer:"""
response = await self.llm.complete(prompt)
return response.content
Multimodal RAG
from dataclasses import dataclass, field
from typing import Any, Optional, Union
from enum import Enum
class ModalityType(Enum):
"""Content modality types."""
TEXT = "text"
IMAGE = "image"
AUDIO = "audio"
VIDEO = "video"
@dataclass
class MultimodalDocument:
"""Document with multiple modalities."""
id: str
modality: ModalityType
content: Any # Text string, image path, audio path, etc.
embedding: Any = None
metadata: dict = field(default_factory=dict)
text_description: str = None # Text description for non-text content
class MultimodalEmbedder:
"""Embed multiple modalities."""
def __init__(
self,
text_model: Any,
vision_model: VisionModel = None,
clip_model: Any = None
):
self.text_model = text_model
self.vision_model = vision_model
self.clip_model = clip_model
async def embed(self, document: MultimodalDocument) -> Any:
"""Embed document based on modality."""
if document.modality == ModalityType.TEXT:
return self.text_model.embed(document.content)
elif document.modality == ModalityType.IMAGE:
if self.clip_model:
# Use CLIP for image embedding
return self._embed_image_clip(document.content)
elif self.vision_model:
# Generate description and embed text
description = await self._describe_image(document.content)
document.text_description = description
return self.text_model.embed(description)
elif document.modality == ModalityType.AUDIO:
# Transcribe and embed text
# Would use STT here
return self.text_model.embed(document.text_description or "")
elif document.modality == ModalityType.VIDEO:
# Use video description
return self.text_model.embed(document.text_description or "")
return None
def _embed_image_clip(self, image_path: str) -> Any:
"""Embed image with CLIP."""
from PIL import Image
import torch
image = Image.open(image_path)
inputs = self.clip_model.processor(images=image, return_tensors="pt")
with torch.no_grad():
features = self.clip_model.model.get_image_features(**inputs)
return features.numpy()[0]
async def _describe_image(self, image_path: str) -> str:
"""Generate text description of image."""
image_input = ImageInput(source=image_path, source_type="file")
result = await self.vision_model.analyze(
image_input,
"Describe this image in detail for search indexing."
)
return result.text
class MultimodalVectorStore:
"""Vector store for multimodal content."""
def __init__(self, embedder: MultimodalEmbedder):
self.embedder = embedder
self.documents: list[MultimodalDocument] = []
self.embeddings: list[Any] = []
async def add(self, document: MultimodalDocument):
"""Add document to store."""
embedding = await self.embedder.embed(document)
document.embedding = embedding
self.documents.append(document)
self.embeddings.append(embedding)
async def search(
self,
query: str,
modality_filter: ModalityType = None,
k: int = 5
) -> list[MultimodalDocument]:
"""Search for relevant documents."""
import numpy as np
# Embed query
query_embedding = self.embedder.text_model.embed(query)
# Calculate similarities
similarities = []
for i, (doc, emb) in enumerate(zip(self.documents, self.embeddings)):
if modality_filter and doc.modality != modality_filter:
continue
if emb is not None:
sim = np.dot(query_embedding, emb)
similarities.append((i, sim))
# Sort by similarity
similarities.sort(key=lambda x: x[1], reverse=True)
# Return top-k
return [self.documents[i] for i, _ in similarities[:k]]
class MultimodalRAG:
"""RAG system for multimodal content."""
def __init__(
self,
vector_store: MultimodalVectorStore,
llm_client: Any,
vision_model: VisionModel = None
):
self.store = vector_store
self.llm = llm_client
self.vision = vision_model
async def query(
self,
question: str,
include_images: bool = True
) -> dict:
"""Query multimodal knowledge base."""
# Retrieve relevant documents
documents = await self.store.search(question, k=5)
# Build context
context_parts = []
images = []
for doc in documents:
if doc.modality == ModalityType.TEXT:
context_parts.append(f"[Text] {doc.content}")
elif doc.modality == ModalityType.IMAGE:
context_parts.append(f"[Image] {doc.text_description}")
if include_images:
images.append(doc.content)
elif doc.modality == ModalityType.AUDIO:
context_parts.append(f"[Audio Transcript] {doc.text_description}")
elif doc.modality == ModalityType.VIDEO:
context_parts.append(f"[Video Summary] {doc.text_description}")
context = "\n\n".join(context_parts)
# Generate answer
prompt = f"""Answer the question based on the following multimodal context.
Context:
{context}
Question: {question}
Answer:"""
response = await self.llm.complete(prompt)
return {
"answer": response.content,
"sources": [
{
"id": doc.id,
"modality": doc.modality.value,
"preview": doc.text_description or str(doc.content)[:100]
}
for doc in documents
],
"images": images if include_images else []
}
Production Multimodal Service
from fastapi import FastAPI, HTTPException, UploadFile, File
from pydantic import BaseModel
from typing import Optional, Any
import asyncio
import tempfile
import os
app = FastAPI()
class AnalyzeImageRequest(BaseModel):
image_url: Optional[str] = None
prompt: str = "Describe this image."
class GenerateImageRequest(BaseModel):
prompt: str
size: str = "1024x1024"
style: str = "vivid"
class TranscribeRequest(BaseModel):
language: Optional[str] = None
class SynthesizeRequest(BaseModel):
text: str
voice: str = "alloy"
# Initialize components (simplified)
class MockVision:
async def analyze(self, image, prompt):
return type('obj', (object,), {'text': f'Analysis of image: {prompt}'})()
class MockGenerator:
async def generate(self, prompt, **kwargs):
return type('obj', (object,), {'url': 'https://example.com/image.png'})()
vision_model = MockVision()
image_generator = MockGenerator()
@app.post("/v1/vision/analyze")
async def analyze_image(request: AnalyzeImageRequest) -> dict:
"""Analyze image with vision model."""
if not request.image_url:
raise HTTPException(status_code=400, detail="Image URL required")
image_input = ImageInput(source=request.image_url)
result = await vision_model.analyze(image_input, request.prompt)
return {
"description": result.text,
"prompt": request.prompt
}
@app.post("/v1/vision/analyze-upload")
async def analyze_uploaded_image(
file: UploadFile = File(...),
prompt: str = "Describe this image."
) -> dict:
"""Analyze uploaded image."""
# Save uploaded file
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as f:
content = await file.read()
f.write(content)
temp_path = f.name
try:
image_input = ImageInput(source=temp_path, source_type="file")
result = await vision_model.analyze(image_input, prompt)
return {
"description": result.text,
"filename": file.filename
}
finally:
os.unlink(temp_path)
@app.post("/v1/image/generate")
async def generate_image(request: GenerateImageRequest) -> dict:
"""Generate image from prompt."""
result = await image_generator.generate(
prompt=request.prompt,
size=request.size,
style=request.style
)
return {
"url": result.url,
"prompt": request.prompt
}
@app.post("/v1/audio/transcribe")
async def transcribe_audio(
file: UploadFile = File(...),
language: Optional[str] = None
) -> dict:
"""Transcribe audio file."""
# Would use STT model
return {
"text": "Transcribed text would appear here",
"language": language or "en"
}
@app.post("/v1/audio/synthesize")
async def synthesize_speech(request: SynthesizeRequest) -> dict:
"""Synthesize speech from text."""
# Would use TTS model
return {
"audio_url": "https://example.com/audio.mp3",
"text": request.text
}
@app.get("/health")
async def health():
return {"status": "healthy"}
References
- OpenAI Vision: https://platform.openai.com/docs/guides/vision
- DALL-E: https://platform.openai.com/docs/guides/images
- Whisper: https://github.com/openai/whisper
- CLIP: https://github.com/openai/CLIP
Conclusion
Multimodal AI opens possibilities that single-modality systems cannot achieve. For vision-language tasks, choose your model based on the task: GPT-4V and Claude excel at complex reasoning over images, while CLIP is better for similarity search and classification. For image generation, DALL-E 3 produces high-quality results with good prompt following, while Stable Diffusion offers more control and local deployment options. Speech processing has become remarkably accessible—Whisper provides excellent transcription across languages, and modern TTS systems produce natural-sounding speech. Video analysis requires thoughtful frame sampling; analyzing every frame is expensive and often unnecessary. For multimodal RAG, the key insight is that you can often convert non-text modalities to text descriptions and use standard text retrieval, though native multimodal embeddings like CLIP provide better results for image-heavy applications. In production, consider the latency implications of multimodal processing—vision and audio models are typically slower than text models, so design your architecture with appropriate timeouts and async processing. The most powerful applications combine modalities thoughtfully, using each where it adds value rather than forcing multimodal processing everywhere.
Discover more from Code, Cloud & Context
Subscribe to get the latest posts sent to your email.