Every major cloud provider now offers managed services for LLM operations. But they’re not created equal—each has different strengths, pricing models, and integration patterns.
I’ve deployed LLM applications across all three major clouds. Here’s an honest comparison of what works, what doesn’t, and how to choose.
Series Navigation: Part 7: MLOps/LLMOps Fundamentals → Part 8: Cloud Platforms (You are here) → Part 9: DIY Implementation
Cloud LLMOps Architecture Overview
AWS: Amazon Bedrock & SageMaker
AWS offers the most comprehensive suite, but it’s spread across multiple services that you need to integrate yourself.
AWS LLMOps Architecture
AWS Implementation Example
# aws_bedrock_rag.py
import boto3
import json
from typing import List, Dict
class AWSBedrockRAG:
"""Production RAG implementation using AWS Bedrock."""
def __init__(self, region: str = "us-east-1"):
self.bedrock = boto3.client("bedrock-runtime", region_name=region)
self.bedrock_agent = boto3.client("bedrock-agent-runtime", region_name=region)
self.opensearch = boto3.client("opensearchserverless", region_name=region)
def invoke_model(self, prompt: str, model_id: str = "anthropic.claude-3-sonnet-20240229-v1:0",
max_tokens: int = 2000, temperature: float = 0) -> str:
"""Invoke a Bedrock foundation model."""
body = json.dumps({
"anthropic_version": "bedrock-2023-05-31",
"max_tokens": max_tokens,
"temperature": temperature,
"messages": [{"role": "user", "content": prompt}]
})
response = self.bedrock.invoke_model(
modelId=model_id,
body=body,
contentType="application/json",
accept="application/json"
)
result = json.loads(response["body"].read())
return result["content"][0]["text"]
def retrieve_and_generate(self, query: str, knowledge_base_id: str) -> Dict:
"""Use Bedrock Knowledge Base for RAG."""
response = self.bedrock_agent.retrieve_and_generate(
input={"text": query},
retrieveAndGenerateConfiguration={
"type": "KNOWLEDGE_BASE",
"knowledgeBaseConfiguration": {
"knowledgeBaseId": knowledge_base_id,
"modelArn": "arn:aws:bedrock:us-east-1::foundation-model/anthropic.claude-3-sonnet-20240229-v1:0",
"retrievalConfiguration": {
"vectorSearchConfiguration": {
"numberOfResults": 5
}
}
}
}
)
return {
"answer": response["output"]["text"],
"citations": response.get("citations", [])
}
def invoke_with_guardrails(self, prompt: str, guardrail_id: str,
guardrail_version: str) -> Dict:
"""Invoke model with Bedrock Guardrails."""
body = json.dumps({
"anthropic_version": "bedrock-2023-05-31",
"max_tokens": 2000,
"messages": [{"role": "user", "content": prompt}]
})
response = self.bedrock.invoke_model(
modelId="anthropic.claude-3-sonnet-20240229-v1:0",
body=body,
guardrailIdentifier=guardrail_id,
guardrailVersion=guardrail_version
)
result = json.loads(response["body"].read())
return {
"response": result["content"][0]["text"],
"guardrail_action": response.get("amazon-bedrock-guardrailAction", "NONE")
}
# Usage
rag = AWSBedrockRAG()
# Simple generation
response = rag.invoke_model("Explain quantum computing in simple terms.")
# RAG with Knowledge Base
result = rag.retrieve_and_generate(
query="What is our refund policy?",
knowledge_base_id="KB12345"
)
# With Guardrails
safe_response = rag.invoke_with_guardrails(
prompt="User query here",
guardrail_id="gr-abc123",
guardrail_version="1"
)
Azure: Azure OpenAI & Azure ML
Azure has the strongest enterprise story—especially if you need OpenAI models with enterprise compliance (HIPAA, SOC2, etc.).
Azure LLMOps Architecture
Azure Implementation Example
# azure_openai_rag.py
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery
import os
class AzureRAGPipeline:
"""Production RAG using Azure OpenAI and Azure AI Search."""
def __init__(self):
# Use managed identity in production
credential = DefaultAzureCredential()
# Azure OpenAI client
self.openai = AzureOpenAI(
azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
api_version="2024-06-01",
azure_ad_token_provider=credential.get_token
)
# Azure AI Search client
self.search = SearchClient(
endpoint=os.environ["AZURE_SEARCH_ENDPOINT"],
index_name=os.environ["AZURE_SEARCH_INDEX"],
credential=credential
)
def get_embedding(self, text: str) -> list[float]:
"""Generate embedding using Azure OpenAI."""
response = self.openai.embeddings.create(
model="text-embedding-3-large", # deployment name
input=text
)
return response.data[0].embedding
def hybrid_search(self, query: str, top_k: int = 5) -> list[dict]:
"""Hybrid search combining vector and keyword."""
# Get query embedding
query_vector = self.get_embedding(query)
# Create vector query
vector_query = VectorizedQuery(
vector=query_vector,
k_nearest_neighbors=top_k,
fields="content_vector"
)
# Execute hybrid search
results = self.search.search(
search_text=query, # Keyword search
vector_queries=[vector_query], # Vector search
query_type="semantic", # Enable semantic ranking
semantic_configuration_name="my-semantic-config",
top=top_k,
select=["content", "title", "source"]
)
return [
{
"content": r["content"],
"title": r["title"],
"source": r["source"],
"score": r["@search.score"],
"reranker_score": r.get("@search.reranker_score")
}
for r in results
]
def generate_response(self, query: str, context: list[dict]) -> str:
"""Generate response using Azure OpenAI with retrieved context."""
context_text = "\n\n".join([
f"[{doc['title']}]\n{doc['content']}"
for doc in context
])
response = self.openai.chat.completions.create(
model="gpt-4o", # deployment name
messages=[
{
"role": "system",
"content": """Answer based on the provided context.
Cite sources using [Source Title] format.
If unsure, say so."""
},
{
"role": "user",
"content": f"Context:\n{context_text}\n\nQuestion: {query}"
}
],
temperature=0,
max_tokens=1000
)
return response.choices[0].message.content
def rag_query(self, query: str) -> dict:
"""Full RAG pipeline."""
# 1. Retrieve relevant documents
docs = self.hybrid_search(query)
# 2. Generate response
answer = self.generate_response(query, docs)
return {
"answer": answer,
"sources": [{"title": d["title"], "source": d["source"]} for d in docs]
}
# Usage
pipeline = AzureRAGPipeline()
result = pipeline.rag_query("What are the compliance requirements for HIPAA?")
Google Cloud: Vertex AI
GCP’s Vertex AI offers the most integrated experience—everything in one platform with Gemini models that have industry-leading context windows.
GCP LLMOps Architecture
GCP Implementation Example
# vertex_ai_rag.py
import vertexai
from vertexai.generative_models import GenerativeModel, Part
from vertexai.language_models import TextEmbeddingModel
from google.cloud import aiplatform
from google.cloud import discoveryengine_v1 as discoveryengine
class VertexAIRAG:
"""Production RAG using Vertex AI."""
def __init__(self, project_id: str, location: str = "us-central1"):
vertexai.init(project=project_id, location=location)
self.project_id = project_id
self.location = location
# Initialize models
self.model = GenerativeModel("gemini-2.5-pro")
self.embedding_model = TextEmbeddingModel.from_pretrained("text-embedding-004")
def generate(self, prompt: str, temperature: float = 0) -> str:
"""Generate using Gemini 2.5."""
response = self.model.generate_content(
prompt,
generation_config={
"temperature": temperature,
"max_output_tokens": 2048,
}
)
return response.text
def generate_with_grounding(self, prompt: str,
data_store_id: str = None,
google_search: bool = False) -> dict:
"""Generate with grounding (RAG or Google Search)."""
from vertexai.generative_models import Tool, grounding
tools = []
if data_store_id:
# Ground on your own data store
tools.append(Tool.from_retrieval(
grounding.Retrieval(
grounding.VertexAISearch(
datastore=f"projects/{self.project_id}/locations/global/collections/default_collection/dataStores/{data_store_id}"
)
)
))
if google_search:
# Ground on Google Search
tools.append(Tool.from_google_search_retrieval(
grounding.GoogleSearchRetrieval()
))
response = self.model.generate_content(
prompt,
tools=tools,
generation_config={"temperature": 0}
)
return {
"text": response.text,
"grounding_metadata": response.candidates[0].grounding_metadata
}
def create_embeddings(self, texts: list[str]) -> list[list[float]]:
"""Create embeddings using Vertex AI."""
embeddings = self.embedding_model.get_embeddings(texts)
return [e.values for e in embeddings]
def search_data_store(self, query: str, data_store_id: str,
top_k: int = 5) -> list[dict]:
"""Search using Vertex AI Search."""
client = discoveryengine.SearchServiceClient()
serving_config = (
f"projects/{self.project_id}/locations/global/"
f"collections/default_collection/dataStores/{data_store_id}/"
"servingConfigs/default_search"
)
request = discoveryengine.SearchRequest(
serving_config=serving_config,
query=query,
page_size=top_k,
content_search_spec=discoveryengine.SearchRequest.ContentSearchSpec(
snippet_spec=discoveryengine.SearchRequest.ContentSearchSpec.SnippetSpec(
return_snippet=True
),
summary_spec=discoveryengine.SearchRequest.ContentSearchSpec.SummarySpec(
summary_result_count=3,
include_citations=True
)
)
)
response = client.search(request)
results = []
for result in response.results:
doc = result.document
results.append({
"id": doc.id,
"content": doc.derived_struct_data.get("snippets", [{}])[0].get("snippet", ""),
"title": doc.derived_struct_data.get("title", ""),
"link": doc.derived_struct_data.get("link", "")
})
return results
# Usage
rag = VertexAIRAG(project_id="my-project")
# Simple generation
response = rag.generate("Explain Kubernetes networking.")
# Generation with your data store grounding
result = rag.generate_with_grounding(
prompt="What are our security policies?",
data_store_id="company-docs-store"
)
# Generation with Google Search grounding
result = rag.generate_with_grounding(
prompt="What are the latest developments in quantum computing?",
google_search=True
)
Cloud Comparison Summary
| Feature | AWS | Azure | GCP |
|---|---|---|---|
| Primary LLM Service | Bedrock | Azure OpenAI | Vertex AI |
| Model Variety | Claude, Llama, Titan, Mistral | GPT-4o, GPT-4 Turbo | Gemini 2.5, Claude, Llama |
| Max Context | 200K (Claude) | 128K (GPT-4o) | 2M (Gemini 2.5) |
| RAG Solution | Knowledge Bases + Kendra | AI Search | Vertex AI Search |
| Prompt Management | Manual | Prompt Flow | Vertex AI Studio |
| Enterprise Compliance | Good | Excellent | Good |
| Fine-tuning | SageMaker | Azure ML | Vertex AI |
| Pricing Model | Pay-per-token | Pay-per-token + PTU | Pay-per-token + committed |
Key Takeaways
- Azure: Best for enterprise compliance and OpenAI models with Prompt Flow for LLMOps
- AWS: Most model variety with Bedrock, requires more integration work
- GCP: Most integrated platform, Gemini’s 2M context is game-changing for large docs
- Multi-cloud: Use LiteLLM or similar to abstract away provider differences
What’s Next
In Part 9, we’ll build our own LLMOps platform using open-source tools—Kubernetes, GitHub Actions, MLflow, and more. Complete control, no vendor lock-in.
References & Further Reading
- Amazon Bedrock Documentation – docs.aws.amazon.com/bedrock
- Azure OpenAI Service – learn.microsoft.com
- Vertex AI Documentation – cloud.google.com/vertex-ai
- Azure AI Search – learn.microsoft.com/azure/search
- AWS RAG Best Practices – AWS ML Blog
Which cloud are you using for LLMOps? Share your experience on GitHub or LinkedIn.
Discover more from Code, Cloud & Context
Subscribe to get the latest posts sent to your email.