Introduction: Memory is what transforms a stateless LLM into a contextually aware assistant. Without memory, every interaction starts from scratch—the model has no knowledge of previous conversations, user preferences, or accumulated context. This guide covers the memory architectures that enable persistent, intelligent AI systems: conversation buffers for recent context, summary memory for long conversations, vector-based retrieval for semantic search over history, entity memory for tracking facts about people and things, and hierarchical memory systems that combine multiple approaches. Whether you’re building a customer support bot that remembers past issues, a personal assistant that learns user preferences, or an agent that accumulates knowledge over time, these patterns will help you design memory systems that scale.

Conversation Buffer Memory
from dataclasses import dataclass, field
from typing import Any, Optional
from datetime import datetime
from abc import ABC, abstractmethod
@dataclass
class Message:
"""A message in conversation history."""
role: str # "user", "assistant", "system"
content: str
timestamp: datetime = field(default_factory=datetime.now)
metadata: dict = field(default_factory=dict)
def to_dict(self) -> dict:
"""Convert to API format."""
return {"role": self.role, "content": self.content}
class Memory(ABC):
"""Abstract memory interface."""
@abstractmethod
def add(self, message: Message):
"""Add message to memory."""
pass
@abstractmethod
def get_context(self) -> list[Message]:
"""Get messages for context."""
pass
@abstractmethod
def clear(self):
"""Clear memory."""
pass
class BufferMemory(Memory):
"""Simple buffer that keeps all messages."""
def __init__(self, max_messages: int = None):
self.messages: list[Message] = []
self.max_messages = max_messages
def add(self, message: Message):
"""Add message to buffer."""
self.messages.append(message)
if self.max_messages and len(self.messages) > self.max_messages:
self.messages = self.messages[-self.max_messages:]
def get_context(self) -> list[Message]:
"""Get all messages."""
return self.messages.copy()
def clear(self):
"""Clear buffer."""
self.messages = []
class WindowBufferMemory(Memory):
"""Sliding window buffer."""
def __init__(self, window_size: int = 10):
self.messages: list[Message] = []
self.window_size = window_size
def add(self, message: Message):
"""Add message, maintaining window."""
self.messages.append(message)
if len(self.messages) > self.window_size:
self.messages = self.messages[-self.window_size:]
def get_context(self) -> list[Message]:
"""Get windowed messages."""
return self.messages.copy()
def clear(self):
"""Clear buffer."""
self.messages = []
class TokenBufferMemory(Memory):
"""Buffer that limits by token count."""
def __init__(self, max_tokens: int, token_counter: Any):
self.messages: list[Message] = []
self.max_tokens = max_tokens
self.counter = token_counter
def add(self, message: Message):
"""Add message, trimming to token limit."""
self.messages.append(message)
self._trim_to_limit()
def _trim_to_limit(self):
"""Trim oldest messages to stay under limit."""
while self._count_tokens() > self.max_tokens and len(self.messages) > 1:
self.messages.pop(0)
def _count_tokens(self) -> int:
"""Count total tokens in buffer."""
total = 0
for msg in self.messages:
total += self.counter.count(msg.content)
return total
def get_context(self) -> list[Message]:
"""Get messages within token limit."""
return self.messages.copy()
def clear(self):
"""Clear buffer."""
self.messages = []
class ConversationTurnMemory(Memory):
"""Memory organized by conversation turns."""
def __init__(self, max_turns: int = 5):
self.turns: list[tuple[Message, Message]] = []
self.max_turns = max_turns
self.pending_user_message: Message = None
def add(self, message: Message):
"""Add message, pairing user/assistant."""
if message.role == "user":
self.pending_user_message = message
elif message.role == "assistant" and self.pending_user_message:
self.turns.append((self.pending_user_message, message))
self.pending_user_message = None
if len(self.turns) > self.max_turns:
self.turns = self.turns[-self.max_turns:]
def get_context(self) -> list[Message]:
"""Get messages from turns."""
messages = []
for user_msg, assistant_msg in self.turns:
messages.append(user_msg)
messages.append(assistant_msg)
if self.pending_user_message:
messages.append(self.pending_user_message)
return messages
def clear(self):
"""Clear turns."""
self.turns = []
self.pending_user_message = None
Summary Memory
from dataclasses import dataclass
from typing import Any, Optional
@dataclass
class ConversationSummary:
"""Summary of conversation history."""
summary: str
message_count: int
last_updated: datetime = None
def __post_init__(self):
if self.last_updated is None:
self.last_updated = datetime.now()
class SummaryMemory(Memory):
"""Memory that summarizes old messages."""
def __init__(
self,
llm_client: Any,
buffer_size: int = 5,
summary_prompt: str = None
):
self.llm = llm_client
self.buffer_size = buffer_size
self.summary_prompt = summary_prompt or self._default_summary_prompt()
self.summary: ConversationSummary = None
self.recent_messages: list[Message] = []
def _default_summary_prompt(self) -> str:
return """Summarize the following conversation, preserving key information:
{conversation}
Previous summary (if any): {previous_summary}
Provide a concise summary that captures:
- Main topics discussed
- Key decisions or conclusions
- Important facts mentioned
- User preferences or requests"""
def add(self, message: Message):
"""Add message, summarizing when buffer full."""
self.recent_messages.append(message)
if len(self.recent_messages) > self.buffer_size:
# Trigger summarization
asyncio.create_task(self._summarize())
async def _summarize(self):
"""Summarize old messages."""
# Keep most recent messages
to_summarize = self.recent_messages[:-self.buffer_size // 2]
self.recent_messages = self.recent_messages[-self.buffer_size // 2:]
# Format conversation
conversation = "\n".join(
f"{m.role}: {m.content}" for m in to_summarize
)
previous = self.summary.summary if self.summary else "None"
prompt = self.summary_prompt.format(
conversation=conversation,
previous_summary=previous
)
response = await self.llm.complete(prompt)
self.summary = ConversationSummary(
summary=response.content,
message_count=(self.summary.message_count if self.summary else 0) + len(to_summarize)
)
def get_context(self) -> list[Message]:
"""Get summary + recent messages."""
messages = []
if self.summary:
messages.append(Message(
role="system",
content=f"Previous conversation summary:\n{self.summary.summary}"
))
messages.extend(self.recent_messages)
return messages
def clear(self):
"""Clear memory."""
self.summary = None
self.recent_messages = []
class ProgressiveSummaryMemory(Memory):
"""Memory with progressive summarization levels."""
def __init__(
self,
llm_client: Any,
levels: list[int] = None
):
self.llm = llm_client
self.levels = levels or [5, 20, 100] # Messages per level
self.summaries: list[str] = [] # One per level
self.current_buffer: list[Message] = []
def add(self, message: Message):
"""Add message with progressive summarization."""
self.current_buffer.append(message)
# Check if we need to summarize at each level
for i, threshold in enumerate(self.levels):
if len(self.current_buffer) >= threshold:
asyncio.create_task(self._summarize_level(i))
async def _summarize_level(self, level: int):
"""Summarize at a specific level."""
threshold = self.levels[level]
to_summarize = self.current_buffer[:threshold]
self.current_buffer = self.current_buffer[threshold:]
# Include previous level summary if exists
previous = self.summaries[level] if level < len(self.summaries) else ""
prompt = f"""Create a {"detailed" if level == 0 else "high-level"} summary:
Previous summary: {previous or "None"}
New messages:
{self._format_messages(to_summarize)}
Summary:"""
response = await self.llm.complete(prompt)
if level >= len(self.summaries):
self.summaries.append(response.content)
else:
self.summaries[level] = response.content
def _format_messages(self, messages: list[Message]) -> str:
return "\n".join(f"{m.role}: {m.content}" for m in messages)
def get_context(self) -> list[Message]:
"""Get hierarchical context."""
messages = []
# Add summaries from highest to lowest level
for i, summary in enumerate(reversed(self.summaries)):
level_name = ["Recent", "Session", "Long-term"][min(i, 2)]
messages.append(Message(
role="system",
content=f"{level_name} context:\n{summary}"
))
# Add current buffer
messages.extend(self.current_buffer)
return messages
def clear(self):
"""Clear all memory."""
self.summaries = []
self.current_buffer = []
class IncrementalSummaryMemory(Memory):
"""Memory that updates summary incrementally."""
def __init__(self, llm_client: Any, update_frequency: int = 3):
self.llm = llm_client
self.update_frequency = update_frequency
self.summary = ""
self.recent_messages: list[Message] = []
self.message_count = 0
def add(self, message: Message):
"""Add message with incremental summary updates."""
self.recent_messages.append(message)
self.message_count += 1
if self.message_count % self.update_frequency == 0:
asyncio.create_task(self._update_summary())
async def _update_summary(self):
"""Incrementally update summary."""
new_content = self._format_messages(self.recent_messages[-self.update_frequency:])
prompt = f"""Update this conversation summary with new information:
Current summary:
{self.summary or "No summary yet."}
New messages:
{new_content}
Updated summary (preserve important old information, add new):"""
response = await self.llm.complete(prompt)
self.summary = response.content
def _format_messages(self, messages: list[Message]) -> str:
return "\n".join(f"{m.role}: {m.content}" for m in messages)
def get_context(self) -> list[Message]:
"""Get summary + recent messages."""
messages = []
if self.summary:
messages.append(Message(
role="system",
content=f"Conversation summary:\n{self.summary}"
))
# Only include very recent messages
messages.extend(self.recent_messages[-3:])
return messages
def clear(self):
"""Clear memory."""
self.summary = ""
self.recent_messages = []
self.message_count = 0
Vector-Based Memory
from dataclasses import dataclass, field
from typing import Any, Optional
import numpy as np
from datetime import datetime
@dataclass
class MemoryItem:
"""Item stored in vector memory."""
id: str
content: str
embedding: np.ndarray = None
timestamp: datetime = field(default_factory=datetime.now)
importance: float = 0.5
access_count: int = 0
last_accessed: datetime = None
metadata: dict = field(default_factory=dict)
class VectorMemory(Memory):
"""Memory with semantic retrieval."""
def __init__(
self,
embedding_model: Any,
max_items: int = 1000,
retrieval_k: int = 5
):
self.embedder = embedding_model
self.max_items = max_items
self.retrieval_k = retrieval_k
self.items: list[MemoryItem] = []
self.embeddings: np.ndarray = None
def add(self, message: Message):
"""Add message to vector memory."""
import uuid
# Create embedding
embedding = self.embedder.embed(message.content)
item = MemoryItem(
id=str(uuid.uuid4()),
content=f"{message.role}: {message.content}",
embedding=embedding,
metadata={"role": message.role}
)
self.items.append(item)
# Update embedding matrix
if self.embeddings is None:
self.embeddings = embedding.reshape(1, -1)
else:
self.embeddings = np.vstack([self.embeddings, embedding])
# Evict if over capacity
if len(self.items) > self.max_items:
self._evict()
def _evict(self):
"""Evict least important items."""
# Score by importance and recency
scores = []
now = datetime.now()
for item in self.items:
recency = 1.0 / (1 + (now - item.timestamp).total_seconds() / 3600)
score = item.importance * 0.5 + recency * 0.3 + (item.access_count / 10) * 0.2
scores.append(score)
# Remove lowest scoring
min_idx = np.argmin(scores)
del self.items[min_idx]
self.embeddings = np.delete(self.embeddings, min_idx, axis=0)
def retrieve(self, query: str, k: int = None) -> list[MemoryItem]:
"""Retrieve relevant memories."""
if not self.items:
return []
k = k or self.retrieval_k
# Embed query
query_embedding = self.embedder.embed(query)
# Calculate similarities
similarities = np.dot(self.embeddings, query_embedding)
# Get top-k indices
top_indices = np.argsort(similarities)[-k:][::-1]
# Update access counts
results = []
for idx in top_indices:
item = self.items[idx]
item.access_count += 1
item.last_accessed = datetime.now()
results.append(item)
return results
def get_context(self) -> list[Message]:
"""Get recent messages (for compatibility)."""
recent = sorted(self.items, key=lambda x: x.timestamp, reverse=True)[:5]
return [
Message(role=item.metadata.get("role", "user"), content=item.content)
for item in recent
]
def clear(self):
"""Clear memory."""
self.items = []
self.embeddings = None
class HybridVectorMemory(Memory):
"""Combine buffer and vector memory."""
def __init__(
self,
embedding_model: Any,
buffer_size: int = 10,
vector_size: int = 1000,
retrieval_k: int = 3
):
self.buffer = BufferMemory(max_messages=buffer_size)
self.vector = VectorMemory(
embedding_model=embedding_model,
max_items=vector_size,
retrieval_k=retrieval_k
)
def add(self, message: Message):
"""Add to both memories."""
self.buffer.add(message)
self.vector.add(message)
def get_context(self, query: str = None) -> list[Message]:
"""Get buffer + relevant vector memories."""
messages = []
# Add relevant memories from vector store
if query:
relevant = self.vector.retrieve(query)
if relevant:
memory_text = "\n".join(item.content for item in relevant)
messages.append(Message(
role="system",
content=f"Relevant past context:\n{memory_text}"
))
# Add recent buffer
messages.extend(self.buffer.get_context())
return messages
def clear(self):
"""Clear both memories."""
self.buffer.clear()
self.vector.clear()
class TimeWeightedVectorMemory(VectorMemory):
"""Vector memory with time-based weighting."""
def __init__(
self,
embedding_model: Any,
decay_rate: float = 0.01,
**kwargs
):
super().__init__(embedding_model, **kwargs)
self.decay_rate = decay_rate
def retrieve(self, query: str, k: int = None) -> list[MemoryItem]:
"""Retrieve with time weighting."""
if not self.items:
return []
k = k or self.retrieval_k
# Embed query
query_embedding = self.embedder.embed(query)
# Calculate base similarities
similarities = np.dot(self.embeddings, query_embedding)
# Apply time decay
now = datetime.now()
time_weights = []
for item in self.items:
hours_ago = (now - item.timestamp).total_seconds() / 3600
weight = np.exp(-self.decay_rate * hours_ago)
time_weights.append(weight)
# Combine similarity and time weight
scores = similarities * np.array(time_weights)
# Get top-k
top_indices = np.argsort(scores)[-k:][::-1]
return [self.items[idx] for idx in top_indices]
Entity Memory
from dataclasses import dataclass, field
from typing import Any, Optional
from datetime import datetime
@dataclass
class Entity:
"""An entity tracked in memory."""
name: str
entity_type: str # "person", "organization", "concept", etc.
attributes: dict = field(default_factory=dict)
relationships: list[tuple[str, str]] = field(default_factory=list) # (relation, target_entity)
first_mentioned: datetime = field(default_factory=datetime.now)
last_mentioned: datetime = field(default_factory=datetime.now)
mention_count: int = 1
class EntityMemory(Memory):
"""Memory that tracks entities and their attributes."""
def __init__(self, llm_client: Any):
self.llm = llm_client
self.entities: dict[str, Entity] = {}
self.recent_messages: list[Message] = []
def add(self, message: Message):
"""Extract and store entities from message."""
self.recent_messages.append(message)
# Extract entities asynchronously
asyncio.create_task(self._extract_entities(message))
async def _extract_entities(self, message: Message):
"""Extract entities using LLM."""
prompt = f"""Extract entities from this message:
"{message.content}"
For each entity, provide:
- Name
- Type (person, organization, place, concept, product)
- Attributes mentioned
- Relationships to other entities
Output as JSON:
{{
"entities": [
{{
"name": "...",
"type": "...",
"attributes": {{"key": "value"}},
"relationships": [["relation", "target_entity"]]
}}
]
}}"""
response = await self.llm.complete(prompt)
import json
try:
data = json.loads(response.content)
for entity_data in data.get("entities", []):
self._update_entity(entity_data)
except json.JSONDecodeError:
pass
def _update_entity(self, entity_data: dict):
"""Update or create entity."""
name = entity_data["name"]
if name in self.entities:
entity = self.entities[name]
entity.attributes.update(entity_data.get("attributes", {}))
entity.relationships.extend(entity_data.get("relationships", []))
entity.last_mentioned = datetime.now()
entity.mention_count += 1
else:
self.entities[name] = Entity(
name=name,
entity_type=entity_data.get("type", "unknown"),
attributes=entity_data.get("attributes", {}),
relationships=entity_data.get("relationships", [])
)
def get_entity(self, name: str) -> Optional[Entity]:
"""Get entity by name."""
return self.entities.get(name)
def get_related_entities(self, name: str) -> list[Entity]:
"""Get entities related to given entity."""
entity = self.entities.get(name)
if not entity:
return []
related = []
for relation, target in entity.relationships:
if target in self.entities:
related.append(self.entities[target])
return related
def get_context(self) -> list[Message]:
"""Get entity context + recent messages."""
messages = []
if self.entities:
entity_summary = self._summarize_entities()
messages.append(Message(
role="system",
content=f"Known entities:\n{entity_summary}"
))
messages.extend(self.recent_messages[-5:])
return messages
def _summarize_entities(self) -> str:
"""Summarize known entities."""
lines = []
for name, entity in self.entities.items():
attrs = ", ".join(f"{k}: {v}" for k, v in entity.attributes.items())
lines.append(f"- {name} ({entity.entity_type}): {attrs}")
return "\n".join(lines)
def clear(self):
"""Clear memory."""
self.entities = {}
self.recent_messages = []
class KnowledgeGraphMemory(Memory):
"""Memory organized as a knowledge graph."""
def __init__(self, llm_client: Any):
self.llm = llm_client
self.nodes: dict[str, dict] = {} # node_id -> {type, attributes}
self.edges: list[tuple[str, str, str]] = [] # (source, relation, target)
self.recent_messages: list[Message] = []
def add(self, message: Message):
"""Extract knowledge graph triples."""
self.recent_messages.append(message)
asyncio.create_task(self._extract_triples(message))
async def _extract_triples(self, message: Message):
"""Extract knowledge graph triples."""
prompt = f"""Extract knowledge graph triples from:
"{message.content}"
Output as JSON:
{{
"triples": [
{{"subject": "...", "predicate": "...", "object": "..."}}
]
}}"""
response = await self.llm.complete(prompt)
import json
try:
data = json.loads(response.content)
for triple in data.get("triples", []):
self._add_triple(
triple["subject"],
triple["predicate"],
triple["object"]
)
except (json.JSONDecodeError, KeyError):
pass
def _add_triple(self, subject: str, predicate: str, obj: str):
"""Add triple to graph."""
# Add nodes
if subject not in self.nodes:
self.nodes[subject] = {"type": "entity", "attributes": {}}
if obj not in self.nodes:
self.nodes[obj] = {"type": "entity", "attributes": {}}
# Add edge
self.edges.append((subject, predicate, obj))
def query(self, subject: str = None, predicate: str = None, obj: str = None) -> list[tuple]:
"""Query the knowledge graph."""
results = []
for s, p, o in self.edges:
if subject and s != subject:
continue
if predicate and p != predicate:
continue
if obj and o != obj:
continue
results.append((s, p, o))
return results
def get_context(self) -> list[Message]:
"""Get graph context."""
messages = []
if self.edges:
graph_summary = self._summarize_graph()
messages.append(Message(
role="system",
content=f"Known facts:\n{graph_summary}"
))
messages.extend(self.recent_messages[-5:])
return messages
def _summarize_graph(self) -> str:
"""Summarize knowledge graph."""
lines = []
for s, p, o in self.edges[-20:]: # Last 20 facts
lines.append(f"- {s} {p} {o}")
return "\n".join(lines)
def clear(self):
"""Clear graph."""
self.nodes = {}
self.edges = []
self.recent_messages = []
Hierarchical Memory Systems
from dataclasses import dataclass
from typing import Any, Optional
from datetime import datetime, timedelta
class HierarchicalMemory(Memory):
"""Multi-level memory system."""
def __init__(
self,
llm_client: Any,
embedding_model: Any,
working_memory_size: int = 10,
short_term_size: int = 100,
long_term_size: int = 10000
):
self.llm = llm_client
# Working memory (immediate context)
self.working = BufferMemory(max_messages=working_memory_size)
# Short-term memory (recent session)
self.short_term = SummaryMemory(llm_client, buffer_size=short_term_size)
# Long-term memory (persistent)
self.long_term = VectorMemory(
embedding_model=embedding_model,
max_items=long_term_size
)
# Consolidation threshold
self.consolidation_threshold = 50
self.messages_since_consolidation = 0
def add(self, message: Message):
"""Add to all memory levels."""
# Always add to working memory
self.working.add(message)
# Add to short-term
self.short_term.add(message)
# Periodically consolidate to long-term
self.messages_since_consolidation += 1
if self.messages_since_consolidation >= self.consolidation_threshold:
asyncio.create_task(self._consolidate())
async def _consolidate(self):
"""Consolidate important memories to long-term."""
self.messages_since_consolidation = 0
# Get summary from short-term
context = self.short_term.get_context()
# Extract important facts
prompt = """Extract the most important facts and information from this conversation that should be remembered long-term:
{context}
List key facts, one per line:"""
response = await self.llm.complete(prompt.format(
context="\n".join(m.content for m in context)
))
# Store each fact in long-term memory
for line in response.content.split("\n"):
if line.strip():
self.long_term.add(Message(
role="system",
content=line.strip()
))
def get_context(self, query: str = None) -> list[Message]:
"""Get hierarchical context."""
messages = []
# Long-term relevant memories
if query:
relevant = self.long_term.retrieve(query, k=3)
if relevant:
memory_text = "\n".join(item.content for item in relevant)
messages.append(Message(
role="system",
content=f"Relevant long-term memories:\n{memory_text}"
))
# Short-term summary
short_term_context = self.short_term.get_context()
if short_term_context:
messages.extend(short_term_context[:1]) # Just the summary
# Working memory (recent messages)
messages.extend(self.working.get_context())
return messages
def clear(self):
"""Clear all memory levels."""
self.working.clear()
self.short_term.clear()
# Long-term is persistent, don't clear
class SessionMemory:
"""Memory scoped to sessions."""
def __init__(self, llm_client: Any, embedding_model: Any):
self.llm = llm_client
self.embedder = embedding_model
self.sessions: dict[str, HierarchicalMemory] = {}
self.session_summaries: dict[str, str] = {}
self.current_session: str = None
def start_session(self, session_id: str):
"""Start a new session."""
self.current_session = session_id
if session_id not in self.sessions:
self.sessions[session_id] = HierarchicalMemory(
self.llm,
self.embedder
)
def end_session(self, session_id: str = None):
"""End session and create summary."""
session_id = session_id or self.current_session
if session_id in self.sessions:
asyncio.create_task(self._summarize_session(session_id))
async def _summarize_session(self, session_id: str):
"""Create session summary."""
memory = self.sessions[session_id]
context = memory.get_context()
prompt = f"""Summarize this session:
{chr(10).join(m.content for m in context)}
Provide a brief summary of:
1. Main topics discussed
2. Key outcomes or decisions
3. Any action items or follow-ups"""
response = await self.llm.complete(prompt)
self.session_summaries[session_id] = response.content
def add(self, message: Message):
"""Add to current session."""
if self.current_session and self.current_session in self.sessions:
self.sessions[self.current_session].add(message)
def get_context(self, query: str = None) -> list[Message]:
"""Get context from current session + relevant past sessions."""
messages = []
# Add relevant past session summaries
if self.session_summaries:
summaries = "\n\n".join(
f"Session {sid}: {summary}"
for sid, summary in list(self.session_summaries.items())[-3:]
)
messages.append(Message(
role="system",
content=f"Previous session summaries:\n{summaries}"
))
# Add current session context
if self.current_session and self.current_session in self.sessions:
messages.extend(self.sessions[self.current_session].get_context(query))
return messages
class ReflectiveMemory(Memory):
"""Memory that reflects on and learns from experiences."""
def __init__(self, llm_client: Any, embedding_model: Any):
self.llm = llm_client
self.base_memory = HierarchicalMemory(llm_client, embedding_model)
self.reflections: list[str] = []
self.lessons_learned: list[str] = []
self.reflection_interval = 20
self.message_count = 0
def add(self, message: Message):
"""Add message and periodically reflect."""
self.base_memory.add(message)
self.message_count += 1
if self.message_count % self.reflection_interval == 0:
asyncio.create_task(self._reflect())
async def _reflect(self):
"""Reflect on recent interactions."""
context = self.base_memory.get_context()
prompt = f"""Reflect on these recent interactions:
{chr(10).join(m.content for m in context)}
Consider:
1. What patterns do you notice?
2. What worked well?
3. What could be improved?
4. What lessons should be remembered?
Provide insights:"""
response = await self.llm.complete(prompt)
self.reflections.append(response.content)
# Extract lessons
lessons_prompt = f"""From this reflection, extract specific lessons to remember:
{response.content}
List lessons as actionable guidelines:"""
lessons_response = await self.llm.complete(lessons_prompt)
for line in lessons_response.content.split("\n"):
if line.strip():
self.lessons_learned.append(line.strip())
def get_context(self, query: str = None) -> list[Message]:
"""Get context with lessons learned."""
messages = []
# Add lessons learned
if self.lessons_learned:
lessons = "\n".join(self.lessons_learned[-10:])
messages.append(Message(
role="system",
content=f"Lessons learned:\n{lessons}"
))
# Add base memory context
messages.extend(self.base_memory.get_context(query))
return messages
def clear(self):
"""Clear memory but keep lessons."""
self.base_memory.clear()
self.reflections = []
# Keep lessons_learned
Production Memory Service
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional, Any
import asyncio
import uuid
app = FastAPI()
class MessageRequest(BaseModel):
session_id: str
role: str
content: str
class ContextRequest(BaseModel):
session_id: str
query: Optional[str] = None
max_messages: int = 10
class SessionCreate(BaseModel):
user_id: str
metadata: dict = {}
# Storage
sessions: dict[str, dict] = {}
memories: dict[str, BufferMemory] = {}
@app.post("/v1/sessions")
async def create_session(request: SessionCreate) -> dict:
"""Create a new memory session."""
session_id = str(uuid.uuid4())
sessions[session_id] = {
"user_id": request.user_id,
"metadata": request.metadata,
"created_at": datetime.now().isoformat()
}
memories[session_id] = BufferMemory(max_messages=100)
return {"session_id": session_id}
@app.post("/v1/messages")
async def add_message(request: MessageRequest) -> dict:
"""Add message to session memory."""
if request.session_id not in memories:
raise HTTPException(status_code=404, detail="Session not found")
message = Message(
role=request.role,
content=request.content
)
memories[request.session_id].add(message)
return {"status": "added", "message_count": len(memories[request.session_id].messages)}
@app.post("/v1/context")
async def get_context(request: ContextRequest) -> dict:
"""Get context for session."""
if request.session_id not in memories:
raise HTTPException(status_code=404, detail="Session not found")
memory = memories[request.session_id]
context = memory.get_context()
# Limit messages
context = context[-request.max_messages:]
return {
"messages": [m.to_dict() for m in context],
"total_messages": len(memory.messages)
}
@app.delete("/v1/sessions/{session_id}")
async def delete_session(session_id: str) -> dict:
"""Delete session and its memory."""
if session_id in sessions:
del sessions[session_id]
if session_id in memories:
del memories[session_id]
return {"status": "deleted"}
@app.get("/v1/sessions/{session_id}")
async def get_session(session_id: str) -> dict:
"""Get session info."""
if session_id not in sessions:
raise HTTPException(status_code=404, detail="Session not found")
return {
**sessions[session_id],
"message_count": len(memories.get(session_id, BufferMemory()).messages)
}
@app.get("/health")
async def health():
return {"status": "healthy"}
References
- LangChain Memory: https://python.langchain.com/docs/modules/memory/
- MemGPT: https://github.com/cpacker/MemGPT
- Generative Agents: https://arxiv.org/abs/2304.03442
- Mem0: https://github.com/mem0ai/mem0
Conclusion
Memory transforms LLMs from stateless responders into contextually aware assistants. Start with simple buffer memory for short conversations—it’s often sufficient and adds minimal complexity. Move to summary memory when conversations exceed context limits; a good summary preserves essential information while staying compact. Implement vector-based retrieval when you need to access relevant information from large histories; semantic search finds what matters regardless of when it was mentioned. Use entity memory when tracking facts about specific people, places, or things is important for your use case. For production systems, consider hierarchical memory that combines working memory (immediate context), short-term memory (session summary), and long-term memory (persistent knowledge). The key insight is that different memory types serve different purposes: buffers for recency, summaries for compression, vectors for relevance, and entities for structured facts. Design your memory system based on what your application needs to remember and how it needs to access that information. Remember that memory adds latency and cost—every retrieval is an embedding call, every summary is an LLM call. Profile your memory system and optimize the hot paths.
Discover more from Code, Cloud & Context
Subscribe to get the latest posts sent to your email.