Introduction: LLMs have no inherent memory—each API call is stateless. The model doesn’t remember your previous conversation, your user’s preferences, or the context you established five messages ago. Memory is something you build on top. This guide covers implementing different memory strategies for LLM applications: buffer memory for recent context, summary memory for long conversations, entity memory for tracking key facts, and vector memory for semantic retrieval. Understanding when to use each approach—and how to combine them—is essential for building conversational AI that feels coherent across extended interactions.

Buffer Memory
from dataclasses import dataclass, field
from typing import Optional
from datetime import datetime
@dataclass
class Message:
role: str # "user", "assistant", "system"
content: str
timestamp: datetime = field(default_factory=datetime.now)
metadata: dict = field(default_factory=dict)
class BufferMemory:
"""Simple buffer memory that keeps last N messages."""
def __init__(self, max_messages: int = 20, max_tokens: int = 4000):
self.messages: list[Message] = []
self.max_messages = max_messages
self.max_tokens = max_tokens
def add(self, role: str, content: str, **metadata):
"""Add a message to memory."""
self.messages.append(Message(
role=role,
content=content,
metadata=metadata
))
self._trim()
def _trim(self):
"""Trim to max messages."""
if len(self.messages) > self.max_messages:
self.messages = self.messages[-self.max_messages:]
def get_messages(self) -> list[dict]:
"""Get messages for LLM context."""
return [
{"role": m.role, "content": m.content}
for m in self.messages
]
def clear(self):
"""Clear all messages."""
self.messages = []
class SlidingWindowMemory(BufferMemory):
"""Buffer memory with token-aware sliding window."""
def __init__(self, max_tokens: int = 4000):
super().__init__()
self.max_tokens = max_tokens
def _estimate_tokens(self, text: str) -> int:
"""Rough token estimation (4 chars per token)."""
return len(text) // 4
def get_messages(self) -> list[dict]:
"""Get messages within token budget."""
result = []
total_tokens = 0
# Work backwards from most recent
for msg in reversed(self.messages):
msg_tokens = self._estimate_tokens(msg.content)
if total_tokens + msg_tokens > self.max_tokens:
break
result.insert(0, {"role": msg.role, "content": msg.content})
total_tokens += msg_tokens
return result
# Usage
memory = SlidingWindowMemory(max_tokens=4000)
memory.add("user", "What is Python?")
memory.add("assistant", "Python is a high-level programming language...")
memory.add("user", "What are its main features?")
# Get context for next LLM call
context = memory.get_messages()
Summary Memory
from openai import OpenAI
client = OpenAI()
class SummaryMemory:
"""Memory that maintains a running summary of conversation."""
def __init__(self, summary_threshold: int = 10):
self.messages: list[Message] = []
self.summary: str = ""
self.summary_threshold = summary_threshold
self.messages_since_summary = 0
def add(self, role: str, content: str):
"""Add message and potentially update summary."""
self.messages.append(Message(role=role, content=content))
self.messages_since_summary += 1
# Summarize when threshold reached
if self.messages_since_summary >= self.summary_threshold:
self._update_summary()
def _update_summary(self):
"""Update the running summary."""
# Get messages to summarize
to_summarize = self.messages[:-2] # Keep last 2 for context
if not to_summarize:
return
conversation_text = "\n".join(
f"{m.role}: {m.content}" for m in to_summarize
)
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "Summarize this conversation concisely, preserving key facts, decisions, and context needed for continuation."
},
{
"role": "user",
"content": f"Previous summary:\n{self.summary}\n\nNew conversation:\n{conversation_text}"
}
],
max_tokens=500
)
self.summary = response.choices[0].message.content
# Keep only recent messages
self.messages = self.messages[-2:]
self.messages_since_summary = 0
def get_context(self) -> list[dict]:
"""Get context for LLM including summary."""
context = []
if self.summary:
context.append({
"role": "system",
"content": f"Conversation summary:\n{self.summary}"
})
context.extend([
{"role": m.role, "content": m.content}
for m in self.messages
])
return context
# Progressive summarization for very long conversations
class ProgressiveSummaryMemory:
"""Multi-level summary memory for extended conversations."""
def __init__(self):
self.recent_messages: list[Message] = [] # Last 5 messages
self.short_summary: str = "" # Summary of last ~20 messages
self.long_summary: str = "" # Summary of entire conversation
self.message_count = 0
def add(self, role: str, content: str):
"""Add message with progressive summarization."""
self.recent_messages.append(Message(role=role, content=content))
self.message_count += 1
# Trim recent to 5
if len(self.recent_messages) > 5:
overflow = self.recent_messages[:-5]
self.recent_messages = self.recent_messages[-5:]
self._update_short_summary(overflow)
# Update long summary periodically
if self.message_count % 50 == 0:
self._update_long_summary()
def _update_short_summary(self, messages: list[Message]):
"""Update short-term summary."""
text = "\n".join(f"{m.role}: {m.content}" for m in messages)
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "Update this summary with new messages. Be concise."},
{"role": "user", "content": f"Current summary:\n{self.short_summary}\n\nNew messages:\n{text}"}
],
max_tokens=300
)
self.short_summary = response.choices[0].message.content
def _update_long_summary(self):
"""Compress short summary into long summary."""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "Merge these summaries into one comprehensive summary."},
{"role": "user", "content": f"Long-term summary:\n{self.long_summary}\n\nRecent summary:\n{self.short_summary}"}
],
max_tokens=500
)
self.long_summary = response.choices[0].message.content
self.short_summary = "" # Reset short summary
Entity Memory
from dataclasses import dataclass
import json
@dataclass
class Entity:
name: str
type: str # "person", "organization", "concept", etc.
facts: list[str]
last_mentioned: datetime
class EntityMemory:
"""Memory that tracks entities and facts about them."""
def __init__(self):
self.entities: dict[str, Entity] = {}
def extract_entities(self, text: str, role: str) -> list[dict]:
"""Extract entities from text using LLM."""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": """Extract entities and facts from the text.
Return JSON: {"entities": [{"name": "...", "type": "person|org|concept|product", "facts": ["fact1", "fact2"]}]}"""
},
{"role": "user", "content": text}
],
response_format={"type": "json_object"}
)
result = json.loads(response.choices[0].message.content)
return result.get("entities", [])
def update(self, text: str, role: str):
"""Update entity memory from new text."""
extracted = self.extract_entities(text, role)
for entity_data in extracted:
name = entity_data["name"].lower()
if name in self.entities:
# Update existing entity
entity = self.entities[name]
entity.facts.extend(entity_data["facts"])
entity.facts = list(set(entity.facts))[-10:] # Keep last 10 unique facts
entity.last_mentioned = datetime.now()
else:
# Create new entity
self.entities[name] = Entity(
name=entity_data["name"],
type=entity_data["type"],
facts=entity_data["facts"],
last_mentioned=datetime.now()
)
def get_relevant_entities(self, query: str, top_k: int = 5) -> list[Entity]:
"""Get entities relevant to query."""
# Simple keyword matching (could use embeddings for better matching)
query_lower = query.lower()
relevant = []
for name, entity in self.entities.items():
if name in query_lower or any(name in fact.lower() for fact in entity.facts):
relevant.append(entity)
# Sort by recency
relevant.sort(key=lambda e: e.last_mentioned, reverse=True)
return relevant[:top_k]
def get_context_string(self, query: str) -> str:
"""Get entity context as string for LLM."""
entities = self.get_relevant_entities(query)
if not entities:
return ""
lines = ["Known information:"]
for entity in entities:
facts_str = "; ".join(entity.facts[:5])
lines.append(f"- {entity.name} ({entity.type}): {facts_str}")
return "\n".join(lines)
# Combined memory with entities
class ConversationWithEntities:
"""Conversation memory that tracks entities."""
def __init__(self):
self.buffer = SlidingWindowMemory(max_tokens=3000)
self.entities = EntityMemory()
def add(self, role: str, content: str):
"""Add message and extract entities."""
self.buffer.add(role, content)
self.entities.update(content, role)
def get_context(self, current_query: str) -> list[dict]:
"""Get full context including entities."""
context = []
# Add entity context
entity_context = self.entities.get_context_string(current_query)
if entity_context:
context.append({
"role": "system",
"content": entity_context
})
# Add conversation history
context.extend(self.buffer.get_messages())
return context
Vector Memory
import numpy as np
from openai import OpenAI
client = OpenAI()
class VectorMemory:
"""Semantic memory using vector embeddings."""
def __init__(self, model: str = "text-embedding-3-small"):
self.model = model
self.memories: list[dict] = [] # {"text": ..., "embedding": ..., "metadata": ...}
def _embed(self, text: str) -> list[float]:
"""Get embedding for text."""
response = client.embeddings.create(
model=self.model,
input=text
)
return response.data[0].embedding
def add(self, text: str, metadata: dict = None):
"""Add text to memory."""
embedding = self._embed(text)
self.memories.append({
"text": text,
"embedding": embedding,
"metadata": metadata or {},
"timestamp": datetime.now()
})
def search(self, query: str, top_k: int = 5, threshold: float = 0.7) -> list[dict]:
"""Search for relevant memories."""
if not self.memories:
return []
query_embedding = self._embed(query)
# Calculate similarities
results = []
for memory in self.memories:
similarity = self._cosine_similarity(query_embedding, memory["embedding"])
if similarity >= threshold:
results.append({
"text": memory["text"],
"similarity": similarity,
"metadata": memory["metadata"]
})
# Sort by similarity
results.sort(key=lambda x: x["similarity"], reverse=True)
return results[:top_k]
def _cosine_similarity(self, a: list[float], b: list[float]) -> float:
"""Calculate cosine similarity."""
a = np.array(a)
b = np.array(b)
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
# Hybrid memory combining all approaches
class HybridMemory:
"""Combines buffer, summary, entity, and vector memory."""
def __init__(self):
self.buffer = SlidingWindowMemory(max_tokens=2000)
self.summary = SummaryMemory(summary_threshold=15)
self.entities = EntityMemory()
self.vector = VectorMemory()
def add(self, role: str, content: str):
"""Add to all memory systems."""
# Buffer for recent context
self.buffer.add(role, content)
# Summary for long-term compression
self.summary.add(role, content)
# Entity extraction
self.entities.update(content, role)
# Vector storage for semantic search
self.vector.add(f"{role}: {content}", {"role": role})
def get_context(self, current_query: str, max_tokens: int = 4000) -> list[dict]:
"""Assemble context from all memory systems."""
context = []
token_budget = max_tokens
# 1. Add summary if exists (most compressed)
if self.summary.summary:
summary_text = f"Conversation summary:\n{self.summary.summary}"
context.append({"role": "system", "content": summary_text})
token_budget -= len(summary_text) // 4
# 2. Add relevant entity facts
entity_context = self.entities.get_context_string(current_query)
if entity_context:
context.append({"role": "system", "content": entity_context})
token_budget -= len(entity_context) // 4
# 3. Add semantically relevant past messages
relevant = self.vector.search(current_query, top_k=3)
if relevant:
relevant_text = "Relevant past context:\n" + "\n".join(r["text"] for r in relevant)
context.append({"role": "system", "content": relevant_text})
token_budget -= len(relevant_text) // 4
# 4. Add recent buffer messages
buffer_messages = self.buffer.get_messages()
for msg in buffer_messages:
msg_tokens = len(msg["content"]) // 4
if msg_tokens <= token_budget:
context.append(msg)
token_budget -= msg_tokens
return context
References
- LangChain Memory: https://python.langchain.com/docs/modules/memory/
- LlamaIndex Chat Memory: https://docs.llamaindex.ai/en/stable/module_guides/deploying/chat_engines/
- Mem0 (formerly MemGPT): https://github.com/mem0ai/mem0
Conclusion
Memory transforms stateless LLM calls into coherent conversations. Buffer memory handles short interactions but fails for long conversations. Summary memory compresses history but loses detail. Entity memory tracks facts but misses conversational flow. Vector memory enables semantic retrieval but adds latency. The best approach combines multiple strategies: buffer for immediate context, summary for compression, entities for key facts, and vectors for semantic search. Start simple with buffer memory, add summarization when conversations exceed context limits, and layer in entity and vector memory as your application matures. Remember that memory adds cost (summarization calls, embedding calls) and latency—profile your application to find the right balance between context richness and performance.
Discover more from Code, Cloud & Context
Subscribe to get the latest posts sent to your email.

Leave a Reply