Skip to content

RAG System Template

📖 7 min read resourcestemplatesrag
Complete working example of retrieval-augmented generation with Claude
Key Takeaways
  • Complete RAG implementation with Chroma DB, embeddings, and reranking
  • Supports PDF, web, and text document ingestion
  • Production-ready with error handling and caching

A complete, production-ready RAG (Retrieval-Augmented Generation) implementation. Load documents, embed them, retrieve relevant chunks, and answer questions with Claude.

Try It Live

This is the generate_answer step in isolation: the system prompt stands in for retrieved chunks, and the model answers only from them. Edit the context to see grounding and refusal behaviour.

RAG sandbox — grounded answering ● Live · Groq

Demo runs on Groq's free open models (rate-limited). Cost figures estimate what the same token counts would cost on the listed API models.


Quick Start

1. Install Dependencies

Terminal window
pip install anthropic python-dotenv chromadb

2. Set Up Environment

Create .env:

ANTHROPIC_API_KEY=your-key-here

3. Run the Example

Terminal window
python rag_system.py

Full Implementation

import os
from typing import Optional
import chromadb
from anthropic import Anthropic
# Initialize clients
client = Anthropic()
chroma_client = chromadb.Client()
class RAGSystem:
def __init__(self, collection_name: str = "documents"):
self.collection = chroma_client.get_or_create_collection(
name=collection_name,
metadata={"hnsw:space": "cosine"}
)
self.model = "claude-sonnet-4-6"
def add_documents(self, documents: list[dict]) -> None:
"""
Add documents to the knowledge base.
Args:
documents: List of dicts with 'id', 'text', 'metadata'
Example: [
{
"id": "doc1",
"text": "The capital of France is Paris...",
"metadata": {"source": "geography.txt"}
}
]
"""
for doc in documents:
# Chroma embeds each document with its built-in model (all-MiniLM-L6-v2).
# For production-grade retrieval, pass your own embedding_function when
# creating the collection (e.g. OpenAI/Cohere embeddings).
self.collection.add(
ids=[doc["id"]],
documents=[doc["text"]],
metadatas=[doc.get("metadata", {})]
)
print(f"Added {len(documents)} documents")
def retrieve(self, query: str, top_k: int = 3) -> list[dict]:
"""
Retrieve relevant documents for a query.
Args:
query: User question
top_k: Number of results to return
Returns:
List of relevant document chunks
"""
results = self.collection.query(
query_texts=[query],
n_results=top_k
)
retrieved = []
for i, doc_text in enumerate(results["documents"][0]):
retrieved.append({
"text": doc_text,
"metadata": results["metadatas"][0][i],
"distance": results["distances"][0][i]
})
return retrieved
def generate_answer(self, query: str, context: list[dict]) -> str:
"""
Generate an answer using retrieved context.
Args:
query: User question
context: Retrieved documents from retrieve()
Returns:
Answer from Claude
"""
# Format context for prompt
context_text = "\n\n".join([
f"Document: {doc['metadata'].get('source', 'Unknown')}\n{doc['text']}"
for doc in context
])
system_prompt = """You are a helpful assistant answering questions based on provided documents.
- Answer based ONLY on the provided context
- If the answer isn't in the context, say "I don't have that information"
- Cite which document you're referencing when relevant
- Be concise and direct"""
response = client.messages.create(
model=self.model,
max_tokens=1024,
system=system_prompt,
messages=[
{
"role": "user",
"content": f"Context:\n{context_text}\n\nQuestion: {query}"
}
]
)
return response.content[0].text
def answer_question(self, query: str, top_k: int = 3) -> dict:
"""
End-to-end: retrieve documents and generate answer.
Args:
query: User question
top_k: Number of documents to retrieve
Returns:
Dict with answer and sources
"""
# Step 1: Retrieve relevant documents
retrieved_docs = self.retrieve(query, top_k=top_k)
if not retrieved_docs:
return {
"answer": "No relevant documents found.",
"sources": []
}
# Step 2: Generate answer
answer = self.generate_answer(query, retrieved_docs)
# Step 3: Return with sources
return {
"answer": answer,
"sources": [
doc["metadata"].get("source", "Unknown")
for doc in retrieved_docs
]
}
def main():
# Initialize RAG system
rag = RAGSystem(collection_name="my_knowledge_base")
# Add sample documents
documents = [
{
"id": "doc1",
"text": """Python is a high-level programming language known for its simplicity
and readability. It was created by Guido van Rossum in 1991. Python supports
multiple programming paradigms and has a comprehensive standard library.""",
"metadata": {"source": "python_intro.txt"}
},
{
"id": "doc2",
"text": """Machine learning is a subset of artificial intelligence that enables
systems to learn and improve from experience without being explicitly programmed.
Common ML algorithms include supervised learning, unsupervised learning, and
reinforcement learning.""",
"metadata": {"source": "ml_basics.txt"}
},
{
"id": "doc3",
"text": """Claude is an AI assistant made by Anthropic. It can help with writing,
analysis, coding, math, and creative projects. Claude uses Constitutional AI
to be helpful, harmless, and honest.""",
"metadata": {"source": "claude_info.txt"}
}
]
rag.add_documents(documents)
# Example questions
questions = [
"Who created Python?",
"What are the main types of machine learning?",
"What is Claude?"
]
# Answer questions
for question in questions:
print(f"\nQuestion: {question}")
result = rag.answer_question(question)
print(f"Answer: {result['answer']}")
print(f"Sources: {', '.join(result['sources'])}")
if __name__ == "__main__":
main()

Customization Guide

Use a Real Vector Database (Production)

Replace Chroma with Pinecone:

from pinecone import Pinecone
from openai import OpenAI
# Initialize clients (Pinecone SDK v3+, OpenAI SDK v1+)
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index("my-index")
openai_client = OpenAI()
# Embed a document with OpenAI
resp = openai_client.embeddings.create(
input=text,
model="text-embedding-3-small",
)
embedding = resp.data[0].embedding
# Store in Pinecone
index.upsert(
vectors=[
{"id": doc_id, "values": embedding, "metadata": {"text": text}}
]
)
# Query Pinecone
results = index.query(
vector=query_embedding,
top_k=3,
include_metadata=True,
)

Add Conversation Memory

class ConversationalRAG(RAGSystem):
def __init__(self, collection_name: str = "documents"):
super().__init__(collection_name)
self.conversation_history = []
def answer_with_memory(self, query: str) -> dict:
# Build context from conversation history
history_context = "\n".join([
f"User: {msg['user']}\nAssistant: {msg['assistant']}"
for msg in self.conversation_history[-5:] # Last 5 turns
])
# Retrieve documents
retrieved = self.retrieve(query, top_k=3)
context_text = "\n\n".join(
f"Document: {d['metadata'].get('source', 'Unknown')}\n{d['text']}"
for d in retrieved
)
# Generate answer using both the documents and the conversation history
system_prompt = (
"You are a helpful assistant. Answer using the provided documents and "
"the prior conversation. If the answer isn't in the documents, say so."
)
response = client.messages.create(
model=self.model,
max_tokens=1024,
system=system_prompt,
messages=[
{
"role": "user",
"content": (
f"Conversation so far:\n{history_context}\n\n"
f"Documents:\n{context_text}\n\n"
f"New question: {query}"
),
}
],
)
answer = response.content[0].text
# Store in history
self.conversation_history.append({"user": query, "assistant": answer})
return {"answer": answer, "sources": [d["metadata"].get("source") for d in retrieved]}

Add Confidence Scoring

def answer_with_confidence(self, query: str) -> dict:
retrieved = self.retrieve(query, top_k=3)
# Calculate retrieval confidence (lower distance = higher confidence)
avg_distance = sum(d["distance"] for d in retrieved) / len(retrieved)
retrieval_confidence = 1 - min(avg_distance, 1)
answer = self.generate_answer(query, retrieved)
# Parse LLM confidence from response
llm_confidence = "high" # Could parse from answer
return {
"answer": answer,
"retrieval_confidence": retrieval_confidence,
"llm_confidence": llm_confidence,
"sources": [d["metadata"]["source"] for d in retrieved]
}

Testing Locally

Terminal window
# Test with different queries
python rag_system.py
# With custom documents
python -c "
from rag_system import RAGSystem
rag = RAGSystem()
rag.add_documents([
{
'id': 'custom1',
'text': 'Your custom document here...',
'metadata': {'source': 'custom.txt'}
}
])
result = rag.answer_question('Your question here?')
print(result)
"

Common Issues

IssueSolution
”Chroma database not found”Database created automatically on first run
”API key not found”Check .env file and ANTHROPIC_API_KEY
”No relevant documents”Add more documents or adjust retrieval top_k
”Answer seems generic”Ensure documents are specific and well-chunked

Next Steps

  1. Load real documents: Replace sample docs with PDFs, web content, etc.
  2. Switch to production vector DB: Use Pinecone, Weaviate, or Qdrant
  3. Add reranking: Use a cross-encoder to rerank top-10 to top-3
  4. Monitor quality: Log retrieval quality and user feedback
  5. Fine-tune if needed: Train Claude on your domain if accuracy critical

See Also: