RAG System Template
A complete, production-ready RAG (Retrieval-Augmented Generation) implementation. Load documents, embed them, retrieve relevant chunks, and answer questions with Claude.
Try It Live
This is the generate_answer step in isolation: the system prompt stands in for retrieved chunks, and the model answers only from them. Edit the context to see grounding and refusal behaviour.
RAG sandbox — grounded answering ● Live · Groq
Demo runs on Groq's free open models (rate-limited). Cost figures estimate what the same token counts would cost on the listed API models.
Quick Start
1. Install Dependencies
pip install anthropic python-dotenv chromadb2. Set Up Environment
Create .env:
ANTHROPIC_API_KEY=your-key-here3. Run the Example
python rag_system.pyFull Implementation
import osfrom typing import Optionalimport chromadbfrom anthropic import Anthropic
# Initialize clientsclient = Anthropic()chroma_client = chromadb.Client()
class RAGSystem: def __init__(self, collection_name: str = "documents"): self.collection = chroma_client.get_or_create_collection( name=collection_name, metadata={"hnsw:space": "cosine"} ) self.model = "claude-sonnet-4-6"
def add_documents(self, documents: list[dict]) -> None: """ Add documents to the knowledge base.
Args: documents: List of dicts with 'id', 'text', 'metadata' Example: [ { "id": "doc1", "text": "The capital of France is Paris...", "metadata": {"source": "geography.txt"} } ] """ for doc in documents: # Chroma embeds each document with its built-in model (all-MiniLM-L6-v2). # For production-grade retrieval, pass your own embedding_function when # creating the collection (e.g. OpenAI/Cohere embeddings). self.collection.add( ids=[doc["id"]], documents=[doc["text"]], metadatas=[doc.get("metadata", {})] ) print(f"Added {len(documents)} documents")
def retrieve(self, query: str, top_k: int = 3) -> list[dict]: """ Retrieve relevant documents for a query.
Args: query: User question top_k: Number of results to return
Returns: List of relevant document chunks """ results = self.collection.query( query_texts=[query], n_results=top_k )
retrieved = [] for i, doc_text in enumerate(results["documents"][0]): retrieved.append({ "text": doc_text, "metadata": results["metadatas"][0][i], "distance": results["distances"][0][i] })
return retrieved
def generate_answer(self, query: str, context: list[dict]) -> str: """ Generate an answer using retrieved context.
Args: query: User question context: Retrieved documents from retrieve()
Returns: Answer from Claude """ # Format context for prompt context_text = "\n\n".join([ f"Document: {doc['metadata'].get('source', 'Unknown')}\n{doc['text']}" for doc in context ])
system_prompt = """You are a helpful assistant answering questions based on provided documents.- Answer based ONLY on the provided context- If the answer isn't in the context, say "I don't have that information"- Cite which document you're referencing when relevant- Be concise and direct"""
response = client.messages.create( model=self.model, max_tokens=1024, system=system_prompt, messages=[ { "role": "user", "content": f"Context:\n{context_text}\n\nQuestion: {query}" } ] )
return response.content[0].text
def answer_question(self, query: str, top_k: int = 3) -> dict: """ End-to-end: retrieve documents and generate answer.
Args: query: User question top_k: Number of documents to retrieve
Returns: Dict with answer and sources """ # Step 1: Retrieve relevant documents retrieved_docs = self.retrieve(query, top_k=top_k)
if not retrieved_docs: return { "answer": "No relevant documents found.", "sources": [] }
# Step 2: Generate answer answer = self.generate_answer(query, retrieved_docs)
# Step 3: Return with sources return { "answer": answer, "sources": [ doc["metadata"].get("source", "Unknown") for doc in retrieved_docs ] }
def main(): # Initialize RAG system rag = RAGSystem(collection_name="my_knowledge_base")
# Add sample documents documents = [ { "id": "doc1", "text": """Python is a high-level programming language known for its simplicity and readability. It was created by Guido van Rossum in 1991. Python supports multiple programming paradigms and has a comprehensive standard library.""", "metadata": {"source": "python_intro.txt"} }, { "id": "doc2", "text": """Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. Common ML algorithms include supervised learning, unsupervised learning, and reinforcement learning.""", "metadata": {"source": "ml_basics.txt"} }, { "id": "doc3", "text": """Claude is an AI assistant made by Anthropic. It can help with writing, analysis, coding, math, and creative projects. Claude uses Constitutional AI to be helpful, harmless, and honest.""", "metadata": {"source": "claude_info.txt"} } ]
rag.add_documents(documents)
# Example questions questions = [ "Who created Python?", "What are the main types of machine learning?", "What is Claude?" ]
# Answer questions for question in questions: print(f"\nQuestion: {question}") result = rag.answer_question(question) print(f"Answer: {result['answer']}") print(f"Sources: {', '.join(result['sources'])}")
if __name__ == "__main__": main()Customization Guide
Use a Real Vector Database (Production)
Replace Chroma with Pinecone:
from pinecone import Pineconefrom openai import OpenAI
# Initialize clients (Pinecone SDK v3+, OpenAI SDK v1+)pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))index = pc.Index("my-index")openai_client = OpenAI()
# Embed a document with OpenAIresp = openai_client.embeddings.create( input=text, model="text-embedding-3-small",)embedding = resp.data[0].embedding
# Store in Pineconeindex.upsert( vectors=[ {"id": doc_id, "values": embedding, "metadata": {"text": text}} ])
# Query Pineconeresults = index.query( vector=query_embedding, top_k=3, include_metadata=True,)Add Conversation Memory
class ConversationalRAG(RAGSystem): def __init__(self, collection_name: str = "documents"): super().__init__(collection_name) self.conversation_history = []
def answer_with_memory(self, query: str) -> dict: # Build context from conversation history history_context = "\n".join([ f"User: {msg['user']}\nAssistant: {msg['assistant']}" for msg in self.conversation_history[-5:] # Last 5 turns ])
# Retrieve documents retrieved = self.retrieve(query, top_k=3) context_text = "\n\n".join( f"Document: {d['metadata'].get('source', 'Unknown')}\n{d['text']}" for d in retrieved )
# Generate answer using both the documents and the conversation history system_prompt = ( "You are a helpful assistant. Answer using the provided documents and " "the prior conversation. If the answer isn't in the documents, say so." ) response = client.messages.create( model=self.model, max_tokens=1024, system=system_prompt, messages=[ { "role": "user", "content": ( f"Conversation so far:\n{history_context}\n\n" f"Documents:\n{context_text}\n\n" f"New question: {query}" ), } ], ) answer = response.content[0].text
# Store in history self.conversation_history.append({"user": query, "assistant": answer})
return {"answer": answer, "sources": [d["metadata"].get("source") for d in retrieved]}Add Confidence Scoring
def answer_with_confidence(self, query: str) -> dict: retrieved = self.retrieve(query, top_k=3)
# Calculate retrieval confidence (lower distance = higher confidence) avg_distance = sum(d["distance"] for d in retrieved) / len(retrieved) retrieval_confidence = 1 - min(avg_distance, 1)
answer = self.generate_answer(query, retrieved)
# Parse LLM confidence from response llm_confidence = "high" # Could parse from answer
return { "answer": answer, "retrieval_confidence": retrieval_confidence, "llm_confidence": llm_confidence, "sources": [d["metadata"]["source"] for d in retrieved] }Testing Locally
# Test with different queriespython rag_system.py
# With custom documentspython -c "from rag_system import RAGSystemrag = RAGSystem()rag.add_documents([ { 'id': 'custom1', 'text': 'Your custom document here...', 'metadata': {'source': 'custom.txt'} }])result = rag.answer_question('Your question here?')print(result)"Common Issues
| Issue | Solution |
|---|---|
| ”Chroma database not found” | Database created automatically on first run |
| ”API key not found” | Check .env file and ANTHROPIC_API_KEY |
| ”No relevant documents” | Add more documents or adjust retrieval top_k |
| ”Answer seems generic” | Ensure documents are specific and well-chunked |
Next Steps
- Load real documents: Replace sample docs with PDFs, web content, etc.
- Switch to production vector DB: Use Pinecone, Weaviate, or Qdrant
- Add reranking: Use a cross-encoder to rerank top-10 to top-3
- Monitor quality: Log retrieval quality and user feedback
- Fine-tune if needed: Train Claude on your domain if accuracy critical
See Also:
- RAG Architecture - Technical deep dive
- Case Study: Customer Support - RAG in production