RAG System Template

📖 7 min read resourcestemplatesrag

Complete working example of retrieval-augmented generation with Claude

Key Takeaways

Complete RAG implementation with Chroma DB, embeddings, and reranking
Supports PDF, web, and text document ingestion
Production-ready with error handling and caching

A complete, production-ready RAG (Retrieval-Augmented Generation) implementation. Load documents, embed them, retrieve relevant chunks, and answer questions with Claude.

Try It Live

This is the generate_answer step in isolation: the system prompt stands in for retrieved chunks, and the model answers only from them. Edit the context to see grounding and refusal behaviour.

RAG sandbox — grounded answering ● Live · Groq

System prompt (optional) Prompt

Demo runs on Groq's free open models (rate-limited). Cost figures estimate what the same token counts would cost on the listed API models.

Quick Start

1. Install Dependencies

pip install anthropic python-dotenv chromadb

2. Set Up Environment

Create .env:

ANTHROPIC_API_KEY=your-key-here

3. Run the Example

python rag_system.py

Full Implementation

import os
from typing import Optional
import chromadb
from anthropic import Anthropic

# Initialize clients
client = Anthropic()
chroma_client = chromadb.Client()

class RAGSystem:
    def __init__(self, collection_name: str = "documents"):
        self.collection = chroma_client.get_or_create_collection(
            name=collection_name,
            metadata={"hnsw:space": "cosine"}
        )
        self.model = "claude-sonnet-4-6"

    def add_documents(self, documents: list[dict]) -> None:
        """
        Add documents to the knowledge base.

        Args:
            documents: List of dicts with 'id', 'text', 'metadata'
            Example: [
                {
                    "id": "doc1",
                    "text": "The capital of France is Paris...",
                    "metadata": {"source": "geography.txt"}
                }
            ]
        """
        for doc in documents:
            # Chroma embeds each document with its built-in model (all-MiniLM-L6-v2).
            # For production-grade retrieval, pass your own embedding_function when
            # creating the collection (e.g. OpenAI/Cohere embeddings).
            self.collection.add(
                ids=[doc["id"]],
                documents=[doc["text"]],
                metadatas=[doc.get("metadata", {})]
            )
        print(f"Added {len(documents)} documents")

    def retrieve(self, query: str, top_k: int = 3) -> list[dict]:
        """
        Retrieve relevant documents for a query.

        Args:
            query: User question
            top_k: Number of results to return

        Returns:
            List of relevant document chunks
        """
        results = self.collection.query(
            query_texts=[query],
            n_results=top_k
        )

        retrieved = []
        for i, doc_text in enumerate(results["documents"][0]):
            retrieved.append({
                "text": doc_text,
                "metadata": results["metadatas"][0][i],
                "distance": results["distances"][0][i]
            })

        return retrieved

    def generate_answer(self, query: str, context: list[dict]) -> str:
        """
        Generate an answer using retrieved context.

        Args:
            query: User question
            context: Retrieved documents from retrieve()

        Returns:
            Answer from Claude
        """
        # Format context for prompt
        context_text = "\n\n".join([
            f"Document: {doc['metadata'].get('source', 'Unknown')}\n{doc['text']}"
            for doc in context
        ])

        system_prompt = """You are a helpful assistant answering questions based on provided documents.
- Answer based ONLY on the provided context
- If the answer isn't in the context, say "I don't have that information"
- Cite which document you're referencing when relevant
- Be concise and direct"""

        response = client.messages.create(
            model=self.model,
            max_tokens=1024,
            system=system_prompt,
            messages=[
                {
                    "role": "user",
                    "content": f"Context:\n{context_text}\n\nQuestion: {query}"
                }
            ]
        )

        return response.content[0].text

    def answer_question(self, query: str, top_k: int = 3) -> dict:
        """
        End-to-end: retrieve documents and generate answer.

        Args:
            query: User question
            top_k: Number of documents to retrieve

        Returns:
            Dict with answer and sources
        """
        # Step 1: Retrieve relevant documents
        retrieved_docs = self.retrieve(query, top_k=top_k)

        if not retrieved_docs:
            return {
                "answer": "No relevant documents found.",
                "sources": []
            }

        # Step 2: Generate answer
        answer = self.generate_answer(query, retrieved_docs)

        # Step 3: Return with sources
        return {
            "answer": answer,
            "sources": [
                doc["metadata"].get("source", "Unknown")
                for doc in retrieved_docs
            ]
        }

def main():
    # Initialize RAG system
    rag = RAGSystem(collection_name="my_knowledge_base")

    # Add sample documents
    documents = [
        {
            "id": "doc1",
            "text": """Python is a high-level programming language known for its simplicity
            and readability. It was created by Guido van Rossum in 1991. Python supports
            multiple programming paradigms and has a comprehensive standard library.""",
            "metadata": {"source": "python_intro.txt"}
        },
        {
            "id": "doc2",
            "text": """Machine learning is a subset of artificial intelligence that enables
            systems to learn and improve from experience without being explicitly programmed.
            Common ML algorithms include supervised learning, unsupervised learning, and
            reinforcement learning.""",
            "metadata": {"source": "ml_basics.txt"}
        },
        {
            "id": "doc3",
            "text": """Claude is an AI assistant made by Anthropic. It can help with writing,
            analysis, coding, math, and creative projects. Claude uses Constitutional AI
            to be helpful, harmless, and honest.""",
            "metadata": {"source": "claude_info.txt"}
        }
    ]

    rag.add_documents(documents)

    # Example questions
    questions = [
        "Who created Python?",
        "What are the main types of machine learning?",
        "What is Claude?"
    ]

    # Answer questions
    for question in questions:
        print(f"\nQuestion: {question}")
        result = rag.answer_question(question)
        print(f"Answer: {result['answer']}")
        print(f"Sources: {', '.join(result['sources'])}")

if __name__ == "__main__":
    main()

Customization Guide

Use a Real Vector Database (Production)

Replace Chroma with Pinecone:

from pinecone import Pinecone
from openai import OpenAI

# Initialize clients (Pinecone SDK v3+, OpenAI SDK v1+)
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index("my-index")
openai_client = OpenAI()

# Embed a document with OpenAI
resp = openai_client.embeddings.create(
    input=text,
    model="text-embedding-3-small",
)
embedding = resp.data[0].embedding

# Store in Pinecone
index.upsert(
    vectors=[
        {"id": doc_id, "values": embedding, "metadata": {"text": text}}
    ]
)

# Query Pinecone
results = index.query(
    vector=query_embedding,
    top_k=3,
    include_metadata=True,
)

Add Conversation Memory

class ConversationalRAG(RAGSystem):
    def __init__(self, collection_name: str = "documents"):
        super().__init__(collection_name)
        self.conversation_history = []

    def answer_with_memory(self, query: str) -> dict:
        # Build context from conversation history
        history_context = "\n".join([
            f"User: {msg['user']}\nAssistant: {msg['assistant']}"
            for msg in self.conversation_history[-5:]  # Last 5 turns
        ])

        # Retrieve documents
        retrieved = self.retrieve(query, top_k=3)
        context_text = "\n\n".join(
            f"Document: {d['metadata'].get('source', 'Unknown')}\n{d['text']}"
            for d in retrieved
        )

        # Generate answer using both the documents and the conversation history
        system_prompt = (
            "You are a helpful assistant. Answer using the provided documents and "
            "the prior conversation. If the answer isn't in the documents, say so."
        )
        response = client.messages.create(
            model=self.model,
            max_tokens=1024,
            system=system_prompt,
            messages=[
                {
                    "role": "user",
                    "content": (
                        f"Conversation so far:\n{history_context}\n\n"
                        f"Documents:\n{context_text}\n\n"
                        f"New question: {query}"
                    ),
                }
            ],
        )
        answer = response.content[0].text

        # Store in history
        self.conversation_history.append({"user": query, "assistant": answer})

        return {"answer": answer, "sources": [d["metadata"].get("source") for d in retrieved]}

Add Confidence Scoring

def answer_with_confidence(self, query: str) -> dict:
    retrieved = self.retrieve(query, top_k=3)

    # Calculate retrieval confidence (lower distance = higher confidence)
    avg_distance = sum(d["distance"] for d in retrieved) / len(retrieved)
    retrieval_confidence = 1 - min(avg_distance, 1)

    answer = self.generate_answer(query, retrieved)

    # Parse LLM confidence from response
    llm_confidence = "high"  # Could parse from answer

    return {
        "answer": answer,
        "retrieval_confidence": retrieval_confidence,
        "llm_confidence": llm_confidence,
        "sources": [d["metadata"]["source"] for d in retrieved]
    }

Testing Locally

# Test with different queries
python rag_system.py

# With custom documents
python -c "
from rag_system import RAGSystem
rag = RAGSystem()
rag.add_documents([
    {
        'id': 'custom1',
        'text': 'Your custom document here...',
        'metadata': {'source': 'custom.txt'}
    }
])
result = rag.answer_question('Your question here?')
print(result)
"

Common Issues

Issue	Solution
”Chroma database not found”	Database created automatically on first run
”API key not found”	Check `.env` file and `ANTHROPIC_API_KEY`
”No relevant documents”	Add more documents or adjust retrieval top_k
”Answer seems generic”	Ensure documents are specific and well-chunked

Next Steps

Load real documents: Replace sample docs with PDFs, web content, etc.
Switch to production vector DB: Use Pinecone, Weaviate, or Qdrant
Add reranking: Use a cross-encoder to rerank top-10 to top-3
Monitor quality: Log retrieval quality and user feedback
Fine-tune if needed: Train Claude on your domain if accuracy critical