Retrieval-Augmented Generation (RAG) enhances LLMs by retrieving relevant context from your documents before generating responses. This guide shows how to build production-ready RAG systems.
RAG Architecture Overview
- Load and split documents
- Generate embeddings
- Store in vector database
- Retrieve relevant chunks
- Generate response with context
Setting Up
pip install langchain langchain-openai chromadb tiktoken pypdf
Document Loading and Splitting
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Load PDFs
loader = DirectoryLoader(
"./documents/",
glob="**/*.pdf",
loader_cls=PyPDFLoader
)
documents = loader.load()
# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
separators=["
", "
", " ", ""]
)
chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks from {len(documents)} documents")
Creating Embeddings and Vector Store
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
# Initialize embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# Create vector store
vectorstore = Chroma.from_documents(
documents=chunks,
embedding=embeddings,
persist_directory="./chroma_db"
)
# For persistence
vectorstore.persist()
# Load existing vectorstore
vectorstore = Chroma(
persist_directory="./chroma_db",
embedding_function=embeddings
)
Building the RAG Chain
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
# Custom prompt template
template = """Use the following context to answer the question.
If you don't know the answer, say you don't know.
Context: {context}
Question: {question}
Answer:"""
prompt = PromptTemplate(
template=template,
input_variables=["context", "question"]
)
# Initialize LLM
llm = ChatOpenAI(model="gpt-4", temperature=0)
# Create retriever
retriever = vectorstore.as_retriever(
search_type="similarity",
search_kwargs={"k": 4}
)
# Build RAG chain
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
chain_type_kwargs={"prompt": prompt},
return_source_documents=True
)
# Query
result = qa_chain({"query": "What is the main topic of these documents?"})
print(result["result"])
print("\nSources:")
for doc in result["source_documents"]:
print(f"- {doc.metadata['source']}: {doc.page_content[:100]}...")
Advanced: Conversational RAG
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
memory = ConversationBufferMemory(
memory_key="chat_history",
return_messages=True,
output_key="answer"
)
conversational_chain = ConversationalRetrievalChain.from_llm(
llm=llm,
retriever=retriever,
memory=memory,
return_source_documents=True
)
# Chat
response1 = conversational_chain({"question": "What is RAG?"})
response2 = conversational_chain({"question": "How does it improve LLMs?"})
Using Pinecone for Production
from langchain.vectorstores import Pinecone
import pinecone
pinecone.init(
api_key="your-api-key",
environment="your-environment"
)
vectorstore = Pinecone.from_documents(
documents=chunks,
embedding=embeddings,
index_name="my-rag-index"
)
Evaluation and Optimization
from langchain.evaluation import load_evaluator
# Evaluate relevance
evaluator = load_evaluator("relevance")
# Test retrieval quality
def evaluate_retrieval(question):
docs = retriever.get_relevant_documents(question)
for i, doc in enumerate(docs):
score = evaluator.evaluate_strings(
input=question,
prediction=doc.page_content
)
print(f"Doc {i+1} relevance: {score['score']}")
RAG systems give LLMs access to your private data while maintaining accuracy. Experiment with chunk sizes and retrieval strategies!