Spaces:
Sleeping
Sleeping
# Utilities to build a RAG system to query information from the CAMELS cosmological simulations using Langchain | |
# Author: Pablo Villanueva Domingo | |
from langchain import hub | |
from langchain_chroma import Chroma | |
from langchain_core.output_parsers import StrOutputParser | |
from langchain_core.runnables import RunnablePassthrough | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import WebBaseLoader | |
# Load documentation from urls | |
def load_docs(): | |
# Get urls | |
urlsfile = open("urls.txt") | |
urls = urlsfile.readlines() | |
urls = [url.replace("\n","") for url in urls] | |
urlsfile.close() | |
# Load, chunk and index the contents of the blog. | |
loader = WebBaseLoader(urls) | |
docs = loader.load() | |
return docs | |
# Join content pages for processing | |
def format_docs(docs): | |
return "\n\n".join(doc.page_content for doc in docs) | |
# Create a RAG chain | |
def RAG(llm, docs, embeddings): | |
# Split text | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
splits = text_splitter.split_documents(docs) | |
# Create vector store | |
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings) | |
# Retrieve and generate using the relevant snippets of the documents | |
retriever = vectorstore.as_retriever() | |
# Prompt basis example for RAG systems | |
prompt = hub.pull("rlm/rag-prompt") | |
# Create the chain | |
rag_chain = ( | |
{"context": retriever | format_docs, "question": RunnablePassthrough()} | |
| prompt | |
| llm | |
| StrOutputParser() | |
) | |
return rag_chain | |