Spaces:
Sleeping
Sleeping
File size: 1,628 Bytes
46e28ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
# Utilities to build a RAG system to query information from the CAMELS cosmological simulations using Langchain
# Author: Pablo Villanueva Domingo
from langchain import hub
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
# Load documentation from urls
def load_docs():
# Get urls
urlsfile = open("urls.txt")
urls = urlsfile.readlines()
urls = [url.replace("\n","") for url in urls]
urlsfile.close()
# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(urls)
docs = loader.load()
return docs
# Join content pages for processing
def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)
# Create a RAG chain
def RAG(llm, docs, embeddings):
# Split text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
# Create vector store
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
# Retrieve and generate using the relevant snippets of the documents
retriever = vectorstore.as_retriever()
# Prompt basis example for RAG systems
prompt = hub.pull("rlm/rag-prompt")
# Create the chain
rag_chain = (
{"context": retriever | format_docs, "question": RunnablePassthrough()}
| prompt
| llm
| StrOutputParser()
)
return rag_chain
|