austinbv
commited on
Commit
·
2c9f2c4
1
Parent(s):
59bc1bb
End of Video 1
Browse files- .gitignore +1 -0
- .idea/misc.xml +3 -0
- app/rag_chain.py +45 -0
- app/server.py +3 -1
- config.py +2 -0
- importer/__init__.py +0 -0
- importer/load_and_process.py +37 -0
- poetry.lock +0 -0
- pyproject.toml +10 -1
.gitignore
CHANGED
@@ -1 +1,2 @@
|
|
1 |
__pycache__
|
|
|
|
1 |
__pycache__
|
2 |
+
.env
|
.idea/misc.xml
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
<project version="4">
|
|
|
|
|
|
|
3 |
<component name="ProjectRootManager" version="2" project-jdk-name="Poetry (pdf_rag)" project-jdk-type="Python SDK" />
|
4 |
</project>
|
|
|
1 |
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
<project version="4">
|
3 |
+
<component name="Black">
|
4 |
+
<option name="sdkName" value="Poetry (pdf_rag)" />
|
5 |
+
</component>
|
6 |
<component name="ProjectRootManager" version="2" project-jdk-name="Poetry (pdf_rag)" project-jdk-type="Python SDK" />
|
7 |
</project>
|
app/rag_chain.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from operator import itemgetter
|
3 |
+
from typing import TypedDict
|
4 |
+
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
from langchain_community.vectorstores.pgvector import PGVector
|
7 |
+
from langchain_core.output_parsers import StrOutputParser
|
8 |
+
from langchain_core.prompts import ChatPromptTemplate
|
9 |
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
10 |
+
|
11 |
+
from config import PG_COLLECTION_NAME
|
12 |
+
|
13 |
+
load_dotenv()
|
14 |
+
|
15 |
+
vector_store = PGVector(
|
16 |
+
collection_name=PG_COLLECTION_NAME,
|
17 |
+
connection_string=os.getenv("POSTGRES_URL"),
|
18 |
+
embedding_function=OpenAIEmbeddings()
|
19 |
+
)
|
20 |
+
|
21 |
+
template = """
|
22 |
+
Answer given the following context:
|
23 |
+
{context}
|
24 |
+
|
25 |
+
Question: {question}
|
26 |
+
"""
|
27 |
+
|
28 |
+
ANSWER_PROMPT = ChatPromptTemplate.from_template(template)
|
29 |
+
|
30 |
+
llm = ChatOpenAI(temperature=0, model='gpt-4-1106-preview', streaming=True)
|
31 |
+
|
32 |
+
|
33 |
+
class RagInput(TypedDict):
|
34 |
+
question: str
|
35 |
+
|
36 |
+
|
37 |
+
final_chain = (
|
38 |
+
{
|
39 |
+
"context": itemgetter("question") | vector_store.as_retriever(),
|
40 |
+
"question": itemgetter("question")
|
41 |
+
}
|
42 |
+
| ANSWER_PROMPT
|
43 |
+
| llm
|
44 |
+
| StrOutputParser()
|
45 |
+
).with_types(input_type=RagInput)
|
app/server.py
CHANGED
@@ -2,6 +2,8 @@ from fastapi import FastAPI
|
|
2 |
from fastapi.responses import RedirectResponse
|
3 |
from langserve import add_routes
|
4 |
|
|
|
|
|
5 |
app = FastAPI()
|
6 |
|
7 |
|
@@ -11,7 +13,7 @@ async def redirect_root_to_docs():
|
|
11 |
|
12 |
|
13 |
# Edit this to add the chain you want to add
|
14 |
-
add_routes(app,
|
15 |
|
16 |
if __name__ == "__main__":
|
17 |
import uvicorn
|
|
|
2 |
from fastapi.responses import RedirectResponse
|
3 |
from langserve import add_routes
|
4 |
|
5 |
+
from app.rag_chain import final_chain
|
6 |
+
|
7 |
app = FastAPI()
|
8 |
|
9 |
|
|
|
13 |
|
14 |
|
15 |
# Edit this to add the chain you want to add
|
16 |
+
add_routes(app, final_chain, path="/rag")
|
17 |
|
18 |
if __name__ == "__main__":
|
19 |
import uvicorn
|
config.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
EMBEDDING_MODEL = 'text-embedding-ada-002'
|
2 |
+
PG_COLLECTION_NAME = "pdf_rag"
|
importer/__init__.py
ADDED
File without changes
|
importer/load_and_process.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from langchain_community.document_loaders import DirectoryLoader, UnstructuredPDFLoader
|
5 |
+
from langchain_community.vectorstores.pgvector import PGVector
|
6 |
+
from langchain_experimental.text_splitter import SemanticChunker
|
7 |
+
from langchain_openai import OpenAIEmbeddings
|
8 |
+
|
9 |
+
from config import EMBEDDING_MODEL, PG_COLLECTION_NAME
|
10 |
+
|
11 |
+
load_dotenv()
|
12 |
+
|
13 |
+
loader = DirectoryLoader(
|
14 |
+
os.path.abspath("../source_docs"),
|
15 |
+
glob="**/*.pdf",
|
16 |
+
use_multithreading=True,
|
17 |
+
show_progress=True,
|
18 |
+
max_concurrency=50,
|
19 |
+
loader_cls=UnstructuredPDFLoader,
|
20 |
+
)
|
21 |
+
docs = loader.load()
|
22 |
+
|
23 |
+
embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, )
|
24 |
+
|
25 |
+
text_splitter = SemanticChunker(
|
26 |
+
embeddings=OpenAIEmbeddings()
|
27 |
+
)
|
28 |
+
|
29 |
+
chunks = text_splitter.split_documents(docs)
|
30 |
+
|
31 |
+
PGVector.from_documents(
|
32 |
+
documents=chunks,
|
33 |
+
embedding=embeddings,
|
34 |
+
collection_name=PG_COLLECTION_NAME,
|
35 |
+
connection_string="postgresql+psycopg://postgres@localhost:5432/pdf_rag_vectors",
|
36 |
+
pre_delete_collection=True,
|
37 |
+
)
|
poetry.lock
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
CHANGED
@@ -9,10 +9,19 @@ packages = [
|
|
9 |
]
|
10 |
|
11 |
[tool.poetry.dependencies]
|
12 |
-
python = "
|
13 |
uvicorn = "^0.23.2"
|
14 |
langserve = {extras = ["server"], version = ">=0.0.30"}
|
15 |
pydantic = "<2"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
|
18 |
[tool.poetry.group.dev.dependencies]
|
|
|
9 |
]
|
10 |
|
11 |
[tool.poetry.dependencies]
|
12 |
+
python = ">=3.11,<3.12"
|
13 |
uvicorn = "^0.23.2"
|
14 |
langserve = {extras = ["server"], version = ">=0.0.30"}
|
15 |
pydantic = "<2"
|
16 |
+
tqdm = "^4.66.1"
|
17 |
+
unstructured = {extras = ["all-docs"], version = "^0.12.2"}
|
18 |
+
langchain-experimental = "^0.0.49"
|
19 |
+
python-dotenv = "^1.0.0"
|
20 |
+
openai = "^1.9.0"
|
21 |
+
tiktoken = "^0.5.2"
|
22 |
+
langchain-openai = "^0.0.3"
|
23 |
+
psycopg = "^3.1.17"
|
24 |
+
pgvector = "^0.2.4"
|
25 |
|
26 |
|
27 |
[tool.poetry.group.dev.dependencies]
|