austinbv commited on
Commit
2c9f2c4
·
1 Parent(s): 59bc1bb

End of Video 1

Browse files
.gitignore CHANGED
@@ -1 +1,2 @@
1
  __pycache__
 
 
1
  __pycache__
2
+ .env
.idea/misc.xml CHANGED
@@ -1,4 +1,7 @@
1
  <?xml version="1.0" encoding="UTF-8"?>
2
  <project version="4">
 
 
 
3
  <component name="ProjectRootManager" version="2" project-jdk-name="Poetry (pdf_rag)" project-jdk-type="Python SDK" />
4
  </project>
 
1
  <?xml version="1.0" encoding="UTF-8"?>
2
  <project version="4">
3
+ <component name="Black">
4
+ <option name="sdkName" value="Poetry (pdf_rag)" />
5
+ </component>
6
  <component name="ProjectRootManager" version="2" project-jdk-name="Poetry (pdf_rag)" project-jdk-type="Python SDK" />
7
  </project>
app/rag_chain.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from operator import itemgetter
3
+ from typing import TypedDict
4
+
5
+ from dotenv import load_dotenv
6
+ from langchain_community.vectorstores.pgvector import PGVector
7
+ from langchain_core.output_parsers import StrOutputParser
8
+ from langchain_core.prompts import ChatPromptTemplate
9
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
10
+
11
+ from config import PG_COLLECTION_NAME
12
+
13
+ load_dotenv()
14
+
15
+ vector_store = PGVector(
16
+ collection_name=PG_COLLECTION_NAME,
17
+ connection_string=os.getenv("POSTGRES_URL"),
18
+ embedding_function=OpenAIEmbeddings()
19
+ )
20
+
21
+ template = """
22
+ Answer given the following context:
23
+ {context}
24
+
25
+ Question: {question}
26
+ """
27
+
28
+ ANSWER_PROMPT = ChatPromptTemplate.from_template(template)
29
+
30
+ llm = ChatOpenAI(temperature=0, model='gpt-4-1106-preview', streaming=True)
31
+
32
+
33
+ class RagInput(TypedDict):
34
+ question: str
35
+
36
+
37
+ final_chain = (
38
+ {
39
+ "context": itemgetter("question") | vector_store.as_retriever(),
40
+ "question": itemgetter("question")
41
+ }
42
+ | ANSWER_PROMPT
43
+ | llm
44
+ | StrOutputParser()
45
+ ).with_types(input_type=RagInput)
app/server.py CHANGED
@@ -2,6 +2,8 @@ from fastapi import FastAPI
2
  from fastapi.responses import RedirectResponse
3
  from langserve import add_routes
4
 
 
 
5
  app = FastAPI()
6
 
7
 
@@ -11,7 +13,7 @@ async def redirect_root_to_docs():
11
 
12
 
13
  # Edit this to add the chain you want to add
14
- add_routes(app, NotImplemented)
15
 
16
  if __name__ == "__main__":
17
  import uvicorn
 
2
  from fastapi.responses import RedirectResponse
3
  from langserve import add_routes
4
 
5
+ from app.rag_chain import final_chain
6
+
7
  app = FastAPI()
8
 
9
 
 
13
 
14
 
15
  # Edit this to add the chain you want to add
16
+ add_routes(app, final_chain, path="/rag")
17
 
18
  if __name__ == "__main__":
19
  import uvicorn
config.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ EMBEDDING_MODEL = 'text-embedding-ada-002'
2
+ PG_COLLECTION_NAME = "pdf_rag"
importer/__init__.py ADDED
File without changes
importer/load_and_process.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from dotenv import load_dotenv
4
+ from langchain_community.document_loaders import DirectoryLoader, UnstructuredPDFLoader
5
+ from langchain_community.vectorstores.pgvector import PGVector
6
+ from langchain_experimental.text_splitter import SemanticChunker
7
+ from langchain_openai import OpenAIEmbeddings
8
+
9
+ from config import EMBEDDING_MODEL, PG_COLLECTION_NAME
10
+
11
+ load_dotenv()
12
+
13
+ loader = DirectoryLoader(
14
+ os.path.abspath("../source_docs"),
15
+ glob="**/*.pdf",
16
+ use_multithreading=True,
17
+ show_progress=True,
18
+ max_concurrency=50,
19
+ loader_cls=UnstructuredPDFLoader,
20
+ )
21
+ docs = loader.load()
22
+
23
+ embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL, )
24
+
25
+ text_splitter = SemanticChunker(
26
+ embeddings=OpenAIEmbeddings()
27
+ )
28
+
29
+ chunks = text_splitter.split_documents(docs)
30
+
31
+ PGVector.from_documents(
32
+ documents=chunks,
33
+ embedding=embeddings,
34
+ collection_name=PG_COLLECTION_NAME,
35
+ connection_string="postgresql+psycopg://postgres@localhost:5432/pdf_rag_vectors",
36
+ pre_delete_collection=True,
37
+ )
poetry.lock CHANGED
The diff for this file is too large to render. See raw diff
 
pyproject.toml CHANGED
@@ -9,10 +9,19 @@ packages = [
9
  ]
10
 
11
  [tool.poetry.dependencies]
12
- python = "^3.11"
13
  uvicorn = "^0.23.2"
14
  langserve = {extras = ["server"], version = ">=0.0.30"}
15
  pydantic = "<2"
 
 
 
 
 
 
 
 
 
16
 
17
 
18
  [tool.poetry.group.dev.dependencies]
 
9
  ]
10
 
11
  [tool.poetry.dependencies]
12
+ python = ">=3.11,<3.12"
13
  uvicorn = "^0.23.2"
14
  langserve = {extras = ["server"], version = ">=0.0.30"}
15
  pydantic = "<2"
16
+ tqdm = "^4.66.1"
17
+ unstructured = {extras = ["all-docs"], version = "^0.12.2"}
18
+ langchain-experimental = "^0.0.49"
19
+ python-dotenv = "^1.0.0"
20
+ openai = "^1.9.0"
21
+ tiktoken = "^0.5.2"
22
+ langchain-openai = "^0.0.3"
23
+ psycopg = "^3.1.17"
24
+ pgvector = "^0.2.4"
25
 
26
 
27
  [tool.poetry.group.dev.dependencies]