Tien09 commited on
Commit
bc3cd48
·
1 Parent(s): 0643d88

Upload 13 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11.5
2
+
3
+ WORKDIR /app
4
+
5
+ COPY ./requirements.txt /app/requirements.txt
6
+
7
+ RUN pip3 install --no-cache-dir -r /app/requirements.txt
8
+
9
+ # User
10
+ RUN useradd -m -u 1000 user
11
+ USER user
12
+ ENV HOME /home/user
13
+ ENV PATH $HOME/.local/bin:$PATH
14
+
15
+ WORKDIR $HOME
16
+ RUN mkdir app
17
+ WORKDIR $HOME/app
18
+ COPY . $HOME/app
19
+
20
+ EXPOSE 7860
21
+ CMD ["streamlit", "run", "app.py", "--host", "0.0.0.0", "--port", "7860"]
22
+
app.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # __import__('pysqlite3')
2
+ # import sys
3
+ # sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
4
+
5
+ # modify comment
6
+
7
+ import streamlit as st
8
+ from langchain.llms import Ollama
9
+ import os
10
+ import chromadb
11
+ from constants import CHROMA_SETTINGS
12
+ from langchain.chains import RetrievalQA
13
+ from langchain.embeddings import HuggingFaceEmbeddings
14
+ from chromadb.config import Settings
15
+
16
+ from langchain.vectorstores import Chroma
17
+ from langchain.llms import Ollama
18
+ from langchain.callbacks.base import BaseCallbackHandler
19
+
20
+ # Custom streamlit handler to display LLM outputs in stream mode
21
+ class StreamHandler(BaseCallbackHandler):
22
+ def __init__(self, container, initial_text=""):
23
+ self.container = container
24
+ self.text=initial_text
25
+ def on_llm_new_token(self, token: str, **kwargs) -> None:
26
+
27
+ self.text+=token+""
28
+ self.container.markdown(self.text)
29
+
30
+ # streamlit UI configuration
31
+ def setup_page():
32
+ st.set_page_config(layout="wide")
33
+ st.markdown("<h2 style='text-align: center; color: white;'>Your Personal MBA </h2>" , unsafe_allow_html=True)
34
+ url = 'https://personalmba.com/'
35
+ col1, col2, col3= st.columns(3)
36
+ with col2:
37
+ st.markdown("""
38
+ <div style="text-align: center;">
39
+ <h5 style='color: white;'>Inspired by </h5>
40
+ <a href="%s">The Personal MBA by Josh Kaufman</a>
41
+ </div>
42
+ """ % url, unsafe_allow_html=True)
43
+ st.divider()
44
+
45
+ # get necessary environment variables for later use
46
+ def get_environment_variables():
47
+ model = os.environ.get("MODEL", "mistral")
48
+ embeddings_model_name = os.environ.get("EMBEDDINGS_MODEL_NAME", "all-MiniLM-L6-v2")
49
+ persist_directory = os.environ.get("PERSIST_DIRECTORY", "db")
50
+ target_source_chunks = int(os.environ.get('TARGET_SOURCE_CHUNKS', 4))
51
+ return model, embeddings_model_name, persist_directory, target_source_chunks
52
+
53
+ # create knowledge base retriever
54
+ def create_knowledge_base(embeddings_model_name, persist_directory, target_source_chunks):
55
+ embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
56
+ db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
57
+ retriever = db.as_retriever(search_kwargs={"k": target_source_chunks})
58
+ return retriever
59
+
60
+ # handle query when user hit 'enter' on a question
61
+ def handle_query(query, model, retriever):
62
+ with st.chat_message('assistant'):
63
+
64
+ with st.spinner("Generating answer..."):
65
+ message_placeholder = st.empty()
66
+ stream_handler = StreamHandler(message_placeholder)
67
+ llm = Ollama(model=model, callbacks=[stream_handler])
68
+ qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=False)
69
+ res = qa(query)
70
+ answer = res['result']
71
+ message_placeholder.markdown(answer)
72
+ return answer
73
+
74
+ # dictionary to store the previous messages, create a 'memory' for the LLM
75
+ def initialize_session():
76
+ if 'messages' not in st.session_state:
77
+ st.session_state.messages = []
78
+
79
+ # display the messages
80
+ def display_messages():
81
+ for message in st.session_state.messages:
82
+ with st.chat_message(message['role']):
83
+ st.markdown(message['content'])
84
+
85
+ # example questions when user first load up the app. Will disappear after user send the first query
86
+ def show_examples():
87
+ examples = st.empty()
88
+ with examples.container():
89
+ with st.chat_message('assistant'):
90
+ st.markdown('Example questions:')
91
+ st.markdown(' - How do I know that I am making the right decisions?')
92
+ st.markdown(' - What are the key ideas in Chapter 6 "The Human Mind"?')
93
+ st.markdown(' - What are common traits shared by the most sucessful individuals in the world?')
94
+ st.markdown(' - I want to be a millionaire, build me a 5 year roadmap based on the top 0.01 percent of the human population.')
95
+ st.markdown('So, how may I help you today?')
96
+ return examples
97
+
98
+
99
+ def main():
100
+ setup_page()
101
+ initialize_session()
102
+ display_messages()
103
+ examples = show_examples()
104
+ model, embeddings_model_name, persist_directory, target_source_chunks = get_environment_variables()
105
+ retriever = create_knowledge_base(embeddings_model_name, persist_directory, target_source_chunks)
106
+
107
+ query = st.chat_input(placeholder='Ask a question...') # starting with empty query
108
+
109
+ if query: # if user input a query and hit 'Enter'
110
+ examples.empty()
111
+
112
+ st.session_state.messages.append({ # add the query into session state/ dictionary
113
+ 'role': 'user',
114
+ 'content': query
115
+ })
116
+
117
+ with st.chat_message('user'):
118
+ st.markdown(query)
119
+
120
+ answer = handle_query(query, model, retriever)
121
+
122
+ st.session_state.messages.append({ # add the answer into session state/ dictionary
123
+ 'role': 'assistant',
124
+ 'content': answer
125
+ })
126
+
127
+ if __name__ == "__main__":
128
+ main()
constants.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from chromadb.config import Settings
3
+
4
+ # Define the folder for storing database
5
+ PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY', 'db') # db is default
6
+
7
+ # Define the Chroma settings
8
+ CHROMA_SETTINGS = Settings(
9
+ persist_directory=PERSIST_DIRECTORY,
10
+ anonymized_telemetry=False
11
+ )
db/.DS_Store ADDED
Binary file (6.15 kB). View file
 
db/b7134628-7a61-4630-adf2-934fde432f96/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6606e50d663d8e0357beb09c60b4274ed698071f2275efa6f3fc3a64ec4aa739
3
+ size 1676000
db/b7134628-7a61-4630-adf2-934fde432f96/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9de63d54afd49cbeeac013426226d9835e7c440647f9303475ea905ead14cd6
3
+ size 100
db/b7134628-7a61-4630-adf2-934fde432f96/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dcba004a08f5ee7c4c87a677a5f0396391c4804a2e5fadaff4a6e518924247f
3
+ size 55974
db/b7134628-7a61-4630-adf2-934fde432f96/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4769fa8c2a4c9da88b7df41a290fea06b511acca2281536194c748a4c89f38d3
3
+ size 4000
db/b7134628-7a61-4630-adf2-934fde432f96/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc83519dd4ca9640feb086425393c54166a8b6e3f1f8f29ba36ac8b24fc5b5e7
3
+ size 8148
db/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6b6b5db043f332c024782c3447ad509ab63ee077fae161c9806187bb168191b
3
+ size 13254656
ingest.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ from typing import List
4
+ from multiprocessing import Pool
5
+ from tqdm import tqdm
6
+
7
+ from langchain.document_loaders import (
8
+ CSVLoader,
9
+ EverNoteLoader,
10
+ PyMuPDFLoader,
11
+ TextLoader,
12
+ UnstructuredEmailLoader,
13
+ UnstructuredEPubLoader,
14
+ UnstructuredHTMLLoader,
15
+ UnstructuredMarkdownLoader,
16
+ UnstructuredODTLoader,
17
+ UnstructuredPowerPointLoader,
18
+ UnstructuredWordDocumentLoader,
19
+ )
20
+
21
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
22
+ from langchain.vectorstores import Chroma
23
+ from langchain.embeddings import HuggingFaceEmbeddings
24
+ from langchain.docstore.document import Document
25
+ from constants import CHROMA_SETTINGS
26
+
27
+
28
+ # Load environment variables
29
+ persist_directory = os.environ.get('PERSIST_DIRECTORY', 'db')
30
+ source_directory = os.environ.get('SOURCE_DIRECTORY', 'source_documents')
31
+ embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME', 'all-MiniLM-L6-v2')
32
+ chunk_size = 500
33
+ chunk_overlap = 50
34
+
35
+ # Custom document loaders
36
+ class MyElmLoader(UnstructuredEmailLoader):
37
+ """Wrapper to fallback to text/plain when default does not work"""
38
+
39
+ def load(self) -> List[Document]:
40
+ """Wrapper adding fallback for elm without html"""
41
+ try:
42
+ try:
43
+ doc = UnstructuredEmailLoader.load(self)
44
+ except ValueError as e:
45
+ if 'text/html content not found in email' in str(e):
46
+ # Try plain text
47
+ self.unstructured_kwargs["content_source"]="text/plain"
48
+ doc = UnstructuredEmailLoader.load(self)
49
+ else:
50
+ raise
51
+ except Exception as e:
52
+ # Add file_path to exception message
53
+ raise type(e)(f"{self.file_path}: {e}") from e
54
+
55
+ return doc
56
+
57
+
58
+ # Map file extensions to document loaders and their arguments
59
+ LOADER_MAPPING = {
60
+ ".csv": (CSVLoader, {}),
61
+ # ".docx": (Docx2txtLoader, {}),
62
+ ".doc": (UnstructuredWordDocumentLoader, {}),
63
+ ".docx": (UnstructuredWordDocumentLoader, {}),
64
+ ".enex": (EverNoteLoader, {}),
65
+ ".eml": (MyElmLoader, {}),
66
+ ".epub": (UnstructuredEPubLoader, {}),
67
+ ".html": (UnstructuredHTMLLoader, {}),
68
+ ".md": (UnstructuredMarkdownLoader, {}),
69
+ ".odt": (UnstructuredODTLoader, {}),
70
+ ".pdf": (PyMuPDFLoader, {}),
71
+ ".ppt": (UnstructuredPowerPointLoader, {}),
72
+ ".pptx": (UnstructuredPowerPointLoader, {}),
73
+ ".txt": (TextLoader, {"encoding": "utf8"}),
74
+ # Add more mappings for other file extensions and loaders as needed
75
+ }
76
+
77
+
78
+ # Loads a single doc from specified file path
79
+ def load_single_document(file_path: str) -> List[Document]: # Return a list of 'Document' objects
80
+ ext = "." + file_path.rsplit(".", 1)[-1]
81
+ if ext in LOADER_MAPPING:
82
+ loader_class, loader_args = LOADER_MAPPING[ext]
83
+ loader = loader_class(file_path, **loader_args)
84
+ return loader.load()
85
+
86
+ raise ValueError(f"Unsupported file extension '{ext}'")
87
+
88
+ # If there's more than 1 doc, loads all docs from a source dir, optionally ignore specific files
89
+ def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
90
+ """
91
+ Loads all documents from the source documents directory, ignoring specified files
92
+ """
93
+ all_files = []
94
+ for ext in LOADER_MAPPING:
95
+ all_files.extend(
96
+ glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
97
+ )
98
+ filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]
99
+ # Parallel processing
100
+ with Pool(processes=os.cpu_count()) as pool:
101
+ results = []
102
+ with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
103
+ for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)): # Call loan_single_document for each file path in parallel
104
+ results.extend(docs)
105
+ pbar.update()
106
+
107
+ return results
108
+
109
+ # Loads docs from source_directory using load_document
110
+ def process_documents(ignored_files: List[str] = []) -> List[Document]:
111
+ """
112
+ Load documents and split in chunks
113
+ """
114
+ print(f"Loading documents from {source_directory}")
115
+ documents = load_documents(source_directory, ignored_files)
116
+ if not documents:
117
+ print("No new documents to load")
118
+ exit(0)
119
+ print(f"Loaded {len(documents)} new documents from {source_directory}")
120
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) # split the docs into text chunks
121
+ texts = text_splitter.split_documents(documents)
122
+ print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
123
+ return texts # return a list of split text chunks
124
+
125
+ def does_vectorstore_exist(persist_directory: str) -> bool:
126
+ """
127
+ Checks if vectorstore exists
128
+ """
129
+ # verifies the presence of necessary files and folders for a valid vectorstore
130
+ if os.path.exists(os.path.join(persist_directory, 'index')):
131
+ if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
132
+ list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
133
+ list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
134
+ # At least 3 documents are needed in a working vectorstore
135
+ if len(list_index_files) > 3:
136
+ return True
137
+ return False
138
+
139
+ def main():
140
+ # Create embeddings
141
+ embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
142
+
143
+ if does_vectorstore_exist(persist_directory):
144
+ # Update and store locally vectorstore
145
+ print(f"Appending to existing vectorstore at {persist_directory}")
146
+ db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
147
+ collection = db.get()
148
+ texts = process_documents([metadata['source'] for metadata in collection['metadatas']])
149
+ print(f"Creating embeddings. May take some minutes...")
150
+ db.add_documents(texts)
151
+ else:
152
+ # Create and store locally vectorstore
153
+ print("Creating new vectorstore")
154
+ texts = process_documents()
155
+ print(f"Creating embeddings. May take some minutes...")
156
+ db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory)
157
+ db.persist()
158
+ db = None
159
+
160
+ print(f"Ingestion complete! You can now run privateGPT.py to query your documents")
161
+
162
+
163
+ if __name__ == "__main__":
164
+ main()
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #python3.11.5
2
+ langchain==0.0.274
3
+ chromadb==0.4.7
4
+ urllib3==2.0.4
5
+ PyMuPDF==1.23.5
6
+ python-dotenv==1.0.0
7
+ unstructured==0.10.8
8
+ extract-msg==0.45.0
9
+ tabulate==0.9.0
10
+ pandoc==2.3
11
+ pypandoc==1.11
12
+ tqdm==4.66.1
13
+ sentence_transformers==2.2.2
14
+ streamlit==1.29.0
source_documents/The Personal MBA.pdf ADDED
Binary file (500 kB). View file