Spaces:

Tien09
/

personal_mba2

Runtime error

App Files Files Community

Tien09 commited on Jan 8, 2024

Commit

bc3cd48

1 Parent(s): 0643d88

Upload 13 files

Browse files

Files changed (14) hide show

.gitattributes +1 -0
Dockerfile +22 -0
app.py +128 -0
constants.py +11 -0
db/.DS_Store +0 -0
db/b7134628-7a61-4630-adf2-934fde432f96/data_level0.bin +3 -0
db/b7134628-7a61-4630-adf2-934fde432f96/header.bin +3 -0
db/b7134628-7a61-4630-adf2-934fde432f96/index_metadata.pickle +3 -0
db/b7134628-7a61-4630-adf2-934fde432f96/length.bin +3 -0
db/b7134628-7a61-4630-adf2-934fde432f96/link_lists.bin +3 -0
db/chroma.sqlite3 +3 -0
ingest.py +164 -0
requirements.txt +14 -0
source_documents/The Personal MBA.pdf +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+db/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,22 @@

+FROM python:3.11.5
+WORKDIR /app
+COPY ./requirements.txt /app/requirements.txt
+RUN pip3 install --no-cache-dir -r /app/requirements.txt
+# User
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME /home/user
+ENV PATH $HOME/.local/bin:$PATH
+WORKDIR $HOME
+RUN mkdir app
+WORKDIR $HOME/app
+COPY . $HOME/app
+EXPOSE 7860
+CMD ["streamlit", "run", "app.py", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# __import__('pysqlite3')
+# import sys
+# sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
+# modify comment
+import streamlit as st
+from langchain.llms import Ollama
+import os
+import chromadb
+from constants import CHROMA_SETTINGS
+from langchain.chains import RetrievalQA
+from langchain.embeddings import HuggingFaceEmbeddings
+from chromadb.config import Settings
+from langchain.vectorstores import Chroma
+from langchain.llms import Ollama
+from langchain.callbacks.base import BaseCallbackHandler
+# Custom streamlit handler to display LLM outputs in stream mode
+class StreamHandler(BaseCallbackHandler):
+    def __init__(self, container, initial_text=""):
+        self.container = container
+        self.text=initial_text
+    def on_llm_new_token(self, token: str, **kwargs) -> None:
+        self.text+=token+""
+        self.container.markdown(self.text)
+# streamlit UI configuration
+def setup_page():
+    st.set_page_config(layout="wide")
+    st.markdown("<h2 style='text-align: center; color: white;'>Your Personal MBA </h2>" , unsafe_allow_html=True)
+    url = 'https://personalmba.com/'
+    col1, col2, col3= st.columns(3)
+    with col2:
+        st.markdown("""
+            <div style="text-align: center;">
+            <h5 style='color: white;'>Inspired by </h5>
+            <a href="%s">The Personal MBA by Josh Kaufman</a>
+            </div>
+            """ % url, unsafe_allow_html=True)
+    st.divider()
+# get necessary environment variables for later use
+def get_environment_variables():
+    model = os.environ.get("MODEL", "mistral")
+    embeddings_model_name = os.environ.get("EMBEDDINGS_MODEL_NAME", "all-MiniLM-L6-v2")
+    persist_directory = os.environ.get("PERSIST_DIRECTORY", "db")
+    target_source_chunks = int(os.environ.get('TARGET_SOURCE_CHUNKS', 4))
+    return model, embeddings_model_name, persist_directory, target_source_chunks
+# create knowledge base retriever
+def create_knowledge_base(embeddings_model_name, persist_directory, target_source_chunks):
+    embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
+    db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
+    retriever = db.as_retriever(search_kwargs={"k": target_source_chunks})
+    return retriever
+# handle query when user hit 'enter' on a question
+def handle_query(query, model, retriever):
+    with st.chat_message('assistant'):
+        with st.spinner("Generating answer..."):
+            message_placeholder = st.empty()
+            stream_handler = StreamHandler(message_placeholder)
+            llm = Ollama(model=model, callbacks=[stream_handler])
+            qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=False)
+            res = qa(query)
+            answer = res['result']
+            message_placeholder.markdown(answer)
+            return answer
+# dictionary to store the previous messages, create a 'memory' for the LLM
+def initialize_session():
+    if 'messages' not in st.session_state:
+        st.session_state.messages = []
+# display the messages
+def display_messages():
+    for message in st.session_state.messages:
+        with st.chat_message(message['role']):
+            st.markdown(message['content'])
+# example questions when user first load up the app. Will disappear after user send the first query
+def show_examples():
+    examples = st.empty()
+    with examples.container():
+        with st.chat_message('assistant'):
+            st.markdown('Example questions:')
+            st.markdown(' - How do I know that I am making the right decisions?')
+            st.markdown(' - What are the key ideas in Chapter 6 "The Human Mind"?')
+            st.markdown(' - What are common traits shared by the most sucessful individuals in the world?')
+            st.markdown(' - I want to be a millionaire, build me a 5 year roadmap based on the top 0.01 percent of the human population.')
+            st.markdown('So, how may I help you today?')
+    return examples
+def main():
+    setup_page()
+    initialize_session()
+    display_messages()
+    examples = show_examples()
+    model, embeddings_model_name, persist_directory, target_source_chunks = get_environment_variables()
+    retriever = create_knowledge_base(embeddings_model_name, persist_directory, target_source_chunks)
+    query = st.chat_input(placeholder='Ask a question...')  # starting with empty query
+    if query:   # if user input a query and hit 'Enter'
+        examples.empty()
+        st.session_state.messages.append({  # add the query into session state/ dictionary
+            'role': 'user',
+            'content': query
+        })
+        with st.chat_message('user'):
+            st.markdown(query)
+        answer = handle_query(query, model, retriever)
+        st.session_state.messages.append({  # add the answer into session state/ dictionary
+             'role': 'assistant',
+             'content': answer
+        })
+if __name__ == "__main__":
+    main()

constants.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import os
+from chromadb.config import Settings
+# Define the folder for storing database
+PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY', 'db')   # db is default
+# Define the Chroma settings
+CHROMA_SETTINGS = Settings(
+        persist_directory=PERSIST_DIRECTORY,
+        anonymized_telemetry=False
+)

db/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

db/b7134628-7a61-4630-adf2-934fde432f96/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6606e50d663d8e0357beb09c60b4274ed698071f2275efa6f3fc3a64ec4aa739
+size 1676000

db/b7134628-7a61-4630-adf2-934fde432f96/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9de63d54afd49cbeeac013426226d9835e7c440647f9303475ea905ead14cd6
+size 100

db/b7134628-7a61-4630-adf2-934fde432f96/index_metadata.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2dcba004a08f5ee7c4c87a677a5f0396391c4804a2e5fadaff4a6e518924247f
+size 55974

db/b7134628-7a61-4630-adf2-934fde432f96/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4769fa8c2a4c9da88b7df41a290fea06b511acca2281536194c748a4c89f38d3
+size 4000

db/b7134628-7a61-4630-adf2-934fde432f96/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc83519dd4ca9640feb086425393c54166a8b6e3f1f8f29ba36ac8b24fc5b5e7
+size 8148

db/chroma.sqlite3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6b6b5db043f332c024782c3447ad509ab63ee077fae161c9806187bb168191b
+size 13254656

ingest.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import os
+import glob
+from typing import List
+from multiprocessing import Pool
+from tqdm import tqdm
+from langchain.document_loaders import (
+    CSVLoader,
+    EverNoteLoader,
+    PyMuPDFLoader,
+    TextLoader,
+    UnstructuredEmailLoader,
+    UnstructuredEPubLoader,
+    UnstructuredHTMLLoader,
+    UnstructuredMarkdownLoader,
+    UnstructuredODTLoader,
+    UnstructuredPowerPointLoader,
+    UnstructuredWordDocumentLoader,
+)
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import Chroma
+from langchain.embeddings import HuggingFaceEmbeddings
+from langchain.docstore.document import Document
+from constants import CHROMA_SETTINGS
+# Load environment variables
+persist_directory = os.environ.get('PERSIST_DIRECTORY', 'db')
+source_directory = os.environ.get('SOURCE_DIRECTORY', 'source_documents')
+embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME', 'all-MiniLM-L6-v2')
+chunk_size = 500
+chunk_overlap = 50
+# Custom document loaders
+class MyElmLoader(UnstructuredEmailLoader):
+    """Wrapper to fallback to text/plain when default does not work"""
+    def load(self) -> List[Document]:
+        """Wrapper adding fallback for elm without html"""
+        try:
+            try:
+                doc = UnstructuredEmailLoader.load(self)
+            except ValueError as e:
+                if 'text/html content not found in email' in str(e):
+                    # Try plain text
+                    self.unstructured_kwargs["content_source"]="text/plain"
+                    doc = UnstructuredEmailLoader.load(self)
+                else:
+                    raise
+        except Exception as e:
+            # Add file_path to exception message
+            raise type(e)(f"{self.file_path}: {e}") from e
+        return doc
+# Map file extensions to document loaders and their arguments
+LOADER_MAPPING = {
+    ".csv": (CSVLoader, {}),
+    # ".docx": (Docx2txtLoader, {}),
+    ".doc": (UnstructuredWordDocumentLoader, {}),
+    ".docx": (UnstructuredWordDocumentLoader, {}),
+    ".enex": (EverNoteLoader, {}),
+    ".eml": (MyElmLoader, {}),
+    ".epub": (UnstructuredEPubLoader, {}),
+    ".html": (UnstructuredHTMLLoader, {}),
+    ".md": (UnstructuredMarkdownLoader, {}),
+    ".odt": (UnstructuredODTLoader, {}),
+    ".pdf": (PyMuPDFLoader, {}),
+    ".ppt": (UnstructuredPowerPointLoader, {}),
+    ".pptx": (UnstructuredPowerPointLoader, {}),
+    ".txt": (TextLoader, {"encoding": "utf8"}),
+    # Add more mappings for other file extensions and loaders as needed
+}
+# Loads a single doc from specified file path
+def load_single_document(file_path: str) -> List[Document]: # Return a list of 'Document' objects
+    ext = "." + file_path.rsplit(".", 1)[-1]
+    if ext in LOADER_MAPPING:
+        loader_class, loader_args = LOADER_MAPPING[ext]
+        loader = loader_class(file_path, **loader_args)
+        return loader.load()
+    raise ValueError(f"Unsupported file extension '{ext}'")
+# If there's more than 1 doc, loads all docs from a source dir, optionally ignore specific files
+def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
+    """
+    Loads all documents from the source documents directory, ignoring specified files
+    """
+    all_files = []
+    for ext in LOADER_MAPPING:
+        all_files.extend(
+            glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
+        )
+    filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]
+    # Parallel processing
+    with Pool(processes=os.cpu_count()) as pool:
+        results = []
+        with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
+            for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):    # Call loan_single_document for each file path in parallel
+                results.extend(docs)
+                pbar.update()
+    return results
+# Loads docs from source_directory using load_document
+def process_documents(ignored_files: List[str] = []) -> List[Document]:
+    """
+    Load documents and split in chunks
+    """
+    print(f"Loading documents from {source_directory}")
+    documents = load_documents(source_directory, ignored_files)
+    if not documents:
+        print("No new documents to load")
+        exit(0)
+    print(f"Loaded {len(documents)} new documents from {source_directory}")
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)  # split the docs into text chunks
+    texts = text_splitter.split_documents(documents)
+    print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
+    return texts    # return a list of split text chunks
+def does_vectorstore_exist(persist_directory: str) -> bool:
+    """
+    Checks if vectorstore exists
+    """
+    # verifies the presence of necessary files and folders for a valid vectorstore
+    if os.path.exists(os.path.join(persist_directory, 'index')):
+        if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
+            list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
+            list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
+            # At least 3 documents are needed in a working vectorstore
+            if len(list_index_files) > 3:
+                return True
+    return False
+def main():
+    # Create embeddings
+    embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
+    if does_vectorstore_exist(persist_directory):
+        # Update and store locally vectorstore
+        print(f"Appending to existing vectorstore at {persist_directory}")
+        db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
+        collection = db.get()
+        texts = process_documents([metadata['source'] for metadata in collection['metadatas']])
+        print(f"Creating embeddings. May take some minutes...")
+        db.add_documents(texts)
+    else:
+        # Create and store locally vectorstore
+        print("Creating new vectorstore")
+        texts = process_documents()
+        print(f"Creating embeddings. May take some minutes...")
+        db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory)
+    db.persist()
+    db = None
+    print(f"Ingestion complete! You can now run privateGPT.py to query your documents")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+#python3.11.5
+langchain==0.0.274
+chromadb==0.4.7
+urllib3==2.0.4
+PyMuPDF==1.23.5
+python-dotenv==1.0.0
+unstructured==0.10.8
+extract-msg==0.45.0
+tabulate==0.9.0
+pandoc==2.3
+pypandoc==1.11
+tqdm==4.66.1
+sentence_transformers==2.2.2
+streamlit==1.29.0

source_documents/The Personal MBA.pdf ADDED Viewed

Binary file (500 kB). View file