XThomasBU commited on
Commit
db6b619
·
1 Parent(s): ce9ef3e

improvements and docs

Browse files
README.md CHANGED
@@ -12,3 +12,17 @@ DL4DS Tutor
12
  ===========
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  ===========
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
15
+
16
+ To run locally,
17
+
18
+ Clone the repository from: https://github.com/DL4DS/dl4ds_tutor
19
+
20
+ Put your data under the `storage/data` directory. Note: You can add urls in the urls.txt file, and other pdf files in the `storage/data` directory.
21
+
22
+ To create the Vector Database, run the following command:
23
+ ```python code/modules/vector_db.py```
24
+
25
+ To run the chainlit app, run the following command:
26
+ ```chainlit run code/main.py```
27
+
28
+ See the [docs](https://github.com/DL4DS/dl4ds_tutor/tree/main/docs) for more information.
code/.chainlit/config.toml DELETED
@@ -1,84 +0,0 @@
1
- [project]
2
- # Whether to enable telemetry (default: true). No personal data is collected.
3
- enable_telemetry = true
4
-
5
- # List of environment variables to be provided by each user to use the app.
6
- user_env = []
7
-
8
- # Duration (in seconds) during which the session is saved when the connection is lost
9
- session_timeout = 3600
10
-
11
- # Enable third parties caching (e.g LangChain cache)
12
- cache = false
13
-
14
- # Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
15
- # follow_symlink = false
16
-
17
- [features]
18
- # Show the prompt playground
19
- prompt_playground = true
20
-
21
- # Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
22
- unsafe_allow_html = false
23
-
24
- # Process and display mathematical expressions. This can clash with "$" characters in messages.
25
- latex = false
26
-
27
- # Authorize users to upload files with messages
28
- multi_modal = true
29
-
30
- # Allows user to use speech to text
31
- [features.speech_to_text]
32
- enabled = false
33
- # See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
34
- # language = "en-US"
35
-
36
- [UI]
37
- # Name of the app and chatbot.
38
- name = "LLM Tutor"
39
-
40
- # Show the readme while the conversation is empty.
41
- show_readme_as_default = true
42
-
43
- # Description of the app and chatbot. This is used for HTML tags.
44
- # description = ""
45
-
46
- # Large size content are by default collapsed for a cleaner ui
47
- default_collapse_content = true
48
-
49
- # The default value for the expand messages settings.
50
- default_expand_messages = false
51
-
52
- # Hide the chain of thought details from the user in the UI.
53
- hide_cot = false
54
-
55
- # Link to your github repo. This will add a github button in the UI's header.
56
- # github = "https://github.com/DL4DS/dl4ds_tutor"
57
-
58
- # Specify a CSS file that can be used to customize the user interface.
59
- # The CSS file can be served from the public directory or via an external link.
60
- # custom_css = "/public/test.css"
61
-
62
- # Override default MUI light theme. (Check theme.ts)
63
- [UI.theme.light]
64
- #background = "#FAFAFA"
65
- #paper = "#FFFFFF"
66
-
67
- [UI.theme.light.primary]
68
- #main = "#F80061"
69
- #dark = "#980039"
70
- #light = "#FFE7EB"
71
-
72
- # Override default MUI dark theme. (Check theme.ts)
73
- [UI.theme.dark]
74
- #background = "#FAFAFA"
75
- #paper = "#FFFFFF"
76
-
77
- [UI.theme.dark.primary]
78
- #main = "#F80061"
79
- #dark = "#980039"
80
- #light = "#FFE7EB"
81
-
82
-
83
- [meta]
84
- generated_by = "0.7.700"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/config.yml CHANGED
@@ -10,6 +10,7 @@ embedding_options:
10
  search_top_k : 3 # int
11
  llm_params:
12
  use_history: False # bool
 
13
  llm_loader: 'local_llm' # str [local_llm, openai]
14
  openai_params:
15
  model: 'gpt-4' # str [gpt-3.5-turbo-1106, gpt-4]
 
10
  search_top_k : 3 # int
11
  llm_params:
12
  use_history: False # bool
13
+ memory_window: 3 # int
14
  llm_loader: 'local_llm' # str [local_llm, openai]
15
  openai_params:
16
  model: 'gpt-4' # str [gpt-3.5-turbo-1106, gpt-4]
code/modules/llm_tutor.py CHANGED
@@ -5,7 +5,7 @@ from langchain_community.embeddings import OpenAIEmbeddings
5
  from langchain.vectorstores import FAISS
6
  from langchain.chains import RetrievalQA, ConversationalRetrievalChain
7
  from langchain.llms import CTransformers
8
- from langchain.memory import ConversationBufferMemory
9
  from langchain.chains.conversational_retrieval.prompts import QA_PROMPT
10
  import os
11
 
@@ -35,8 +35,9 @@ class LLMTutor:
35
  # Retrieval QA Chain
36
  def retrieval_qa_chain(self, llm, prompt, db):
37
  if self.config["llm_params"]["use_history"]:
38
- memory = ConversationBufferMemory(
39
- memory_key="chat_history", return_messages=True, output_key="answer"
 
40
  )
41
  qa_chain = ConversationalRetrievalChain.from_llm(
42
  llm=llm,
 
5
  from langchain.vectorstores import FAISS
6
  from langchain.chains import RetrievalQA, ConversationalRetrievalChain
7
  from langchain.llms import CTransformers
8
+ from langchain.memory import ConversationBufferWindowMemory
9
  from langchain.chains.conversational_retrieval.prompts import QA_PROMPT
10
  import os
11
 
 
35
  # Retrieval QA Chain
36
  def retrieval_qa_chain(self, llm, prompt, db):
37
  if self.config["llm_params"]["use_history"]:
38
+ memory = ConversationBufferWindowMemory(
39
+ k = self.config["llm_params"]["memory_window"],
40
+ memory_key="chat_history", return_messages=True, output_key="answer"
41
  )
42
  qa_chain = ConversationalRetrievalChain.from_llm(
43
  llm=llm,
code/vectorstores/db_FAISS_sentence-transformers/all-MiniLM-L6-v2/index.faiss DELETED
Binary file (6.19 kB)
 
code/vectorstores/db_FAISS_sentence-transformers/all-MiniLM-L6-v2/index.pkl DELETED
Binary file (9.21 kB)
 
code/vectorstores/db_FAISS_text-embedding-ada-002/index.faiss DELETED
Binary file (24.6 kB)
 
code/vectorstores/db_FAISS_text-embedding-ada-002/index.pkl DELETED
Binary file (9.21 kB)
 
docs/README.md CHANGED
@@ -1,3 +1,43 @@
1
  # Documentation
2
 
3
- To be updated
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Documentation
2
 
3
+ ## File Structure:
4
+ - `docs/` - Documentation files
5
+ - `code/` - Code files
6
+ - `storage/` - Storage files
7
+ - `vectorstores/` - Vector Databases
8
+ - `.env` - Environment Variables
9
+ - `Dockerfile` - Dockerfile for Hugging Face
10
+ - `.chainlit` - Chainlit Configuration
11
+ - `chainlit.md` - Chainlit README
12
+ - `README.md` - Repository README
13
+ - `.gitignore` - Gitignore file
14
+ - `requirements.txt` - Python Requirements
15
+ - `.gitattributes` - Gitattributes file
16
+
17
+ ## Code Structure
18
+
19
+ - `code/main.py` - Main Chainlit App
20
+ - `code/config.yaml` - Configuration File to set Embedding related, Vector Database related, and Chat Model related parameters.
21
+ - `code/modules/vector_db.py` - Vector Database Creation
22
+ - `code/modules/chat_model_loader.py` - Chat Model Loader (Creates the Chat Model)
23
+ - `code/modules/constants.py` - Constants (Loads the Environment Variables, Prompts, Model Paths, etc.)
24
+ - `code/modules/data_loader.py` - Loads and Chunks the Data
25
+ - `code/modules/embedding_model.py` - Creates the Embedding Model to Embed the Data
26
+ - `code/modules/llm_tutor.py` - Creates the RAG LLM Tutor
27
+ - The Function `qa_bot()` loads the vector database and the chat model, and sets the prompt to pass to the chat model.
28
+ - `code/modules/helpers.py` - Helper Functions
29
+
30
+ ## Storage and Vectorstores
31
+
32
+ - `storage/data/` - Data Storage (Put your pdf files under this directory, and urls in the urls.txt file)
33
+ - `storage/models/` - Model Storage (Put your local LLMs under this directory)
34
+
35
+ - `vectorstores/` - Vector Databases (Stores the Vector Databases generated from `code/modules/vector_db.py`)
36
+
37
+
38
+ ## Useful Configurations
39
+ set these in `code/config.yaml`:
40
+ * ``["embedding_options"]["expand_urls"]`` - If set to True, gets and reads the data from all the links under the url provided. If set to False, only reads the data in the url provided.
41
+ * ``["embedding_options"]["search_top_k"]`` - Number of sources that the retriever returns
42
+ * ``["llm_params]["use_history"]`` - Whether to use history in the prompt or not
43
+ * ``["llm_params]["memory_window"]`` - Number of interactions to keep a track of in the history