improvements and docs
Browse files- README.md +14 -0
- code/.chainlit/config.toml +0 -84
- code/config.yml +1 -0
- code/modules/llm_tutor.py +4 -3
- code/vectorstores/db_FAISS_sentence-transformers/all-MiniLM-L6-v2/index.faiss +0 -0
- code/vectorstores/db_FAISS_sentence-transformers/all-MiniLM-L6-v2/index.pkl +0 -0
- code/vectorstores/db_FAISS_text-embedding-ada-002/index.faiss +0 -0
- code/vectorstores/db_FAISS_text-embedding-ada-002/index.pkl +0 -0
- docs/README.md +41 -1
README.md
CHANGED
@@ -12,3 +12,17 @@ DL4DS Tutor
|
|
12 |
===========
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
===========
|
13 |
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
15 |
+
|
16 |
+
To run locally,
|
17 |
+
|
18 |
+
Clone the repository from: https://github.com/DL4DS/dl4ds_tutor
|
19 |
+
|
20 |
+
Put your data under the `storage/data` directory. Note: You can add urls in the urls.txt file, and other pdf files in the `storage/data` directory.
|
21 |
+
|
22 |
+
To create the Vector Database, run the following command:
|
23 |
+
```python code/modules/vector_db.py```
|
24 |
+
|
25 |
+
To run the chainlit app, run the following command:
|
26 |
+
```chainlit run code/main.py```
|
27 |
+
|
28 |
+
See the [docs](https://github.com/DL4DS/dl4ds_tutor/tree/main/docs) for more information.
|
code/.chainlit/config.toml
DELETED
@@ -1,84 +0,0 @@
|
|
1 |
-
[project]
|
2 |
-
# Whether to enable telemetry (default: true). No personal data is collected.
|
3 |
-
enable_telemetry = true
|
4 |
-
|
5 |
-
# List of environment variables to be provided by each user to use the app.
|
6 |
-
user_env = []
|
7 |
-
|
8 |
-
# Duration (in seconds) during which the session is saved when the connection is lost
|
9 |
-
session_timeout = 3600
|
10 |
-
|
11 |
-
# Enable third parties caching (e.g LangChain cache)
|
12 |
-
cache = false
|
13 |
-
|
14 |
-
# Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
|
15 |
-
# follow_symlink = false
|
16 |
-
|
17 |
-
[features]
|
18 |
-
# Show the prompt playground
|
19 |
-
prompt_playground = true
|
20 |
-
|
21 |
-
# Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
|
22 |
-
unsafe_allow_html = false
|
23 |
-
|
24 |
-
# Process and display mathematical expressions. This can clash with "$" characters in messages.
|
25 |
-
latex = false
|
26 |
-
|
27 |
-
# Authorize users to upload files with messages
|
28 |
-
multi_modal = true
|
29 |
-
|
30 |
-
# Allows user to use speech to text
|
31 |
-
[features.speech_to_text]
|
32 |
-
enabled = false
|
33 |
-
# See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
|
34 |
-
# language = "en-US"
|
35 |
-
|
36 |
-
[UI]
|
37 |
-
# Name of the app and chatbot.
|
38 |
-
name = "LLM Tutor"
|
39 |
-
|
40 |
-
# Show the readme while the conversation is empty.
|
41 |
-
show_readme_as_default = true
|
42 |
-
|
43 |
-
# Description of the app and chatbot. This is used for HTML tags.
|
44 |
-
# description = ""
|
45 |
-
|
46 |
-
# Large size content are by default collapsed for a cleaner ui
|
47 |
-
default_collapse_content = true
|
48 |
-
|
49 |
-
# The default value for the expand messages settings.
|
50 |
-
default_expand_messages = false
|
51 |
-
|
52 |
-
# Hide the chain of thought details from the user in the UI.
|
53 |
-
hide_cot = false
|
54 |
-
|
55 |
-
# Link to your github repo. This will add a github button in the UI's header.
|
56 |
-
# github = "https://github.com/DL4DS/dl4ds_tutor"
|
57 |
-
|
58 |
-
# Specify a CSS file that can be used to customize the user interface.
|
59 |
-
# The CSS file can be served from the public directory or via an external link.
|
60 |
-
# custom_css = "/public/test.css"
|
61 |
-
|
62 |
-
# Override default MUI light theme. (Check theme.ts)
|
63 |
-
[UI.theme.light]
|
64 |
-
#background = "#FAFAFA"
|
65 |
-
#paper = "#FFFFFF"
|
66 |
-
|
67 |
-
[UI.theme.light.primary]
|
68 |
-
#main = "#F80061"
|
69 |
-
#dark = "#980039"
|
70 |
-
#light = "#FFE7EB"
|
71 |
-
|
72 |
-
# Override default MUI dark theme. (Check theme.ts)
|
73 |
-
[UI.theme.dark]
|
74 |
-
#background = "#FAFAFA"
|
75 |
-
#paper = "#FFFFFF"
|
76 |
-
|
77 |
-
[UI.theme.dark.primary]
|
78 |
-
#main = "#F80061"
|
79 |
-
#dark = "#980039"
|
80 |
-
#light = "#FFE7EB"
|
81 |
-
|
82 |
-
|
83 |
-
[meta]
|
84 |
-
generated_by = "0.7.700"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
code/config.yml
CHANGED
@@ -10,6 +10,7 @@ embedding_options:
|
|
10 |
search_top_k : 3 # int
|
11 |
llm_params:
|
12 |
use_history: False # bool
|
|
|
13 |
llm_loader: 'local_llm' # str [local_llm, openai]
|
14 |
openai_params:
|
15 |
model: 'gpt-4' # str [gpt-3.5-turbo-1106, gpt-4]
|
|
|
10 |
search_top_k : 3 # int
|
11 |
llm_params:
|
12 |
use_history: False # bool
|
13 |
+
memory_window: 3 # int
|
14 |
llm_loader: 'local_llm' # str [local_llm, openai]
|
15 |
openai_params:
|
16 |
model: 'gpt-4' # str [gpt-3.5-turbo-1106, gpt-4]
|
code/modules/llm_tutor.py
CHANGED
@@ -5,7 +5,7 @@ from langchain_community.embeddings import OpenAIEmbeddings
|
|
5 |
from langchain.vectorstores import FAISS
|
6 |
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
|
7 |
from langchain.llms import CTransformers
|
8 |
-
from langchain.memory import
|
9 |
from langchain.chains.conversational_retrieval.prompts import QA_PROMPT
|
10 |
import os
|
11 |
|
@@ -35,8 +35,9 @@ class LLMTutor:
|
|
35 |
# Retrieval QA Chain
|
36 |
def retrieval_qa_chain(self, llm, prompt, db):
|
37 |
if self.config["llm_params"]["use_history"]:
|
38 |
-
memory =
|
39 |
-
|
|
|
40 |
)
|
41 |
qa_chain = ConversationalRetrievalChain.from_llm(
|
42 |
llm=llm,
|
|
|
5 |
from langchain.vectorstores import FAISS
|
6 |
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
|
7 |
from langchain.llms import CTransformers
|
8 |
+
from langchain.memory import ConversationBufferWindowMemory
|
9 |
from langchain.chains.conversational_retrieval.prompts import QA_PROMPT
|
10 |
import os
|
11 |
|
|
|
35 |
# Retrieval QA Chain
|
36 |
def retrieval_qa_chain(self, llm, prompt, db):
|
37 |
if self.config["llm_params"]["use_history"]:
|
38 |
+
memory = ConversationBufferWindowMemory(
|
39 |
+
k = self.config["llm_params"]["memory_window"],
|
40 |
+
memory_key="chat_history", return_messages=True, output_key="answer"
|
41 |
)
|
42 |
qa_chain = ConversationalRetrievalChain.from_llm(
|
43 |
llm=llm,
|
code/vectorstores/db_FAISS_sentence-transformers/all-MiniLM-L6-v2/index.faiss
DELETED
Binary file (6.19 kB)
|
|
code/vectorstores/db_FAISS_sentence-transformers/all-MiniLM-L6-v2/index.pkl
DELETED
Binary file (9.21 kB)
|
|
code/vectorstores/db_FAISS_text-embedding-ada-002/index.faiss
DELETED
Binary file (24.6 kB)
|
|
code/vectorstores/db_FAISS_text-embedding-ada-002/index.pkl
DELETED
Binary file (9.21 kB)
|
|
docs/README.md
CHANGED
@@ -1,3 +1,43 @@
|
|
1 |
# Documentation
|
2 |
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Documentation
|
2 |
|
3 |
+
## File Structure:
|
4 |
+
- `docs/` - Documentation files
|
5 |
+
- `code/` - Code files
|
6 |
+
- `storage/` - Storage files
|
7 |
+
- `vectorstores/` - Vector Databases
|
8 |
+
- `.env` - Environment Variables
|
9 |
+
- `Dockerfile` - Dockerfile for Hugging Face
|
10 |
+
- `.chainlit` - Chainlit Configuration
|
11 |
+
- `chainlit.md` - Chainlit README
|
12 |
+
- `README.md` - Repository README
|
13 |
+
- `.gitignore` - Gitignore file
|
14 |
+
- `requirements.txt` - Python Requirements
|
15 |
+
- `.gitattributes` - Gitattributes file
|
16 |
+
|
17 |
+
## Code Structure
|
18 |
+
|
19 |
+
- `code/main.py` - Main Chainlit App
|
20 |
+
- `code/config.yaml` - Configuration File to set Embedding related, Vector Database related, and Chat Model related parameters.
|
21 |
+
- `code/modules/vector_db.py` - Vector Database Creation
|
22 |
+
- `code/modules/chat_model_loader.py` - Chat Model Loader (Creates the Chat Model)
|
23 |
+
- `code/modules/constants.py` - Constants (Loads the Environment Variables, Prompts, Model Paths, etc.)
|
24 |
+
- `code/modules/data_loader.py` - Loads and Chunks the Data
|
25 |
+
- `code/modules/embedding_model.py` - Creates the Embedding Model to Embed the Data
|
26 |
+
- `code/modules/llm_tutor.py` - Creates the RAG LLM Tutor
|
27 |
+
- The Function `qa_bot()` loads the vector database and the chat model, and sets the prompt to pass to the chat model.
|
28 |
+
- `code/modules/helpers.py` - Helper Functions
|
29 |
+
|
30 |
+
## Storage and Vectorstores
|
31 |
+
|
32 |
+
- `storage/data/` - Data Storage (Put your pdf files under this directory, and urls in the urls.txt file)
|
33 |
+
- `storage/models/` - Model Storage (Put your local LLMs under this directory)
|
34 |
+
|
35 |
+
- `vectorstores/` - Vector Databases (Stores the Vector Databases generated from `code/modules/vector_db.py`)
|
36 |
+
|
37 |
+
|
38 |
+
## Useful Configurations
|
39 |
+
set these in `code/config.yaml`:
|
40 |
+
* ``["embedding_options"]["expand_urls"]`` - If set to True, gets and reads the data from all the links under the url provided. If set to False, only reads the data in the url provided.
|
41 |
+
* ``["embedding_options"]["search_top_k"]`` - Number of sources that the retriever returns
|
42 |
+
* ``["llm_params]["use_history"]`` - Whether to use history in the prompt or not
|
43 |
+
* ``["llm_params]["memory_window"]`` - Number of interactions to keep a track of in the history
|