Merge pull request #58 from DL4DS/setup_and_format_instructions
Browse files- .flake8 +3 -0
- .github/workflows/code_quality_check.yml +33 -0
- .gitignore +1 -0
- README.md +5 -2
- code/.chainlit/config.toml +3 -1
- code/__init__.py +0 -1
- code/main.py +24 -12
- code/modules/chat/chat_model_loader.py +1 -8
- code/modules/chat/langchain/__init__.py +0 -0
- code/modules/chat/langchain/langchain_rag.py +13 -8
- code/modules/chat/langchain/utils.py +3 -27
- code/modules/chat/llm_tutor.py +10 -7
- code/modules/chat_processor/literal_ai.py +1 -38
- code/modules/config/config.yml +1 -2
- code/modules/config/constants.py +6 -3
- code/modules/config/user_config.yml +3 -0
- code/modules/dataloader/data_loader.py +75 -49
- code/modules/dataloader/helpers.py +5 -3
- code/modules/dataloader/pdf_readers/gpt.py +27 -19
- code/modules/dataloader/pdf_readers/llama.py +24 -23
- code/modules/dataloader/webpage_crawler.py +5 -3
- code/modules/vectorstore/colbert.py +3 -2
- code/modules/vectorstore/embedding_model_loader.py +1 -7
- code/modules/vectorstore/faiss.py +10 -7
- code/modules/vectorstore/raptor.py +1 -4
- code/modules/vectorstore/store_manager.py +13 -7
- docs/README.md +0 -51
- docs/contribute.md +33 -0
- docs/setup.md +127 -0
- pyproject.toml +2 -0
- requirements.txt +3 -0
.flake8
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[flake8]
|
2 |
+
max-line-length = 88
|
3 |
+
extend-ignore = E203, E266, E501, W503
|
.github/workflows/code_quality_check.yml
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Code Quality and Security Checks
|
2 |
+
|
3 |
+
on:
|
4 |
+
push:
|
5 |
+
branches: [ main, dev_branch ]
|
6 |
+
pull_request:
|
7 |
+
branches: [ main, dev_branch ]
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
code-quality:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- uses: actions/checkout@v3
|
14 |
+
|
15 |
+
- name: Set up Python
|
16 |
+
uses: actions/setup-python@v4
|
17 |
+
with:
|
18 |
+
python-version: '3.11'
|
19 |
+
|
20 |
+
- name: Install dependencies
|
21 |
+
run: |
|
22 |
+
python -m pip install --upgrade pip
|
23 |
+
pip install flake8 black bandit
|
24 |
+
|
25 |
+
- name: Run Black
|
26 |
+
run: black --check .
|
27 |
+
|
28 |
+
- name: Run Flake8
|
29 |
+
run: flake8 .
|
30 |
+
|
31 |
+
- name: Run Bandit
|
32 |
+
run: |
|
33 |
+
bandit -r .
|
.gitignore
CHANGED
@@ -165,6 +165,7 @@ cython_debug/
|
|
165 |
.ragatouille/*
|
166 |
*/__pycache__/*
|
167 |
.chainlit/translations/
|
|
|
168 |
storage/logs/*
|
169 |
vectorstores/*
|
170 |
|
|
|
165 |
.ragatouille/*
|
166 |
*/__pycache__/*
|
167 |
.chainlit/translations/
|
168 |
+
code/.chainlit/translations/
|
169 |
storage/logs/*
|
170 |
vectorstores/*
|
171 |
|
README.md
CHANGED
@@ -15,6 +15,8 @@ You can find an implementation of the Tutor at [DL4DS Tutor on Hugging Face](htt
|
|
15 |
|
16 |
## Running Locally
|
17 |
|
|
|
|
|
18 |
1. **Clone the Repository**
|
19 |
```bash
|
20 |
git clone https://github.com/DL4DS/dl4ds_tutor
|
@@ -36,7 +38,6 @@ You can find an implementation of the Tutor at [DL4DS Tutor on Hugging Face](htt
|
|
36 |
python -m modules.vectorstore.store_manager
|
37 |
```
|
38 |
- Note: You need to run the above command when you add new data to the `storage/data` directory, or if the `storage/data/urls.txt` file is updated.
|
39 |
-
- Alternatively, you can set `["vectorstore"]["embedd_files"]` to `True` in the `code/modules/config/config.yaml` file, which will embed files from the storage directory every time you run the below chainlit command.
|
40 |
|
41 |
5. **Run the Chainlit App**
|
42 |
```bash
|
@@ -90,4 +91,6 @@ docker run -it --rm -p 8000:8000 dev
|
|
90 |
|
91 |
## Contributing
|
92 |
|
93 |
-
Please create an issue if you have any suggestions or improvements, and start working on it by creating a branch and by making a pull request to the main branch.
|
|
|
|
|
|
15 |
|
16 |
## Running Locally
|
17 |
|
18 |
+
Please view `docs/setup.md` for more information on setting up the project.
|
19 |
+
|
20 |
1. **Clone the Repository**
|
21 |
```bash
|
22 |
git clone https://github.com/DL4DS/dl4ds_tutor
|
|
|
38 |
python -m modules.vectorstore.store_manager
|
39 |
```
|
40 |
- Note: You need to run the above command when you add new data to the `storage/data` directory, or if the `storage/data/urls.txt` file is updated.
|
|
|
41 |
|
42 |
5. **Run the Chainlit App**
|
43 |
```bash
|
|
|
91 |
|
92 |
## Contributing
|
93 |
|
94 |
+
Please create an issue if you have any suggestions or improvements, and start working on it by creating a branch and by making a pull request to the main branch.
|
95 |
+
|
96 |
+
Please view `docs/contribute.md` for more information on contributing.
|
code/.chainlit/config.toml
CHANGED
@@ -49,6 +49,8 @@ auto_tag_thread = true
|
|
49 |
# Sample rate of the audio
|
50 |
sample_rate = 44100
|
51 |
|
|
|
|
|
52 |
[UI]
|
53 |
# Name of the assistant.
|
54 |
name = "AI Tutor"
|
@@ -115,4 +117,4 @@ custom_meta_image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/f/
|
|
115 |
#secondary = "#BDBDBD"
|
116 |
|
117 |
[meta]
|
118 |
-
generated_by = "1.1.
|
|
|
49 |
# Sample rate of the audio
|
50 |
sample_rate = 44100
|
51 |
|
52 |
+
edit_message = true
|
53 |
+
|
54 |
[UI]
|
55 |
# Name of the assistant.
|
56 |
name = "AI Tutor"
|
|
|
117 |
#secondary = "#BDBDBD"
|
118 |
|
119 |
[meta]
|
120 |
+
generated_by = "1.1.306"
|
code/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
from .modules import *
|
|
|
|
code/main.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import chainlit.data as cl_data
|
2 |
import asyncio
|
3 |
from modules.config.constants import (
|
4 |
-
LLAMA_PATH,
|
5 |
LITERAL_API_KEY_LOGGING,
|
6 |
LITERAL_API_URL,
|
7 |
)
|
@@ -9,7 +8,6 @@ from modules.chat_processor.literal_ai import CustomLiteralDataLayer
|
|
9 |
|
10 |
import json
|
11 |
import yaml
|
12 |
-
import os
|
13 |
from typing import Any, Dict, no_type_check
|
14 |
import chainlit as cl
|
15 |
from modules.chat.llm_tutor import LLMTutor
|
@@ -73,7 +71,14 @@ class Chatbot:
|
|
73 |
start_time = time.time()
|
74 |
|
75 |
llm_settings = cl.user_session.get("llm_settings", {})
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
llm_settings.get("chat_model"),
|
78 |
llm_settings.get("retriever_method"),
|
79 |
llm_settings.get("memory_window"),
|
@@ -113,8 +118,6 @@ class Chatbot:
|
|
113 |
),
|
114 |
)
|
115 |
|
116 |
-
tags = [chat_profile, self.config["vectorstore"]["db_option"]]
|
117 |
-
|
118 |
cl.user_session.set("chain", self.chain)
|
119 |
cl.user_session.set("llm_tutor", self.llm_tutor)
|
120 |
|
@@ -180,7 +183,7 @@ class Chatbot:
|
|
180 |
cl.input_widget.Select(
|
181 |
id="chunking_mode",
|
182 |
label="Chunking mode",
|
183 |
-
values=[
|
184 |
initial_index=1,
|
185 |
),
|
186 |
cl.input_widget.Switch(
|
@@ -241,7 +244,8 @@ class Chatbot:
|
|
241 |
) # see if the thread has any steps
|
242 |
if thread.steps or len(thread.steps) > 0:
|
243 |
return None
|
244 |
-
except:
|
|
|
245 |
return [
|
246 |
cl.Starter(
|
247 |
label="recording on CNNs?",
|
@@ -294,10 +298,18 @@ class Chatbot:
|
|
294 |
|
295 |
await self.make_llm_settings_widgets(self.config)
|
296 |
user = cl.user_session.get("user")
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
301 |
|
302 |
memory = cl.user_session.get("memory", [])
|
303 |
|
@@ -355,7 +367,7 @@ class Chatbot:
|
|
355 |
llm_settings = cl.user_session.get("llm_settings", {})
|
356 |
view_sources = llm_settings.get("view_sources", False)
|
357 |
stream = llm_settings.get("stream_response", False)
|
358 |
-
|
359 |
user_query_dict = {"input": message.content}
|
360 |
# Define the base configuration
|
361 |
chain_config = {
|
|
|
1 |
import chainlit.data as cl_data
|
2 |
import asyncio
|
3 |
from modules.config.constants import (
|
|
|
4 |
LITERAL_API_KEY_LOGGING,
|
5 |
LITERAL_API_URL,
|
6 |
)
|
|
|
8 |
|
9 |
import json
|
10 |
import yaml
|
|
|
11 |
from typing import Any, Dict, no_type_check
|
12 |
import chainlit as cl
|
13 |
from modules.chat.llm_tutor import LLMTutor
|
|
|
71 |
start_time = time.time()
|
72 |
|
73 |
llm_settings = cl.user_session.get("llm_settings", {})
|
74 |
+
(
|
75 |
+
chat_profile,
|
76 |
+
retriever_method,
|
77 |
+
memory_window,
|
78 |
+
llm_style,
|
79 |
+
generate_follow_up,
|
80 |
+
chunking_mode,
|
81 |
+
) = (
|
82 |
llm_settings.get("chat_model"),
|
83 |
llm_settings.get("retriever_method"),
|
84 |
llm_settings.get("memory_window"),
|
|
|
118 |
),
|
119 |
)
|
120 |
|
|
|
|
|
121 |
cl.user_session.set("chain", self.chain)
|
122 |
cl.user_session.set("llm_tutor", self.llm_tutor)
|
123 |
|
|
|
183 |
cl.input_widget.Select(
|
184 |
id="chunking_mode",
|
185 |
label="Chunking mode",
|
186 |
+
values=["fixed", "semantic"],
|
187 |
initial_index=1,
|
188 |
),
|
189 |
cl.input_widget.Switch(
|
|
|
244 |
) # see if the thread has any steps
|
245 |
if thread.steps or len(thread.steps) > 0:
|
246 |
return None
|
247 |
+
except Exception as e:
|
248 |
+
print(e)
|
249 |
return [
|
250 |
cl.Starter(
|
251 |
label="recording on CNNs?",
|
|
|
298 |
|
299 |
await self.make_llm_settings_widgets(self.config)
|
300 |
user = cl.user_session.get("user")
|
301 |
+
|
302 |
+
try:
|
303 |
+
self.user = {
|
304 |
+
"user_id": user.identifier,
|
305 |
+
"session_id": cl.context.session.thread_id,
|
306 |
+
}
|
307 |
+
except Exception as e:
|
308 |
+
print(e)
|
309 |
+
self.user = {
|
310 |
+
"user_id": "guest",
|
311 |
+
"session_id": cl.context.session.thread_id,
|
312 |
+
}
|
313 |
|
314 |
memory = cl.user_session.get("memory", [])
|
315 |
|
|
|
367 |
llm_settings = cl.user_session.get("llm_settings", {})
|
368 |
view_sources = llm_settings.get("view_sources", False)
|
369 |
stream = llm_settings.get("stream_response", False)
|
370 |
+
stream = False # Fix streaming
|
371 |
user_query_dict = {"input": message.content}
|
372 |
# Define the base configuration
|
373 |
chain_config = {
|
code/modules/chat/chat_model_loader.py
CHANGED
@@ -1,15 +1,8 @@
|
|
1 |
from langchain_openai import ChatOpenAI
|
2 |
-
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
|
3 |
-
from transformers import AutoTokenizer, TextStreamer
|
4 |
from langchain_community.llms import LlamaCpp
|
5 |
-
import torch
|
6 |
-
import transformers
|
7 |
import os
|
8 |
from pathlib import Path
|
9 |
from huggingface_hub import hf_hub_download
|
10 |
-
from langchain.callbacks.manager import CallbackManager
|
11 |
-
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
12 |
-
from modules.config.constants import LLAMA_PATH
|
13 |
|
14 |
|
15 |
class ChatModelLoader:
|
@@ -38,7 +31,7 @@ class ChatModelLoader:
|
|
38 |
self.config["llm_params"]["local_llm_params"]["model"]
|
39 |
)
|
40 |
llm = LlamaCpp(
|
41 |
-
model_path=
|
42 |
n_batch=n_batch,
|
43 |
n_ctx=2048,
|
44 |
f16_kv=True,
|
|
|
1 |
from langchain_openai import ChatOpenAI
|
|
|
|
|
2 |
from langchain_community.llms import LlamaCpp
|
|
|
|
|
3 |
import os
|
4 |
from pathlib import Path
|
5 |
from huggingface_hub import hf_hub_download
|
|
|
|
|
|
|
6 |
|
7 |
|
8 |
class ChatModelLoader:
|
|
|
31 |
self.config["llm_params"]["local_llm_params"]["model"]
|
32 |
)
|
33 |
llm = LlamaCpp(
|
34 |
+
model_path=model_path,
|
35 |
n_batch=n_batch,
|
36 |
n_ctx=2048,
|
37 |
f16_kv=True,
|
code/modules/chat/langchain/__init__.py
ADDED
File without changes
|
code/modules/chat/langchain/langchain_rag.py
CHANGED
@@ -1,17 +1,22 @@
|
|
1 |
from langchain_core.prompts import ChatPromptTemplate
|
2 |
|
3 |
-
from modules.chat.langchain.utils import
|
4 |
-
from
|
5 |
from modules.chat.base import BaseRAG
|
6 |
from langchain_core.prompts import PromptTemplate
|
7 |
-
from langchain.memory import
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
)
|
11 |
|
12 |
-
import chainlit as cl
|
13 |
-
from langchain_community.chat_models import ChatOpenAI
|
14 |
-
|
15 |
|
16 |
class Langchain_RAG_V1(BaseRAG):
|
17 |
|
|
|
1 |
from langchain_core.prompts import ChatPromptTemplate
|
2 |
|
3 |
+
# from modules.chat.langchain.utils import
|
4 |
+
from langchain_community.chat_message_histories import ChatMessageHistory
|
5 |
from modules.chat.base import BaseRAG
|
6 |
from langchain_core.prompts import PromptTemplate
|
7 |
+
from langchain.memory import ConversationBufferWindowMemory
|
8 |
+
from langchain_core.runnables.utils import ConfigurableFieldSpec
|
9 |
+
from .utils import (
|
10 |
+
CustomConversationalRetrievalChain,
|
11 |
+
create_history_aware_retriever,
|
12 |
+
create_stuff_documents_chain,
|
13 |
+
create_retrieval_chain,
|
14 |
+
return_questions,
|
15 |
+
CustomRunnableWithHistory,
|
16 |
+
BaseChatMessageHistory,
|
17 |
+
InMemoryHistory,
|
18 |
)
|
19 |
|
|
|
|
|
|
|
20 |
|
21 |
class Langchain_RAG_V1(BaseRAG):
|
22 |
|
code/modules/chat/langchain/utils.py
CHANGED
@@ -1,53 +1,29 @@
|
|
1 |
from typing import Any, Dict, List, Union, Tuple, Optional
|
2 |
-
from langchain_core.messages import (
|
3 |
-
BaseMessage,
|
4 |
-
AIMessage,
|
5 |
-
FunctionMessage,
|
6 |
-
HumanMessage,
|
7 |
-
)
|
8 |
-
|
9 |
from langchain_core.prompts.base import BasePromptTemplate, format_document
|
10 |
-
from langchain_core.prompts.chat import MessagesPlaceholder
|
11 |
from langchain_core.output_parsers import StrOutputParser
|
12 |
from langchain_core.output_parsers.base import BaseOutputParser
|
13 |
from langchain_core.retrievers import BaseRetriever, RetrieverOutput
|
14 |
from langchain_core.language_models import LanguageModelLike
|
15 |
from langchain_core.runnables import Runnable, RunnableBranch, RunnablePassthrough
|
16 |
from langchain_core.runnables.history import RunnableWithMessageHistory
|
17 |
-
from langchain_core.runnables.utils import ConfigurableFieldSpec
|
18 |
from langchain_core.chat_history import BaseChatMessageHistory
|
19 |
from langchain_core.pydantic_v1 import BaseModel, Field
|
20 |
from langchain.chains.combine_documents.base import (
|
21 |
DEFAULT_DOCUMENT_PROMPT,
|
22 |
DEFAULT_DOCUMENT_SEPARATOR,
|
23 |
DOCUMENTS_KEY,
|
24 |
-
BaseCombineDocumentsChain,
|
25 |
_validate_prompt,
|
26 |
)
|
27 |
-
from langchain.chains.llm import LLMChain
|
28 |
-
from langchain_core.callbacks import Callbacks
|
29 |
-
from langchain_core.documents import Document
|
30 |
-
|
31 |
-
|
32 |
-
CHAT_TURN_TYPE = Union[Tuple[str, str], BaseMessage]
|
33 |
-
|
34 |
from langchain_core.runnables.config import RunnableConfig
|
35 |
-
from langchain_core.messages import BaseMessage
|
36 |
-
|
37 |
-
|
38 |
-
from langchain_core.output_parsers import StrOutputParser
|
39 |
from langchain_core.prompts import ChatPromptTemplate
|
40 |
from langchain_community.chat_models import ChatOpenAI
|
41 |
-
|
42 |
-
from langchain.chains import RetrievalQA, ConversationalRetrievalChain
|
43 |
-
from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
|
44 |
-
|
45 |
-
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
|
46 |
from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
|
47 |
import inspect
|
48 |
-
from langchain.chains.conversational_retrieval.base import _get_chat_history
|
49 |
from langchain_core.messages import BaseMessage
|
50 |
|
|
|
|
|
51 |
|
52 |
class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
|
53 |
|
|
|
1 |
from typing import Any, Dict, List, Union, Tuple, Optional
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from langchain_core.prompts.base import BasePromptTemplate, format_document
|
|
|
3 |
from langchain_core.output_parsers import StrOutputParser
|
4 |
from langchain_core.output_parsers.base import BaseOutputParser
|
5 |
from langchain_core.retrievers import BaseRetriever, RetrieverOutput
|
6 |
from langchain_core.language_models import LanguageModelLike
|
7 |
from langchain_core.runnables import Runnable, RunnableBranch, RunnablePassthrough
|
8 |
from langchain_core.runnables.history import RunnableWithMessageHistory
|
|
|
9 |
from langchain_core.chat_history import BaseChatMessageHistory
|
10 |
from langchain_core.pydantic_v1 import BaseModel, Field
|
11 |
from langchain.chains.combine_documents.base import (
|
12 |
DEFAULT_DOCUMENT_PROMPT,
|
13 |
DEFAULT_DOCUMENT_SEPARATOR,
|
14 |
DOCUMENTS_KEY,
|
|
|
15 |
_validate_prompt,
|
16 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
from langchain_core.runnables.config import RunnableConfig
|
|
|
|
|
|
|
|
|
18 |
from langchain_core.prompts import ChatPromptTemplate
|
19 |
from langchain_community.chat_models import ChatOpenAI
|
20 |
+
from langchain.chains import ConversationalRetrievalChain
|
|
|
|
|
|
|
|
|
21 |
from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun
|
22 |
import inspect
|
|
|
23 |
from langchain_core.messages import BaseMessage
|
24 |
|
25 |
+
CHAT_TURN_TYPE = Union[Tuple[str, str], BaseMessage]
|
26 |
+
|
27 |
|
28 |
class CustomConversationalRetrievalChain(ConversationalRetrievalChain):
|
29 |
|
code/modules/chat/llm_tutor.py
CHANGED
@@ -3,7 +3,6 @@ from modules.chat.chat_model_loader import ChatModelLoader
|
|
3 |
from modules.vectorstore.store_manager import VectorStoreManager
|
4 |
from modules.retriever.retriever import Retriever
|
5 |
from modules.chat.langchain.langchain_rag import (
|
6 |
-
Langchain_RAG_V1,
|
7 |
Langchain_RAG_V2,
|
8 |
QuestionGenerator,
|
9 |
)
|
@@ -28,9 +27,11 @@ class LLMTutor:
|
|
28 |
self.rephrase_prompt = get_prompt(
|
29 |
config, "rephrase"
|
30 |
) # Initialize rephrase_prompt
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
34 |
|
35 |
def update_llm(self, old_config, new_config):
|
36 |
"""
|
@@ -48,9 +49,11 @@ class LLMTutor:
|
|
48 |
self.vector_db = VectorStoreManager(
|
49 |
self.config, logger=self.logger
|
50 |
).load_database() # Reinitialize VectorStoreManager if vectorstore changes
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
54 |
|
55 |
if "llm_params.llm_style" in changes:
|
56 |
self.qa_prompt = get_prompt(
|
|
|
3 |
from modules.vectorstore.store_manager import VectorStoreManager
|
4 |
from modules.retriever.retriever import Retriever
|
5 |
from modules.chat.langchain.langchain_rag import (
|
|
|
6 |
Langchain_RAG_V2,
|
7 |
QuestionGenerator,
|
8 |
)
|
|
|
27 |
self.rephrase_prompt = get_prompt(
|
28 |
config, "rephrase"
|
29 |
) # Initialize rephrase_prompt
|
30 |
+
|
31 |
+
# TODO: Removed this functionality for now, don't know if we need it
|
32 |
+
# if self.config["vectorstore"]["embedd_files"]:
|
33 |
+
# self.vector_db.create_database()
|
34 |
+
# self.vector_db.save_database()
|
35 |
|
36 |
def update_llm(self, old_config, new_config):
|
37 |
"""
|
|
|
49 |
self.vector_db = VectorStoreManager(
|
50 |
self.config, logger=self.logger
|
51 |
).load_database() # Reinitialize VectorStoreManager if vectorstore changes
|
52 |
+
|
53 |
+
# TODO: Removed this functionality for now, don't know if we need it
|
54 |
+
# if self.config["vectorstore"]["embedd_files"]:
|
55 |
+
# self.vector_db.create_database()
|
56 |
+
# self.vector_db.save_database()
|
57 |
|
58 |
if "llm_params.llm_style" in changes:
|
59 |
self.qa_prompt = get_prompt(
|
code/modules/chat_processor/literal_ai.py
CHANGED
@@ -1,44 +1,7 @@
|
|
1 |
-
from chainlit.data import ChainlitDataLayer
|
2 |
|
3 |
|
4 |
# update custom methods here (Ref: https://github.com/Chainlit/chainlit/blob/4b533cd53173bcc24abe4341a7108f0070d60099/backend/chainlit/data/__init__.py)
|
5 |
class CustomLiteralDataLayer(ChainlitDataLayer):
|
6 |
def __init__(self, **kwargs):
|
7 |
super().__init__(**kwargs)
|
8 |
-
|
9 |
-
@queue_until_user_message()
|
10 |
-
async def create_step(self, step_dict: "StepDict"):
|
11 |
-
metadata = dict(
|
12 |
-
step_dict.get("metadata", {}),
|
13 |
-
**{
|
14 |
-
"waitForAnswer": step_dict.get("waitForAnswer"),
|
15 |
-
"language": step_dict.get("language"),
|
16 |
-
"showInput": step_dict.get("showInput"),
|
17 |
-
},
|
18 |
-
)
|
19 |
-
|
20 |
-
step: LiteralStepDict = {
|
21 |
-
"createdAt": step_dict.get("createdAt"),
|
22 |
-
"startTime": step_dict.get("start"),
|
23 |
-
"endTime": step_dict.get("end"),
|
24 |
-
"generation": step_dict.get("generation"),
|
25 |
-
"id": step_dict.get("id"),
|
26 |
-
"parentId": step_dict.get("parentId"),
|
27 |
-
"name": step_dict.get("name"),
|
28 |
-
"threadId": step_dict.get("threadId"),
|
29 |
-
"type": step_dict.get("type"),
|
30 |
-
"tags": step_dict.get("tags"),
|
31 |
-
"metadata": metadata,
|
32 |
-
}
|
33 |
-
if step_dict.get("input"):
|
34 |
-
step["input"] = {"content": step_dict.get("input")}
|
35 |
-
if step_dict.get("output"):
|
36 |
-
step["output"] = {"content": step_dict.get("output")}
|
37 |
-
if step_dict.get("isError"):
|
38 |
-
step["error"] = step_dict.get("output")
|
39 |
-
|
40 |
-
# print("\n\n\n")
|
41 |
-
# print("Step: ", step)
|
42 |
-
# print("\n\n\n")
|
43 |
-
|
44 |
-
await self.client.api.send_steps([step])
|
|
|
1 |
+
from chainlit.data import ChainlitDataLayer
|
2 |
|
3 |
|
4 |
# update custom methods here (Ref: https://github.com/Chainlit/chainlit/blob/4b533cd53173bcc24abe4341a7108f0070d60099/backend/chainlit/data/__init__.py)
|
5 |
class CustomLiteralDataLayer(ChainlitDataLayer):
|
6 |
def __init__(self, **kwargs):
|
7 |
super().__init__(**kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
code/modules/config/config.yml
CHANGED
@@ -4,7 +4,7 @@ device: 'cpu' # str [cuda, cpu]
|
|
4 |
|
5 |
vectorstore:
|
6 |
load_from_HF: True # bool
|
7 |
-
|
8 |
data_path: '../storage/data' # str
|
9 |
url_file_path: '../storage/data/urls.txt' # str
|
10 |
expand_urls: True # bool
|
@@ -37,7 +37,6 @@ llm_params:
|
|
37 |
temperature: 0.7 # float
|
38 |
repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
|
39 |
filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
|
40 |
-
pdf_reader: 'pymupdf' # str [llama, pymupdf, gpt]
|
41 |
stream: False # bool
|
42 |
pdf_reader: 'gpt' # str [llama, pymupdf, gpt]
|
43 |
|
|
|
4 |
|
5 |
vectorstore:
|
6 |
load_from_HF: True # bool
|
7 |
+
reparse_files: True # bool
|
8 |
data_path: '../storage/data' # str
|
9 |
url_file_path: '../storage/data/urls.txt' # str
|
10 |
expand_urls: True # bool
|
|
|
37 |
temperature: 0.7 # float
|
38 |
repo_id: 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF' # HuggingFace repo id
|
39 |
filename: 'tinyllama-1.1b-chat-v1.0.Q5_0.gguf' # Specific name of gguf file in the repo
|
|
|
40 |
stream: False # bool
|
41 |
pdf_reader: 'gpt' # str [llama, pymupdf, gpt]
|
42 |
|
code/modules/config/constants.py
CHANGED
@@ -3,6 +3,8 @@ import os
|
|
3 |
|
4 |
load_dotenv()
|
5 |
|
|
|
|
|
6 |
# API Keys - Loaded from the .env file
|
7 |
|
8 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
@@ -14,10 +16,11 @@ LITERAL_API_URL = os.getenv("LITERAL_API_URL")
|
|
14 |
OAUTH_GOOGLE_CLIENT_ID = os.getenv("OAUTH_GOOGLE_CLIENT_ID")
|
15 |
OAUTH_GOOGLE_CLIENT_SECRET = os.getenv("OAUTH_GOOGLE_CLIENT_SECRET")
|
16 |
|
17 |
-
opening_message =
|
|
|
|
|
|
|
18 |
|
19 |
# Model Paths
|
20 |
|
21 |
LLAMA_PATH = "../storage/models/tinyllama"
|
22 |
-
|
23 |
-
RETRIEVER_HF_PATHS = {"RAGatouille": "XThomasBU/Colbert_Index"}
|
|
|
3 |
|
4 |
load_dotenv()
|
5 |
|
6 |
+
TIMEOUT = 60
|
7 |
+
|
8 |
# API Keys - Loaded from the .env file
|
9 |
|
10 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
|
16 |
OAUTH_GOOGLE_CLIENT_ID = os.getenv("OAUTH_GOOGLE_CLIENT_ID")
|
17 |
OAUTH_GOOGLE_CLIENT_SECRET = os.getenv("OAUTH_GOOGLE_CLIENT_SECRET")
|
18 |
|
19 |
+
opening_message = "Hey, What Can I Help You With?\n\nYou can me ask me questions about the course logistics, course content, about the final project, or anything else!"
|
20 |
+
chat_end_message = (
|
21 |
+
"I hope I was able to help you. If you have any more questions, feel free to ask!"
|
22 |
+
)
|
23 |
|
24 |
# Model Paths
|
25 |
|
26 |
LLAMA_PATH = "../storage/models/tinyllama"
|
|
|
|
code/modules/config/user_config.yml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
retriever:
|
2 |
+
retriever_hf_paths:
|
3 |
+
RAGatouille: "XThomasBU/Colbert_Index"
|
code/modules/dataloader/data_loader.py
CHANGED
@@ -3,40 +3,26 @@ import re
|
|
3 |
import requests
|
4 |
import pysrt
|
5 |
from langchain_community.document_loaders import (
|
6 |
-
PyMuPDFLoader,
|
7 |
Docx2txtLoader,
|
8 |
YoutubeLoader,
|
9 |
-
WebBaseLoader,
|
10 |
TextLoader,
|
11 |
)
|
12 |
-
from langchain_community.document_loaders import UnstructuredMarkdownLoader
|
13 |
-
from llama_parse import LlamaParse
|
14 |
from langchain.schema import Document
|
15 |
import logging
|
16 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
17 |
from langchain_experimental.text_splitter import SemanticChunker
|
18 |
from langchain_openai.embeddings import OpenAIEmbeddings
|
19 |
-
from ragatouille import RAGPretrainedModel
|
20 |
-
from langchain.chains import LLMChain
|
21 |
-
from langchain_community.llms import OpenAI
|
22 |
-
from langchain import PromptTemplate
|
23 |
import json
|
24 |
from concurrent.futures import ThreadPoolExecutor
|
25 |
from urllib.parse import urljoin
|
26 |
import html2text
|
27 |
import bs4
|
28 |
-
import tempfile
|
29 |
import PyPDF2
|
30 |
from modules.dataloader.pdf_readers.base import PDFReader
|
31 |
from modules.dataloader.pdf_readers.llama import LlamaParser
|
32 |
from modules.dataloader.pdf_readers.gpt import GPTParser
|
33 |
-
|
34 |
-
|
35 |
-
from modules.dataloader.helpers import get_metadata, download_pdf_from_url
|
36 |
-
from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
|
37 |
-
except:
|
38 |
-
from dataloader.helpers import get_metadata, download_pdf_from_url
|
39 |
-
from config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
|
40 |
|
41 |
logger = logging.getLogger(__name__)
|
42 |
BASE_DIR = os.getcwd()
|
@@ -47,7 +33,7 @@ class HTMLReader:
|
|
47 |
pass
|
48 |
|
49 |
def read_url(self, url):
|
50 |
-
response = requests.get(url)
|
51 |
if response.status_code == 200:
|
52 |
return response.text
|
53 |
else:
|
@@ -65,11 +51,13 @@ class HTMLReader:
|
|
65 |
href = href.replace("http", "https")
|
66 |
|
67 |
absolute_url = urljoin(base_url, href)
|
68 |
-
link[
|
69 |
|
70 |
-
resp = requests.head(absolute_url)
|
71 |
if resp.status_code != 200:
|
72 |
-
logger.warning(
|
|
|
|
|
73 |
|
74 |
return str(soup)
|
75 |
|
@@ -85,6 +73,7 @@ class HTMLReader:
|
|
85 |
else:
|
86 |
return None
|
87 |
|
|
|
88 |
class FileReader:
|
89 |
def __init__(self, logger, kind):
|
90 |
self.logger = logger
|
@@ -96,7 +85,9 @@ class FileReader:
|
|
96 |
else:
|
97 |
self.pdf_reader = PDFReader()
|
98 |
self.web_reader = HTMLReader()
|
99 |
-
self.logger.info(
|
|
|
|
|
100 |
|
101 |
def extract_text_from_pdf(self, pdf_path):
|
102 |
text = ""
|
@@ -137,7 +128,7 @@ class FileReader:
|
|
137 |
return [Document(page_content=self.web_reader.read_html(url))]
|
138 |
|
139 |
def read_tex_from_url(self, tex_url):
|
140 |
-
response = requests.get(tex_url)
|
141 |
if response.status_code == 200:
|
142 |
return [Document(page_content=response.text)]
|
143 |
else:
|
@@ -154,17 +145,20 @@ class ChunkProcessor:
|
|
154 |
self.document_metadata = {}
|
155 |
self.document_chunks_full = []
|
156 |
|
157 |
-
|
|
|
158 |
self.load_document_data()
|
159 |
|
160 |
if config["splitter_options"]["use_splitter"]:
|
161 |
if config["splitter_options"]["chunking_mode"] == "fixed":
|
162 |
if config["splitter_options"]["split_by_token"]:
|
163 |
-
self.splitter =
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
|
|
|
|
168 |
)
|
169 |
else:
|
170 |
self.splitter = RecursiveCharacterTextSplitter(
|
@@ -175,8 +169,7 @@ class ChunkProcessor:
|
|
175 |
)
|
176 |
else:
|
177 |
self.splitter = SemanticChunker(
|
178 |
-
OpenAIEmbeddings(),
|
179 |
-
breakpoint_threshold_type="percentile"
|
180 |
)
|
181 |
|
182 |
else:
|
@@ -203,7 +196,10 @@ class ChunkProcessor:
|
|
203 |
):
|
204 |
# TODO: Clear up this pipeline of re-adding metadata
|
205 |
documents = [Document(page_content=documents, source=source, page=page)]
|
206 |
-
if
|
|
|
|
|
|
|
207 |
document_chunks = documents
|
208 |
else:
|
209 |
document_chunks = self.splitter.split_documents(documents)
|
@@ -229,6 +225,20 @@ class ChunkProcessor:
|
|
229 |
"https://dl4ds.github.io/sp2024/lectures/",
|
230 |
"https://dl4ds.github.io/sp2024/schedule/",
|
231 |
) # For any additional metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
with ThreadPoolExecutor() as executor:
|
233 |
executor.map(
|
234 |
self.process_file,
|
@@ -298,6 +308,7 @@ class ChunkProcessor:
|
|
298 |
self.document_metadata[file_path] = file_metadata
|
299 |
|
300 |
def process_file(self, file_path, file_index, file_reader, addl_metadata):
|
|
|
301 |
file_name = os.path.basename(file_path)
|
302 |
|
303 |
file_type = file_name.split(".")[-1]
|
@@ -314,10 +325,13 @@ class ChunkProcessor:
|
|
314 |
return
|
315 |
|
316 |
try:
|
317 |
-
|
318 |
if file_path in self.document_data:
|
319 |
self.logger.warning(f"File {file_name} already processed")
|
320 |
-
documents = [
|
|
|
|
|
|
|
321 |
else:
|
322 |
documents = read_methods[file_type](file_path)
|
323 |
|
@@ -370,22 +384,31 @@ class ChunkProcessor:
|
|
370 |
json.dump(self.document_metadata, json_file, indent=4)
|
371 |
|
372 |
def load_document_data(self):
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
|
385 |
|
386 |
class DataLoader:
|
387 |
def __init__(self, config, logger=None):
|
388 |
-
self.file_reader = FileReader(
|
|
|
|
|
389 |
self.chunk_processor = ChunkProcessor(config, logger=logger)
|
390 |
|
391 |
def get_chunks(self, uploaded_files, weblinks):
|
@@ -403,19 +426,22 @@ if __name__ == "__main__":
|
|
403 |
with open("../code/modules/config/config.yml", "r") as f:
|
404 |
config = yaml.safe_load(f)
|
405 |
|
406 |
-
STORAGE_DIR = os.path.join(BASE_DIR, config[
|
407 |
uploaded_files = [
|
408 |
-
os.path.join(STORAGE_DIR, file)
|
|
|
|
|
409 |
]
|
410 |
|
411 |
data_loader = DataLoader(config, logger=logger)
|
412 |
document_chunks, document_names, documents, document_metadata = (
|
413 |
data_loader.get_chunks(
|
414 |
-
[
|
|
|
|
|
415 |
[],
|
416 |
)
|
417 |
)
|
418 |
|
419 |
print(document_names[:5])
|
420 |
print(len(document_chunks))
|
421 |
-
|
|
|
3 |
import requests
|
4 |
import pysrt
|
5 |
from langchain_community.document_loaders import (
|
|
|
6 |
Docx2txtLoader,
|
7 |
YoutubeLoader,
|
|
|
8 |
TextLoader,
|
9 |
)
|
|
|
|
|
10 |
from langchain.schema import Document
|
11 |
import logging
|
12 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
13 |
from langchain_experimental.text_splitter import SemanticChunker
|
14 |
from langchain_openai.embeddings import OpenAIEmbeddings
|
|
|
|
|
|
|
|
|
15 |
import json
|
16 |
from concurrent.futures import ThreadPoolExecutor
|
17 |
from urllib.parse import urljoin
|
18 |
import html2text
|
19 |
import bs4
|
|
|
20 |
import PyPDF2
|
21 |
from modules.dataloader.pdf_readers.base import PDFReader
|
22 |
from modules.dataloader.pdf_readers.llama import LlamaParser
|
23 |
from modules.dataloader.pdf_readers.gpt import GPTParser
|
24 |
+
from modules.dataloader.helpers import get_metadata
|
25 |
+
from modules.config.constants import TIMEOUT
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
logger = logging.getLogger(__name__)
|
28 |
BASE_DIR = os.getcwd()
|
|
|
33 |
pass
|
34 |
|
35 |
def read_url(self, url):
|
36 |
+
response = requests.get(url, timeout=TIMEOUT)
|
37 |
if response.status_code == 200:
|
38 |
return response.text
|
39 |
else:
|
|
|
51 |
href = href.replace("http", "https")
|
52 |
|
53 |
absolute_url = urljoin(base_url, href)
|
54 |
+
link["href"] = absolute_url
|
55 |
|
56 |
+
resp = requests.head(absolute_url, timeout=TIMEOUT)
|
57 |
if resp.status_code != 200:
|
58 |
+
logger.warning(
|
59 |
+
f"Link {absolute_url} is broken. Status code: {resp.status_code}"
|
60 |
+
)
|
61 |
|
62 |
return str(soup)
|
63 |
|
|
|
73 |
else:
|
74 |
return None
|
75 |
|
76 |
+
|
77 |
class FileReader:
|
78 |
def __init__(self, logger, kind):
|
79 |
self.logger = logger
|
|
|
85 |
else:
|
86 |
self.pdf_reader = PDFReader()
|
87 |
self.web_reader = HTMLReader()
|
88 |
+
self.logger.info(
|
89 |
+
f"Initialized FileReader with {kind} PDF reader and HTML reader"
|
90 |
+
)
|
91 |
|
92 |
def extract_text_from_pdf(self, pdf_path):
|
93 |
text = ""
|
|
|
128 |
return [Document(page_content=self.web_reader.read_html(url))]
|
129 |
|
130 |
def read_tex_from_url(self, tex_url):
|
131 |
+
response = requests.get(tex_url, timeout=TIMEOUT)
|
132 |
if response.status_code == 200:
|
133 |
return [Document(page_content=response.text)]
|
134 |
else:
|
|
|
145 |
self.document_metadata = {}
|
146 |
self.document_chunks_full = []
|
147 |
|
148 |
+
# TODO: Fix when reparse_files is False
|
149 |
+
if not config["vectorstore"]["reparse_files"]:
|
150 |
self.load_document_data()
|
151 |
|
152 |
if config["splitter_options"]["use_splitter"]:
|
153 |
if config["splitter_options"]["chunking_mode"] == "fixed":
|
154 |
if config["splitter_options"]["split_by_token"]:
|
155 |
+
self.splitter = (
|
156 |
+
RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
157 |
+
chunk_size=config["splitter_options"]["chunk_size"],
|
158 |
+
chunk_overlap=config["splitter_options"]["chunk_overlap"],
|
159 |
+
separators=config["splitter_options"]["chunk_separators"],
|
160 |
+
disallowed_special=(),
|
161 |
+
)
|
162 |
)
|
163 |
else:
|
164 |
self.splitter = RecursiveCharacterTextSplitter(
|
|
|
169 |
)
|
170 |
else:
|
171 |
self.splitter = SemanticChunker(
|
172 |
+
OpenAIEmbeddings(), breakpoint_threshold_type="percentile"
|
|
|
173 |
)
|
174 |
|
175 |
else:
|
|
|
196 |
):
|
197 |
# TODO: Clear up this pipeline of re-adding metadata
|
198 |
documents = [Document(page_content=documents, source=source, page=page)]
|
199 |
+
if (
|
200 |
+
file_type == "pdf"
|
201 |
+
and self.config["splitter_options"]["chunking_mode"] == "fixed"
|
202 |
+
):
|
203 |
document_chunks = documents
|
204 |
else:
|
205 |
document_chunks = self.splitter.split_documents(documents)
|
|
|
225 |
"https://dl4ds.github.io/sp2024/lectures/",
|
226 |
"https://dl4ds.github.io/sp2024/schedule/",
|
227 |
) # For any additional metadata
|
228 |
+
|
229 |
+
# remove already processed files if reparse_files is False
|
230 |
+
if not self.config["vectorstore"]["reparse_files"]:
|
231 |
+
total_documents = len(uploaded_files) + len(weblinks)
|
232 |
+
uploaded_files = [
|
233 |
+
file_path
|
234 |
+
for file_path in uploaded_files
|
235 |
+
if file_path not in self.document_data
|
236 |
+
]
|
237 |
+
weblinks = [link for link in weblinks if link not in self.document_data]
|
238 |
+
print(
|
239 |
+
f"Total documents to process: {total_documents}, Documents already processed: {total_documents - len(uploaded_files) - len(weblinks)}"
|
240 |
+
)
|
241 |
+
|
242 |
with ThreadPoolExecutor() as executor:
|
243 |
executor.map(
|
244 |
self.process_file,
|
|
|
308 |
self.document_metadata[file_path] = file_metadata
|
309 |
|
310 |
def process_file(self, file_path, file_index, file_reader, addl_metadata):
|
311 |
+
print(f"Processing file {file_index + 1} : {file_path}")
|
312 |
file_name = os.path.basename(file_path)
|
313 |
|
314 |
file_type = file_name.split(".")[-1]
|
|
|
325 |
return
|
326 |
|
327 |
try:
|
328 |
+
|
329 |
if file_path in self.document_data:
|
330 |
self.logger.warning(f"File {file_name} already processed")
|
331 |
+
documents = [
|
332 |
+
Document(page_content=content)
|
333 |
+
for content in self.document_data[file_path].values()
|
334 |
+
]
|
335 |
else:
|
336 |
documents = read_methods[file_type](file_path)
|
337 |
|
|
|
384 |
json.dump(self.document_metadata, json_file, indent=4)
|
385 |
|
386 |
def load_document_data(self):
|
387 |
+
try:
|
388 |
+
with open(
|
389 |
+
f"{self.config['log_chunk_dir']}/docs/doc_content.json", "r"
|
390 |
+
) as json_file:
|
391 |
+
self.document_data = json.load(json_file)
|
392 |
+
with open(
|
393 |
+
f"{self.config['log_chunk_dir']}/metadata/doc_metadata.json", "r"
|
394 |
+
) as json_file:
|
395 |
+
self.document_metadata = json.load(json_file)
|
396 |
+
self.logger.info(
|
397 |
+
f"Loaded document content from {self.config['log_chunk_dir']}/docs/doc_content.json. Total documents: {len(self.document_data)}"
|
398 |
+
)
|
399 |
+
except FileNotFoundError:
|
400 |
+
self.logger.warning(
|
401 |
+
f"Document content not found in {self.config['log_chunk_dir']}/docs/doc_content.json"
|
402 |
+
)
|
403 |
+
self.document_data = {}
|
404 |
+
self.document_metadata = {}
|
405 |
|
406 |
|
407 |
class DataLoader:
|
408 |
def __init__(self, config, logger=None):
|
409 |
+
self.file_reader = FileReader(
|
410 |
+
logger=logger, kind=config["llm_params"]["pdf_reader"]
|
411 |
+
)
|
412 |
self.chunk_processor = ChunkProcessor(config, logger=logger)
|
413 |
|
414 |
def get_chunks(self, uploaded_files, weblinks):
|
|
|
426 |
with open("../code/modules/config/config.yml", "r") as f:
|
427 |
config = yaml.safe_load(f)
|
428 |
|
429 |
+
STORAGE_DIR = os.path.join(BASE_DIR, config["vectorstore"]["data_path"])
|
430 |
uploaded_files = [
|
431 |
+
os.path.join(STORAGE_DIR, file)
|
432 |
+
for file in os.listdir(STORAGE_DIR)
|
433 |
+
if file != "urls.txt"
|
434 |
]
|
435 |
|
436 |
data_loader = DataLoader(config, logger=logger)
|
437 |
document_chunks, document_names, documents, document_metadata = (
|
438 |
data_loader.get_chunks(
|
439 |
+
[
|
440 |
+
"https://dl4ds.github.io/fa2024/static_files/discussion_slides/00_discussion.pdf"
|
441 |
+
],
|
442 |
[],
|
443 |
)
|
444 |
)
|
445 |
|
446 |
print(document_names[:5])
|
447 |
print(len(document_chunks))
|
|
code/modules/dataloader/helpers.py
CHANGED
@@ -2,6 +2,8 @@ import requests
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
from urllib.parse import urlparse
|
4 |
import tempfile
|
|
|
|
|
5 |
|
6 |
def get_urls_from_file(file_path: str):
|
7 |
"""
|
@@ -26,11 +28,11 @@ def get_metadata(lectures_url, schedule_url):
|
|
26 |
lecture_metadata = {}
|
27 |
|
28 |
# Get the main lectures page content
|
29 |
-
r_lectures = requests.get(lectures_url)
|
30 |
soup_lectures = BeautifulSoup(r_lectures.text, "html.parser")
|
31 |
|
32 |
# Get the main schedule page content
|
33 |
-
r_schedule = requests.get(schedule_url)
|
34 |
soup_schedule = BeautifulSoup(r_schedule.text, "html.parser")
|
35 |
|
36 |
# Find all lecture blocks
|
@@ -118,7 +120,7 @@ def download_pdf_from_url(pdf_url):
|
|
118 |
Returns:
|
119 |
str: The local file path of the downloaded PDF file.
|
120 |
"""
|
121 |
-
response = requests.get(pdf_url)
|
122 |
if response.status_code == 200:
|
123 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
124 |
temp_file.write(response.content)
|
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
from urllib.parse import urlparse
|
4 |
import tempfile
|
5 |
+
from modules.config.constants import TIMEOUT
|
6 |
+
|
7 |
|
8 |
def get_urls_from_file(file_path: str):
|
9 |
"""
|
|
|
28 |
lecture_metadata = {}
|
29 |
|
30 |
# Get the main lectures page content
|
31 |
+
r_lectures = requests.get(lectures_url, timeout=TIMEOUT)
|
32 |
soup_lectures = BeautifulSoup(r_lectures.text, "html.parser")
|
33 |
|
34 |
# Get the main schedule page content
|
35 |
+
r_schedule = requests.get(schedule_url, timeout=TIMEOUT)
|
36 |
soup_schedule = BeautifulSoup(r_schedule.text, "html.parser")
|
37 |
|
38 |
# Find all lecture blocks
|
|
|
120 |
Returns:
|
121 |
str: The local file path of the downloaded PDF file.
|
122 |
"""
|
123 |
+
response = requests.get(pdf_url, timeout=TIMEOUT)
|
124 |
if response.status_code == 200:
|
125 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
126 |
temp_file.write(response.content)
|
code/modules/dataloader/pdf_readers/gpt.py
CHANGED
@@ -6,6 +6,7 @@ from io import BytesIO
|
|
6 |
from openai import OpenAI
|
7 |
from pdf2image import convert_from_path
|
8 |
from langchain.schema import Document
|
|
|
9 |
|
10 |
|
11 |
class GPTParser:
|
@@ -19,9 +20,9 @@ class GPTParser:
|
|
19 |
self.api_key = os.getenv("OPENAI_API_KEY")
|
20 |
self.prompt = """
|
21 |
The provided documents are images of PDFs of lecture slides of deep learning material.
|
22 |
-
They contain LaTeX equations, images, and text.
|
23 |
The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
|
24 |
-
The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
|
25 |
For images, give a description and if you can, a source. Separate each page with '---'.
|
26 |
Just respond with the markdown. Do not include page numbers or any other metadata. Do not try to provide titles. Strictly the content.
|
27 |
"""
|
@@ -31,36 +32,45 @@ class GPTParser:
|
|
31 |
|
32 |
encoded_images = [self.encode_image(image) for image in images]
|
33 |
|
34 |
-
chunks = [encoded_images[i:i + 5] for i in range(0, len(encoded_images), 5)]
|
35 |
|
36 |
headers = {
|
37 |
"Content-Type": "application/json",
|
38 |
-
"Authorization": f"Bearer {self.api_key}"
|
39 |
}
|
40 |
|
41 |
output = ""
|
42 |
for chunk_num, chunk in enumerate(chunks):
|
43 |
-
content = [
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
content.insert(0, {"type": "text", "text": self.prompt})
|
47 |
|
48 |
payload = {
|
49 |
"model": "gpt-4o-mini",
|
50 |
-
"messages": [
|
51 |
-
{
|
52 |
-
"role": "user",
|
53 |
-
"content": content
|
54 |
-
}
|
55 |
-
],
|
56 |
}
|
57 |
|
58 |
response = requests.post(
|
59 |
-
"https://api.openai.com/v1/chat/completions",
|
|
|
|
|
|
|
|
|
60 |
|
61 |
resp = response.json()
|
62 |
|
63 |
-
chunk_output =
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
output += chunk_output + "\n---\n"
|
66 |
|
@@ -68,14 +78,12 @@ class GPTParser:
|
|
68 |
output = [doc for doc in output if doc.strip() != ""]
|
69 |
|
70 |
documents = [
|
71 |
-
Document(
|
72 |
-
|
73 |
-
metadata={"source": pdf_path, "page": i}
|
74 |
-
) for i, page in enumerate(output)
|
75 |
]
|
76 |
return documents
|
77 |
|
78 |
def encode_image(self, image):
|
79 |
buffered = BytesIO()
|
80 |
image.save(buffered, format="JPEG")
|
81 |
-
return base64.b64encode(buffered.getvalue()).decode(
|
|
|
6 |
from openai import OpenAI
|
7 |
from pdf2image import convert_from_path
|
8 |
from langchain.schema import Document
|
9 |
+
from modules.config.constants import TIMEOUT
|
10 |
|
11 |
|
12 |
class GPTParser:
|
|
|
20 |
self.api_key = os.getenv("OPENAI_API_KEY")
|
21 |
self.prompt = """
|
22 |
The provided documents are images of PDFs of lecture slides of deep learning material.
|
23 |
+
They contain LaTeX equations, images, and text.
|
24 |
The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
|
25 |
+
The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
|
26 |
For images, give a description and if you can, a source. Separate each page with '---'.
|
27 |
Just respond with the markdown. Do not include page numbers or any other metadata. Do not try to provide titles. Strictly the content.
|
28 |
"""
|
|
|
32 |
|
33 |
encoded_images = [self.encode_image(image) for image in images]
|
34 |
|
35 |
+
chunks = [encoded_images[i : i + 5] for i in range(0, len(encoded_images), 5)]
|
36 |
|
37 |
headers = {
|
38 |
"Content-Type": "application/json",
|
39 |
+
"Authorization": f"Bearer {self.api_key}",
|
40 |
}
|
41 |
|
42 |
output = ""
|
43 |
for chunk_num, chunk in enumerate(chunks):
|
44 |
+
content = [
|
45 |
+
{
|
46 |
+
"type": "image_url",
|
47 |
+
"image_url": {"url": f"data:image/jpeg;base64,{image}"},
|
48 |
+
}
|
49 |
+
for image in chunk
|
50 |
+
]
|
51 |
|
52 |
content.insert(0, {"type": "text", "text": self.prompt})
|
53 |
|
54 |
payload = {
|
55 |
"model": "gpt-4o-mini",
|
56 |
+
"messages": [{"role": "user", "content": content}],
|
|
|
|
|
|
|
|
|
|
|
57 |
}
|
58 |
|
59 |
response = requests.post(
|
60 |
+
"https://api.openai.com/v1/chat/completions",
|
61 |
+
headers=headers,
|
62 |
+
json=payload,
|
63 |
+
timeout=TIMEOUT,
|
64 |
+
)
|
65 |
|
66 |
resp = response.json()
|
67 |
|
68 |
+
chunk_output = (
|
69 |
+
resp["choices"][0]["message"]["content"]
|
70 |
+
.replace("```", "")
|
71 |
+
.replace("markdown", "")
|
72 |
+
.replace("````", "")
|
73 |
+
)
|
74 |
|
75 |
output += chunk_output + "\n---\n"
|
76 |
|
|
|
78 |
output = [doc for doc in output if doc.strip() != ""]
|
79 |
|
80 |
documents = [
|
81 |
+
Document(page_content=page, metadata={"source": pdf_path, "page": i})
|
82 |
+
for i, page in enumerate(output)
|
|
|
|
|
83 |
]
|
84 |
return documents
|
85 |
|
86 |
def encode_image(self, image):
|
87 |
buffered = BytesIO()
|
88 |
image.save(buffered, format="JPEG")
|
89 |
+
return base64.b64encode(buffered.getvalue()).decode("utf-8")
|
code/modules/dataloader/pdf_readers/llama.py
CHANGED
@@ -2,19 +2,18 @@ import os
|
|
2 |
import requests
|
3 |
from llama_parse import LlamaParse
|
4 |
from langchain.schema import Document
|
5 |
-
from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY
|
6 |
from modules.dataloader.helpers import download_pdf_from_url
|
7 |
|
8 |
|
9 |
-
|
10 |
class LlamaParser:
|
11 |
def __init__(self):
|
12 |
self.GPT_API_KEY = OPENAI_API_KEY
|
13 |
self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
|
14 |
self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
|
15 |
self.headers = {
|
16 |
-
|
17 |
-
|
18 |
}
|
19 |
self.parser = LlamaParse(
|
20 |
api_key=LLAMA_CLOUD_API_KEY,
|
@@ -23,7 +22,7 @@ class LlamaParser:
|
|
23 |
language="en",
|
24 |
gpt4o_mode=False,
|
25 |
# gpt4o_api_key=OPENAI_API_KEY,
|
26 |
-
parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX format, between $ signs. For images, if you can, give a description and a source."
|
27 |
)
|
28 |
|
29 |
def parse(self, pdf_path):
|
@@ -38,10 +37,8 @@ class LlamaParser:
|
|
38 |
pages = [page.strip() for page in pages]
|
39 |
|
40 |
documents = [
|
41 |
-
Document(
|
42 |
-
|
43 |
-
metadata={"source": pdf_path, "page": i}
|
44 |
-
) for i, page in enumerate(pages)
|
45 |
]
|
46 |
|
47 |
return documents
|
@@ -53,20 +50,30 @@ class LlamaParser:
|
|
53 |
}
|
54 |
|
55 |
files = [
|
56 |
-
(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
]
|
58 |
|
59 |
response = requests.request(
|
60 |
-
"POST", self.parse_url, headers=self.headers, data=payload, files=files
|
|
|
61 |
|
62 |
-
return response.json()[
|
63 |
|
64 |
async def get_result(self, job_id):
|
65 |
-
url =
|
|
|
|
|
66 |
|
67 |
response = requests.request("GET", url, headers=self.headers, data={})
|
68 |
|
69 |
-
return response.json()[
|
70 |
|
71 |
async def _parse(self, pdf_path):
|
72 |
job_id, status = self.make_request(pdf_path)
|
@@ -78,15 +85,9 @@ class LlamaParser:
|
|
78 |
|
79 |
result = await self.get_result(job_id)
|
80 |
|
81 |
-
documents = [
|
82 |
-
Document(
|
83 |
-
page_content=result,
|
84 |
-
metadata={"source": pdf_path}
|
85 |
-
)
|
86 |
-
]
|
87 |
|
88 |
return documents
|
89 |
|
90 |
-
async def _parse(self, pdf_path):
|
91 |
-
|
92 |
-
|
|
|
2 |
import requests
|
3 |
from llama_parse import LlamaParse
|
4 |
from langchain.schema import Document
|
5 |
+
from modules.config.constants import OPENAI_API_KEY, LLAMA_CLOUD_API_KEY, TIMEOUT
|
6 |
from modules.dataloader.helpers import download_pdf_from_url
|
7 |
|
8 |
|
|
|
9 |
class LlamaParser:
|
10 |
def __init__(self):
|
11 |
self.GPT_API_KEY = OPENAI_API_KEY
|
12 |
self.LLAMA_CLOUD_API_KEY = LLAMA_CLOUD_API_KEY
|
13 |
self.parse_url = "https://api.cloud.llamaindex.ai/api/parsing/upload"
|
14 |
self.headers = {
|
15 |
+
"Accept": "application/json",
|
16 |
+
"Authorization": f"Bearer {LLAMA_CLOUD_API_KEY}",
|
17 |
}
|
18 |
self.parser = LlamaParse(
|
19 |
api_key=LLAMA_CLOUD_API_KEY,
|
|
|
22 |
language="en",
|
23 |
gpt4o_mode=False,
|
24 |
# gpt4o_api_key=OPENAI_API_KEY,
|
25 |
+
parsing_instruction="The provided documents are PDFs of lecture slides of deep learning material. They contain LaTeX equations, images, and text. The goal is to extract the text, images and equations from the slides. The markdown should be clean and easy to read, and any math equation should be converted to LaTeX format, between $ signs. For images, if you can, give a description and a source.",
|
26 |
)
|
27 |
|
28 |
def parse(self, pdf_path):
|
|
|
37 |
pages = [page.strip() for page in pages]
|
38 |
|
39 |
documents = [
|
40 |
+
Document(page_content=page, metadata={"source": pdf_path, "page": i})
|
41 |
+
for i, page in enumerate(pages)
|
|
|
|
|
42 |
]
|
43 |
|
44 |
return documents
|
|
|
50 |
}
|
51 |
|
52 |
files = [
|
53 |
+
(
|
54 |
+
"file",
|
55 |
+
(
|
56 |
+
"file",
|
57 |
+
requests.get(pdf_url, timeout=TIMEOUT).content,
|
58 |
+
"application/octet-stream",
|
59 |
+
),
|
60 |
+
)
|
61 |
]
|
62 |
|
63 |
response = requests.request(
|
64 |
+
"POST", self.parse_url, headers=self.headers, data=payload, files=files
|
65 |
+
)
|
66 |
|
67 |
+
return response.json()["id"], response.json()["status"]
|
68 |
|
69 |
async def get_result(self, job_id):
|
70 |
+
url = (
|
71 |
+
f"https://api.cloud.llamaindex.ai/api/parsing/job/{job_id}/result/markdown"
|
72 |
+
)
|
73 |
|
74 |
response = requests.request("GET", url, headers=self.headers, data={})
|
75 |
|
76 |
+
return response.json()["markdown"]
|
77 |
|
78 |
async def _parse(self, pdf_path):
|
79 |
job_id, status = self.make_request(pdf_path)
|
|
|
85 |
|
86 |
result = await self.get_result(job_id)
|
87 |
|
88 |
+
documents = [Document(page_content=result, metadata={"source": pdf_path})]
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
return documents
|
91 |
|
92 |
+
# async def _parse(self, pdf_path):
|
93 |
+
# return await self._parse(pdf_path)
|
|
code/modules/dataloader/webpage_crawler.py
CHANGED
@@ -3,7 +3,9 @@ from aiohttp import ClientSession
|
|
3 |
import asyncio
|
4 |
import requests
|
5 |
from bs4 import BeautifulSoup
|
6 |
-
from urllib.parse import
|
|
|
|
|
7 |
|
8 |
class WebpageCrawler:
|
9 |
def __init__(self):
|
@@ -18,7 +20,7 @@ class WebpageCrawler:
|
|
18 |
|
19 |
def url_exists(self, url: str) -> bool:
|
20 |
try:
|
21 |
-
response = requests.head(url)
|
22 |
return response.status_code == 200
|
23 |
except requests.ConnectionError:
|
24 |
return False
|
@@ -88,7 +90,7 @@ class WebpageCrawler:
|
|
88 |
|
89 |
def is_webpage(self, url: str) -> bool:
|
90 |
try:
|
91 |
-
response = requests.head(url, allow_redirects=True)
|
92 |
content_type = response.headers.get("Content-Type", "").lower()
|
93 |
return "text/html" in content_type
|
94 |
except requests.RequestException:
|
|
|
3 |
import asyncio
|
4 |
import requests
|
5 |
from bs4 import BeautifulSoup
|
6 |
+
from urllib.parse import urljoin, urldefrag
|
7 |
+
from modules.config.constants import TIMEOUT
|
8 |
+
|
9 |
|
10 |
class WebpageCrawler:
|
11 |
def __init__(self):
|
|
|
20 |
|
21 |
def url_exists(self, url: str) -> bool:
|
22 |
try:
|
23 |
+
response = requests.head(url, timeout=TIMEOUT)
|
24 |
return response.status_code == 200
|
25 |
except requests.ConnectionError:
|
26 |
return False
|
|
|
90 |
|
91 |
def is_webpage(self, url: str) -> bool:
|
92 |
try:
|
93 |
+
response = requests.head(url, allow_redirects=True, timeout=TIMEOUT)
|
94 |
content_type = response.headers.get("Content-Type", "").lower()
|
95 |
return "text/html" in content_type
|
96 |
except requests.RequestException:
|
code/modules/vectorstore/colbert.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
from ragatouille import RAGPretrainedModel
|
2 |
from modules.vectorstore.base import VectorStoreBase
|
3 |
from langchain_core.retrievers import BaseRetriever
|
4 |
-
from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
|
5 |
from langchain_core.documents import Document
|
6 |
-
from typing import Any, List
|
7 |
import os
|
8 |
import json
|
9 |
|
@@ -85,6 +85,7 @@ class ColbertVectorStore(VectorStoreBase):
|
|
85 |
document_ids=document_names,
|
86 |
document_metadatas=document_metadata,
|
87 |
)
|
|
|
88 |
self.colbert.set_document_count(len(document_names))
|
89 |
|
90 |
def load_database(self):
|
|
|
1 |
from ragatouille import RAGPretrainedModel
|
2 |
from modules.vectorstore.base import VectorStoreBase
|
3 |
from langchain_core.retrievers import BaseRetriever
|
4 |
+
from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
|
5 |
from langchain_core.documents import Document
|
6 |
+
from typing import Any, List
|
7 |
import os
|
8 |
import json
|
9 |
|
|
|
85 |
document_ids=document_names,
|
86 |
document_metadatas=document_metadata,
|
87 |
)
|
88 |
+
print(f"Index created at {index_path}")
|
89 |
self.colbert.set_document_count(len(document_names))
|
90 |
|
91 |
def load_database(self):
|
code/modules/vectorstore/embedding_model_loader.py
CHANGED
@@ -1,9 +1,6 @@
|
|
1 |
from langchain_community.embeddings import OpenAIEmbeddings
|
2 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
3 |
-
from
|
4 |
-
|
5 |
-
from modules.config.constants import *
|
6 |
-
import os
|
7 |
|
8 |
|
9 |
class EmbeddingModelLoader:
|
@@ -28,8 +25,5 @@ class EmbeddingModelLoader:
|
|
28 |
"trust_remote_code": True,
|
29 |
},
|
30 |
)
|
31 |
-
# embedding_model = LlamaCppEmbeddings(
|
32 |
-
# model_path=os.path.abspath("storage/llama-7b.ggmlv3.q4_0.bin")
|
33 |
-
# )
|
34 |
|
35 |
return embedding_model
|
|
|
1 |
from langchain_community.embeddings import OpenAIEmbeddings
|
2 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
3 |
+
from modules.config.constants import OPENAI_API_KEY, HUGGINGFACE_TOKEN
|
|
|
|
|
|
|
4 |
|
5 |
|
6 |
class EmbeddingModelLoader:
|
|
|
25 |
"trust_remote_code": True,
|
26 |
},
|
27 |
)
|
|
|
|
|
|
|
28 |
|
29 |
return embedding_model
|
code/modules/vectorstore/faiss.py
CHANGED
@@ -14,10 +14,15 @@ class FaissVectorStore(VectorStoreBase):
|
|
14 |
def __init__(self, config):
|
15 |
self.config = config
|
16 |
self._init_vector_db()
|
17 |
-
self.local_path = os.path.join(
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
def _init_vector_db(self):
|
23 |
self.faiss = FAISS(
|
@@ -28,9 +33,7 @@ class FaissVectorStore(VectorStoreBase):
|
|
28 |
self.vectorstore = self.faiss.from_documents(
|
29 |
documents=document_chunks, embedding=embedding_model
|
30 |
)
|
31 |
-
self.vectorstore.save_local(
|
32 |
-
self.local_path
|
33 |
-
)
|
34 |
|
35 |
def load_database(self, embedding_model):
|
36 |
self.vectorstore = self.faiss.load_local(
|
|
|
14 |
def __init__(self, config):
|
15 |
self.config = config
|
16 |
self._init_vector_db()
|
17 |
+
self.local_path = os.path.join(
|
18 |
+
self.config["vectorstore"]["db_path"],
|
19 |
+
"db_"
|
20 |
+
+ self.config["vectorstore"]["db_option"]
|
21 |
+
+ "_"
|
22 |
+
+ self.config["vectorstore"]["model"]
|
23 |
+
+ "_"
|
24 |
+
+ config["splitter_options"]["chunking_mode"],
|
25 |
+
)
|
26 |
|
27 |
def _init_vector_db(self):
|
28 |
self.faiss = FAISS(
|
|
|
33 |
self.vectorstore = self.faiss.from_documents(
|
34 |
documents=document_chunks, embedding=embedding_model
|
35 |
)
|
36 |
+
self.vectorstore.save_local(self.local_path)
|
|
|
|
|
37 |
|
38 |
def load_database(self, embedding_model):
|
39 |
self.vectorstore = self.faiss.load_local(
|
code/modules/vectorstore/raptor.py
CHANGED
@@ -317,13 +317,10 @@ class RAPTORVectoreStore(VectorStoreBase):
|
|
317 |
print(f"--Generated {len(all_clusters)} clusters--")
|
318 |
|
319 |
# Summarization
|
320 |
-
template = """Here is content from the course DS598: Deep Learning for Data Science.
|
321 |
-
|
322 |
The content may be form webapge about the course, or lecture content, or any other relevant information.
|
323 |
If the content is in bullet points (from pdf lectre slides), you can summarize the bullet points.
|
324 |
-
|
325 |
Give a detailed summary of the content below.
|
326 |
-
|
327 |
Documentation:
|
328 |
{context}
|
329 |
"""
|
|
|
317 |
print(f"--Generated {len(all_clusters)} clusters--")
|
318 |
|
319 |
# Summarization
|
320 |
+
template = """Here is content from the course DS598: Deep Learning for Data Science.
|
|
|
321 |
The content may be form webapge about the course, or lecture content, or any other relevant information.
|
322 |
If the content is in bullet points (from pdf lectre slides), you can summarize the bullet points.
|
|
|
323 |
Give a detailed summary of the content below.
|
|
|
324 |
Documentation:
|
325 |
{context}
|
326 |
"""
|
code/modules/vectorstore/store_manager.py
CHANGED
@@ -1,9 +1,7 @@
|
|
1 |
from modules.vectorstore.vectorstore import VectorStore
|
2 |
-
from modules.
|
3 |
from modules.dataloader.webpage_crawler import WebpageCrawler
|
4 |
from modules.dataloader.data_loader import DataLoader
|
5 |
-
from modules.dataloader.helpers import *
|
6 |
-
from modules.config.constants import RETRIEVER_HF_PATHS
|
7 |
from modules.vectorstore.embedding_model_loader import EmbeddingModelLoader
|
8 |
import logging
|
9 |
import os
|
@@ -117,7 +115,7 @@ class VectorStoreManager:
|
|
117 |
)
|
118 |
num_documents = len(document_chunks)
|
119 |
self.logger.info(f"Number of documents in the DB: {num_documents}")
|
120 |
-
metadata_keys = list(document_metadata[0].keys())
|
121 |
self.logger.info(f"Metadata keys: {metadata_keys}")
|
122 |
self.logger.info("Completed loading data")
|
123 |
self.initialize_database(
|
@@ -170,13 +168,21 @@ if __name__ == "__main__":
|
|
170 |
|
171 |
with open("modules/config/config.yml", "r") as f:
|
172 |
config = yaml.safe_load(f)
|
|
|
|
|
173 |
print(config)
|
|
|
174 |
print(f"Trying to create database with config: {config}")
|
175 |
vector_db = VectorStoreManager(config)
|
176 |
if config["vectorstore"]["load_from_HF"]:
|
177 |
-
if
|
|
|
|
|
|
|
178 |
vector_db.load_from_HF(
|
179 |
-
HF_PATH=
|
|
|
|
|
180 |
)
|
181 |
else:
|
182 |
# print(f"HF_PATH not available for {config['vectorstore']['db_option']}")
|
@@ -189,7 +195,7 @@ if __name__ == "__main__":
|
|
189 |
vector_db.create_database()
|
190 |
print("Created database")
|
191 |
|
192 |
-
print(
|
193 |
vector_db = VectorStoreManager(config)
|
194 |
vector_db.load_database()
|
195 |
print("Loaded database")
|
|
|
1 |
from modules.vectorstore.vectorstore import VectorStore
|
2 |
+
from modules.dataloader.helpers import get_urls_from_file
|
3 |
from modules.dataloader.webpage_crawler import WebpageCrawler
|
4 |
from modules.dataloader.data_loader import DataLoader
|
|
|
|
|
5 |
from modules.vectorstore.embedding_model_loader import EmbeddingModelLoader
|
6 |
import logging
|
7 |
import os
|
|
|
115 |
)
|
116 |
num_documents = len(document_chunks)
|
117 |
self.logger.info(f"Number of documents in the DB: {num_documents}")
|
118 |
+
metadata_keys = list(document_metadata[0].keys()) if document_metadata else []
|
119 |
self.logger.info(f"Metadata keys: {metadata_keys}")
|
120 |
self.logger.info("Completed loading data")
|
121 |
self.initialize_database(
|
|
|
168 |
|
169 |
with open("modules/config/config.yml", "r") as f:
|
170 |
config = yaml.safe_load(f)
|
171 |
+
with open("modules/config/user_config.yml", "r") as f:
|
172 |
+
user_config = yaml.safe_load(f)
|
173 |
print(config)
|
174 |
+
print(user_config)
|
175 |
print(f"Trying to create database with config: {config}")
|
176 |
vector_db = VectorStoreManager(config)
|
177 |
if config["vectorstore"]["load_from_HF"]:
|
178 |
+
if (
|
179 |
+
config["vectorstore"]["db_option"]
|
180 |
+
in user_config["retriever"]["retriever_hf_paths"]
|
181 |
+
):
|
182 |
vector_db.load_from_HF(
|
183 |
+
HF_PATH=user_config["retriever"]["retriever_hf_paths"][
|
184 |
+
config["vectorstore"]["db_option"]
|
185 |
+
]
|
186 |
)
|
187 |
else:
|
188 |
# print(f"HF_PATH not available for {config['vectorstore']['db_option']}")
|
|
|
195 |
vector_db.create_database()
|
196 |
print("Created database")
|
197 |
|
198 |
+
print("Trying to load the database")
|
199 |
vector_db = VectorStoreManager(config)
|
200 |
vector_db.load_database()
|
201 |
print("Loaded database")
|
docs/README.md
DELETED
@@ -1,51 +0,0 @@
|
|
1 |
-
# Documentation
|
2 |
-
|
3 |
-
## File Structure:
|
4 |
-
- `docs/` - Documentation files
|
5 |
-
- `code/` - Code files
|
6 |
-
- `storage/` - Storage files
|
7 |
-
- `vectorstores/` - Vector Databases
|
8 |
-
- `.env` - Environment Variables
|
9 |
-
- `Dockerfile` - Dockerfile for Hugging Face
|
10 |
-
- `.chainlit` - Chainlit Configuration
|
11 |
-
- `chainlit.md` - Chainlit README
|
12 |
-
- `README.md` - Repository README
|
13 |
-
- `.gitignore` - Gitignore file
|
14 |
-
- `requirements.txt` - Python Requirements
|
15 |
-
- `.gitattributes` - Gitattributes file
|
16 |
-
|
17 |
-
## Code Structure
|
18 |
-
|
19 |
-
- `code/main.py` - Main Chainlit App
|
20 |
-
- `code/config.yaml` - Configuration File to set Embedding related, Vector Database related, and Chat Model related parameters.
|
21 |
-
- `code/modules/vector_db.py` - Vector Database Creation
|
22 |
-
- `code/modules/chat_model_loader.py` - Chat Model Loader (Creates the Chat Model)
|
23 |
-
- `code/modules/constants.py` - Constants (Loads the Environment Variables, Prompts, Model Paths, etc.)
|
24 |
-
- `code/modules/data_loader.py` - Loads and Chunks the Data
|
25 |
-
- `code/modules/embedding_model.py` - Creates the Embedding Model to Embed the Data
|
26 |
-
- `code/modules/llm_tutor.py` - Creates the RAG LLM Tutor
|
27 |
-
- The Function `qa_bot()` loads the vector database and the chat model, and sets the prompt to pass to the chat model.
|
28 |
-
- `code/modules/helpers.py` - Helper Functions
|
29 |
-
|
30 |
-
## Storage and Vectorstores
|
31 |
-
|
32 |
-
- `storage/data/` - Data Storage (Put your pdf files under this directory, and urls in the urls.txt file)
|
33 |
-
- `storage/models/` - Model Storage (Put your local LLMs under this directory)
|
34 |
-
|
35 |
-
- `vectorstores/` - Vector Databases (Stores the Vector Databases generated from `code/modules/vector_db.py`)
|
36 |
-
|
37 |
-
|
38 |
-
## Useful Configurations
|
39 |
-
set these in `code/config.yaml`:
|
40 |
-
* ``["embedding_options"]["embedd_files"]`` - If set to True, embeds the files from the storage directory everytime you run the chainlit command. If set to False, uses the stored vector database.
|
41 |
-
* ``["embedding_options"]["expand_urls"]`` - If set to True, gets and reads the data from all the links under the url provided. If set to False, only reads the data in the url provided.
|
42 |
-
* ``["embedding_options"]["search_top_k"]`` - Number of sources that the retriever returns
|
43 |
-
* ``["llm_params]["use_history"]`` - Whether to use history in the prompt or not
|
44 |
-
* ``["llm_params]["memory_window"]`` - Number of interactions to keep a track of in the history
|
45 |
-
|
46 |
-
|
47 |
-
## LlamaCpp
|
48 |
-
* https://python.langchain.com/docs/integrations/llms/llamacpp
|
49 |
-
|
50 |
-
## Hugging Face Models
|
51 |
-
* Download the ``.gguf`` files for your Local LLM from Hugging Face (Example: https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/contribute.md
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
💡 **Please ensure formatting, linting, and security checks pass before submitting a pull request**
|
2 |
+
|
3 |
+
## Code Formatting
|
4 |
+
|
5 |
+
The codebase is formatted using [black](https://github.com/psf/black)
|
6 |
+
|
7 |
+
To format the codebase, run the following command:
|
8 |
+
|
9 |
+
```bash
|
10 |
+
black .
|
11 |
+
```
|
12 |
+
|
13 |
+
Please ensure that the code is formatted before submitting a pull request.
|
14 |
+
|
15 |
+
## Linting
|
16 |
+
|
17 |
+
The codebase is linted using [flake8](https://flake8.pycqa.org/en/latest/)
|
18 |
+
|
19 |
+
To view the linting errors, run the following command:
|
20 |
+
|
21 |
+
```bash
|
22 |
+
flake8 .
|
23 |
+
```
|
24 |
+
|
25 |
+
## Security and Vulnerabilities
|
26 |
+
|
27 |
+
The codebase is scanned for security vulnerabilities using [bandit](https://github.com/PyCQA/bandit)
|
28 |
+
|
29 |
+
To scan the codebase for security vulnerabilities, run the following command:
|
30 |
+
|
31 |
+
```bash
|
32 |
+
bandit -r .
|
33 |
+
```
|
docs/setup.md
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Initial Setup
|
2 |
+
|
3 |
+
⚠️ **Create the .env file inside the `code/` directory.**
|
4 |
+
|
5 |
+
## Python Environment
|
6 |
+
|
7 |
+
Python Version: 3.11
|
8 |
+
|
9 |
+
Create a virtual environment and install the required packages:
|
10 |
+
|
11 |
+
```bash
|
12 |
+
conda create -n ai_tutor python=3.11
|
13 |
+
conda activate ai_tutor
|
14 |
+
pip install -r requirements.txt
|
15 |
+
```
|
16 |
+
|
17 |
+
## Code Formatting
|
18 |
+
|
19 |
+
The codebase is formatted using [black](https://github.com/psf/black), and if making changes to the codebase, ensure that the code is formatted before submitting a pull request. More instructions can be found in `docs/contribute.md`.
|
20 |
+
|
21 |
+
## Google OAuth 2.0 Client ID and Secret
|
22 |
+
|
23 |
+
To set up the Google OAuth 2.0 Client ID and Secret, follow these steps:
|
24 |
+
|
25 |
+
1. Go to the [Google Cloud Console](https://console.cloud.google.com/apis/credentials).
|
26 |
+
2. Create a new project or select an existing one.
|
27 |
+
3. Navigate to the "Credentials" page.
|
28 |
+
4. Click on "Create Credentials" and select "OAuth 2.0 Client ID".
|
29 |
+
5. Configure the OAuth consent screen if you haven't already.
|
30 |
+
6. Choose "Web application" as the application type.
|
31 |
+
7. Configure the redirect URIs as needed.
|
32 |
+
8. Copy the generated `Client ID` and `Client Secret`.
|
33 |
+
|
34 |
+
Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
|
35 |
+
|
36 |
+
```bash
|
37 |
+
OAUTH_GOOGLE_CLIENT_ID=<your_client_id>
|
38 |
+
OAUTH_GOOGLE_CLIENT_SECRET=<your_client_secret>
|
39 |
+
```
|
40 |
+
|
41 |
+
## Literal AI API Key
|
42 |
+
|
43 |
+
To obtain the Literal AI API key:
|
44 |
+
|
45 |
+
1. Sign up or log in to [Literal AI](https://cloud.getliteral.ai/).
|
46 |
+
2. Navigate to the API Keys section under your account settings.
|
47 |
+
3. Create a new API key if necessary and copy it.
|
48 |
+
|
49 |
+
Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
|
50 |
+
|
51 |
+
```bash
|
52 |
+
LITERAL_API_KEY_LOGGING=<your_api_key>
|
53 |
+
LITERAL_API_URL=https://cloud.getliteral.ai
|
54 |
+
```
|
55 |
+
|
56 |
+
## LlamaCloud API Key
|
57 |
+
|
58 |
+
To obtain the LlamaCloud API Key:
|
59 |
+
|
60 |
+
1. Go to [LlamaCloud](https://cloud.llamaindex.ai/).
|
61 |
+
2. Sign up or log in to your account.
|
62 |
+
3. Navigate to the API section and generate a new API key if necessary.
|
63 |
+
|
64 |
+
Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
|
65 |
+
|
66 |
+
```bash
|
67 |
+
LLAMA_CLOUD_API_KEY=<your_api_key>
|
68 |
+
```
|
69 |
+
|
70 |
+
## Hugging Face Access Token
|
71 |
+
|
72 |
+
To obtain your Hugging Face access token:
|
73 |
+
|
74 |
+
1. Go to [Hugging Face settings](https://huggingface.co/settings/tokens).
|
75 |
+
2. Log in or create an account.
|
76 |
+
3. Generate a new token or use an existing one.
|
77 |
+
|
78 |
+
Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
|
79 |
+
|
80 |
+
```bash
|
81 |
+
HUGGINGFACE_TOKEN=<your-huggingface-token>
|
82 |
+
```
|
83 |
+
|
84 |
+
## Chainlit Authentication Secret
|
85 |
+
|
86 |
+
You must provide a JWT secret in the environment to use authentication. Run `chainlit create-secret` to generate one.
|
87 |
+
|
88 |
+
```bash
|
89 |
+
chainlit create-secret
|
90 |
+
```
|
91 |
+
|
92 |
+
Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
|
93 |
+
|
94 |
+
```bash
|
95 |
+
CHAINLIT_AUTH_SECRET=<your_jwt_secret>
|
96 |
+
CHAINLIT_URL=<your_chainlit_url> # Example: CHAINLIT_URL=http://localhost:8000
|
97 |
+
```
|
98 |
+
|
99 |
+
## OpenAI API Key
|
100 |
+
|
101 |
+
Set the following in the .env file (if running locally) or in secrets (if running on Hugging Face Spaces):
|
102 |
+
|
103 |
+
```bash
|
104 |
+
OPENAI_API_KEY=<your_openai_api_key>
|
105 |
+
```
|
106 |
+
|
107 |
+
## In a Nutshell
|
108 |
+
|
109 |
+
Your .env file (secrets in HuggingFace) should look like this:
|
110 |
+
|
111 |
+
```bash
|
112 |
+
CHAINLIT_AUTH_SECRET=<your_jwt_secret>
|
113 |
+
OPENAI_API_KEY=<your_openai_api_key>
|
114 |
+
HUGGINGFACE_TOKEN=<your-huggingface-token>
|
115 |
+
LITERAL_API_KEY_LOGGING=<your_api_key>
|
116 |
+
LITERAL_API_URL=<https://cloud.getliteral.ai>
|
117 |
+
OAUTH_GOOGLE_CLIENT_ID=<your_client_id>
|
118 |
+
OAUTH_GOOGLE_CLIENT_SECRET=<your_client_secret>
|
119 |
+
LLAMA_CLOUD_API_KEY=<your_api_key>
|
120 |
+
CHAINLIT_URL=<your_chainlit_url>
|
121 |
+
```
|
122 |
+
|
123 |
+
|
124 |
+
# Configuration
|
125 |
+
|
126 |
+
The configuration file `code/modules/config.yaml` contains the parameters that control the behaviour of your app.
|
127 |
+
The configuration file `code/modules/user_config.yaml` contains user-defined parameters.
|
pyproject.toml
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
[tool.black]
|
2 |
+
line-length = 88
|
requirements.txt
CHANGED
@@ -27,3 +27,6 @@ langchain_experimental
|
|
27 |
html2text
|
28 |
PyPDF2
|
29 |
pdf2image
|
|
|
|
|
|
|
|
27 |
html2text
|
28 |
PyPDF2
|
29 |
pdf2image
|
30 |
+
black
|
31 |
+
flake8
|
32 |
+
bandit
|