diff --git a/.chainlit/translations/en-US.json b/.chainlit/translations/en-US.json deleted file mode 100644 index 0bca720703849ee16b077215e24e7387cc8d50bf..0000000000000000000000000000000000000000 --- a/.chainlit/translations/en-US.json +++ /dev/null @@ -1,231 +0,0 @@ -{ - "components": { - "atoms": { - "buttons": { - "userButton": { - "menu": { - "settings": "Settings", - "settingsKey": "S", - "APIKeys": "API Keys", - "logout": "Logout" - } - } - } - }, - "molecules": { - "newChatButton": { - "newChat": "New Chat" - }, - "tasklist": { - "TaskList": { - "title": "\ud83d\uddd2\ufe0f Task List", - "loading": "Loading...", - "error": "An error occured" - } - }, - "attachments": { - "cancelUpload": "Cancel upload", - "removeAttachment": "Remove attachment" - }, - "newChatDialog": { - "createNewChat": "Create new chat?", - "clearChat": "This will clear the current messages and start a new chat.", - "cancel": "Cancel", - "confirm": "Confirm" - }, - "settingsModal": { - "settings": "Settings", - "expandMessages": "Expand Messages", - "hideChainOfThought": "Hide Chain of Thought", - "darkMode": "Dark Mode" - }, - "detailsButton": { - "using": "Using", - "running": "Running", - "took_one": "Took {{count}} step", - "took_other": "Took {{count}} steps" - }, - "auth": { - "authLogin": { - "title": "Login to access the app.", - "form": { - "email": "Email address", - "password": "Password", - "noAccount": "Don't have an account?", - "alreadyHaveAccount": "Already have an account?", - "signup": "Sign Up", - "signin": "Sign In", - "or": "OR", - "continue": "Continue", - "forgotPassword": "Forgot password?", - "passwordMustContain": "Your password must contain:", - "emailRequired": "email is a required field", - "passwordRequired": "password is a required field" - }, - "error": { - "default": "Unable to sign in.", - "signin": "Try signing in with a different account.", - "oauthsignin": "Try signing in with a different account.", - "redirect_uri_mismatch": "The redirect URI is not matching the oauth app configuration.", - "oauthcallbackerror": "Try signing in with a different account.", - "oauthcreateaccount": "Try signing in with a different account.", - "emailcreateaccount": "Try signing in with a different account.", - "callback": "Try signing in with a different account.", - "oauthaccountnotlinked": "To confirm your identity, sign in with the same account you used originally.", - "emailsignin": "The e-mail could not be sent.", - "emailverify": "Please verify your email, a new email has been sent.", - "credentialssignin": "Sign in failed. Check the details you provided are correct.", - "sessionrequired": "Please sign in to access this page." - } - }, - "authVerifyEmail": { - "almostThere": "You're almost there! We've sent an email to ", - "verifyEmailLink": "Please click on the link in that email to complete your signup.", - "didNotReceive": "Can't find the email?", - "resendEmail": "Resend email", - "goBack": "Go Back", - "emailSent": "Email sent successfully.", - "verifyEmail": "Verify your email address" - }, - "providerButton": { - "continue": "Continue with {{provider}}", - "signup": "Sign up with {{provider}}" - }, - "authResetPassword": { - "newPasswordRequired": "New password is a required field", - "passwordsMustMatch": "Passwords must match", - "confirmPasswordRequired": "Confirm password is a required field", - "newPassword": "New password", - "confirmPassword": "Confirm password", - "resetPassword": "Reset Password" - }, - "authForgotPassword": { - "email": "Email address", - "emailRequired": "email is a required field", - "emailSent": "Please check the email address {{email}} for instructions to reset your password.", - "enterEmail": "Enter your email address and we will send you instructions to reset your password.", - "resendEmail": "Resend email", - "continue": "Continue", - "goBack": "Go Back" - } - } - }, - "organisms": { - "chat": { - "history": { - "index": { - "showHistory": "Show history", - "lastInputs": "Last Inputs", - "noInputs": "Such empty...", - "loading": "Loading..." - } - }, - "inputBox": { - "input": { - "placeholder": "Type your message here..." - }, - "speechButton": { - "start": "Start recording", - "stop": "Stop recording" - }, - "SubmitButton": { - "sendMessage": "Send message", - "stopTask": "Stop Task" - }, - "UploadButton": { - "attachFiles": "Attach files" - }, - "waterMark": { - "text": "Built with" - } - }, - "Messages": { - "index": { - "running": "Running", - "executedSuccessfully": "executed successfully", - "failed": "failed", - "feedbackUpdated": "Feedback updated", - "updating": "Updating" - } - }, - "dropScreen": { - "dropYourFilesHere": "Drop your files here" - }, - "index": { - "failedToUpload": "Failed to upload", - "cancelledUploadOf": "Cancelled upload of", - "couldNotReachServer": "Could not reach the server", - "continuingChat": "Continuing previous chat" - }, - "settings": { - "settingsPanel": "Settings panel", - "reset": "Reset", - "cancel": "Cancel", - "confirm": "Confirm" - } - }, - "threadHistory": { - "sidebar": { - "filters": { - "FeedbackSelect": { - "feedbackAll": "Feedback: All", - "feedbackPositive": "Feedback: Positive", - "feedbackNegative": "Feedback: Negative" - }, - "SearchBar": { - "search": "Search" - } - }, - "DeleteThreadButton": { - "confirmMessage": "This will delete the thread as well as it's messages and elements.", - "cancel": "Cancel", - "confirm": "Confirm", - "deletingChat": "Deleting chat", - "chatDeleted": "Chat deleted" - }, - "index": { - "pastChats": "Past Chats" - }, - "ThreadList": { - "empty": "Empty...", - "today": "Today", - "yesterday": "Yesterday", - "previous7days": "Previous 7 days", - "previous30days": "Previous 30 days" - }, - "TriggerButton": { - "closeSidebar": "Close sidebar", - "openSidebar": "Open sidebar" - } - }, - "Thread": { - "backToChat": "Go back to chat", - "chatCreatedOn": "This chat was created on" - } - }, - "header": { - "chat": "Chat", - "readme": "Readme" - } - } - }, - "hooks": { - "useLLMProviders": { - "failedToFetchProviders": "Failed to fetch providers:" - } - }, - "pages": { - "Design": {}, - "Env": { - "savedSuccessfully": "Saved successfully", - "requiredApiKeys": "Required API Keys", - "requiredApiKeysInfo": "To use this app, the following API keys are required. The keys are stored on your device's local storage." - }, - "Page": { - "notPartOfProject": "You are not part of this project." - }, - "ResumeButton": { - "resumeChat": "Resume Chat" - } - } -} \ No newline at end of file diff --git a/.chainlit/translations/pt-BR.json b/.chainlit/translations/pt-BR.json deleted file mode 100644 index 5b428318bc4121b2078dbbe7fe9061e17bb8b576..0000000000000000000000000000000000000000 --- a/.chainlit/translations/pt-BR.json +++ /dev/null @@ -1,155 +0,0 @@ -{ - "components": { - "atoms": { - "buttons": { - "userButton": { - "menu": { - "settings": "Configura\u00e7\u00f5es", - "settingsKey": "S", - "APIKeys": "Chaves de API", - "logout": "Sair" - } - } - } - }, - "molecules": { - "newChatButton": { - "newChat": "Nova Conversa" - }, - "tasklist": { - "TaskList": { - "title": "\ud83d\uddd2\ufe0f Lista de Tarefas", - "loading": "Carregando...", - "error": "Ocorreu um erro" - } - }, - "attachments": { - "cancelUpload": "Cancelar envio", - "removeAttachment": "Remover anexo" - }, - "newChatDialog": { - "createNewChat": "Criar novo chat?", - "clearChat": "Isso limpar\u00e1 as mensagens atuais e iniciar\u00e1 uma nova conversa.", - "cancel": "Cancelar", - "confirm": "Confirmar" - }, - "settingsModal": { - "expandMessages": "Expandir Mensagens", - "hideChainOfThought": "Esconder Sequ\u00eancia de Pensamento", - "darkMode": "Modo Escuro" - } - }, - "organisms": { - "chat": { - "history": { - "index": { - "lastInputs": "\u00daltimas Entradas", - "noInputs": "Vazio...", - "loading": "Carregando..." - } - }, - "inputBox": { - "input": { - "placeholder": "Digite sua mensagem aqui..." - }, - "speechButton": { - "start": "Iniciar grava\u00e7\u00e3o", - "stop": "Parar grava\u00e7\u00e3o" - }, - "SubmitButton": { - "sendMessage": "Enviar mensagem", - "stopTask": "Parar Tarefa" - }, - "UploadButton": { - "attachFiles": "Anexar arquivos" - }, - "waterMark": { - "text": "Constru\u00eddo com" - } - }, - "Messages": { - "index": { - "running": "Executando", - "executedSuccessfully": "executado com sucesso", - "failed": "falhou", - "feedbackUpdated": "Feedback atualizado", - "updating": "Atualizando" - } - }, - "dropScreen": { - "dropYourFilesHere": "Solte seus arquivos aqui" - }, - "index": { - "failedToUpload": "Falha ao enviar", - "cancelledUploadOf": "Envio cancelado de", - "couldNotReachServer": "N\u00e3o foi poss\u00edvel conectar ao servidor", - "continuingChat": "Continuando o chat anterior" - }, - "settings": { - "settingsPanel": "Painel de Configura\u00e7\u00f5es", - "reset": "Redefinir", - "cancel": "Cancelar", - "confirm": "Confirmar" - } - }, - "threadHistory": { - "sidebar": { - "filters": { - "FeedbackSelect": { - "feedbackAll": "Feedback: Todos", - "feedbackPositive": "Feedback: Positivo", - "feedbackNegative": "Feedback: Negativo" - }, - "SearchBar": { - "search": "Buscar" - } - }, - "DeleteThreadButton": { - "confirmMessage": "Isso deletar\u00e1 a conversa, assim como suas mensagens e elementos.", - "cancel": "Cancelar", - "confirm": "Confirmar", - "deletingChat": "Deletando conversa", - "chatDeleted": "Conversa deletada" - }, - "index": { - "pastChats": "Conversas Anteriores" - }, - "ThreadList": { - "empty": "Vazio..." - }, - "TriggerButton": { - "closeSidebar": "Fechar barra lateral", - "openSidebar": "Abrir barra lateral" - } - }, - "Thread": { - "backToChat": "Voltar para a conversa", - "chatCreatedOn": "Esta conversa foi criada em" - } - }, - "header": { - "chat": "Conversa", - "readme": "Leia-me" - } - }, - "hooks": { - "useLLMProviders": { - "failedToFetchProviders": "Falha ao buscar provedores:" - } - }, - "pages": { - "Design": {}, - "Env": { - "savedSuccessfully": "Salvo com sucesso", - "requiredApiKeys": "Chaves de API necess\u00e1rias", - "requiredApiKeysInfo": "Para usar este aplicativo, as seguintes chaves de API s\u00e3o necess\u00e1rias. As chaves s\u00e3o armazenadas localmente em seu dispositivo." - }, - "Page": { - "notPartOfProject": "Voc\u00ea n\u00e3o faz parte deste projeto." - }, - "ResumeButton": { - "resumeChat": "Continuar Conversa" - } - } - } -} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 5b23c78d48aa2d7d973a8bdf36cd0ac1468fbdcd..73f36b044b7404127b8c5a1f6d04eb70ff040fd5 100644 --- a/.gitignore +++ b/.gitignore @@ -160,4 +160,12 @@ cython_debug/ #.idea/ # log files -*.log \ No newline at end of file +*.log + +.ragatouille/* +*/__pycache__/* +*/.chainlit/translations/* +storage/logs/* +vectorstores/* + +*/.files/* \ No newline at end of file diff --git a/Dockerfile.dev b/Dockerfile.dev index fb24ec749fb82b535004f34a93a5034c0d36356d..b154815021ec63d1b21e7b83ce4a0065fb9b1e55 100644 --- a/Dockerfile.dev +++ b/Dockerfile.dev @@ -10,7 +10,8 @@ RUN pip install --no-cache-dir -r /code/requirements.txt COPY . /code -RUN ls -R +# List the contents of the /code directory to verify files are copied correctly +RUN ls -R /code # Change permissions to allow writing to the directory RUN chmod -R 777 /code @@ -21,7 +22,10 @@ RUN mkdir /code/logs && chmod 777 /code/logs # Create a cache directory within the application's working directory RUN mkdir /.cache && chmod -R 777 /.cache +WORKDIR /code/code + # Expose the port the app runs on EXPOSE 8051 -CMD python code/modules/vector_db.py && chainlit run code/main.py --port 8051 \ No newline at end of file +# Default command to run the application +CMD ["sh", "-c", "python -m modules.vectorstore.store_manager && chainlit run main.py --host 0.0.0.0 --port 8051"] \ No newline at end of file diff --git a/README.md b/README.md index b3f2a6de1b316034c43a0dcf6b06b0fd4ab296e9..017d5b4f2f6a34e7d08e8ca6e74c7e0e4ba15685 100644 --- a/README.md +++ b/README.md @@ -1,35 +1,84 @@ ---- -title: Dl4ds Tutor -emoji: πŸƒ -colorFrom: green -colorTo: red -sdk: docker -pinned: false -hf_oauth: true ---- +# DL4DS Tutor πŸƒ -DL4DS Tutor -=========== +Check out the configuration reference at [Hugging Face Spaces Config Reference](https://huggingface.co/docs/hub/spaces-config-reference). -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +You can find an implementation of the Tutor at [DL4DS Tutor on Hugging Face](https://dl4ds-dl4ds-tutor.hf.space/), which is hosted on Hugging Face [here](https://huggingface.co/spaces/dl4ds/dl4ds_tutor). -You can find an implementation of the Tutor at https://dl4ds-dl4ds-tutor.hf.space/, which is hosted on Hugging Face [here](https://huggingface.co/spaces/dl4ds/dl4ds_tutor) +## Running Locally -To run locally, +1. **Clone the Repository** + ```bash + git clone https://github.com/DL4DS/dl4ds_tutor + ``` -Clone the repository from: https://github.com/DL4DS/dl4ds_tutor +2. **Put your data under the `storage/data` directory** + - Add URLs in the `urls.txt` file. + - Add other PDF files in the `storage/data` directory. -Put your data under the `storage/data` directory. Note: You can add urls in the urls.txt file, and other pdf files in the `storage/data` directory. +3. **To test Data Loading (Optional)** + ```bash + cd code + python -m modules.dataloader.data_loader + ``` -To create the Vector Database, run the following command: -```python code/modules/vector_db.py``` -(Note: You would need to run the above when you add new data to the `storage/data` directory, or if the ``storage/data/urls.txt`` file is updated. Or you can set ``["embedding_options"]["embedd_files"]`` to True in the `code/config.yaml` file, which would embed files from the storage directory everytime you run the below chainlit command.) +4. **Create the Vector Database** + ```bash + cd code + python -m modules.vectorstore.store_manager + ``` + - Note: You need to run the above command when you add new data to the `storage/data` directory, or if the `storage/data/urls.txt` file is updated. + - Alternatively, you can set `["vectorstore"]["embedd_files"]` to `True` in the `code/modules/config/config.yaml` file, which will embed files from the storage directory every time you run the below chainlit command. -To run the chainlit app, run the following command: -```chainlit run code/main.py``` +5. **Run the Chainlit App** + ```bash + chainlit run main.py + ``` See the [docs](https://github.com/DL4DS/dl4ds_tutor/tree/main/docs) for more information. +## File Structure + +```plaintext +code/ + β”œβ”€β”€ modules + β”‚ β”œβ”€β”€ chat # Contains the chatbot implementation + β”‚ β”œβ”€β”€ chat_processor # Contains the implementation to process and log the conversations + β”‚ β”œβ”€β”€ config # Contains the configuration files + β”‚ β”œβ”€β”€ dataloader # Contains the implementation to load the data from the storage directory + β”‚ β”œβ”€β”€ retriever # Contains the implementation to create the retriever + β”‚ └── vectorstore # Contains the implementation to create the vector database + β”œβ”€β”€ public + β”‚ β”œβ”€β”€ logo_dark.png # Dark theme logo + β”‚ β”œβ”€β”€ logo_light.png # Light theme logo + β”‚ └── test.css # Custom CSS file + └── main.py + + +docs/ # Contains the documentation to the codebase and methods used + +storage/ + β”œβ”€β”€ data # Store files and URLs here + β”œβ”€β”€ logs # Logs directory, includes logs on vector DB creation, tutor logs, and chunks logged in JSON files + └── models # Local LLMs are loaded from here + +vectorstores/ # Stores the created vector databases + +.env # This needs to be created, store the API keys here +``` +- `code/modules/vectorstore/vectorstore.py`: Instantiates the `VectorStore` class to create the vector database. +- `code/modules/vectorstore/store_manager.py`: Instantiates the `VectorStoreManager:` class to manage the vector database, and all associated methods. +- `code/modules/retriever/retriever.py`: Instantiates the `Retriever` class to create the retriever. + + +## Docker + +The HuggingFace Space is built using the `Dockerfile` in the repository. To run it locally, use the `Dockerfile.dev` file. + +```bash +docker build --tag dev -f Dockerfile.dev . +docker run -it --rm -p 8051:8051 dev +``` + ## Contributing -Please create an issue if you have any suggestions or improvements, and start working on it by creating a branch and by making a pull request to the main branch. +Please create an issue if you have any suggestions or improvements, and start working on it by creating a branch and by making a pull request to the main branch. \ No newline at end of file diff --git a/.chainlit/config.toml b/code/.chainlit/config.toml similarity index 82% rename from .chainlit/config.toml rename to code/.chainlit/config.toml index 4cd8df03bccf4e6759fd82fc44734f07b1e8ffc9..5e27d3d519f60e9c75978d4d9784a83ad16d0626 100644 --- a/.chainlit/config.toml +++ b/code/.chainlit/config.toml @@ -19,9 +19,6 @@ allow_origins = ["*"] # follow_symlink = false [features] -# Show the prompt playground -prompt_playground = true - # Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript) unsafe_allow_html = false @@ -53,26 +50,20 @@ auto_tag_thread = true sample_rate = 44100 [UI] -# Name of the app and chatbot. +# Name of the assistant. name = "AI Tutor" -# Show the readme while the thread is empty. -show_readme_as_default = true - -# Description of the app and chatbot. This is used for HTML tags. -# description = "AI Tutor - DS598" +# Description of the assistant. This is used for HTML tags. +# description = "" # Large size content are by default collapsed for a cleaner ui default_collapse_content = true -# The default value for the expand messages settings. -default_expand_messages = false - # Hide the chain of thought details from the user in the UI. -hide_cot = false +hide_cot = true # Link to your github repo. This will add a github button in the UI's header. -# github = "" +# github = "https://github.com/DL4DS/dl4ds_tutor" # Specify a CSS file that can be used to customize the user interface. # The CSS file can be served from the public directory or via an external link. @@ -86,7 +77,7 @@ custom_css = "/public/test.css" # custom_font = "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap" # Specify a custom meta image url. -# custom_meta_image_url = "https://chainlit-cloud.s3.eu-west-3.amazonaws.com/logo/chainlit_banner.png" +custom_meta_image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/f/f5/Boston_University_seal.svg/1200px-Boston_University_seal.svg.png" # Specify a custom build directory for the frontend. # This can be used to customize the frontend code. @@ -94,18 +85,21 @@ custom_css = "/public/test.css" # custom_build = "./public/build" [UI.theme] + default = "light" #layout = "wide" #font_family = "Inter, sans-serif" # Override default MUI light theme. (Check theme.ts) [UI.theme.light] - #background = "#FAFAFA" - #paper = "#FFFFFF" + background = "#FAFAFA" + paper = "#FFFFFF" [UI.theme.light.primary] - #main = "#F80061" - #dark = "#980039" - #light = "#FFE7EB" - + main = "#b22222" # Brighter shade of red + dark = "#8b0000" # Darker shade of the brighter red + light = "#ff6347" # Lighter shade of the brighter red + [UI.theme.light.text] + primary = "#212121" + secondary = "#616161" # Override default MUI dark theme. (Check theme.ts) [UI.theme.dark] background = "#1C1C1C" # Slightly lighter dark background color @@ -118,4 +112,4 @@ custom_css = "/public/test.css" [meta] -generated_by = "1.1.202" \ No newline at end of file +generated_by = "1.1.302" diff --git a/code/__init__.py b/code/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..270dcebaa5f4e79f101903087c3dfbd8dcfdddb3 --- /dev/null +++ b/code/__init__.py @@ -0,0 +1 @@ +from .modules import * diff --git a/chainlit.md b/code/chainlit.md similarity index 100% rename from chainlit.md rename to code/chainlit.md diff --git a/code/main.py b/code/main.py index c9808978413bce5515cc348347edd4f707858c63..78ce5bfa53d204ab37dcdf4f64abd037bde9f4e6 100644 --- a/code/main.py +++ b/code/main.py @@ -1,9 +1,8 @@ -from langchain.document_loaders import PyPDFLoader, DirectoryLoader -from langchain import PromptTemplate -from langchain.embeddings import HuggingFaceEmbeddings -from langchain.vectorstores import FAISS +from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader +from langchain_core.prompts import PromptTemplate +from langchain_community.embeddings import HuggingFaceEmbeddings +from langchain_community.vectorstores import FAISS from langchain.chains import RetrievalQA -from langchain.llms import CTransformers import chainlit as cl from langchain_community.chat_models import ChatOpenAI from langchain_community.embeddings import OpenAIEmbeddings @@ -11,27 +10,48 @@ import yaml import logging from dotenv import load_dotenv -from modules.llm_tutor import LLMTutor -from modules.constants import * -from modules.helpers import get_sources - +from modules.chat.llm_tutor import LLMTutor +from modules.config.constants import * +from modules.chat.helpers import get_sources +from modules.chat_processor.chat_processor import ChatProcessor +global logger +# Initialize logger logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") # Console Handler console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) -formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") console_handler.setFormatter(formatter) logger.addHandler(console_handler) -# File Handler -log_file_path = "log_file.log" # Change this to your desired log file path -file_handler = logging.FileHandler(log_file_path) -file_handler.setLevel(logging.INFO) -file_handler.setFormatter(formatter) -logger.addHandler(file_handler) + +@cl.set_starters +async def set_starters(): + return [ + cl.Starter( + label="recording on CNNs?", + message="Where can I find the recording for the lecture on Transfromers?", + icon="/public/adv-screen-recorder-svgrepo-com.svg", + ), + cl.Starter( + label="where's the slides?", + message="When are the lectures? I can't find the schedule.", + icon="/public/alarmy-svgrepo-com.svg", + ), + cl.Starter( + label="Due Date?", + message="When is the final project due?", + icon="/public/calendar-samsung-17-svgrepo-com.svg", + ), + cl.Starter( + label="Explain backprop.", + message="I didnt understand the math behind backprop, could you explain it?", + icon="/public/acastusphoton-svgrepo-com.svg", + ), + ] # Adding option to select the chat profile @@ -66,12 +86,26 @@ def rename(orig_author: str): # chainlit code @cl.on_chat_start async def start(): - with open("code/config.yml", "r") as f: + with open("modules/config/config.yml", "r") as f: config = yaml.safe_load(f) - print(config) - logger.info("Config file loaded") - logger.info(f"Config: {config}") - logger.info("Creating llm_tutor instance") + + # Ensure log directory exists + log_directory = config["log_dir"] + if not os.path.exists(log_directory): + os.makedirs(log_directory) + + # File Handler + log_file_path = ( + f"{log_directory}/tutor.log" # Change this to your desired log file path + ) + file_handler = logging.FileHandler(log_file_path, mode="w") + file_handler.setLevel(logging.INFO) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + logger.info("Config file loaded") + logger.info(f"Config: {config}") + logger.info("Creating llm_tutor instance") chat_profile = cl.user_session.get("chat_profile") if chat_profile is not None: @@ -93,36 +127,50 @@ async def start(): llm_tutor = LLMTutor(config, logger=logger) chain = llm_tutor.qa_bot() - model = config["llm_params"]["local_llm_params"]["model"] - msg = cl.Message(content=f"Starting the bot {model}...") - await msg.send() - msg.content = opening_message - await msg.update() + # msg = cl.Message(content=f"Starting the bot {chat_profile}...") + # await msg.send() + # msg.content = opening_message + # await msg.update() + tags = [chat_profile, config["vectorstore"]["db_option"]] + chat_processor = ChatProcessor(config, tags=tags) cl.user_session.set("chain", chain) + cl.user_session.set("counter", 0) + cl.user_session.set("chat_processor", chat_processor) + + +@cl.on_chat_end +async def on_chat_end(): + await cl.Message(content="Sorry, I have to go now. Goodbye!").send() @cl.on_message async def main(message): + global logger user = cl.user_session.get("user") chain = cl.user_session.get("chain") - # cb = cl.AsyncLangchainCallbackHandler( - # stream_final_answer=True, answer_prefix_tokens=["FINAL", "ANSWER"] - # ) - # cb.answer_reached = True - # res=await chain.acall(message, callbacks=[cb]) - res = await chain.acall(message.content) - print(f"response: {res}") + + counter = cl.user_session.get("counter") + counter += 1 + cl.user_session.set("counter", counter) + + # if counter >= 3: # Ensure the counter condition is checked + # await cl.Message(content="Your credits are up!").send() + # await on_chat_end() # Call the on_chat_end function to handle the end of the chat + # return # Exit the function to stop further processing + # else: + + cb = cl.AsyncLangchainCallbackHandler() # TODO: fix streaming here + cb.answer_reached = True + + processor = cl.user_session.get("chat_processor") + res = await processor.rag(message.content, chain, cb) try: answer = res["answer"] except: answer = res["result"] - print(f"answer: {answer}") - - logger.info(f"Question: {res['question']}") - logger.info(f"History: {res['chat_history']}") - logger.info(f"Answer: {answer}\n") - answer_with_sources, source_elements = get_sources(res, answer) + answer_with_sources, source_elements, sources_dict = get_sources(res, answer) + processor._process(message.content, answer, sources_dict) await cl.Message(content=answer_with_sources, elements=source_elements).send() diff --git a/code/modules/chat/__init__.py b/code/modules/chat/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/modules/chat_model_loader.py b/code/modules/chat/chat_model_loader.py similarity index 90% rename from code/modules/chat_model_loader.py rename to code/modules/chat/chat_model_loader.py index d5a19967f540f0037ed6a46456bf0ee9751f66ab..49d689a01684c058417247c41aebfe8dad5c60f4 100644 --- a/code/modules/chat_model_loader.py +++ b/code/modules/chat/chat_model_loader.py @@ -1,8 +1,7 @@ from langchain_community.chat_models import ChatOpenAI -from langchain.llms import CTransformers -from langchain.llms.huggingface_pipeline import HuggingFacePipeline +from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline from transformers import AutoTokenizer, TextStreamer -from langchain.llms import LlamaCpp +from langchain_community.llms import LlamaCpp import torch import transformers import os diff --git a/code/modules/chat/helpers.py b/code/modules/chat/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..669cedadae70ca7d6918c4a076581b4b1a4afe26 --- /dev/null +++ b/code/modules/chat/helpers.py @@ -0,0 +1,104 @@ +from modules.config.constants import * +import chainlit as cl +from langchain_core.prompts import PromptTemplate + + +def get_sources(res, answer): + source_elements = [] + source_dict = {} # Dictionary to store URL elements + + for idx, source in enumerate(res["source_documents"]): + source_metadata = source.metadata + url = source_metadata.get("source", "N/A") + score = source_metadata.get("score", "N/A") + page = source_metadata.get("page", 1) + + lecture_tldr = source_metadata.get("tldr", "N/A") + lecture_recording = source_metadata.get("lecture_recording", "N/A") + suggested_readings = source_metadata.get("suggested_readings", "N/A") + date = source_metadata.get("date", "N/A") + + source_type = source_metadata.get("source_type", "N/A") + + url_name = f"{url}_{page}" + if url_name not in source_dict: + source_dict[url_name] = { + "text": source.page_content, + "url": url, + "score": score, + "page": page, + "lecture_tldr": lecture_tldr, + "lecture_recording": lecture_recording, + "suggested_readings": suggested_readings, + "date": date, + "source_type": source_type, + } + else: + source_dict[url_name]["text"] += f"\n\n{source.page_content}" + + # First, display the answer + full_answer = "**Answer:**\n" + full_answer += answer + + # Then, display the sources + full_answer += "\n\n**Sources:**\n" + for idx, (url_name, source_data) in enumerate(source_dict.items()): + full_answer += f"\nSource {idx + 1} (Score: {source_data['score']}): {source_data['url']}\n" + + name = f"Source {idx + 1} Text\n" + full_answer += name + source_elements.append( + cl.Text(name=name, content=source_data["text"], display="side") + ) + + # Add a PDF element if the source is a PDF file + if source_data["url"].lower().endswith(".pdf"): + name = f"Source {idx + 1} PDF\n" + full_answer += name + pdf_url = f"{source_data['url']}#page={source_data['page']+1}" + source_elements.append(cl.Pdf(name=name, url=pdf_url, display="side")) + + full_answer += "\n**Metadata:**\n" + for idx, (url_name, source_data) in enumerate(source_dict.items()): + full_answer += f"\nSource {idx + 1} Metadata:\n" + source_elements.append( + cl.Text( + name=f"Source {idx + 1} Metadata", + content=f"Source: {source_data['url']}\n" + f"Page: {source_data['page']}\n" + f"Type: {source_data['source_type']}\n" + f"Date: {source_data['date']}\n" + f"TL;DR: {source_data['lecture_tldr']}\n" + f"Lecture Recording: {source_data['lecture_recording']}\n" + f"Suggested Readings: {source_data['suggested_readings']}\n", + display="side", + ) + ) + + return full_answer, source_elements, source_dict + + +def get_prompt(config): + if config["llm_params"]["use_history"]: + if config["llm_params"]["llm_loader"] == "local_llm": + custom_prompt_template = tinyllama_prompt_template_with_history + elif config["llm_params"]["llm_loader"] == "openai": + custom_prompt_template = openai_prompt_template_with_history + # else: + # custom_prompt_template = tinyllama_prompt_template_with_history # default + prompt = PromptTemplate( + template=custom_prompt_template, + input_variables=["context", "chat_history", "question"], + ) + else: + if config["llm_params"]["llm_loader"] == "local_llm": + custom_prompt_template = tinyllama_prompt_template + elif config["llm_params"]["llm_loader"] == "openai": + custom_prompt_template = openai_prompt_template + # else: + # custom_prompt_template = tinyllama_prompt_template + prompt = PromptTemplate( + template=custom_prompt_template, + input_variables=["context", "question"], + ) + return prompt diff --git a/code/modules/chat/llm_tutor.py b/code/modules/chat/llm_tutor.py new file mode 100644 index 0000000000000000000000000000000000000000..62c862f9fc33fe83950332a01a7659f3c7e76344 --- /dev/null +++ b/code/modules/chat/llm_tutor.py @@ -0,0 +1,211 @@ +from langchain.chains import RetrievalQA, ConversationalRetrievalChain +from langchain.memory import ( + ConversationBufferWindowMemory, + ConversationSummaryBufferMemory, +) +from langchain.chains.conversational_retrieval.prompts import QA_PROMPT +import os +from modules.config.constants import * +from modules.chat.helpers import get_prompt +from modules.chat.chat_model_loader import ChatModelLoader +from modules.vectorstore.store_manager import VectorStoreManager + +from modules.retriever.retriever import Retriever + +from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union +from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun +import inspect +from langchain.chains.conversational_retrieval.base import _get_chat_history +from langchain_core.messages import BaseMessage + +CHAT_TURN_TYPE = Union[Tuple[str, str], BaseMessage] + +from langchain_core.output_parsers import StrOutputParser +from langchain_core.prompts import ChatPromptTemplate +from langchain_community.chat_models import ChatOpenAI + + +class CustomConversationalRetrievalChain(ConversationalRetrievalChain): + + def _get_chat_history(self, chat_history: List[CHAT_TURN_TYPE]) -> str: + _ROLE_MAP = {"human": "Student: ", "ai": "AI Tutor: "} + buffer = "" + for dialogue_turn in chat_history: + if isinstance(dialogue_turn, BaseMessage): + role_prefix = _ROLE_MAP.get( + dialogue_turn.type, f"{dialogue_turn.type}: " + ) + buffer += f"\n{role_prefix}{dialogue_turn.content}" + elif isinstance(dialogue_turn, tuple): + human = "Student: " + dialogue_turn[0] + ai = "AI Tutor: " + dialogue_turn[1] + buffer += "\n" + "\n".join([human, ai]) + else: + raise ValueError( + f"Unsupported chat history format: {type(dialogue_turn)}." + f" Full chat history: {chat_history} " + ) + return buffer + + async def _acall( + self, + inputs: Dict[str, Any], + run_manager: Optional[AsyncCallbackManagerForChainRun] = None, + ) -> Dict[str, Any]: + _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager() + question = inputs["question"] + get_chat_history = self._get_chat_history + chat_history_str = get_chat_history(inputs["chat_history"]) + if chat_history_str: + # callbacks = _run_manager.get_child() + # new_question = await self.question_generator.arun( + # question=question, chat_history=chat_history_str, callbacks=callbacks + # ) + system = ( + "You are someone that rephrases statements. Rephrase the student's question to add context from their chat history if relevant, ensuring it remains from the student's point of view. " + "Incorporate relevant details from the chat history to make the question clearer and more specific." + "Do not change the meaning of the original statement, and maintain the student's tone and perspective. " + "If the question is conversational and doesn't require context, do not rephrase it. " + "Example: If the student previously asked about backpropagation in the context of deep learning and now asks 'what is it', rephrase to 'What is backprogatation.'. " + "Example: Do not rephrase if the user is asking something specific like 'cool, suggest a project with transformers to use as my final project'" + "Chat history: \n{chat_history_str}\n" + "Rephrase the following question only if necessary: '{question}'" + ) + + prompt = ChatPromptTemplate.from_messages( + [ + ("system", system), + ("human", "{question}, {chat_history_str}"), + ] + ) + llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0) + step_back = prompt | llm | StrOutputParser() + new_question = step_back.invoke( + {"question": question, "chat_history_str": chat_history_str} + ) + else: + new_question = question + accepts_run_manager = ( + "run_manager" in inspect.signature(self._aget_docs).parameters + ) + if accepts_run_manager: + docs = await self._aget_docs(new_question, inputs, run_manager=_run_manager) + else: + docs = await self._aget_docs(new_question, inputs) # type: ignore[call-arg] + + output: Dict[str, Any] = {} + output["original_question"] = question + if self.response_if_no_docs_found is not None and len(docs) == 0: + output[self.output_key] = self.response_if_no_docs_found + else: + new_inputs = inputs.copy() + if self.rephrase_question: + new_inputs["question"] = new_question + new_inputs["chat_history"] = chat_history_str + + # Prepare the final prompt with metadata + context = "\n\n".join( + [ + f"Context {idx+1}: \n(Document content: {doc.page_content}\nMetadata: (source_file: {doc.metadata['source'] if 'source' in doc.metadata else 'unknown'}))" + for idx, doc in enumerate(docs) + ] + ) + final_prompt = ( + "You are an AI Tutor for the course DS598, taught by Prof. Thomas Gardos. Answer the user's question using the provided context. Only use the context if it is relevant. The context is ordered by relevance." + "If you don't know the answer, do your best without making things up. Keep the conversation flowing naturally. " + "Use chat history and context as guides but avoid repeating past responses. Provide links from the source_file metadata. Use the source context that is most relevent." + "Speak in a friendly and engaging manner, like talking to a friend. Avoid sounding repetitive or robotic.\n\n" + f"Chat History:\n{chat_history_str}\n\n" + f"Context:\n{context}\n\n" + "Answer the student's question below in a friendly, concise, and engaging manner. Use the context and history only if relevant, otherwise, engage in a free-flowing conversation.\n" + f"Student: {question}\n" + "AI Tutor:" + ) + + # new_inputs["input"] = final_prompt + new_inputs["question"] = final_prompt + # output["final_prompt"] = final_prompt + + answer = await self.combine_docs_chain.arun( + input_documents=docs, callbacks=_run_manager.get_child(), **new_inputs + ) + output[self.output_key] = answer + + if self.return_source_documents: + output["source_documents"] = docs + output["rephrased_question"] = new_question + return output + + +class LLMTutor: + def __init__(self, config, logger=None): + self.config = config + self.llm = self.load_llm() + self.logger = logger + self.vector_db = VectorStoreManager(config, logger=self.logger) + if self.config["vectorstore"]["embedd_files"]: + self.vector_db.create_database() + self.vector_db.save_database() + + def set_custom_prompt(self): + """ + Prompt template for QA retrieval for each vectorstore + """ + prompt = get_prompt(self.config) + # prompt = QA_PROMPT + + return prompt + + # Retrieval QA Chain + def retrieval_qa_chain(self, llm, prompt, db): + + retriever = Retriever(self.config)._return_retriever(db) + + if self.config["llm_params"]["use_history"]: + memory = ConversationBufferWindowMemory( + k=self.config["llm_params"]["memory_window"], + memory_key="chat_history", + return_messages=True, + output_key="answer", + max_token_limit=128, + ) + qa_chain = CustomConversationalRetrievalChain.from_llm( + llm=llm, + chain_type="stuff", + retriever=retriever, + return_source_documents=True, + memory=memory, + combine_docs_chain_kwargs={"prompt": prompt}, + response_if_no_docs_found="No context found", + ) + else: + qa_chain = RetrievalQA.from_chain_type( + llm=llm, + chain_type="stuff", + retriever=retriever, + return_source_documents=True, + chain_type_kwargs={"prompt": prompt}, + ) + return qa_chain + + # Loading the model + def load_llm(self): + chat_model_loader = ChatModelLoader(self.config) + llm = chat_model_loader.load_chat_model() + return llm + + # QA Model Function + def qa_bot(self): + db = self.vector_db.load_database() + qa_prompt = self.set_custom_prompt() + qa = self.retrieval_qa_chain( + self.llm, qa_prompt, db + ) # TODO: PROMPT is overwritten in CustomConversationalRetrievalChain + + return qa + + # output function + def final_result(query): + qa_result = qa_bot() + response = qa_result({"query": query}) + return response diff --git a/code/modules/chat_processor/__init__.py b/code/modules/chat_processor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/modules/chat_processor/base.py b/code/modules/chat_processor/base.py new file mode 100644 index 0000000000000000000000000000000000000000..f245d9c4a04efc8c84982fffee575e4c7d6c558f --- /dev/null +++ b/code/modules/chat_processor/base.py @@ -0,0 +1,12 @@ +# Template for chat processor classes + + +class ChatProcessorBase: + def __init__(self, config): + self.config = config + + def process(self, message): + """ + Processes and Logs the message + """ + raise NotImplementedError("process method not implemented") diff --git a/code/modules/chat_processor/chat_processor.py b/code/modules/chat_processor/chat_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..10061b98c50d9e717c1b9a4b258f1848367e414c --- /dev/null +++ b/code/modules/chat_processor/chat_processor.py @@ -0,0 +1,29 @@ +from modules.chat_processor.literal_ai import LiteralaiChatProcessor + + +class ChatProcessor: + def __init__(self, config, tags=None): + self.chat_processor_type = config["chat_logging"]["platform"] + self.logging = config["chat_logging"]["log_chat"] + self.tags = tags + self._init_processor() + + def _init_processor(self): + if self.chat_processor_type == "literalai": + self.processor = LiteralaiChatProcessor(self.tags) + else: + raise ValueError( + f"Chat processor type {self.chat_processor_type} not supported" + ) + + def _process(self, user_message, assistant_message, source_dict): + if self.logging: + return self.processor.process(user_message, assistant_message, source_dict) + else: + pass + + async def rag(self, user_query: str, chain, cb): + if self.logging: + return await self.processor.rag(user_query, chain, cb) + else: + return await chain.acall(user_query, callbacks=[cb]) diff --git a/code/modules/chat_processor/literal_ai.py b/code/modules/chat_processor/literal_ai.py new file mode 100644 index 0000000000000000000000000000000000000000..c5b9c98eb1f8e82f30f583e669cd6b3ff99a89fa --- /dev/null +++ b/code/modules/chat_processor/literal_ai.py @@ -0,0 +1,37 @@ +from literalai import LiteralClient +import os +from .base import ChatProcessorBase + + +class LiteralaiChatProcessor(ChatProcessorBase): + def __init__(self, tags=None): + self.literal_client = LiteralClient(api_key=os.getenv("LITERAL_API_KEY")) + self.literal_client.reset_context() + with self.literal_client.thread(name="TEST") as thread: + self.thread_id = thread.id + self.thread = thread + if tags is not None and type(tags) == list: + self.thread.tags = tags + print(f"Thread ID: {self.thread}") + + def process(self, user_message, assistant_message, source_dict): + with self.literal_client.thread(thread_id=self.thread_id) as thread: + self.literal_client.message( + content=user_message, + type="user_message", + name="User", + ) + self.literal_client.message( + content=assistant_message, + type="assistant_message", + name="AI_Tutor", + ) + + async def rag(self, user_query: str, chain, cb): + with self.literal_client.step( + type="retrieval", name="RAG", thread_id=self.thread_id + ) as step: + step.input = {"question": user_query} + res = await chain.acall(user_query, callbacks=[cb]) + step.output = res + return res diff --git a/code/modules/config/__init__.py b/code/modules/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/config.yml b/code/modules/config/config.yml similarity index 55% rename from code/config.yml rename to code/modules/config/config.yml index e98f64c14ab68faf559710b246b87b98ba72f7cc..45b3f6c97c63ffc51b4338a201b6a4ca34dd3d09 100644 --- a/code/config.yml +++ b/code/modules/config/config.yml @@ -1,13 +1,28 @@ -embedding_options: +log_dir: '../storage/logs' # str +log_chunk_dir: '../storage/logs/chunks' # str +device: 'cpu' # str [cuda, cpu] + +vectorstore: embedd_files: False # bool - data_path: 'storage/data' # str - url_file_path: 'storage/data/urls.txt' # str + data_path: '../storage/data' # str + url_file_path: '../storage/data/urls.txt' # str expand_urls: True # bool - db_option : 'RAGatouille' # str [FAISS, Chroma, RAGatouille] - db_path : 'vectorstores' # str + db_option : 'FAISS' # str [FAISS, Chroma, RAGatouille, RAPTOR] + db_path : '../vectorstores' # str model : 'sentence-transformers/all-MiniLM-L6-v2' # str [sentence-transformers/all-MiniLM-L6-v2, text-embedding-ada-002'] search_top_k : 3 # int score_threshold : 0.2 # float + + faiss_params: # Not used as of now + index_path: '../vectorstores/faiss.index' # str + index_type: 'Flat' # str [Flat, HNSW, IVF] + index_dimension: 384 # int + index_nlist: 100 # int + index_nprobe: 10 # int + + colbert_params: + index_name: "new_idx" # str + llm_params: use_history: True # bool memory_window: 3 # int @@ -15,9 +30,13 @@ llm_params: openai_params: model: 'gpt-3.5-turbo-1106' # str [gpt-3.5-turbo-1106, gpt-4] local_llm_params: - model: "storage/models/llama-2-7b-chat.Q4_0.gguf" - model_type: "llama" - temperature: 0.2 + model: 'tiny-llama' + temperature: 0.7 + +chat_logging: + log_chat: False # bool + platform: 'literalai' + splitter_options: use_splitter: True # bool split_by_token : True # bool diff --git a/code/modules/constants.py b/code/modules/config/constants.py similarity index 95% rename from code/modules/constants.py rename to code/modules/config/constants.py index a7fa7d8b4f516d34b354a890f20e33af4f912d20..d336d10a400e1aba795c62df48fb7ec701a7caee 100644 --- a/code/modules/constants.py +++ b/code/modules/config/constants.py @@ -7,6 +7,7 @@ load_dotenv() OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") +LITERAL_API_KEY = os.getenv("LITERAL_API_KEY") opening_message = f"Hey, What Can I Help You With?\n\nYou can me ask me questions about the course logistics, course content, about the final project, or anything else!" @@ -77,5 +78,5 @@ Question: {question} # Model Paths -LLAMA_PATH = "storage/models/tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf" +LLAMA_PATH = "../storage/models/tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf" MISTRAL_PATH = "storage/models/mistral-7b-v0.1.Q4_K_M.gguf" diff --git a/code/modules/data_loader.py b/code/modules/data_loader.py deleted file mode 100644 index 92732cf1e49de0376438979b822810ebafba121e..0000000000000000000000000000000000000000 --- a/code/modules/data_loader.py +++ /dev/null @@ -1,293 +0,0 @@ -import os -import re -import requests -import pysrt -from langchain_community.document_loaders import ( - PyMuPDFLoader, - Docx2txtLoader, - YoutubeLoader, - WebBaseLoader, - TextLoader, -) -from langchain_community.document_loaders import UnstructuredMarkdownLoader -from llama_parse import LlamaParse -from langchain.schema import Document -import logging -from langchain.text_splitter import RecursiveCharacterTextSplitter -from ragatouille import RAGPretrainedModel -from langchain.chains import LLMChain -from langchain.llms import OpenAI -from langchain import PromptTemplate - -try: - from modules.helpers import get_metadata -except: - from helpers import get_metadata - -logger = logging.getLogger(__name__) - - -class PDFReader: - def __init__(self): - pass - - def get_loader(self, pdf_path): - loader = PyMuPDFLoader(pdf_path) - return loader - - def get_documents(self, loader): - return loader.load() - - -class FileReader: - def __init__(self): - self.pdf_reader = PDFReader() - - def extract_text_from_pdf(self, pdf_path): - text = "" - with open(pdf_path, "rb") as file: - reader = PyPDF2.PdfReader(file) - num_pages = len(reader.pages) - for page_num in range(num_pages): - page = reader.pages[page_num] - text += page.extract_text() - return text - - def download_pdf_from_url(self, pdf_url): - response = requests.get(pdf_url) - if response.status_code == 200: - with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: - temp_file.write(response.content) - temp_file_path = temp_file.name - return temp_file_path - else: - print("Failed to download PDF from URL:", pdf_url) - return None - - def read_pdf(self, temp_file_path: str): - loader = self.pdf_reader.get_loader(temp_file_path) - documents = self.pdf_reader.get_documents(loader) - return documents - - def read_txt(self, temp_file_path: str): - loader = TextLoader(temp_file_path, autodetect_encoding=True) - return loader.load() - - def read_docx(self, temp_file_path: str): - loader = Docx2txtLoader(temp_file_path) - return loader.load() - - def read_srt(self, temp_file_path: str): - subs = pysrt.open(temp_file_path) - text = "" - for sub in subs: - text += sub.text - return [Document(page_content=text)] - - def read_youtube_transcript(self, url: str): - loader = YoutubeLoader.from_youtube_url( - url, add_video_info=True, language=["en"], translation="en" - ) - return loader.load() - - def read_html(self, url: str): - loader = WebBaseLoader(url) - return loader.load() - - def read_tex_from_url(self, tex_url): - response = requests.get(tex_url) - if response.status_code == 200: - return [Document(page_content=response.text)] - else: - print("Failed to fetch .tex file from URL:", tex_url) - return None - - -class ChunkProcessor: - def __init__(self, config): - self.config = config - - if config["splitter_options"]["use_splitter"]: - if config["splitter_options"]["split_by_token"]: - self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( - chunk_size=config["splitter_options"]["chunk_size"], - chunk_overlap=config["splitter_options"]["chunk_overlap"], - separators=config["splitter_options"]["chunk_separators"], - disallowed_special=(), - ) - else: - self.splitter = RecursiveCharacterTextSplitter( - chunk_size=config["splitter_options"]["chunk_size"], - chunk_overlap=config["splitter_options"]["chunk_overlap"], - separators=config["splitter_options"]["chunk_separators"], - disallowed_special=(), - ) - else: - self.splitter = None - logger.info("ChunkProcessor instance created") - - def remove_delimiters(self, document_chunks: list): - for chunk in document_chunks: - for delimiter in self.config["splitter_options"]["delimiters_to_remove"]: - chunk.page_content = re.sub(delimiter, " ", chunk.page_content) - return document_chunks - - def remove_chunks(self, document_chunks: list): - front = self.config["splitter_options"]["front_chunk_to_remove"] - end = self.config["splitter_options"]["last_chunks_to_remove"] - for _ in range(front): - del document_chunks[0] - for _ in range(end): - document_chunks.pop() - logger.info(f"\tNumber of pages after skipping: {len(document_chunks)}") - return document_chunks - - def process_chunks( - self, documents, file_type="txt", source="", page=0, metadata={} - ): - documents = [Document(page_content=documents, source=source, page=page)] - if ( - file_type == "txt" - or file_type == "docx" - or file_type == "srt" - or file_type == "tex" - ): - document_chunks = self.splitter.split_documents(documents) - elif file_type == "pdf": - document_chunks = documents # Full page for now - - # add the source and page number back to the metadata - for chunk in document_chunks: - chunk.metadata["source"] = source - chunk.metadata["page"] = page - - # add the metadata extracted from the document - for key, value in metadata.items(): - chunk.metadata[key] = value - - if self.config["splitter_options"]["remove_leftover_delimiters"]: - document_chunks = self.remove_delimiters(document_chunks) - if self.config["splitter_options"]["remove_chunks"]: - document_chunks = self.remove_chunks(document_chunks) - - return document_chunks - - def get_chunks(self, file_reader, uploaded_files, weblinks): - self.document_chunks_full = [] - self.parent_document_names = [] - self.child_document_names = [] - self.documents = [] - self.document_metadata = [] - - addl_metadata = get_metadata( - "https://dl4ds.github.io/sp2024/lectures/", - "https://dl4ds.github.io/sp2024/schedule/", - ) # For any additional metadata - - for file_index, file_path in enumerate(uploaded_files): - file_name = os.path.basename(file_path) - if file_name not in self.parent_document_names: - file_type = file_name.split(".")[-1].lower() - - # try: - if file_type == "pdf": - documents = file_reader.read_pdf(file_path) - elif file_type == "txt": - documents = file_reader.read_txt(file_path) - elif file_type == "docx": - documents = file_reader.read_docx(file_path) - elif file_type == "srt": - documents = file_reader.read_srt(file_path) - elif file_type == "tex": - documents = file_reader.read_tex_from_url(file_path) - else: - logger.warning(f"Unsupported file type: {file_type}") - continue - - for doc in documents: - page_num = doc.metadata.get("page", 0) - self.documents.append(doc.page_content) - self.document_metadata.append( - {"source": file_path, "page": page_num} - ) - metadata = addl_metadata.get(file_path, {}) - self.document_metadata[-1].update(metadata) - - self.child_document_names.append(f"{file_name}_{page_num}") - - self.parent_document_names.append(file_name) - if self.config["embedding_options"]["db_option"] not in [ - "RAGatouille" - ]: - document_chunks = self.process_chunks( - self.documents[-1], - file_type, - source=file_path, - page=page_num, - metadata=metadata, - ) - self.document_chunks_full.extend(document_chunks) - - # except Exception as e: - # logger.error(f"Error processing file {file_name}: {str(e)}") - - self.process_weblinks(file_reader, weblinks) - - logger.info( - f"Total document chunks extracted: {len(self.document_chunks_full)}" - ) - return ( - self.document_chunks_full, - self.child_document_names, - self.documents, - self.document_metadata, - ) - - def process_weblinks(self, file_reader, weblinks): - if weblinks[0] != "": - logger.info(f"Splitting weblinks: total of {len(weblinks)}") - - for link_index, link in enumerate(weblinks): - if link not in self.parent_document_names: - try: - logger.info(f"\tSplitting link {link_index+1} : {link}") - if "youtube" in link: - documents = file_reader.read_youtube_transcript(link) - else: - documents = file_reader.read_html(link) - - for doc in documents: - page_num = doc.metadata.get("page", 0) - self.documents.append(doc.page_content) - self.document_metadata.append( - {"source": link, "page": page_num} - ) - self.child_document_names.append(f"{link}") - - self.parent_document_names.append(link) - if self.config["embedding_options"]["db_option"] not in [ - "RAGatouille" - ]: - document_chunks = self.process_chunks( - self.documents[-1], - "txt", - source=link, - page=0, - metadata={"source_type": "webpage"}, - ) - self.document_chunks_full.extend(document_chunks) - except Exception as e: - logger.error( - f"Error splitting link {link_index+1} : {link}: {str(e)}" - ) - - -class DataLoader: - def __init__(self, config): - self.file_reader = FileReader() - self.chunk_processor = ChunkProcessor(config) - - def get_chunks(self, uploaded_files, weblinks): - return self.chunk_processor.get_chunks( - self.file_reader, uploaded_files, weblinks - ) diff --git a/code/modules/dataloader/__init__.py b/code/modules/dataloader/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/modules/dataloader/data_loader.py b/code/modules/dataloader/data_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..81da7f3aa188e2d52bbb8697953d63639ed051f3 --- /dev/null +++ b/code/modules/dataloader/data_loader.py @@ -0,0 +1,360 @@ +import os +import re +import requests +import pysrt +from langchain_community.document_loaders import ( + PyMuPDFLoader, + Docx2txtLoader, + YoutubeLoader, + WebBaseLoader, + TextLoader, +) +from langchain_community.document_loaders import UnstructuredMarkdownLoader +from llama_parse import LlamaParse +from langchain.schema import Document +import logging +from langchain.text_splitter import RecursiveCharacterTextSplitter +from ragatouille import RAGPretrainedModel +from langchain.chains import LLMChain +from langchain_community.llms import OpenAI +from langchain import PromptTemplate +import json +from concurrent.futures import ThreadPoolExecutor + +from modules.dataloader.helpers import get_metadata + + +class PDFReader: + def __init__(self): + pass + + def get_loader(self, pdf_path): + loader = PyMuPDFLoader(pdf_path) + return loader + + def get_documents(self, loader): + return loader.load() + + +class FileReader: + def __init__(self, logger): + self.pdf_reader = PDFReader() + self.logger = logger + + def extract_text_from_pdf(self, pdf_path): + text = "" + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + num_pages = len(reader.pages) + for page_num in range(num_pages): + page = reader.pages[page_num] + text += page.extract_text() + return text + + def download_pdf_from_url(self, pdf_url): + response = requests.get(pdf_url) + if response.status_code == 200: + with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: + temp_file.write(response.content) + temp_file_path = temp_file.name + return temp_file_path + else: + self.logger.error(f"Failed to download PDF from URL: {pdf_url}") + return None + + def read_pdf(self, temp_file_path: str): + loader = self.pdf_reader.get_loader(temp_file_path) + documents = self.pdf_reader.get_documents(loader) + return documents + + def read_txt(self, temp_file_path: str): + loader = TextLoader(temp_file_path, autodetect_encoding=True) + return loader.load() + + def read_docx(self, temp_file_path: str): + loader = Docx2txtLoader(temp_file_path) + return loader.load() + + def read_srt(self, temp_file_path: str): + subs = pysrt.open(temp_file_path) + text = "" + for sub in subs: + text += sub.text + return [Document(page_content=text)] + + def read_youtube_transcript(self, url: str): + loader = YoutubeLoader.from_youtube_url( + url, add_video_info=True, language=["en"], translation="en" + ) + return loader.load() + + def read_html(self, url: str): + loader = WebBaseLoader(url) + return loader.load() + + def read_tex_from_url(self, tex_url): + response = requests.get(tex_url) + if response.status_code == 200: + return [Document(page_content=response.text)] + else: + self.logger.error(f"Failed to fetch .tex file from URL: {tex_url}") + return None + + +class ChunkProcessor: + def __init__(self, config, logger): + self.config = config + self.logger = logger + + self.document_data = {} + self.document_metadata = {} + self.document_chunks_full = [] + + if config["splitter_options"]["use_splitter"]: + if config["splitter_options"]["split_by_token"]: + self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( + chunk_size=config["splitter_options"]["chunk_size"], + chunk_overlap=config["splitter_options"]["chunk_overlap"], + separators=config["splitter_options"]["chunk_separators"], + disallowed_special=(), + ) + else: + self.splitter = RecursiveCharacterTextSplitter( + chunk_size=config["splitter_options"]["chunk_size"], + chunk_overlap=config["splitter_options"]["chunk_overlap"], + separators=config["splitter_options"]["chunk_separators"], + disallowed_special=(), + ) + else: + self.splitter = None + self.logger.info("ChunkProcessor instance created") + + def remove_delimiters(self, document_chunks: list): + for chunk in document_chunks: + for delimiter in self.config["splitter_options"]["delimiters_to_remove"]: + chunk.page_content = re.sub(delimiter, " ", chunk.page_content) + return document_chunks + + def remove_chunks(self, document_chunks: list): + front = self.config["splitter_options"]["front_chunk_to_remove"] + end = self.config["splitter_options"]["last_chunks_to_remove"] + for _ in range(front): + del document_chunks[0] + for _ in range(end): + document_chunks.pop() + return document_chunks + + def process_chunks( + self, documents, file_type="txt", source="", page=0, metadata={} + ): + documents = [Document(page_content=documents, source=source, page=page)] + if ( + file_type == "txt" + or file_type == "docx" + or file_type == "srt" + or file_type == "tex" + ): + document_chunks = self.splitter.split_documents(documents) + elif file_type == "pdf": + document_chunks = documents # Full page for now + + # add the source and page number back to the metadata + for chunk in document_chunks: + chunk.metadata["source"] = source + chunk.metadata["page"] = page + + # add the metadata extracted from the document + for key, value in metadata.items(): + chunk.metadata[key] = value + + if self.config["splitter_options"]["remove_leftover_delimiters"]: + document_chunks = self.remove_delimiters(document_chunks) + if self.config["splitter_options"]["remove_chunks"]: + document_chunks = self.remove_chunks(document_chunks) + + return document_chunks + + def chunk_docs(self, file_reader, uploaded_files, weblinks): + addl_metadata = get_metadata( + "https://dl4ds.github.io/sp2024/lectures/", + "https://dl4ds.github.io/sp2024/schedule/", + ) # For any additional metadata + + with ThreadPoolExecutor() as executor: + executor.map( + self.process_file, + uploaded_files, + range(len(uploaded_files)), + [file_reader] * len(uploaded_files), + [addl_metadata] * len(uploaded_files), + ) + executor.map( + self.process_weblink, + weblinks, + range(len(weblinks)), + [file_reader] * len(weblinks), + [addl_metadata] * len(weblinks), + ) + + document_names = [ + f"{file_name}_{page_num}" + for file_name, pages in self.document_data.items() + for page_num in pages.keys() + ] + documents = [ + page for doc in self.document_data.values() for page in doc.values() + ] + document_metadata = [ + page for doc in self.document_metadata.values() for page in doc.values() + ] + + self.save_document_data() + + self.logger.info( + f"Total document chunks extracted: {len(self.document_chunks_full)}" + ) + + return self.document_chunks_full, document_names, documents, document_metadata + + def process_documents( + self, documents, file_path, file_type, metadata_source, addl_metadata + ): + file_data = {} + file_metadata = {} + + for doc in documents: + # if len(doc.page_content) <= 400: # better approach to filter out non-informative documents + # continue + + page_num = doc.metadata.get("page", 0) + file_data[page_num] = doc.page_content + metadata = ( + addl_metadata.get(file_path, {}) + if metadata_source == "file" + else {"source": file_path, "page": page_num} + ) + file_metadata[page_num] = metadata + + if self.config["vectorstore"]["db_option"] not in ["RAGatouille"]: + document_chunks = self.process_chunks( + doc.page_content, + file_type, + source=file_path, + page=page_num, + metadata=metadata, + ) + self.document_chunks_full.extend(document_chunks) + + self.document_data[file_path] = file_data + self.document_metadata[file_path] = file_metadata + + def process_file(self, file_path, file_index, file_reader, addl_metadata): + file_name = os.path.basename(file_path) + if file_name in self.document_data: + return + + file_type = file_name.split(".")[-1].lower() + self.logger.info(f"Reading file {file_index + 1}: {file_path}") + + read_methods = { + "pdf": file_reader.read_pdf, + "txt": file_reader.read_txt, + "docx": file_reader.read_docx, + "srt": file_reader.read_srt, + "tex": file_reader.read_tex_from_url, + } + if file_type not in read_methods: + self.logger.warning(f"Unsupported file type: {file_type}") + return + + try: + documents = read_methods[file_type](file_path) + self.process_documents( + documents, file_path, file_type, "file", addl_metadata + ) + except Exception as e: + self.logger.error(f"Error processing file {file_name}: {str(e)}") + + def process_weblink(self, link, link_index, file_reader, addl_metadata): + if link in self.document_data: + return + + self.logger.info(f"Reading link {link_index + 1} : {link}") + + try: + if "youtube" in link: + documents = file_reader.read_youtube_transcript(link) + else: + documents = file_reader.read_html(link) + + self.process_documents(documents, link, "txt", "link", addl_metadata) + except Exception as e: + self.logger.error(f"Error Reading link {link_index + 1} : {link}: {str(e)}") + + def save_document_data(self): + if not os.path.exists(f"{self.config['log_chunk_dir']}/docs"): + os.makedirs(f"{self.config['log_chunk_dir']}/docs") + self.logger.info( + f"Creating directory {self.config['log_chunk_dir']}/docs for document data" + ) + self.logger.info( + f"Saving document content to {self.config['log_chunk_dir']}/docs/doc_content.json" + ) + if not os.path.exists(f"{self.config['log_chunk_dir']}/metadata"): + os.makedirs(f"{self.config['log_chunk_dir']}/metadata") + self.logger.info( + f"Creating directory {self.config['log_chunk_dir']}/metadata for document metadata" + ) + self.logger.info( + f"Saving document metadata to {self.config['log_chunk_dir']}/metadata/doc_metadata.json" + ) + with open( + f"{self.config['log_chunk_dir']}/docs/doc_content.json", "w" + ) as json_file: + json.dump(self.document_data, json_file, indent=4) + with open( + f"{self.config['log_chunk_dir']}/metadata/doc_metadata.json", "w" + ) as json_file: + json.dump(self.document_metadata, json_file, indent=4) + + def load_document_data(self): + with open( + f"{self.config['log_chunk_dir']}/docs/doc_content.json", "r" + ) as json_file: + self.document_data = json.load(json_file) + with open( + f"{self.config['log_chunk_dir']}/metadata/doc_metadata.json", "r" + ) as json_file: + self.document_metadata = json.load(json_file) + + +class DataLoader: + def __init__(self, config, logger=None): + self.file_reader = FileReader(logger=logger) + self.chunk_processor = ChunkProcessor(config, logger=logger) + + def get_chunks(self, uploaded_files, weblinks): + return self.chunk_processor.chunk_docs( + self.file_reader, uploaded_files, weblinks + ) + + +if __name__ == "__main__": + import yaml + + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + + with open("../code/modules/config/config.yml", "r") as f: + config = yaml.safe_load(f) + + data_loader = DataLoader(config, logger=logger) + document_chunks, document_names, documents, document_metadata = ( + data_loader.get_chunks( + [], + ["https://dl4ds.github.io/sp2024/"], + ) + ) + + print(document_names) + print(len(document_chunks)) diff --git a/code/modules/dataloader/helpers.py b/code/modules/dataloader/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..aa61af2d44e92156dd39b74a0224f5cd6797dbdb --- /dev/null +++ b/code/modules/dataloader/helpers.py @@ -0,0 +1,108 @@ +import requests +from bs4 import BeautifulSoup +from tqdm import tqdm + + +def get_urls_from_file(file_path: str): + """ + Function to get urls from a file + """ + with open(file_path, "r") as f: + urls = f.readlines() + urls = [url.strip() for url in urls] + return urls + + +def get_base_url(url): + parsed_url = urlparse(url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/" + return base_url + + +def get_metadata(lectures_url, schedule_url): + """ + Function to get the lecture metadata from the lectures and schedule URLs. + """ + lecture_metadata = {} + + # Get the main lectures page content + r_lectures = requests.get(lectures_url) + soup_lectures = BeautifulSoup(r_lectures.text, "html.parser") + + # Get the main schedule page content + r_schedule = requests.get(schedule_url) + soup_schedule = BeautifulSoup(r_schedule.text, "html.parser") + + # Find all lecture blocks + lecture_blocks = soup_lectures.find_all("div", class_="lecture-container") + + # Create a mapping from slides link to date + date_mapping = {} + schedule_rows = soup_schedule.find_all("li", class_="table-row-lecture") + for row in schedule_rows: + try: + date = ( + row.find("div", {"data-label": "Date"}).get_text(separator=" ").strip() + ) + description_div = row.find("div", {"data-label": "Description"}) + slides_link_tag = description_div.find("a", title="Download slides") + slides_link = slides_link_tag["href"].strip() if slides_link_tag else None + slides_link = ( + f"https://dl4ds.github.io{slides_link}" if slides_link else None + ) + if slides_link: + date_mapping[slides_link] = date + except Exception as e: + print(f"Error processing schedule row: {e}") + continue + + for block in lecture_blocks: + try: + # Extract the lecture title + title = block.find("span", style="font-weight: bold;").text.strip() + + # Extract the TL;DR + tldr = block.find("strong", text="tl;dr:").next_sibling.strip() + + # Extract the link to the slides + slides_link_tag = block.find("a", title="Download slides") + slides_link = slides_link_tag["href"].strip() if slides_link_tag else None + slides_link = ( + f"https://dl4ds.github.io{slides_link}" if slides_link else None + ) + + # Extract the link to the lecture recording + recording_link_tag = block.find("a", title="Download lecture recording") + recording_link = ( + recording_link_tag["href"].strip() if recording_link_tag else None + ) + + # Extract suggested readings or summary if available + suggested_readings_tag = block.find("p", text="Suggested Readings:") + if suggested_readings_tag: + suggested_readings = suggested_readings_tag.find_next_sibling("ul") + if suggested_readings: + suggested_readings = suggested_readings.get_text( + separator="\n" + ).strip() + else: + suggested_readings = "No specific readings provided." + else: + suggested_readings = "No specific readings provided." + + # Get the date from the schedule + date = date_mapping.get(slides_link, "No date available") + + # Add to the dictionary + lecture_metadata[slides_link] = { + "date": date, + "tldr": tldr, + "title": title, + "lecture_recording": recording_link, + "suggested_readings": suggested_readings, + } + except Exception as e: + print(f"Error processing block: {e}") + continue + + return lecture_metadata diff --git a/code/modules/dataloader/webpage_crawler.py b/code/modules/dataloader/webpage_crawler.py new file mode 100644 index 0000000000000000000000000000000000000000..fb02b4c34e3e252940d0d86a137c00563d46eba3 --- /dev/null +++ b/code/modules/dataloader/webpage_crawler.py @@ -0,0 +1,115 @@ +import aiohttp +from aiohttp import ClientSession +import asyncio +import requests +from bs4 import BeautifulSoup +from urllib.parse import urlparse, urljoin, urldefrag + +class WebpageCrawler: + def __init__(self): + self.dict_href_links = {} + + async def fetch(self, session: ClientSession, url: str) -> str: + async with session.get(url) as response: + try: + return await response.text() + except UnicodeDecodeError: + return await response.text(encoding="latin1") + + def url_exists(self, url: str) -> bool: + try: + response = requests.head(url) + return response.status_code == 200 + except requests.ConnectionError: + return False + + async def get_links(self, session: ClientSession, website_link: str, base_url: str): + html_data = await self.fetch(session, website_link) + soup = BeautifulSoup(html_data, "html.parser") + list_links = [] + for link in soup.find_all("a", href=True): + href = link["href"].strip() + full_url = urljoin(base_url, href) + normalized_url = self.normalize_url(full_url) # sections removed + if ( + normalized_url not in self.dict_href_links + and self.is_child_url(normalized_url, base_url) + and self.url_exists(normalized_url) + ): + self.dict_href_links[normalized_url] = None + list_links.append(normalized_url) + + return list_links + + async def get_subpage_links( + self, session: ClientSession, urls: list, base_url: str + ): + tasks = [self.get_links(session, url, base_url) for url in urls] + results = await asyncio.gather(*tasks) + all_links = [link for sublist in results for link in sublist] + return all_links + + async def get_all_pages(self, url: str, base_url: str): + async with aiohttp.ClientSession() as session: + dict_links = {url: "Not-checked"} + counter = None + while counter != 0: + unchecked_links = [ + link + for link, status in dict_links.items() + if status == "Not-checked" + ] + if not unchecked_links: + break + new_links = await self.get_subpage_links( + session, unchecked_links, base_url + ) + for link in unchecked_links: + dict_links[link] = "Checked" + print(f"Checked: {link}") + dict_links.update( + { + link: "Not-checked" + for link in new_links + if link not in dict_links + } + ) + counter = len( + [ + status + for status in dict_links.values() + if status == "Not-checked" + ] + ) + + checked_urls = [ + url for url, status in dict_links.items() if status == "Checked" + ] + return checked_urls + + def is_webpage(self, url: str) -> bool: + try: + response = requests.head(url, allow_redirects=True) + content_type = response.headers.get("Content-Type", "").lower() + return "text/html" in content_type + except requests.RequestException: + return False + + def clean_url_list(self, urls): + files, webpages = [], [] + + for url in urls: + if self.is_webpage(url): + webpages.append(url) + else: + files.append(url) + + return files, webpages + + def is_child_url(self, url, base_url): + return url.startswith(base_url) + + def normalize_url(self, url: str): + # Strip the fragment identifier + defragged_url, _ = urldefrag(url) + return defragged_url diff --git a/code/modules/helpers.py b/code/modules/helpers.py deleted file mode 100644 index f3c9822b72e93033e1f6b39a5fa77e4800acae1b..0000000000000000000000000000000000000000 --- a/code/modules/helpers.py +++ /dev/null @@ -1,337 +0,0 @@ -import requests -from bs4 import BeautifulSoup -from tqdm import tqdm -import chainlit as cl -from langchain import PromptTemplate -import requests -from bs4 import BeautifulSoup -from urllib.parse import urlparse, urljoin, urldefrag -import asyncio -import aiohttp -from aiohttp import ClientSession -from typing import Dict, Any, List - -try: - from modules.constants import * -except: - from constants import * - -""" -Ref: https://python.plainenglish.io/scraping-the-subpages-on-a-website-ea2d4e3db113 -""" - - -class WebpageCrawler: - def __init__(self): - self.dict_href_links = {} - - async def fetch(self, session: ClientSession, url: str) -> str: - async with session.get(url) as response: - try: - return await response.text() - except UnicodeDecodeError: - return await response.text(encoding="latin1") - - def url_exists(self, url: str) -> bool: - try: - response = requests.head(url) - return response.status_code == 200 - except requests.ConnectionError: - return False - - async def get_links(self, session: ClientSession, website_link: str, base_url: str): - html_data = await self.fetch(session, website_link) - soup = BeautifulSoup(html_data, "html.parser") - list_links = [] - for link in soup.find_all("a", href=True): - href = link["href"].strip() - full_url = urljoin(base_url, href) - normalized_url = self.normalize_url(full_url) # sections removed - if ( - normalized_url not in self.dict_href_links - and self.is_child_url(normalized_url, base_url) - and self.url_exists(normalized_url) - ): - self.dict_href_links[normalized_url] = None - list_links.append(normalized_url) - - return list_links - - async def get_subpage_links( - self, session: ClientSession, urls: list, base_url: str - ): - tasks = [self.get_links(session, url, base_url) for url in urls] - results = await asyncio.gather(*tasks) - all_links = [link for sublist in results for link in sublist] - return all_links - - async def get_all_pages(self, url: str, base_url: str): - async with aiohttp.ClientSession() as session: - dict_links = {url: "Not-checked"} - counter = None - while counter != 0: - unchecked_links = [ - link - for link, status in dict_links.items() - if status == "Not-checked" - ] - if not unchecked_links: - break - new_links = await self.get_subpage_links( - session, unchecked_links, base_url - ) - for link in unchecked_links: - dict_links[link] = "Checked" - print(f"Checked: {link}") - dict_links.update( - { - link: "Not-checked" - for link in new_links - if link not in dict_links - } - ) - counter = len( - [ - status - for status in dict_links.values() - if status == "Not-checked" - ] - ) - - checked_urls = [ - url for url, status in dict_links.items() if status == "Checked" - ] - return checked_urls - - def is_webpage(self, url: str) -> bool: - try: - response = requests.head(url, allow_redirects=True) - content_type = response.headers.get("Content-Type", "").lower() - return "text/html" in content_type - except requests.RequestException: - return False - - def clean_url_list(self, urls): - files, webpages = [], [] - - for url in urls: - if self.is_webpage(url): - webpages.append(url) - else: - files.append(url) - - return files, webpages - - def is_child_url(self, url, base_url): - return url.startswith(base_url) - - def normalize_url(self, url: str): - # Strip the fragment identifier - defragged_url, _ = urldefrag(url) - return defragged_url - - -def get_urls_from_file(file_path: str): - """ - Function to get urls from a file - """ - with open(file_path, "r") as f: - urls = f.readlines() - urls = [url.strip() for url in urls] - return urls - - -def get_base_url(url): - parsed_url = urlparse(url) - base_url = f"{parsed_url.scheme}://{parsed_url.netloc}/" - return base_url - - -def get_prompt(config): - if config["llm_params"]["use_history"]: - if config["llm_params"]["llm_loader"] == "local_llm": - custom_prompt_template = tinyllama_prompt_template_with_history - elif config["llm_params"]["llm_loader"] == "openai": - custom_prompt_template = openai_prompt_template_with_history - # else: - # custom_prompt_template = tinyllama_prompt_template_with_history # default - prompt = PromptTemplate( - template=custom_prompt_template, - input_variables=["context", "chat_history", "question"], - ) - else: - if config["llm_params"]["llm_loader"] == "local_llm": - custom_prompt_template = tinyllama_prompt_template - elif config["llm_params"]["llm_loader"] == "openai": - custom_prompt_template = openai_prompt_template - # else: - # custom_prompt_template = tinyllama_prompt_template - prompt = PromptTemplate( - template=custom_prompt_template, - input_variables=["context", "question"], - ) - return prompt - - -def get_sources(res, answer): - source_elements = [] - source_dict = {} # Dictionary to store URL elements - - for idx, source in enumerate(res["source_documents"]): - source_metadata = source.metadata - url = source_metadata["source"] - score = source_metadata.get("score", "N/A") - page = source_metadata.get("page", 1) - - lecture_tldr = source_metadata.get("tldr", "N/A") - lecture_recording = source_metadata.get("lecture_recording", "N/A") - suggested_readings = source_metadata.get("suggested_readings", "N/A") - date = source_metadata.get("date", "N/A") - - source_type = source_metadata.get("source_type", "N/A") - - url_name = f"{url}_{page}" - if url_name not in source_dict: - source_dict[url_name] = { - "text": source.page_content, - "url": url, - "score": score, - "page": page, - "lecture_tldr": lecture_tldr, - "lecture_recording": lecture_recording, - "suggested_readings": suggested_readings, - "date": date, - "source_type": source_type, - } - else: - source_dict[url_name]["text"] += f"\n\n{source.page_content}" - - # First, display the answer - full_answer = "**Answer:**\n" - full_answer += answer - - # Then, display the sources - full_answer += "\n\n**Sources:**\n" - for idx, (url_name, source_data) in enumerate(source_dict.items()): - full_answer += f"\nSource {idx + 1} (Score: {source_data['score']}): {source_data['url']}\n" - - name = f"Source {idx + 1} Text\n" - full_answer += name - source_elements.append( - cl.Text(name=name, content=source_data["text"], display="side") - ) - - # Add a PDF element if the source is a PDF file - if source_data["url"].lower().endswith(".pdf"): - name = f"Source {idx + 1} PDF\n" - full_answer += name - pdf_url = f"{source_data['url']}#page={source_data['page']+1}" - source_elements.append(cl.Pdf(name=name, url=pdf_url, display="side")) - - full_answer += "\n**Metadata:**\n" - for idx, (url_name, source_data) in enumerate(source_dict.items()): - full_answer += f"\nSource {idx + 1} Metadata:\n" - source_elements.append( - cl.Text( - name=f"Source {idx + 1} Metadata", - content=f"Source: {source_data['url']}\n" - f"Page: {source_data['page']}\n" - f"Type: {source_data['source_type']}\n" - f"Date: {source_data['date']}\n" - f"TL;DR: {source_data['lecture_tldr']}\n" - f"Lecture Recording: {source_data['lecture_recording']}\n" - f"Suggested Readings: {source_data['suggested_readings']}\n", - display="side", - ) - ) - - return full_answer, source_elements - - -def get_metadata(lectures_url, schedule_url): - """ - Function to get the lecture metadata from the lectures and schedule URLs. - """ - lecture_metadata = {} - - # Get the main lectures page content - r_lectures = requests.get(lectures_url) - soup_lectures = BeautifulSoup(r_lectures.text, "html.parser") - - # Get the main schedule page content - r_schedule = requests.get(schedule_url) - soup_schedule = BeautifulSoup(r_schedule.text, "html.parser") - - # Find all lecture blocks - lecture_blocks = soup_lectures.find_all("div", class_="lecture-container") - - # Create a mapping from slides link to date - date_mapping = {} - schedule_rows = soup_schedule.find_all("li", class_="table-row-lecture") - for row in schedule_rows: - try: - date = ( - row.find("div", {"data-label": "Date"}).get_text(separator=" ").strip() - ) - description_div = row.find("div", {"data-label": "Description"}) - slides_link_tag = description_div.find("a", title="Download slides") - slides_link = slides_link_tag["href"].strip() if slides_link_tag else None - slides_link = ( - f"https://dl4ds.github.io{slides_link}" if slides_link else None - ) - if slides_link: - date_mapping[slides_link] = date - except Exception as e: - print(f"Error processing schedule row: {e}") - continue - - for block in lecture_blocks: - try: - # Extract the lecture title - title = block.find("span", style="font-weight: bold;").text.strip() - - # Extract the TL;DR - tldr = block.find("strong", text="tl;dr:").next_sibling.strip() - - # Extract the link to the slides - slides_link_tag = block.find("a", title="Download slides") - slides_link = slides_link_tag["href"].strip() if slides_link_tag else None - slides_link = ( - f"https://dl4ds.github.io{slides_link}" if slides_link else None - ) - - # Extract the link to the lecture recording - recording_link_tag = block.find("a", title="Download lecture recording") - recording_link = ( - recording_link_tag["href"].strip() if recording_link_tag else None - ) - - # Extract suggested readings or summary if available - suggested_readings_tag = block.find("p", text="Suggested Readings:") - if suggested_readings_tag: - suggested_readings = suggested_readings_tag.find_next_sibling("ul") - if suggested_readings: - suggested_readings = suggested_readings.get_text( - separator="\n" - ).strip() - else: - suggested_readings = "No specific readings provided." - else: - suggested_readings = "No specific readings provided." - - # Get the date from the schedule - date = date_mapping.get(slides_link, "No date available") - - # Add to the dictionary - lecture_metadata[slides_link] = { - "date": date, - "tldr": tldr, - "title": title, - "lecture_recording": recording_link, - "suggested_readings": suggested_readings, - } - except Exception as e: - print(f"Error processing block: {e}") - continue - - return lecture_metadata diff --git a/code/modules/llm_tutor.py b/code/modules/llm_tutor.py deleted file mode 100644 index 1ebf20d37b08373ffec9356d0898828a9a57fe5a..0000000000000000000000000000000000000000 --- a/code/modules/llm_tutor.py +++ /dev/null @@ -1,177 +0,0 @@ -from langchain import PromptTemplate -from langchain.embeddings import HuggingFaceEmbeddings -from langchain_community.chat_models import ChatOpenAI -from langchain_community.embeddings import OpenAIEmbeddings -from langchain.vectorstores import FAISS -from langchain.chains import RetrievalQA, ConversationalRetrievalChain -from langchain.llms import CTransformers -from langchain.memory import ConversationBufferWindowMemory, ConversationSummaryBufferMemory -from langchain.chains.conversational_retrieval.prompts import QA_PROMPT -import os -from modules.constants import * -from modules.helpers import get_prompt -from modules.chat_model_loader import ChatModelLoader -from modules.vector_db import VectorDB, VectorDBScore -from typing import Dict, Any, Optional -from langchain_core.callbacks.manager import AsyncCallbackManagerForChainRun -import inspect -from langchain.chains.conversational_retrieval.base import _get_chat_history - - -class CustomConversationalRetrievalChain(ConversationalRetrievalChain): - async def _acall( - self, - inputs: Dict[str, Any], - run_manager: Optional[AsyncCallbackManagerForChainRun] = None, - ) -> Dict[str, Any]: - _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager() - question = inputs["question"] - get_chat_history = self.get_chat_history or _get_chat_history - chat_history_str = get_chat_history(inputs["chat_history"]) - print(f"chat_history_str: {chat_history_str}") - if chat_history_str: - callbacks = _run_manager.get_child() - new_question = await self.question_generator.arun( - question=question, chat_history=chat_history_str, callbacks=callbacks - ) - else: - new_question = question - accepts_run_manager = ( - "run_manager" in inspect.signature(self._aget_docs).parameters - ) - if accepts_run_manager: - docs = await self._aget_docs(new_question, inputs, run_manager=_run_manager) - else: - docs = await self._aget_docs(new_question, inputs) # type: ignore[call-arg] - - output: Dict[str, Any] = {} - if self.response_if_no_docs_found is not None and len(docs) == 0: - output[self.output_key] = self.response_if_no_docs_found - else: - new_inputs = inputs.copy() - if self.rephrase_question: - new_inputs["question"] = new_question - new_inputs["chat_history"] = chat_history_str - - # Prepare the final prompt with metadata - context = "\n\n".join( - [ - f"Document content: {doc.page_content}\nMetadata: {doc.metadata}" - for doc in docs - ] - ) - final_prompt = f""" - You are an AI Tutor for the course DS598, taught by Prof. Thomas Gardos. Use the following pieces of information to answer the user's question. - If you don't know the answer, just say that you don't knowβ€”don't try to make up an answer. - Use the chat history to answer the question only if it's relevant; otherwise, ignore it. The context for the answer will be under "Document context:". - Use the metadata from each document to guide the user to the correct sources. - The context is ordered by relevance to the question. Give more weight to the most relevant documents. - Talk in a friendly and personalized manner, similar to how you would speak to a friend who needs help. Make the conversation engaging and avoid sounding repetitive or robotic. - - Chat History: - {chat_history_str} - - Context: - {context} - - Question: {new_question} - AI Tutor: - """ - - new_inputs["input"] = final_prompt - new_inputs["question"] = final_prompt - output["final_prompt"] = final_prompt - - answer = await self.combine_docs_chain.arun( - input_documents=docs, callbacks=_run_manager.get_child(), **new_inputs - ) - output[self.output_key] = answer - - if self.return_source_documents: - output["source_documents"] = docs - if self.return_generated_question: - output["generated_question"] = new_question - return output - - -class LLMTutor: - def __init__(self, config, logger=None): - self.config = config - self.llm = self.load_llm() - self.vector_db = VectorDB(config, logger=logger) - if self.config["embedding_options"]["embedd_files"]: - self.vector_db.create_database() - self.vector_db.save_database() - - def set_custom_prompt(self): - """ - Prompt template for QA retrieval for each vectorstore - """ - prompt = get_prompt(self.config) - # prompt = QA_PROMPT - - return prompt - - # Retrieval QA Chain - def retrieval_qa_chain(self, llm, prompt, db): - if self.config["embedding_options"]["db_option"] in ["FAISS", "Chroma"]: - retriever = VectorDBScore( - vectorstore=db, - # search_type="similarity_score_threshold", - # search_kwargs={ - # "score_threshold": self.config["embedding_options"][ - # "score_threshold" - # ], - # "k": self.config["embedding_options"]["search_top_k"], - # }, - ) - elif self.config["embedding_options"]["db_option"] == "RAGatouille": - retriever = db.as_langchain_retriever( - k=self.config["embedding_options"]["search_top_k"] - ) - if self.config["llm_params"]["use_history"]: - memory = ConversationSummaryBufferMemory( - llm = llm, - k=self.config["llm_params"]["memory_window"], - memory_key="chat_history", - return_messages=True, - output_key="answer", - max_token_limit=128, - ) - qa_chain = CustomConversationalRetrievalChain.from_llm( - llm=llm, - chain_type="stuff", - retriever=retriever, - return_source_documents=True, - memory=memory, - combine_docs_chain_kwargs={"prompt": prompt}, - ) - else: - qa_chain = RetrievalQA.from_chain_type( - llm=llm, - chain_type="stuff", - retriever=retriever, - return_source_documents=True, - chain_type_kwargs={"prompt": prompt}, - ) - return qa_chain - - # Loading the model - def load_llm(self): - chat_model_loader = ChatModelLoader(self.config) - llm = chat_model_loader.load_chat_model() - return llm - - # QA Model Function - def qa_bot(self): - db = self.vector_db.load_database() - qa_prompt = self.set_custom_prompt() - qa = self.retrieval_qa_chain(self.llm, qa_prompt, db) - - return qa - - # output function - def final_result(query): - qa_result = qa_bot() - response = qa_result({"query": query}) - return response diff --git a/code/modules/retriever/__init__.py b/code/modules/retriever/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/modules/retriever/base.py b/code/modules/retriever/base.py new file mode 100644 index 0000000000000000000000000000000000000000..0d4a5640a6de4ed321ad4f37df904faca1db958e --- /dev/null +++ b/code/modules/retriever/base.py @@ -0,0 +1,12 @@ +# template for retriever classes + + +class BaseRetriever: + def __init__(self, config): + self.config = config + + def return_retriever(self): + """ + Returns the retriever object + """ + raise NotImplementedError diff --git a/code/modules/retriever/chroma_retriever.py b/code/modules/retriever/chroma_retriever.py new file mode 100644 index 0000000000000000000000000000000000000000..f7bbbc729ca048507333628af9635c24e6a70c7e --- /dev/null +++ b/code/modules/retriever/chroma_retriever.py @@ -0,0 +1,24 @@ +from .helpers import VectorStoreRetrieverScore +from .base import BaseRetriever + + +class ChromaRetriever(BaseRetriever): + def __init__(self): + pass + + def return_retriever(self, db, config): + retriever = VectorStoreRetrieverScore( + vectorstore=db, + # search_type="similarity_score_threshold", + # search_kwargs={ + # "score_threshold": self.config["vectorstore"][ + # "score_threshold" + # ], + # "k": self.config["vectorstore"]["search_top_k"], + # }, + search_kwargs={ + "k": config["vectorstore"]["search_top_k"], + }, + ) + + return retriever diff --git a/code/modules/retriever/colbert_retriever.py b/code/modules/retriever/colbert_retriever.py new file mode 100644 index 0000000000000000000000000000000000000000..ad66f6dd99987eec95435255b952e94d435c2814 --- /dev/null +++ b/code/modules/retriever/colbert_retriever.py @@ -0,0 +1,10 @@ +from .base import BaseRetriever + + +class ColbertRetriever(BaseRetriever): + def __init__(self): + pass + + def return_retriever(self, db, config): + retriever = db.as_langchain_retriever(k=config["vectorstore"]["search_top_k"]) + return retriever diff --git a/code/modules/retriever/faiss_retriever.py b/code/modules/retriever/faiss_retriever.py new file mode 100644 index 0000000000000000000000000000000000000000..c87be305522f99c03ed5602993da82e6726b077f --- /dev/null +++ b/code/modules/retriever/faiss_retriever.py @@ -0,0 +1,23 @@ +from .helpers import VectorStoreRetrieverScore +from .base import BaseRetriever + + +class FaissRetriever(BaseRetriever): + def __init__(self): + pass + + def return_retriever(self, db, config): + retriever = VectorStoreRetrieverScore( + vectorstore=db, + # search_type="similarity_score_threshold", + # search_kwargs={ + # "score_threshold": self.config["vectorstore"][ + # "score_threshold" + # ], + # "k": self.config["vectorstore"]["search_top_k"], + # }, + search_kwargs={ + "k": config["vectorstore"]["search_top_k"], + }, + ) + return retriever diff --git a/code/modules/retriever/helpers.py b/code/modules/retriever/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..f42d23f671d120eaaf54ed20a11ad47603366c94 --- /dev/null +++ b/code/modules/retriever/helpers.py @@ -0,0 +1,39 @@ +from langchain.schema.vectorstore import VectorStoreRetriever +from langchain.callbacks.manager import CallbackManagerForRetrieverRun +from langchain.schema.document import Document +from langchain_core.callbacks import AsyncCallbackManagerForRetrieverRun +from typing import List + + +class VectorStoreRetrieverScore(VectorStoreRetriever): + + # See https://github.com/langchain-ai/langchain/blob/61dd92f8215daef3d9cf1734b0d1f8c70c1571c3/libs/langchain/langchain/vectorstores/base.py#L500 + def _get_relevant_documents( + self, query: str, *, run_manager: CallbackManagerForRetrieverRun + ) -> List[Document]: + docs_and_similarities = ( + self.vectorstore.similarity_search_with_relevance_scores( + query, **self.search_kwargs + ) + ) + # Make the score part of the document metadata + for doc, similarity in docs_and_similarities: + doc.metadata["score"] = similarity + + docs = [doc for doc, _ in docs_and_similarities] + return docs + + async def _aget_relevant_documents( + self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun + ) -> List[Document]: + docs_and_similarities = ( + self.vectorstore.similarity_search_with_relevance_scores( + query, **self.search_kwargs + ) + ) + # Make the score part of the document metadata + for doc, similarity in docs_and_similarities: + doc.metadata["score"] = similarity + + docs = [doc for doc, _ in docs_and_similarities] + return docs diff --git a/code/modules/retriever/raptor_retriever.py b/code/modules/retriever/raptor_retriever.py new file mode 100644 index 0000000000000000000000000000000000000000..550e9f89520da5e4753ea3820049d258be286bd8 --- /dev/null +++ b/code/modules/retriever/raptor_retriever.py @@ -0,0 +1,16 @@ +from .helpers import VectorStoreRetrieverScore +from .base import BaseRetriever + + +class RaptorRetriever(BaseRetriever): + def __init__(self): + pass + + def return_retriever(self, db, config): + retriever = VectorStoreRetrieverScore( + vectorstore=db, + search_kwargs={ + "k": config["vectorstore"]["search_top_k"], + }, + ) + return retriever diff --git a/code/modules/retriever/retriever.py b/code/modules/retriever/retriever.py new file mode 100644 index 0000000000000000000000000000000000000000..1ae6370a513e3f478ca37e1e326178189e21bc49 --- /dev/null +++ b/code/modules/retriever/retriever.py @@ -0,0 +1,26 @@ +from modules.retriever.faiss_retriever import FaissRetriever +from modules.retriever.chroma_retriever import ChromaRetriever +from modules.retriever.colbert_retriever import ColbertRetriever +from modules.retriever.raptor_retriever import RaptorRetriever + + +class Retriever: + def __init__(self, config): + self.config = config + self.retriever_classes = { + "FAISS": FaissRetriever, + "Chroma": ChromaRetriever, + "RAGatouille": ColbertRetriever, + "RAPTOR": RaptorRetriever, + } + self._create_retriever() + + def _create_retriever(self): + db_option = self.config["vectorstore"]["db_option"] + retriever_class = self.retriever_classes.get(db_option) + if not retriever_class: + raise ValueError(f"Invalid db_option: {db_option}") + self.retriever = retriever_class() + + def _return_retriever(self, db): + return self.retriever.return_retriever(db, self.config) diff --git a/code/modules/vector_db.py b/code/modules/vector_db.py deleted file mode 100644 index 14ff56fa86392111a64fcce9dbe3353a28999b7f..0000000000000000000000000000000000000000 --- a/code/modules/vector_db.py +++ /dev/null @@ -1,226 +0,0 @@ -import logging -import os -import yaml -from langchain_community.vectorstores import FAISS, Chroma -from langchain.schema.vectorstore import VectorStoreRetriever -from langchain.callbacks.manager import CallbackManagerForRetrieverRun -from langchain.schema.document import Document -from langchain_core.callbacks import AsyncCallbackManagerForRetrieverRun -from ragatouille import RAGPretrainedModel - -try: - from modules.embedding_model_loader import EmbeddingModelLoader - from modules.data_loader import DataLoader - from modules.constants import * - from modules.helpers import * -except: - from embedding_model_loader import EmbeddingModelLoader - from data_loader import DataLoader - from constants import * - from helpers import * - -from typing import List - - -class VectorDBScore(VectorStoreRetriever): - - # See https://github.com/langchain-ai/langchain/blob/61dd92f8215daef3d9cf1734b0d1f8c70c1571c3/libs/langchain/langchain/vectorstores/base.py#L500 - def _get_relevant_documents( - self, query: str, *, run_manager: CallbackManagerForRetrieverRun - ) -> List[Document]: - docs_and_similarities = ( - self.vectorstore.similarity_search_with_relevance_scores( - query, **self.search_kwargs - ) - ) - # Make the score part of the document metadata - for doc, similarity in docs_and_similarities: - doc.metadata["score"] = similarity - - docs = [doc for doc, _ in docs_and_similarities] - return docs - - async def _aget_relevant_documents( - self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun - ) -> List[Document]: - docs_and_similarities = ( - self.vectorstore.similarity_search_with_relevance_scores( - query, **self.search_kwargs - ) - ) - # Make the score part of the document metadata - for doc, similarity in docs_and_similarities: - doc.metadata["score"] = similarity - - docs = [doc for doc, _ in docs_and_similarities] - return docs - - -class VectorDB: - def __init__(self, config, logger=None): - self.config = config - self.db_option = config["embedding_options"]["db_option"] - self.document_names = None - self.webpage_crawler = WebpageCrawler() - - # Set up logging to both console and a file - if logger is None: - self.logger = logging.getLogger(__name__) - self.logger.setLevel(logging.INFO) - - # Console Handler - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.INFO) - formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") - console_handler.setFormatter(formatter) - self.logger.addHandler(console_handler) - - # File Handler - log_file_path = "vector_db.log" # Change this to your desired log file path - file_handler = logging.FileHandler(log_file_path, mode="w") - file_handler.setLevel(logging.INFO) - file_handler.setFormatter(formatter) - self.logger.addHandler(file_handler) - else: - self.logger = logger - - self.logger.info("VectorDB instance instantiated") - - def load_files(self): - files = os.listdir(self.config["embedding_options"]["data_path"]) - files = [ - os.path.join(self.config["embedding_options"]["data_path"], file) - for file in files - ] - urls = get_urls_from_file(self.config["embedding_options"]["url_file_path"]) - if self.config["embedding_options"]["expand_urls"]: - all_urls = [] - for url in urls: - loop = asyncio.get_event_loop() - all_urls.extend( - loop.run_until_complete( - self.webpage_crawler.get_all_pages( - url, url - ) # only get child urls, if you want to get all urls, replace the second argument with the base url - ) - ) - urls = all_urls - return files, urls - - def create_embedding_model(self): - self.logger.info("Creating embedding function") - self.embedding_model_loader = EmbeddingModelLoader(self.config) - self.embedding_model = self.embedding_model_loader.load_embedding_model() - - def initialize_database( - self, - document_chunks: list, - document_names: list, - documents: list, - document_metadata: list, - ): - if self.db_option in ["FAISS", "Chroma"]: - self.create_embedding_model() - # Track token usage - self.logger.info("Initializing vector_db") - self.logger.info("\tUsing {} as db_option".format(self.db_option)) - if self.db_option == "FAISS": - self.vector_db = FAISS.from_documents( - documents=document_chunks, embedding=self.embedding_model - ) - elif self.db_option == "Chroma": - self.vector_db = Chroma.from_documents( - documents=document_chunks, - embedding=self.embedding_model, - persist_directory=os.path.join( - self.config["embedding_options"]["db_path"], - "db_" - + self.config["embedding_options"]["db_option"] - + "_" - + self.config["embedding_options"]["model"], - ), - ) - elif self.db_option == "RAGatouille": - self.RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0") - index_path = self.RAG.index( - index_name="new_idx", - collection=documents, - document_ids=document_names, - document_metadatas=document_metadata, - ) - self.logger.info("Completed initializing vector_db") - - def create_database(self): - data_loader = DataLoader(self.config) - self.logger.info("Loading data") - files, urls = self.load_files() - files, webpages = self.webpage_crawler.clean_url_list(urls) - if "storage/data/urls.txt" in files: - files.remove("storage/data/urls.txt") - document_chunks, document_names, documents, document_metadata = ( - data_loader.get_chunks(files, webpages) - ) - self.logger.info("Completed loading data") - self.initialize_database( - document_chunks, document_names, documents, document_metadata - ) - - def save_database(self): - if self.db_option == "FAISS": - self.vector_db.save_local( - os.path.join( - self.config["embedding_options"]["db_path"], - "db_" - + self.config["embedding_options"]["db_option"] - + "_" - + self.config["embedding_options"]["model"], - ) - ) - elif self.db_option == "Chroma": - # db is saved in the persist directory during initialization - pass - elif self.db_option == "RAGatouille": - # index is saved during initialization - pass - self.logger.info("Saved database") - - def load_database(self): - self.create_embedding_model() - if self.db_option == "FAISS": - self.vector_db = FAISS.load_local( - os.path.join( - self.config["embedding_options"]["db_path"], - "db_" - + self.config["embedding_options"]["db_option"] - + "_" - + self.config["embedding_options"]["model"], - ), - self.embedding_model, - allow_dangerous_deserialization=True, - ) - elif self.db_option == "Chroma": - self.vector_db = Chroma( - persist_directory=os.path.join( - self.config["embedding_options"]["db_path"], - "db_" - + self.config["embedding_options"]["db_option"] - + "_" - + self.config["embedding_options"]["model"], - ), - embedding_function=self.embedding_model, - ) - elif self.db_option == "RAGatouille": - self.vector_db = RAGPretrainedModel.from_index( - ".ragatouille/colbert/indexes/new_idx" - ) - self.logger.info("Loaded database") - return self.vector_db - - -if __name__ == "__main__": - with open("code/config.yml", "r") as f: - config = yaml.safe_load(f) - print(config) - vector_db = VectorDB(config) - vector_db.create_database() - vector_db.save_database() diff --git a/code/modules/vectorstore/__init__.py b/code/modules/vectorstore/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/modules/vectorstore/base.py b/code/modules/vectorstore/base.py new file mode 100644 index 0000000000000000000000000000000000000000..95f84d9a1a46ce5f387c5c11b91fcc273344fd22 --- /dev/null +++ b/code/modules/vectorstore/base.py @@ -0,0 +1,33 @@ +# template for vector store classes + + +class VectorStoreBase: + def __init__(self, config): + self.config = config + + def _init_vector_db(self): + """ + Creates a vector store object + """ + raise NotImplementedError + + def create_database(self): + """ + Populates the vector store with documents + """ + raise NotImplementedError + + def load_database(self): + """ + Loads the vector store from disk + """ + raise NotImplementedError + + def as_retriever(self): + """ + Returns the vector store as a retriever + """ + raise NotImplementedError + + def __str__(self): + return self.__class__.__name__ diff --git a/code/modules/vectorstore/chroma.py b/code/modules/vectorstore/chroma.py new file mode 100644 index 0000000000000000000000000000000000000000..3c8da4e9227a9e0685276f3fc0a110d7bf74125f --- /dev/null +++ b/code/modules/vectorstore/chroma.py @@ -0,0 +1,41 @@ +from langchain_community.vectorstores import Chroma +from modules.vectorstore.base import VectorStoreBase +import os + + +class ChromaVectorStore(VectorStoreBase): + def __init__(self, config): + self.config = config + self._init_vector_db() + + def _init_vector_db(self): + self.chroma = Chroma() + + def create_database(self, document_chunks, embedding_model): + self.vectorstore = self.chroma.from_documents( + documents=document_chunks, + embedding=embedding_model, + persist_directory=os.path.join( + self.config["vectorstore"]["db_path"], + "db_" + + self.config["vectorstore"]["db_option"] + + "_" + + self.config["vectorstore"]["model"], + ), + ) + + def load_database(self, embedding_model): + self.vectorstore = Chroma( + persist_directory=os.path.join( + self.config["vectorstore"]["db_path"], + "db_" + + self.config["vectorstore"]["db_option"] + + "_" + + self.config["vectorstore"]["model"], + ), + embedding_function=embedding_model, + ) + return self.vectorstore + + def as_retriever(self): + return self.vectorstore.as_retriever() diff --git a/code/modules/vectorstore/colbert.py b/code/modules/vectorstore/colbert.py new file mode 100644 index 0000000000000000000000000000000000000000..712a73b0cb589c1f8db57f2853a777402f7b55e8 --- /dev/null +++ b/code/modules/vectorstore/colbert.py @@ -0,0 +1,39 @@ +from ragatouille import RAGPretrainedModel +from modules.vectorstore.base import VectorStoreBase +import os + + +class ColbertVectorStore(VectorStoreBase): + def __init__(self, config): + self.config = config + self._init_vector_db() + + def _init_vector_db(self): + self.colbert = RAGPretrainedModel.from_pretrained( + "colbert-ir/colbertv2.0", + index_root=os.path.join( + self.config["vectorstore"]["db_path"], + "db_" + self.config["vectorstore"]["db_option"], + ), + ) + + def create_database(self, documents, document_names, document_metadata): + index_path = self.colbert.index( + index_name="new_idx", + collection=documents, + document_ids=document_names, + document_metadatas=document_metadata, + ) + + def load_database(self): + path = os.path.join( + self.config["vectorstore"]["db_path"], + "db_" + self.config["vectorstore"]["db_option"], + ) + self.vectorstore = RAGPretrainedModel.from_index( + f"{path}/colbert/indexes/new_idx" + ) + return self.vectorstore + + def as_retriever(self): + return self.vectorstore.as_retriever() diff --git a/code/modules/embedding_model_loader.py b/code/modules/vectorstore/embedding_model_loader.py similarity index 74% rename from code/modules/embedding_model_loader.py rename to code/modules/vectorstore/embedding_model_loader.py index e8bc2f44f15fc78f115a137382823370570a46e0..0cdc23d75cfb07b4e8f1164bd9d5d7b7a5f9769e 100644 --- a/code/modules/embedding_model_loader.py +++ b/code/modules/vectorstore/embedding_model_loader.py @@ -2,10 +2,7 @@ from langchain_community.embeddings import OpenAIEmbeddings from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.embeddings import LlamaCppEmbeddings -try: - from modules.constants import * -except: - from constants import * +from modules.config.constants import * import os @@ -14,19 +11,19 @@ class EmbeddingModelLoader: self.config = config def load_embedding_model(self): - if self.config["embedding_options"]["model"] in ["text-embedding-ada-002"]: + if self.config["vectorstore"]["model"] in ["text-embedding-ada-002"]: embedding_model = OpenAIEmbeddings( deployment="SL-document_embedder", - model=self.config["embedding_options"]["model"], + model=self.config["vectorestore"]["model"], show_progress_bar=True, openai_api_key=OPENAI_API_KEY, disallowed_special=(), ) else: embedding_model = HuggingFaceEmbeddings( - model_name=self.config["embedding_options"]["model"], + model_name=self.config["vectorstore"]["model"], model_kwargs={ - "device": "cpu", + "device": f"{self.config['device']}", "token": f"{HUGGINGFACE_TOKEN}", "trust_remote_code": True, }, diff --git a/code/modules/vectorstore/faiss.py b/code/modules/vectorstore/faiss.py new file mode 100644 index 0000000000000000000000000000000000000000..c130e12ca2fe4a873ed3c2b378cf902cbcb07db1 --- /dev/null +++ b/code/modules/vectorstore/faiss.py @@ -0,0 +1,45 @@ +from langchain_community.vectorstores import FAISS +from modules.vectorstore.base import VectorStoreBase +import os + + +class FaissVectorStore(VectorStoreBase): + def __init__(self, config): + self.config = config + self._init_vector_db() + + def _init_vector_db(self): + self.faiss = FAISS( + embedding_function=None, index=0, index_to_docstore_id={}, docstore={} + ) + + def create_database(self, document_chunks, embedding_model): + self.vectorstore = self.faiss.from_documents( + documents=document_chunks, embedding=embedding_model + ) + self.vectorstore.save_local( + os.path.join( + self.config["vectorstore"]["db_path"], + "db_" + + self.config["vectorstore"]["db_option"] + + "_" + + self.config["vectorstore"]["model"], + ) + ) + + def load_database(self, embedding_model): + self.vectorstore = self.faiss.load_local( + os.path.join( + self.config["vectorstore"]["db_path"], + "db_" + + self.config["vectorstore"]["db_option"] + + "_" + + self.config["vectorstore"]["model"], + ), + embedding_model, + allow_dangerous_deserialization=True, + ) + return self.vectorstore + + def as_retriever(self): + return self.vectorstore.as_retriever() diff --git a/code/modules/vectorstore/helpers.py b/code/modules/vectorstore/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/modules/vectorstore/raptor.py b/code/modules/vectorstore/raptor.py new file mode 100644 index 0000000000000000000000000000000000000000..d01a25a01ab84a3d92a15d8aaab087f6f5d1a490 --- /dev/null +++ b/code/modules/vectorstore/raptor.py @@ -0,0 +1,438 @@ +# code modified from https://github.com/langchain-ai/langchain/blob/master/cookbook/RAPTOR.ipynb + +from typing import Dict, List, Optional, Tuple +import os +import numpy as np +import pandas as pd +import umap +from langchain_core.prompts import ChatPromptTemplate +from langchain_core.output_parsers import StrOutputParser +from sklearn.mixture import GaussianMixture +from langchain_community.chat_models import ChatOpenAI +from langchain_community.vectorstores import FAISS +from langchain.text_splitter import RecursiveCharacterTextSplitter +from modules.vectorstore.base import VectorStoreBase + +RANDOM_SEED = 42 + + +class RAPTORVectoreStore(VectorStoreBase): + def __init__(self, config, documents=[], text_splitter=None, embedding_model=None): + self.documents = documents + self.config = config + self.text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( + chunk_size=self.config["splitter_options"]["chunk_size"], + chunk_overlap=self.config["splitter_options"]["chunk_overlap"], + separators=self.config["splitter_options"]["chunk_separators"], + disallowed_special=(), + ) + self.embd = embedding_model + self.model = ChatOpenAI( + model="gpt-3.5-turbo", + ) + + def concat_documents(self, documents): + d_sorted = sorted(documents, key=lambda x: x.metadata["source"]) + d_reversed = list(reversed(d_sorted)) + concatenated_content = "\n\n\n --- \n\n\n".join( + [doc.page_content for doc in d_reversed] + ) + return concatenated_content + + def split_documents(self, documents): + concatenated_content = self.concat_documents(documents) + texts_split = self.text_splitter.split_text(concatenated_content) + return texts_split + + def add_documents(self, documents): + self.documents.extend(documents) + + def global_cluster_embeddings( + self, + embeddings: np.ndarray, + dim: int, + n_neighbors: Optional[int] = None, + metric: str = "cosine", + ) -> np.ndarray: + """ + Perform global dimensionality reduction on the embeddings using UMAP. + + Parameters: + - embeddings: The input embeddings as a numpy array. + - dim: The target dimensionality for the reduced space. + - n_neighbors: Optional; the number of neighbors to consider for each point. + If not provided, it defaults to the square root of the number of embeddings. + - metric: The distance metric to use for UMAP. + + Returns: + - A numpy array of the embeddings reduced to the specified dimensionality. + """ + if n_neighbors is None: + n_neighbors = int((len(embeddings) - 1) ** 0.5) + return umap.UMAP( + n_neighbors=n_neighbors, n_components=dim, metric=metric + ).fit_transform(embeddings) + + def local_cluster_embeddings( + self, + embeddings: np.ndarray, + dim: int, + num_neighbors: int = 10, + metric: str = "cosine", + ) -> np.ndarray: + """ + Perform local dimensionality reduction on the embeddings using UMAP, typically after global clustering. + + Parameters: + - embeddings: The input embeddings as a numpy array. + - dim: The target dimensionality for the reduced space. + - num_neighbors: The number of neighbors to consider for each point. + - metric: The distance metric to use for UMAP. + + Returns: + - A numpy array of the embeddings reduced to the specified dimensionality. + """ + return umap.UMAP( + n_neighbors=num_neighbors, n_components=dim, metric=metric + ).fit_transform(embeddings) + + def get_optimal_clusters( + self, + embeddings: np.ndarray, + max_clusters: int = 50, + random_state: int = RANDOM_SEED, + ) -> int: + """ + Determine the optimal number of clusters using the Bayesian Information Criterion (BIC) with a Gaussian Mixture Model. + + Parameters: + - embeddings: The input embeddings as a numpy array. + - max_clusters: The maximum number of clusters to consider. + - random_state: Seed for reproducibility. + + Returns: + - An integer representing the optimal number of clusters found. + """ + max_clusters = min(max_clusters, len(embeddings)) + n_clusters = np.arange(1, max_clusters) + bics = [] + for n in n_clusters: + gm = GaussianMixture(n_components=n, random_state=random_state) + gm.fit(embeddings) + bics.append(gm.bic(embeddings)) + return n_clusters[np.argmin(bics)] + + def GMM_cluster( + self, embeddings: np.ndarray, threshold: float, random_state: int = 0 + ): + """ + Cluster embeddings using a Gaussian Mixture Model (GMM) based on a probability threshold. + + Parameters: + - embeddings: The input embeddings as a numpy array. + - threshold: The probability threshold for assigning an embedding to a cluster. + - random_state: Seed for reproducibility. + + Returns: + - A tuple containing the cluster labels and the number of clusters determined. + """ + n_clusters = self.get_optimal_clusters(embeddings) + gm = GaussianMixture(n_components=n_clusters, random_state=random_state) + gm.fit(embeddings) + probs = gm.predict_proba(embeddings) + labels = [np.where(prob > threshold)[0] for prob in probs] + return labels, n_clusters + + def perform_clustering( + self, + embeddings: np.ndarray, + dim: int, + threshold: float, + ) -> List[np.ndarray]: + """ + Perform clustering on the embeddings by first reducing their dimensionality globally, then clustering + using a Gaussian Mixture Model, and finally performing local clustering within each global cluster. + + Parameters: + - embeddings: The input embeddings as a numpy array. + - dim: The target dimensionality for UMAP reduction. + - threshold: The probability threshold for assigning an embedding to a cluster in GMM. + + Returns: + - A list of numpy arrays, where each array contains the cluster IDs for each embedding. + """ + if len(embeddings) <= dim + 1: + # Avoid clustering when there's insufficient data + return [np.array([0]) for _ in range(len(embeddings))] + + # Global dimensionality reduction + reduced_embeddings_global = self.global_cluster_embeddings(embeddings, dim) + # Global clustering + global_clusters, n_global_clusters = self.GMM_cluster( + reduced_embeddings_global, threshold + ) + + all_local_clusters = [np.array([]) for _ in range(len(embeddings))] + total_clusters = 0 + + # Iterate through each global cluster to perform local clustering + for i in range(n_global_clusters): + # Extract embeddings belonging to the current global cluster + global_cluster_embeddings_ = embeddings[ + np.array([i in gc for gc in global_clusters]) + ] + + if len(global_cluster_embeddings_) == 0: + continue + if len(global_cluster_embeddings_) <= dim + 1: + # Handle small clusters with direct assignment + local_clusters = [np.array([0]) for _ in global_cluster_embeddings_] + n_local_clusters = 1 + else: + # Local dimensionality reduction and clustering + reduced_embeddings_local = self.local_cluster_embeddings( + global_cluster_embeddings_, dim + ) + local_clusters, n_local_clusters = self.GMM_cluster( + reduced_embeddings_local, threshold + ) + + # Assign local cluster IDs, adjusting for total clusters already processed + for j in range(n_local_clusters): + local_cluster_embeddings_ = global_cluster_embeddings_[ + np.array([j in lc for lc in local_clusters]) + ] + indices = np.where( + (embeddings == local_cluster_embeddings_[:, None]).all(-1) + )[1] + for idx in indices: + all_local_clusters[idx] = np.append( + all_local_clusters[idx], j + total_clusters + ) + + total_clusters += n_local_clusters + + return all_local_clusters + + def embed(self, texts): + """ + Generate embeddings for a list of text documents. + + This function assumes the existence of an `embd` object with a method `embed_documents` + that takes a list of texts and returns their embeddings. + + Parameters: + - texts: List[str], a list of text documents to be embedded. + + Returns: + - numpy.ndarray: An array of embeddings for the given text documents. + """ + text_embeddings = self.embd.embed_documents(texts) + text_embeddings_np = np.array(text_embeddings) + return text_embeddings_np + + def embed_cluster_texts(self, texts): + """ + Embeds a list of texts and clusters them, returning a DataFrame with texts, their embeddings, and cluster labels. + + This function combines embedding generation and clustering into a single step. It assumes the existence + of a previously defined `perform_clustering` function that performs clustering on the embeddings. + + Parameters: + - texts: List[str], a list of text documents to be processed. + + Returns: + - pandas.DataFrame: A DataFrame containing the original texts, their embeddings, and the assigned cluster labels. + """ + text_embeddings_np = self.embed(texts) # Generate embeddings + cluster_labels = self.perform_clustering( + text_embeddings_np, 10, 0.1 + ) # Perform clustering on the embeddings + df = pd.DataFrame() # Initialize a DataFrame to store the results + df["text"] = texts # Store original texts + df["embd"] = list( + text_embeddings_np + ) # Store embeddings as a list in the DataFrame + df["cluster"] = cluster_labels # Store cluster labels + return df + + def fmt_txt(self, df: pd.DataFrame) -> str: + """ + Formats the text documents in a DataFrame into a single string. + + Parameters: + - df: DataFrame containing the 'text' column with text documents to format. + + Returns: + - A single string where all text documents are joined by a specific delimiter. + """ + unique_txt = df["text"].tolist() + return "--- --- \n --- --- ".join(unique_txt) + + def embed_cluster_summarize_texts( + self, texts: List[str], level: int + ) -> Tuple[pd.DataFrame, pd.DataFrame]: + """ + Embeds, clusters, and summarizes a list of texts. This function first generates embeddings for the texts, + clusters them based on similarity, expands the cluster assignments for easier processing, and then summarizes + the content within each cluster. + + Parameters: + - texts: A list of text documents to be processed. + - level: An integer parameter that could define the depth or detail of processing. + + Returns: + - Tuple containing two DataFrames: + 1. The first DataFrame (`df_clusters`) includes the original texts, their embeddings, and cluster assignments. + 2. The second DataFrame (`df_summary`) contains summaries for each cluster, the specified level of detail, + and the cluster identifiers. + """ + + # Embed and cluster the texts, resulting in a DataFrame with 'text', 'embd', and 'cluster' columns + df_clusters = self.embed_cluster_texts(texts) + + # Prepare to expand the DataFrame for easier manipulation of clusters + expanded_list = [] + + # Expand DataFrame entries to document-cluster pairings for straightforward processing + for index, row in df_clusters.iterrows(): + for cluster in row["cluster"]: + expanded_list.append( + {"text": row["text"], "embd": row["embd"], "cluster": cluster} + ) + + # Create a new DataFrame from the expanded list + expanded_df = pd.DataFrame(expanded_list) + + # Retrieve unique cluster identifiers for processing + all_clusters = expanded_df["cluster"].unique() + + print(f"--Generated {len(all_clusters)} clusters--") + + # Summarization + template = """Here is content from the course DS598: Deep Learning for Data Science. + + The content may be form webapge about the course, or lecture content, or any other relevant information. + If the content is in bullet points (from pdf lectre slides), you can summarize the bullet points. + + Give a detailed summary of the content below. + + Documentation: + {context} + """ + prompt = ChatPromptTemplate.from_template(template) + chain = prompt | self.model | StrOutputParser() + + # Format text within each cluster for summarization + summaries = [] + for i in all_clusters: + df_cluster = expanded_df[expanded_df["cluster"] == i] + formatted_txt = self.fmt_txt(df_cluster) + summaries.append(chain.invoke({"context": formatted_txt})) + + # Create a DataFrame to store summaries with their corresponding cluster and level + df_summary = pd.DataFrame( + { + "summaries": summaries, + "level": [level] * len(summaries), + "cluster": list(all_clusters), + } + ) + + return df_clusters, df_summary + + def recursive_embed_cluster_summarize( + self, texts: List[str], level: int = 1, n_levels: int = 3 + ) -> Dict[int, Tuple[pd.DataFrame, pd.DataFrame]]: + """ + Recursively embeds, clusters, and summarizes texts up to a specified level or until + the number of unique clusters becomes 1, storing the results at each level. + + Parameters: + - texts: List[str], texts to be processed. + - level: int, current recursion level (starts at 1). + - n_levels: int, maximum depth of recursion. + + Returns: + - Dict[int, Tuple[pd.DataFrame, pd.DataFrame]], a dictionary where keys are the recursion + levels and values are tuples containing the clusters DataFrame and summaries DataFrame at that level. + """ + results = {} # Dictionary to store results at each level + + # Perform embedding, clustering, and summarization for the current level + df_clusters, df_summary = self.embed_cluster_summarize_texts(texts, level) + + # Store the results of the current level + results[level] = (df_clusters, df_summary) + + # Determine if further recursion is possible and meaningful + unique_clusters = df_summary["cluster"].nunique() + if level < n_levels and unique_clusters > 1: + # Use summaries as the input texts for the next level of recursion + new_texts = df_summary["summaries"].tolist() + next_level_results = self.recursive_embed_cluster_summarize( + new_texts, level + 1, n_levels + ) + + # Merge the results from the next level into the current results dictionary + results.update(next_level_results) + + return results + + def get_vector_db(self): + """ + Generate a retriever object from a list of documents. + + Parameters: + - documents: List of document objects. + + Returns: + - A retriever object. + """ + leaf_texts = self.split_documents(self.documents) + results = self.recursive_embed_cluster_summarize( + leaf_texts, level=1, n_levels=10 + ) + + all_texts = leaf_texts.copy() + # Iterate through the results to extract summaries from each level and add them to all_texts + for level in sorted(results.keys()): + # Extract summaries from the current level's DataFrame + summaries = results[level][1]["summaries"].tolist() + # Extend all_texts with the summaries from the current level + all_texts.extend(summaries) + + # Now, use all_texts to build the vectorstore + vectorstore = FAISS.from_texts(texts=all_texts, embedding=self.embd) + return vectorstore + + def create_database(self, documents, embedding_model): + self.documents = documents + self.embd = embedding_model + self.vectorstore = self.get_vector_db() + self.vectorstore.save_local( + os.path.join( + self.config["vectorstore"]["db_path"], + "db_" + + self.config["vectorstore"]["db_option"] + + "_" + + self.config["vectorstore"]["model"], + ) + ) + + def load_database(self, embedding_model): + self.vectorstore = FAISS.load_local( + os.path.join( + self.config["vectorstore"]["db_path"], + "db_" + + self.config["vectorstore"]["db_option"] + + "_" + + self.config["vectorstore"]["model"], + ), + embedding_model, + allow_dangerous_deserialization=True, + ) + return self.vectorstore + + def as_retriever(self): + return self.vectorstore.as_retriever() diff --git a/code/modules/vectorstore/store_manager.py b/code/modules/vectorstore/store_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..cc4a0ee78423756c0e9ab7300b131849d9fe9f51 --- /dev/null +++ b/code/modules/vectorstore/store_manager.py @@ -0,0 +1,163 @@ +from modules.vectorstore.vectorstore import VectorStore +from modules.vectorstore.helpers import * +from modules.dataloader.webpage_crawler import WebpageCrawler +from modules.dataloader.data_loader import DataLoader +from modules.dataloader.helpers import * +from modules.vectorstore.embedding_model_loader import EmbeddingModelLoader +import logging +import os +import time +import asyncio + + +class VectorStoreManager: + def __init__(self, config, logger=None): + self.config = config + self.document_names = None + + # Set up logging to both console and a file + self.logger = logger or self._setup_logging() + self.webpage_crawler = WebpageCrawler() + self.vector_db = VectorStore(self.config) + + self.logger.info("VectorDB instance instantiated") + + def _setup_logging(self): + logger = logging.getLogger(__name__) + if not logger.hasHandlers(): + logger.setLevel(logging.INFO) + formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") + + # Console Handler + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.INFO) + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + + # Ensure log directory exists + log_directory = self.config["log_dir"] + os.makedirs(log_directory, exist_ok=True) + + # File Handler + log_file_path = os.path.join(log_directory, "vector_db.log") + file_handler = logging.FileHandler(log_file_path, mode="w") + file_handler.setLevel(logging.INFO) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + return logger + + def load_files(self): + + files = os.listdir(self.config["vectorstore"]["data_path"]) + files = [ + os.path.join(self.config["vectorstore"]["data_path"], file) + for file in files + ] + urls = get_urls_from_file(self.config["vectorstore"]["url_file_path"]) + if self.config["vectorstore"]["expand_urls"]: + all_urls = [] + for url in urls: + loop = asyncio.get_event_loop() + all_urls.extend( + loop.run_until_complete( + self.webpage_crawler.get_all_pages( + url, url + ) # only get child urls, if you want to get all urls, replace the second argument with the base url + ) + ) + urls = all_urls + return files, urls + + def create_embedding_model(self): + + self.logger.info("Creating embedding function") + embedding_model_loader = EmbeddingModelLoader(self.config) + embedding_model = embedding_model_loader.load_embedding_model() + return embedding_model + + def initialize_database( + self, + document_chunks: list, + document_names: list, + documents: list, + document_metadata: list, + ): + if self.config["vectorstore"]["db_option"] in ["FAISS", "Chroma", "RAPTOR"]: + self.embedding_model = self.create_embedding_model() + else: + self.embedding_model = None + + self.logger.info("Initializing vector_db") + self.logger.info( + "\tUsing {} as db_option".format(self.config["vectorstore"]["db_option"]) + ) + self.vector_db._create_database( + document_chunks, + document_names, + documents, + document_metadata, + self.embedding_model, + ) + + def create_database(self): + + start_time = time.time() # Start time for creating database + data_loader = DataLoader(self.config, self.logger) + self.logger.info("Loading data") + files, urls = self.load_files() + files, webpages = self.webpage_crawler.clean_url_list(urls) + self.logger.info(f"Number of files: {len(files)}") + self.logger.info(f"Number of webpages: {len(webpages)}") + if f"{self.config['vectorstore']['url_file_path']}" in files: + files.remove(f"{self.config['vectorstores']['url_file_path']}") # cleanup + document_chunks, document_names, documents, document_metadata = ( + data_loader.get_chunks(files, webpages) + ) + num_documents = len(document_chunks) + self.logger.info(f"Number of documents in the DB: {num_documents}") + metadata_keys = list(document_metadata[0].keys()) + self.logger.info(f"Metadata keys: {metadata_keys}") + self.logger.info("Completed loading data") + self.initialize_database( + document_chunks, document_names, documents, document_metadata + ) + end_time = time.time() # End time for creating database + self.logger.info("Created database") + self.logger.info( + f"Time taken to create database: {end_time - start_time} seconds" + ) + + def load_database(self): + + start_time = time.time() # Start time for loading database + if self.config["vectorstore"]["db_option"] in ["FAISS", "Chroma", "RAPTOR"]: + self.embedding_model = self.create_embedding_model() + else: + self.embedding_model = None + self.loaded_vector_db = self.vector_db._load_database(self.embedding_model) + end_time = time.time() # End time for loading database + self.logger.info( + f"Time taken to load database: {end_time - start_time} seconds" + ) + self.logger.info("Loaded database") + return self.loaded_vector_db + + +if __name__ == "__main__": + import yaml + + with open("modules/config/config.yml", "r") as f: + config = yaml.safe_load(f) + print(config) + print(f"Trying to create database with config: {config}") + vector_db = VectorStoreManager(config) + vector_db.create_database() + print("Created database") + + print(f"Trying to load the database") + vector_db = VectorStoreManager(config) + vector_db.load_database() + print("Loaded database") + + print(f"View the logs at {config['log_dir']}/vector_db.log") diff --git a/code/modules/vectorstore/vectorstore.py b/code/modules/vectorstore/vectorstore.py new file mode 100644 index 0000000000000000000000000000000000000000..5ac5da1415a20686e1fbeea1a145b9d7e96f51c0 --- /dev/null +++ b/code/modules/vectorstore/vectorstore.py @@ -0,0 +1,57 @@ +from modules.vectorstore.faiss import FaissVectorStore +from modules.vectorstore.chroma import ChromaVectorStore +from modules.vectorstore.colbert import ColbertVectorStore +from modules.vectorstore.raptor import RAPTORVectoreStore + + +class VectorStore: + def __init__(self, config): + self.config = config + self.vectorstore = None + self.vectorstore_classes = { + "FAISS": FaissVectorStore, + "Chroma": ChromaVectorStore, + "RAGatouille": ColbertVectorStore, + "RAPTOR": RAPTORVectoreStore, + } + + def _create_database( + self, + document_chunks, + document_names, + documents, + document_metadata, + embedding_model, + ): + db_option = self.config["vectorstore"]["db_option"] + vectorstore_class = self.vectorstore_classes.get(db_option) + if not vectorstore_class: + raise ValueError(f"Invalid db_option: {db_option}") + + self.vectorstore = vectorstore_class(self.config) + + if db_option == "RAGatouille": + self.vectorstore.create_database( + documents, document_names, document_metadata + ) + else: + self.vectorstore.create_database(document_chunks, embedding_model) + + def _load_database(self, embedding_model): + db_option = self.config["vectorstore"]["db_option"] + vectorstore_class = self.vectorstore_classes.get(db_option) + if not vectorstore_class: + raise ValueError(f"Invalid db_option: {db_option}") + + self.vectorstore = vectorstore_class(self.config) + + if db_option == "RAGatouille": + return self.vectorstore.load_database() + else: + return self.vectorstore.load_database(embedding_model) + + def _as_retriever(self): + return self.vectorstore.as_retriever() + + def _get_vectorstore(self): + return self.vectorstore diff --git a/code/public/acastusphoton-svgrepo-com.svg b/code/public/acastusphoton-svgrepo-com.svg new file mode 100644 index 0000000000000000000000000000000000000000..e8edaf55d695486fbf134388dbfb84f7dbb7ca8c --- /dev/null +++ b/code/public/acastusphoton-svgrepo-com.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/code/public/adv-screen-recorder-svgrepo-com.svg b/code/public/adv-screen-recorder-svgrepo-com.svg new file mode 100644 index 0000000000000000000000000000000000000000..439e638eaf976a2278a2aa043de2feb2d65c3862 --- /dev/null +++ b/code/public/adv-screen-recorder-svgrepo-com.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/code/public/alarmy-svgrepo-com.svg b/code/public/alarmy-svgrepo-com.svg new file mode 100644 index 0000000000000000000000000000000000000000..d2dee1b557beab61067452d4700fa1d62ba0b0e8 --- /dev/null +++ b/code/public/alarmy-svgrepo-com.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/public/logo_dark.png b/code/public/avatars/ai-tutor.png similarity index 100% rename from public/logo_dark.png rename to code/public/avatars/ai-tutor.png diff --git a/code/public/calendar-samsung-17-svgrepo-com.svg b/code/public/calendar-samsung-17-svgrepo-com.svg new file mode 100644 index 0000000000000000000000000000000000000000..8f9caa7ac74d4ea04369169e830e4042b267de89 --- /dev/null +++ b/code/public/calendar-samsung-17-svgrepo-com.svg @@ -0,0 +1,36 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/public/logo_light.png b/code/public/logo_dark.png similarity index 100% rename from public/logo_light.png rename to code/public/logo_dark.png diff --git a/code/public/logo_light.png b/code/public/logo_light.png new file mode 100644 index 0000000000000000000000000000000000000000..1b9e56d72a9738e8cd4838bfb7dce31fc92cd9ee Binary files /dev/null and b/code/public/logo_light.png differ diff --git a/code/public/test.css b/code/public/test.css new file mode 100644 index 0000000000000000000000000000000000000000..fd69a17d90db3c746349647d0c64dbba7ce9005c --- /dev/null +++ b/code/public/test.css @@ -0,0 +1,33 @@ +a[href*='https://github.com/Chainlit/chainlit'] { + visibility: hidden; +} + + +/* Target the container of the image and set a custom background image */ +.MuiAvatar-root.MuiAvatar-circular.css-m2icte { + background-image: url('/public/avatars/ai-tutor.png'); /* Replace with your custom image URL */ + background-size: cover; /* Ensure the image covers the entire container */ + background-position: center; /* Center the image */ + width: 100px; /* Ensure the dimensions match the original */ + height: 100px; /* Ensure the dimensions match the original */ + border-radius: 50%; /* Maintain circular shape */ +} + +/* Hide the default image */ +.MuiAvatar-root.MuiAvatar-circular.css-m2icte .MuiAvatar-img.css-1hy9t21 { + display: none; +} + +.MuiAvatar-root.MuiAvatar-circular.css-v72an7 { + background-image: url('/public/avatars/ai-tutor.png'); /* Replace with your custom image URL */ + background-size: cover; /* Ensure the image covers the entire container */ + background-position: center; /* Center the image */ + width: 40px; /* Ensure the dimensions match the original */ + height: 40px; /* Ensure the dimensions match the original */ + border-radius: 50%; /* Maintain circular shape */ +} + +/* Hide the default image */ +.MuiAvatar-root.MuiAvatar-circular.css-v72an7 .MuiAvatar-img.css-1hy9t21 { + display: none; +} diff --git a/eval_code/true_lens.ipynb b/eval_code/true_lens.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..6b0cc7150cd610adedd23893f1d4765da5f905a1 --- /dev/null +++ b/eval_code/true_lens.ipynb @@ -0,0 +1,897 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"\"\n", + "s" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{\n", + " \"createdAt\": \"2024-06-19T09:44:35.578Z\",\n", + " \"id\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"metadata\": {},\n", + " \"name\": \"TEST\",\n", + " \"participant\": {},\n", + " \"steps\": [\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:44:48.221\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"1ab30fe6-99cb-4327-bd7d-e3128b2c1a62\",\n", + " \"input\": {\n", + " \"question\": \"What is inductive bias in CNNs?\"\n", + " },\n", + " \"metadata\": {},\n", + " \"name\": \"RAG\",\n", + " \"output\": {\n", + " \"answer\": \"Inductive bias in CNNs refers to the inherent preference for prioritizing information from neighboring nodes when updating each node in the network. This bias helps the network effectively aggregate information from nearby nodes, leading to more efficient and accurate node embeddings. You can find more information about this topic in the lecture slides on Graph Neural Networks and Convolutional Networks.\",\n", + " \"chat_history\": [],\n", + " \"original_question\": \"What is inductive bias in CNNs?\",\n", + " \"question\": \"What is inductive bias in CNNs?\",\n", + " \"rephrased_question\": \"What is inductive bias in CNNs?\",\n", + " \"source_documents\": [\n", + " \"page_content='Graph convolutional network These models are convolutional in that they update each node by aggregating information from nearby nodes. As such, they induce a relational inductive bias (i.e., a bias toward prioritizing information from neighbors). 39 A function \\ud835\\udc39[\\u22c5] with parameters \\ud835\\udf19\\\" that takes the node embeddings and adjacency matrix and outputs new node embeddings ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/20_graph_neural_networks.pdf', 'page': 38, 'date': '04/16/2024\\\\n Tuesday', 'tldr': 'In this lecture we introduce graph neural networks, define matrix representations, how to do graph level classification and regression, and how to define graph convolutional network layers.', 'title': '20 -- Graph Neural Networks', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_bt7qtymk', 'suggested_readings': 'UDL Chapter 13', 'score': 0.4147438882496012}\",\n", + " \"page_content='https://poloclub.github.io/cnn-explainer/ 92 ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/10_convolutional_networks.pdf', 'page': 91, 'date': '02/27/2024\\\\n Tuesday', 'tldr': 'We cover 1D and 2D convolutional neural networks along with subsampling and upsampling operations.', 'title': '10 - Convolutional Neural Networks', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_8ejvyib4', 'suggested_readings': 'UDL Chapter 10', 'score': 0.30807206912986296}\",\n", + " \"page_content='Convolutional networks \\u2022 Networks for images \\u2022 Invariance and equivariance \\u2022 1D convolution \\u2022 Convolutional layers \\u2022 Channels \\u2022 Receptive fields \\u2022 Convolutional network for MNIST 1D 44 ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/10_convolutional_networks.pdf', 'page': 43, 'date': '02/27/2024\\\\n Tuesday', 'tldr': 'We cover 1D and 2D convolutional neural networks along with subsampling and upsampling operations.', 'title': '10 - Convolutional Neural Networks', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_8ejvyib4', 'suggested_readings': 'UDL Chapter 10', 'score': 0.26420832659802906}\"\n", + " ]\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:44:46.84\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"retrieval\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:44:48.223\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"18af26e1-e738-4304-b484-1fd7bd5a26e6\",\n", + " \"input\": null,\n", + " \"metadata\": {},\n", + " \"name\": \"User\",\n", + " \"output\": {\n", + " \"content\": \"What is inductive bias in CNNs?\"\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:44:48.223\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"user_message\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:44:48.225\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"067c2783-38ff-43e4-866a-abf957415359\",\n", + " \"input\": null,\n", + " \"metadata\": {},\n", + " \"name\": \"AI_Tutor\",\n", + " \"output\": {\n", + " \"content\": \"Inductive bias in CNNs refers to the inherent preference for prioritizing information from neighboring nodes when updating each node in the network. This bias helps the network effectively aggregate information from nearby nodes, leading to more efficient and accurate node embeddings. You can find more information about this topic in the lecture slides on Graph Neural Networks and Convolutional Networks.\"\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:44:48.225\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"assistant_message\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:44:58.148\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"d2b74775-41fc-4ebc-bc40-d8d7cf67015d\",\n", + " \"input\": {\n", + " \"question\": \"Do transformers have this?\"\n", + " },\n", + " \"metadata\": {},\n", + " \"name\": \"RAG\",\n", + " \"output\": {\n", + " \"answer\": \"I'm not sure, I don't have that information.\",\n", + " \"chat_history\": [\n", + " \"content='What is inductive bias in CNNs?'\",\n", + " \"content='Inductive bias in CNNs refers to the inherent preference for prioritizing information from neighboring nodes when updating each node in the network. This bias helps the network effectively aggregate information from nearby nodes, leading to more efficient and accurate node embeddings. You can find more information about this topic in the lecture slides on Graph Neural Networks and Convolutional Networks.'\"\n", + " ],\n", + " \"original_question\": \"Do transformers have this?\",\n", + " \"question\": \"Do transformers have this?\",\n", + " \"rephrased_question\": \"Do transformers exhibit a similar inductive bias as CNNs?\",\n", + " \"source_documents\": [\n", + " \"page_content='Transformers \\u2022 Multi-headed Self Attention is just one component of the transformer architecture \\u2022 Let\\u2019s look at a transformer block (or layer) from the encoder 49 Encoder Decoder ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 48, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.2451855994037282}\",\n", + " \"page_content='Transformers \\u2022 Motivation \\u2022 Dot-product self-attention \\u2022 Applying Self-Attention \\u2022 The Transformer Architecture \\u2022 Three Types of NLP Transformer Models \\u2022 Encoder \\u2022 Decoder \\u2022 Encoder-Decoder 55 ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 54, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.24314965373956088}\",\n", + " \"page_content='Transformers \\u2022 Motivation \\u2022 Dot-product self-attention \\u2022 Applying Self-Attention \\u2022 The Transformer Architecture \\u2022 Three Types of NLP Transformer Models \\u2022 Encoder \\u2022 Decoder \\u2022 Encoder-Decoder 10 ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/13_transformers_part2_v2.pdf', 'page': 9, 'date': '03/07/2024\\\\n Thursday', 'tldr': 'In this lecture we continue to review the transformer architecture. We continue the discussion of decoders and encoder-decoder architectures, then discuss scaling to large contexts and then tokenization and embedding.', 'title': '13 - Transformers Part 2', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_m9hhz175', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.2367633946657962}\"\n", + " ]\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:44:56.486\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"retrieval\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:44:58.15\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"2ec55d74-cce5-42d4-827e-375d597cf91c\",\n", + " \"input\": null,\n", + " \"metadata\": {},\n", + " \"name\": \"User\",\n", + " \"output\": {\n", + " \"content\": \"Do transformers have this?\"\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:44:58.15\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"user_message\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:44:58.151\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"703534b4-981a-43a3-8571-773367be2fa5\",\n", + " \"input\": null,\n", + " \"metadata\": {},\n", + " \"name\": \"AI_Tutor\",\n", + " \"output\": {\n", + " \"content\": \"I'm not sure, I don't have that information.\"\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:44:58.151\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"assistant_message\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:45:13.1\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"fd58e85c-dbaa-4d06-8a75-a3839e37856d\",\n", + " \"input\": {\n", + " \"question\": \"cool, suggest a project with transformers to use as my final project\"\n", + " },\n", + " \"metadata\": {},\n", + " \"name\": \"RAG\",\n", + " \"output\": {\n", + " \"answer\": \"You could consider working on a project that explores the use of transformers in natural language processing tasks, such as sentiment analysis, language translation, or text generation. This could be a great opportunity to apply what you've learned about transformers in a real-world context. Good luck with your project!\",\n", + " \"chat_history\": [\n", + " \"content='What is inductive bias in CNNs?'\",\n", + " \"content='Inductive bias in CNNs refers to the inherent preference for prioritizing information from neighboring nodes when updating each node in the network. This bias helps the network effectively aggregate information from nearby nodes, leading to more efficient and accurate node embeddings. You can find more information about this topic in the lecture slides on Graph Neural Networks and Convolutional Networks.'\",\n", + " \"content='Do transformers have this?'\",\n", + " \"content=\\\"I'm not sure, I don't have that information.\\\"\"\n", + " ],\n", + " \"original_question\": \"cool, suggest a project with transformers to use as my final project\",\n", + " \"question\": \"cool, suggest a project with transformers to use as my final project\",\n", + " \"rephrased_question\": \"What is the concept of inductive bias in transformers, and how does it relate to the prioritization of information from neighboring nodes like in CNNs?\",\n", + " \"source_documents\": [\n", + " \"page_content='Graph convolutional network These models are convolutional in that they update each node by aggregating information from nearby nodes. As such, they induce a relational inductive bias (i.e., a bias toward prioritizing information from neighbors). 39 A function \\ud835\\udc39[\\u22c5] with parameters \\ud835\\udf19\\\" that takes the node embeddings and adjacency matrix and outputs new node embeddings ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/20_graph_neural_networks.pdf', 'page': 38, 'date': '04/16/2024\\\\n Tuesday', 'tldr': 'In this lecture we introduce graph neural networks, define matrix representations, how to do graph level classification and regression, and how to define graph convolutional network layers.', 'title': '20 -- Graph Neural Networks', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_bt7qtymk', 'suggested_readings': 'UDL Chapter 13', 'score': 0.3613770850497736}\",\n", + " \"page_content='Transformers \\u2022 Multi-headed Self Attention is just one component of the transformer architecture \\u2022 Let\\u2019s look at a transformer block (or layer) from the encoder 49 Encoder Decoder ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 48, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.2733153333305661}\",\n", + " \"page_content='Transformers \\u2022 Multi-headed Self Attention is just one component of the transformer architecture 48 Encoder Decoder ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 47, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.22982265165302218}\"\n", + " ]\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:45:10.667\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"retrieval\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:45:13.102\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"c86489d2-cbf3-475e-867e-d1b20addedf0\",\n", + " \"input\": null,\n", + " \"metadata\": {},\n", + " \"name\": \"User\",\n", + " \"output\": {\n", + " \"content\": \"cool, suggest a project with transformers to use as my final project\"\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:45:13.102\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"user_message\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:45:13.103\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"75eb694e-4b1b-4161-96ea-9ecec4092f53\",\n", + " \"input\": null,\n", + " \"metadata\": {},\n", + " \"name\": \"AI_Tutor\",\n", + " \"output\": {\n", + " \"content\": \"You could consider working on a project that explores the use of transformers in natural language processing tasks, such as sentiment analysis, language translation, or text generation. This could be a great opportunity to apply what you've learned about transformers in a real-world context. Good luck with your project!\"\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:45:13.103\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"assistant_message\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:45:26.404\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"4d02c5ad-1bd9-4767-beab-4c724062093e\",\n", + " \"input\": {\n", + " \"question\": \"Could you give me a quick rundown on what image captioning is? and how transformers are useful for this?\"\n", + " },\n", + " \"metadata\": {},\n", + " \"name\": \"RAG\",\n", + " \"output\": {\n", + " \"answer\": \"Image captioning is the process of generating textual descriptions for images. Transformers can be useful for this task because they are effective at processing sequential data and can learn to effectively capture the relationships between the visual and textual information in images and their corresponding captions. You can find more information about this in the lecture slides on Image Transformers and Multimodal Transformers. Good luck with your project!\",\n", + " \"chat_history\": [\n", + " \"content='What is inductive bias in CNNs?'\",\n", + " \"content='Inductive bias in CNNs refers to the inherent preference for prioritizing information from neighboring nodes when updating each node in the network. This bias helps the network effectively aggregate information from nearby nodes, leading to more efficient and accurate node embeddings. You can find more information about this topic in the lecture slides on Graph Neural Networks and Convolutional Networks.'\",\n", + " \"content='Do transformers have this?'\",\n", + " \"content=\\\"I'm not sure, I don't have that information.\\\"\",\n", + " \"content='cool, suggest a project with transformers to use as my final project'\",\n", + " \"content=\\\"You could consider working on a project that explores the use of transformers in natural language processing tasks, such as sentiment analysis, language translation, or text generation. This could be a great opportunity to apply what you've learned about transformers in a real-world context. Good luck with your project!\\\"\"\n", + " ],\n", + " \"original_question\": \"Could you give me a quick rundown on what image captioning is? and how transformers are useful for this?\",\n", + " \"question\": \"Could you give me a quick rundown on what image captioning is? and how transformers are useful for this?\",\n", + " \"rephrased_question\": \"What is image captioning and how are transformers beneficial for this task?\",\n", + " \"source_documents\": [\n", + " \"page_content='Image captioning 24 UDL \\u2022 E.g. CNN-RNN, LSTM, Transformers ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/presentations/01_sp2024_v02.pdf', 'page': 23, 'date': '01/18/2024\\\\n Thursday', 'tldr': \\\"We will introduce the topic of deep learning, a bit about it's history, and what impact it has had. Then we'll go over the course logistics, the lecture topics, problem sets and the mid-term and final projects.\\\", 'title': '01 - Intro to Deep Learning and Course Logistics', 'lecture_recording': None, 'suggested_readings': 'UDL Chapter 1', 'score': 0.552598095386973}\",\n", + " \"page_content='Next Time \\u2022 Tokenization and Learned Embeddings \\u2022 Training and Fine-Tuning Transformers \\u2022 Image Transformers \\u2022 Multimodal Transformers \\u2022 \\u2026 76 Link Feedback ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 75, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.34481147697680925}\",\n", + " \"page_content='Encoder-Decoder Model \\u2022 Used for machine translation, which is a sequence-to-sequence task 22 https://jalammar.github.io/illustrated-transformer/ ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/13_transformers_part2_v2.pdf', 'page': 21, 'date': '03/07/2024\\\\n Thursday', 'tldr': 'In this lecture we continue to review the transformer architecture. We continue the discussion of decoders and encoder-decoder architectures, then discuss scaling to large contexts and then tokenization and embedding.', 'title': '13 - Transformers Part 2', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_m9hhz175', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.33002046196039614}\"\n", + " ]\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:45:23.305\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"retrieval\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:45:26.405\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"3cce51a5-4bcc-42f0-b186-063cb713856a\",\n", + " \"input\": null,\n", + " \"metadata\": {},\n", + " \"name\": \"User\",\n", + " \"output\": {\n", + " \"content\": \"Could you give me a quick rundown on what image captioning is? and how transformers are useful for this?\"\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:45:26.405\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"user_message\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:45:26.407\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"36ae0e9d-540f-45e9-9adf-a596d0886304\",\n", + " \"input\": null,\n", + " \"metadata\": {},\n", + " \"name\": \"AI_Tutor\",\n", + " \"output\": {\n", + " \"content\": \"Image captioning is the process of generating textual descriptions for images. Transformers can be useful for this task because they are effective at processing sequential data and can learn to effectively capture the relationships between the visual and textual information in images and their corresponding captions. You can find more information about this in the lecture slides on Image Transformers and Multimodal Transformers. Good luck with your project!\"\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:45:26.407\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"assistant_message\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:45:48.729\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"9e061c1e-ad50-400f-983f-813587508223\",\n", + " \"input\": {\n", + " \"question\": \"How do I learn the math behind transformers, what are the most important bits?\"\n", + " },\n", + " \"metadata\": {},\n", + " \"name\": \"RAG\",\n", + " \"output\": {\n", + " \"answer\": \"You can start by diving into the lecture slides and suggested readings on transformers, particularly Chapter 12 and the Illustrated Transformer. These resources should cover the important mathematical concepts behind transformers and help you understand the key components. Good luck with your studies!\",\n", + " \"chat_history\": [\n", + " \"content='Do transformers have this?'\",\n", + " \"content=\\\"I'm not sure, I don't have that information.\\\"\",\n", + " \"content='cool, suggest a project with transformers to use as my final project'\",\n", + " \"content=\\\"You could consider working on a project that explores the use of transformers in natural language processing tasks, such as sentiment analysis, language translation, or text generation. This could be a great opportunity to apply what you've learned about transformers in a real-world context. Good luck with your project!\\\"\",\n", + " \"content='Could you give me a quick rundown on what image captioning is? and how transformers are useful for this?'\",\n", + " \"content='Image captioning is the process of generating textual descriptions for images. Transformers can be useful for this task because they are effective at processing sequential data and can learn to effectively capture the relationships between the visual and textual information in images and their corresponding captions. You can find more information about this in the lecture slides on Image Transformers and Multimodal Transformers. Good luck with your project!'\"\n", + " ],\n", + " \"original_question\": \"How do I learn the math behind transformers, what are the most important bits?\",\n", + " \"question\": \"How do I learn the math behind transformers, what are the most important bits?\",\n", + " \"rephrased_question\": \"What are the key components I should focus on when learning the mathematics behind transformers?\",\n", + " \"source_documents\": [\n", + " \"page_content='After the break \\u2022 Image Transformers \\u2022 Multimodal Transformers \\u2022 RAG pattern \\u2022 Training and Fine-Tuning Transformers \\u2022 \\u2026 68 Feedback ChatGPT ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/13_transformers_part2_v2.pdf', 'page': 67, 'date': '03/07/2024\\\\n Thursday', 'tldr': 'In this lecture we continue to review the transformer architecture. We continue the discussion of decoders and encoder-decoder architectures, then discuss scaling to large contexts and then tokenization and embedding.', 'title': '13 - Transformers Part 2', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_m9hhz175', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.3366359998900601}\",\n", + " \"page_content='UDL Chapter 11 Lecture 02/29/2024 Thursday 11a - Recurrent Neural Networks [slides] [lecture recording] Suggested Readings: UDL Chapter 11 Lecture 03/05/2024 Tuesday 12 - Transformers [slides] [lecture recording] Suggested Readings: UDL Chapter 12 Optional The Illustrated Transformer Discussion 03/06/2024 Wednesday Discussion_05 - Deep Dive 2: Deep Learning Modules in Pytorch (CNN, RNN/LSTM, Transformer) Github Link: disc5 Lecture' metadata={'source': 'https://dl4ds.github.io/sp2024/schedule/', 'page': 0, 'score': 0.3100015518546917}\",\n", + " \"page_content=\\\"Suggested Readings: UDL Chapter 12 Optional The Illustrated Transformer 14 -- Vision & Multimodal Transformers tl;dr: In this lecture we'll cover vision and multimodal transformers as a survey of three papers. [slides] [lecture recording] Suggested Readings: See slides for references 15 -- Improving LLM Perf tl;dr: In this lecture we talk about ways to improve LLM performance short of retraining or finetuning. We cover more sophisticated prompt strategies, retrieval augmentation and cognitive architectures building systems and agents based on LLMs. [slides] [lecture recording] Suggested Readings: See slides for references\\\" metadata={'source': 'https://dl4ds.github.io/sp2024/lectures/', 'page': 0, 'score': 0.3078678255019791}\"\n", + " ]\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:45:46.877\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"retrieval\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:45:48.731\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"9cd0e108-9097-4372-97d5-bb2624a03548\",\n", + " \"input\": null,\n", + " \"metadata\": {},\n", + " \"name\": \"User\",\n", + " \"output\": {\n", + " \"content\": \"How do I learn the math behind transformers, what are the most important bits?\"\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:45:48.731\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"user_message\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:45:48.733\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"8c9cf79f-767d-4184-9eeb-baf6e867653e\",\n", + " \"input\": null,\n", + " \"metadata\": {},\n", + " \"name\": \"AI_Tutor\",\n", + " \"output\": {\n", + " \"content\": \"You can start by diving into the lecture slides and suggested readings on transformers, particularly Chapter 12 and the Illustrated Transformer. These resources should cover the important mathematical concepts behind transformers and help you understand the key components. Good luck with your studies!\"\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:45:48.733\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"assistant_message\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:46:02.781\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"74ed6d18-6c08-4d65-b47a-4d8efbcdfc20\",\n", + " \"input\": {\n", + " \"question\": \"What is attention?\"\n", + " },\n", + " \"metadata\": {},\n", + " \"name\": \"RAG\",\n", + " \"output\": {\n", + " \"answer\": \"Attention is a mechanism used in transformers to focus on different parts of the input when processing information. It allows the model to weigh the importance of different elements in the input sequence, enabling it to effectively capture relationships and dependencies. You can find more information about attention in the lecture slides on transformers. Good luck with your studies!\",\n", + " \"chat_history\": [\n", + " \"content='cool, suggest a project with transformers to use as my final project'\",\n", + " \"content=\\\"You could consider working on a project that explores the use of transformers in natural language processing tasks, such as sentiment analysis, language translation, or text generation. This could be a great opportunity to apply what you've learned about transformers in a real-world context. Good luck with your project!\\\"\",\n", + " \"content='Could you give me a quick rundown on what image captioning is? and how transformers are useful for this?'\",\n", + " \"content='Image captioning is the process of generating textual descriptions for images. Transformers can be useful for this task because they are effective at processing sequential data and can learn to effectively capture the relationships between the visual and textual information in images and their corresponding captions. You can find more information about this in the lecture slides on Image Transformers and Multimodal Transformers. Good luck with your project!'\",\n", + " \"content='How do I learn the math behind transformers, what are the most important bits?'\",\n", + " \"content='You can start by diving into the lecture slides and suggested readings on transformers, particularly Chapter 12 and the Illustrated Transformer. These resources should cover the important mathematical concepts behind transformers and help you understand the key components. Good luck with your studies!'\"\n", + " ],\n", + " \"original_question\": \"What is attention?\",\n", + " \"question\": \"What is attention?\",\n", + " \"rephrased_question\": \"What is attention?\",\n", + " \"source_documents\": [\n", + " \"page_content='Transformers, GPTs and Assistants A. Vaswani et al., \\u201cAttention is All you Need,\\u201d presented at the 31st Conference on Neural Information Processing Systems (NIPS 2017), Long Beach, CA, USA, 2017, p. 11. [Online]. Available: https://arxiv.org/abs/1706.03762 State of GPT, Andrej Karpathy, MS Build Keynote 38 ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/presentations/01_sp2024_v02.pdf', 'page': 37, 'date': '01/18/2024\\\\n Thursday', 'tldr': \\\"We will introduce the topic of deep learning, a bit about it's history, and what impact it has had. Then we'll go over the course logistics, the lecture topics, problem sets and the mid-term and final projects.\\\", 'title': '01 - Intro to Deep Learning and Course Logistics', 'lecture_recording': None, 'suggested_readings': 'UDL Chapter 1', 'score': 0.40361429640387503}\",\n", + " \"page_content='Motivation \\u2022 Need to efficiently process large strings of text \\u2022 Need to relate words across fairly long context lengths Self-Attention addresses these problems ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 16, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.2435349602286475}\",\n", + " \"page_content='Graph Attention 59 Regular graph convolution Graph attention ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/20_graph_neural_networks.pdf', 'page': 58, 'date': '04/16/2024\\\\n Tuesday', 'tldr': 'In this lecture we introduce graph neural networks, define matrix representations, how to do graph level classification and regression, and how to define graph convolutional network layers.', 'title': '20 -- Graph Neural Networks', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_bt7qtymk', 'suggested_readings': 'UDL Chapter 13', 'score': 0.22901789972755515}\"\n", + " ]\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:46:00.606\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"retrieval\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:46:02.783\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"718cdca2-b453-4b3a-9e0b-fbcdd38c54c7\",\n", + " \"input\": null,\n", + " \"metadata\": {},\n", + " \"name\": \"User\",\n", + " \"output\": {\n", + " \"content\": \"What is attention?\"\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:46:02.783\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"user_message\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:46:02.785\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"ae559dc2-c069-419e-b0e5-f117861292f8\",\n", + " \"input\": null,\n", + " \"metadata\": {},\n", + " \"name\": \"AI_Tutor\",\n", + " \"output\": {\n", + " \"content\": \"Attention is a mechanism used in transformers to focus on different parts of the input when processing information. It allows the model to weigh the importance of different elements in the input sequence, enabling it to effectively capture relationships and dependencies. You can find more information about attention in the lecture slides on transformers. Good luck with your studies!\"\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:46:02.785\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"assistant_message\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:46:10.547\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"4ac119fb-86f5-4528-9edc-44cc616de390\",\n", + " \"input\": {\n", + " \"question\": \"cool thanks, see ya\"\n", + " },\n", + " \"metadata\": {},\n", + " \"name\": \"RAG\",\n", + " \"output\": {\n", + " \"answer\": \"You're welcome! If you have any more questions in the future, feel free to ask. Goodbye!\",\n", + " \"chat_history\": [\n", + " \"content='Could you give me a quick rundown on what image captioning is? and how transformers are useful for this?'\",\n", + " \"content='Image captioning is the process of generating textual descriptions for images. Transformers can be useful for this task because they are effective at processing sequential data and can learn to effectively capture the relationships between the visual and textual information in images and their corresponding captions. You can find more information about this in the lecture slides on Image Transformers and Multimodal Transformers. Good luck with your project!'\",\n", + " \"content='How do I learn the math behind transformers, what are the most important bits?'\",\n", + " \"content='You can start by diving into the lecture slides and suggested readings on transformers, particularly Chapter 12 and the Illustrated Transformer. These resources should cover the important mathematical concepts behind transformers and help you understand the key components. Good luck with your studies!'\",\n", + " \"content='What is attention?'\",\n", + " \"content='Attention is a mechanism used in transformers to focus on different parts of the input when processing information. It allows the model to weigh the importance of different elements in the input sequence, enabling it to effectively capture relationships and dependencies. You can find more information about attention in the lecture slides on transformers. Good luck with your studies!'\"\n", + " ],\n", + " \"original_question\": \"cool thanks, see ya\",\n", + " \"question\": \"cool thanks, see ya\",\n", + " \"rephrased_question\": \"No rephrasing needed.\",\n", + " \"source_documents\": [\n", + " \"page_content='Motivation \\u2022 Need to efficiently process large strings of text \\u2022 Need to relate words across fairly long context lengths Self-Attention addresses these problems ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 16, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': -0.015666472239410423}\",\n", + " \"page_content='Example 1: univariate regression 61 ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf', 'page': 59, 'date': '02/01/2024\\\\n Thursday', 'tldr': 'We reconsider loss functions as a measure of how well the data fits to parametric probability distribution. We show that for univariate gaussian distributions we arrive back at least squares loss. We then introduce the notion of maximum likelihood and see how we can use that to define loss functions for many types data distributions. We cover some examples and then show how to generalize. This is a key topic to aid you in applying deep learning models to new types of data.', 'title': '05 - Loss Functions', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_k2t4xjkd', 'suggested_readings': 'UDL Chapter 5', 'score': -0.03567729015020071}\",\n", + " \"page_content='Example 1: univariate regression AXBXiclZjZbtw2FEDHXdN0c1 rELwVaoUaApHAM20iXlwKJHWezU0/iN bEcg9JQGsYUJUuUPRN1not+TN+KvY7 +hX9hV5K8jC6l37oAPEw9xul5TEUZB JUeilpX9m3n3vfc/+PDKR1c/uTz 6fvfbFXpGWech3w1Sm+UHACi6F4rta MkPspyzJB8PzhZM3z/jOeFSNWOHmf8 KGxEpEImYbQ8ezv/fzm+Fc/CdJRFU0 O/SAaLcCfbCiOFvxCxAl7tXL+9nzo5 yF1fKk8ovTXFcrfiZaXK1MJhOfjzJf8 kgf3m7Mm+PbzkZv1T7Ut5X9XMRDfXQ8 O7+0uFR/PFpYbgvzvfbTP752feAP0rB MuNKhZEVxuLyU6aOK5VqEk+u+mXBMx aesJgfQlGxhBdHVZ2ziXcDIgMvSnP4p 7RXR9+uUbGkKMZJAGbC9LDAzARd7LDU 0U9HlVBZqbkKm46iUno69cwCeAOR81D LMRYmAsYqxcOGeRMwzJd9RU/D9MkYW pQ+avrzyDhAY+FqvhpWS/ZNJ1muHQ /EyY/XxzrQVoXki3nDSK2YRi4ReDyp Kr4YL2IgOACxyAlIFS+gTZOfIPKWEYU tKgFXzR6BveE9n5CmleYx5KSjvSQaFD LJRx1rjViwlElH2QbF8254BnCdwyrAU OGLozXYzpiaXNTfKTzpCpMDPeQMxXz uguYcsikmVHXUKWUDXsWL9g6zlTJ23 i0qweam4iyNrJu47OaV7UoOvUEWTBJo y7Vh1BloQbyoAlDLclo9hwolnIm5VK KwKsjH7eRp0+85MBO/NUQbXS9dbr0j6 zxjKiAnA1We+BVMh7+pr6dT2LpJzVvu mwEfeEBarW4XlcTOti05gVm1sQs06V8 ik2YJQnp53TMah8oz0Z2gCeCLrsyFi t7SFuoSbFkT9hdgqnkp+eHtxe/56Kha MpeN+UOyCQ0VZeZqyIT/R0MDeITh/QU RvHipRIsHgXrxUgn3d7R0LMcb20TqtY OCUEwKPUaXv4hVt04dwYNEzRWCJh24 ZsJhRY5irqyCRgZvuFh7NhAIZpk2Mwx lGlR5pzc/NB+hkitm9tiLszDqntDlUb o3je4nNaCMjwczvgl1QOU0aDJZ5CWas BylMyRWdLRK7/QcIm5rv56yZui04r56 UbH4wLVqcMQ356vIHXIyYWdSRqC04/ zrYksRz9QVvT7fr2yKqNV9+RrR07XLc pSbvtKN2w71kBPx0zHaTeIRizoStd WOkHrEcvQHbnzuOmahcN1m5K0e5FHp +1wpyba/tHOkGtmjkmpHJhjXyr9JoRF TUXtFNOEx0hsQlhMyq4F/8fKtjksd60 mhMV+IbqaCWBpwCWeQhPCYnMJd802ht VNh7rpVpnMhshsQlh8yBI86yaExZiKs VM8YVmGxCZE8jEeRzSPGZYylwSXpHM sSJkS7k2VD5Mu5IJYGmEehs5OoMRyFS hDtsglgu68wrnzlNoFyu6i3dHe9e0r FmqETwNIWucY8f8t5kQU4xXDMciU5E 8jKaAL72OlT5+L0F0QVOckF0djSMaXn lp5Tum/pPqW5peQXQRA9t5T8OgmiM0v PKN2zdI/S0tKS0l1LdymNLI0ofWDpA0 pDS0NK1yxdo1RbSk6k8ESwdIfSoaVDS g8sPaD0haUvKH1k6SNKX1r6ktI3lr6h 9J6l9yhljJK1y1dp5RbSl4dBNGqpau UBpaS35wrVnapzSzNKP0vqX3KR1YSn 4Vw/PMUnK8gQejpZLSx5Y+plRYSn6/B dFTS59SmliaUPrE0ieUvrb0NaUPLX1I aWwpeTcApxNLtym1b4GqgtJnlj6j9NT SU/d7AT5dxsC1MbdsA1uUpamlG5YSn 4pwFHC0hNynoxUe1e7eNtE7muRmnIHa zN+UZvkPFJT7mDt3emiNrk/RWrKh2To 63vTFymQUrjTH8/OL+O3sLSwt7K4/MP inWd35u+utm9or/S+6n3bu9lb7v3Yu9 t71Ov3dnth79+Z6zNfz3wz9vcH3N/z v3VqO/MtHW+7HU+c3/B3l+BUI= Pr(y|f[x, \\u03c6], \\u03c32) = 1 p 2\\u21e1\\u03c32 exp \\\\uf8ff \\u2212(y \\u2212f[x, \\u03c6])2 2\\u03c32 \\\" AW4XiclZjLbtw2FECVPtP05b SoN90INQIkRWLYRvrYFEjsOC879Th+J pZjUBpKw5iZImyZ6LOB3RXdNtP6hf0 M7ptN73UaIbRvfSiAzjD3HPExyUpcRT mUpR6aemvK+8+97H3x49aNrH3/y6W efz13/Yr/MqiLie1Ems+IwZCWXQvE9L bTkh3nBWRpKfhCerhl+cM6LUmRqV49y fpyRIlYRExD6GSO94qbo1+CtLodlCJ J2auVW/5PfhAXLKqXx3VQnhW6Xgly0e J6ZTweB3yYB5LH+ujOxLw5ugNV3Go2 FYNCpEM9PHJ3MLS4lLz8WlhuS0seO2n d3L9q37Qz6Iq5UpHkpXl0fJSro9rVmg RST6+FlQlz1l0yhJ+BEXFUl4e10+xv 4NiPT9OCvgT2m/ib59Rc3SshylIZgp0 4MSMxN0saNKxz8e10LleYqmjQUV9LX mW+S6/dFwSMtR1BgUSGgr340YJAhDVN wLVD8IsrSlKl+Hayub0N6Q54IVfOzqp mO8bjrDcOh+JlxuqT3VktQvNUvOGk kYxlVwi8GRc13wxWcRAcABikROQKV5C nSY/YewvIwrLTwIGHmZD6FzsPx+TqpX mCeSko70kGhRyYcda41YMJVpR9kBxf dv+AZwXcAsQFfhi6M52MmZGk+v03yoi 7QuTQy3UDCV8KYJGHLEpBlR1CVlHBp 1LF+xtZzpk7bxGV509XCRJC1W3QdXdC 8qH7XaSLIgkWYdK0mgiwJN4s+SxlkuS 2fwIBT30TcqlBYFWRh9os7Ladmwhem 8Mc9kvXW69J+s8ZyogJwO4z34KpiHf1 tWxm+9PknDe+KfChP4DJ6l7CimQyrGk jMKo2NqZmkytk0mxBqMguqbpjUPlue gO0ATwpqsKoeK3tNtNCZasCQe3YahFJ fnRncXv+PC4XjLbxvxDsgkVlVXuqsiE /0dFfXg84fUFETx5mUSTB4Fm8jIJ93c 0dazAC9tEmrmDglBMCj1C218kqntNE8 GdzVLUVwiYeuGbCYUmOY67sgkYGb7hQ etYQBEaZDQZYySzsio4ufmh9QyRje3 xUKYh1X3hiqN0L1vcDm7CsrwcDjnl1w eoyGk3yGWaX6rEDJHJopHb4KSg1bzL X7mymfFJ1Wws82vagXzA7VRTxs5MNP B8JsagjUV1wsnHWJYnlaA/qmi3Xt3tW b7z6liztxOG6TUnqbXvpth3uJT3gZ5u O3m4Sj1jUkaiutofUI5ajPajLncdN1y gcrtuUpN5pHp2w52ZaPnHuwOumTkmZ bJvjn2ZDCYhLGoqaqeYpTxB4iSExbTq WvB/rOyYw3LXmoSw2CtFVzMBLPW5xEO YhLA42cJds41hdOhbrpVJvMBMichLD 5iKR71JITFhIqJUzxleY7ESYjkcYDzO KB5zLGUuyQ8I7ljRsiSci2oYpB1JRPA 0hC1NnQ0Bj2QmUINtkEsl3Tlc6Vp9A qVnQV7ka3rukYc1QhSaApS2yx/xgy7 nJQpxiOGa5kpwLZOU0gT3s9KgzPf2Fc U1OcmE8snRE6YWlF5QeWHpAaWEp+UQ xs8tJb9Owvjc0nNK9y3dp7SytKJ0z9I 9SmNLY0ofWvqQ0sjSiNI1S9co1ZaSEy k8ESzdpXRg6YDSQ0sPKX1h6QtKH1v6m NKXlr6k9I2lbyi9b+l9SpmljNJ1S9cp 5ZaSVwdhvGrpKqWhpeS3H+w1S3uU5pb mlD6w9AGlfUvJr2J4nlKjfwYLRUv rE0ieUCkvJ7cwfmbpM0pTS1NKn1r6l NLXlr6m9JGljyhNLCXvBuB0YukOpfYt UF1Sum3pNqVnlp653wvw2TSGroW5ZSv YojSzNKN0w1LySwGOEpaekvNkrNq72v RtE7mvxWrGHazN+PRqkvNYzbiDtXen6 dXk/hSrGR+Qrq/vz16kQErhTn8yt7CM 38LSwv7K4vL3i3e37y7cW23f0F71va +8W56y94P3j3vsdfz9rzI+9P72/vH+3 c+mv91/rf53yfqO1fa70Op/5P/4D+ G/4mg= Pr(y|\\u00b5, \\u03c32) = 1 p 2\\u21e1\\u03c32 exp \\\\uf8ff \\u2212(y \\u2212\\u00b5)2 2\\u03c32 \\\" In this case, just the mean Just learn the mean, \\ud835\\udf07, and assume the variance is fixed,. 63 ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf', 'page': 61, 'date': '02/01/2024\\\\n Thursday', 'tldr': 'We reconsider loss functions as a measure of how well the data fits to parametric probability distribution. We show that for univariate gaussian distributions we arrive back at least squares loss. We then introduce the notion of maximum likelihood and see how we can use that to define loss functions for many types data distributions. We cover some examples and then show how to generalize. This is a key topic to aid you in applying deep learning models to new types of data.', 'title': '05 - Loss Functions', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_k2t4xjkd', 'suggested_readings': 'UDL Chapter 5', 'score': -0.038714982109774976}\"\n", + " ]\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:46:08.896\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"retrieval\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:46:10.549\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"4425cdbd-0da9-4d90-80e2-d909d4390189\",\n", + " \"input\": null,\n", + " \"metadata\": {},\n", + " \"name\": \"User\",\n", + " \"output\": {\n", + " \"content\": \"cool thanks, see ya\"\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:46:10.549\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"user_message\"\n", + " },\n", + " {\n", + " \"attachments\": [],\n", + " \"endTime\": \"2024-06-19T09:46:10.55\",\n", + " \"error\": null,\n", + " \"generation\": null,\n", + " \"id\": \"ae5047bf-cdbd-4490-a5fb-beb9cf2f27e5\",\n", + " \"input\": null,\n", + " \"metadata\": {},\n", + " \"name\": \"AI_Tutor\",\n", + " \"output\": {\n", + " \"content\": \"You're welcome! If you have any more questions in the future, feel free to ask. Goodbye!\"\n", + " },\n", + " \"parentId\": null,\n", + " \"scores\": [],\n", + " \"startTime\": \"2024-06-19T09:46:10.55\",\n", + " \"tags\": null,\n", + " \"threadId\": \"7a2c91be-4f47-4f60-8ded-537dd31386df\",\n", + " \"type\": \"assistant_message\"\n", + " }\n", + " ],\n", + " \"tags\": [\n", + " \"FAISS\",\n", + " \"gpt-3.5-turbo-1106\"\n", + " ]\n", + "}]\n", + "{'step_id': ['1ab30fe6-99cb-4327-bd7d-e3128b2c1a62', 'd2b74775-41fc-4ebc-bc40-d8d7cf67015d', 'fd58e85c-dbaa-4d06-8a75-a3839e37856d', '4d02c5ad-1bd9-4767-beab-4c724062093e', '9e061c1e-ad50-400f-983f-813587508223', '74ed6d18-6c08-4d65-b47a-4d8efbcdfc20', '4ac119fb-86f5-4528-9edc-44cc616de390'], 'user_input': ['What is inductive bias in CNNs?', 'Do transformers have this?', 'cool, suggest a project with transformers to use as my final project', 'Could you give me a quick rundown on what image captioning is? and how transformers are useful for this?', 'How do I learn the math behind transformers, what are the most important bits?', 'What is attention?', 'cool thanks, see ya'], 'output': ['Inductive bias in CNNs refers to the inherent preference for prioritizing information from neighboring nodes when updating each node in the network. This bias helps the network effectively aggregate information from nearby nodes, leading to more efficient and accurate node embeddings. You can find more information about this topic in the lecture slides on Graph Neural Networks and Convolutional Networks.', \"I'm not sure, I don't have that information.\", \"You could consider working on a project that explores the use of transformers in natural language processing tasks, such as sentiment analysis, language translation, or text generation. This could be a great opportunity to apply what you've learned about transformers in a real-world context. Good luck with your project!\", 'Image captioning is the process of generating textual descriptions for images. Transformers can be useful for this task because they are effective at processing sequential data and can learn to effectively capture the relationships between the visual and textual information in images and their corresponding captions. You can find more information about this in the lecture slides on Image Transformers and Multimodal Transformers. Good luck with your project!', 'You can start by diving into the lecture slides and suggested readings on transformers, particularly Chapter 12 and the Illustrated Transformer. These resources should cover the important mathematical concepts behind transformers and help you understand the key components. Good luck with your studies!', 'Attention is a mechanism used in transformers to focus on different parts of the input when processing information. It allows the model to weigh the importance of different elements in the input sequence, enabling it to effectively capture relationships and dependencies. You can find more information about attention in the lecture slides on transformers. Good luck with your studies!', \"You're welcome! If you have any more questions in the future, feel free to ask. Goodbye!\"], 'context': [['page_content=\\'Graph convolutional network These models are convolutional in that they update each node by aggregating information from nearby nodes. As such, they induce a relational inductive bias (i.e., a bias toward prioritizing information from neighbors). 39 A function 𝐹[β‹…] with parameters πœ™\" that takes the node embeddings and adjacency matrix and outputs new node embeddings \\' metadata={\\'source\\': \\'https://dl4ds.github.io/sp2024/static_files/lectures/20_graph_neural_networks.pdf\\', \\'page\\': 38, \\'date\\': \\'04/16/2024\\\\n Tuesday\\', \\'tldr\\': \\'In this lecture we introduce graph neural networks, define matrix representations, how to do graph level classification and regression, and how to define graph convolutional network layers.\\', \\'title\\': \\'20 -- Graph Neural Networks\\', \\'lecture_recording\\': \\'https://mymedia.bu.edu/media/t/1_bt7qtymk\\', \\'suggested_readings\\': \\'UDL Chapter 13\\', \\'score\\': 0.4147438882496012}', \"page_content='https://poloclub.github.io/cnn-explainer/ 92 ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/10_convolutional_networks.pdf', 'page': 91, 'date': '02/27/2024\\\\n Tuesday', 'tldr': 'We cover 1D and 2D convolutional neural networks along with subsampling and upsampling operations.', 'title': '10 - Convolutional Neural Networks', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_8ejvyib4', 'suggested_readings': 'UDL Chapter 10', 'score': 0.30807206912986296}\", \"page_content='Convolutional networks β€’ Networks for images β€’ Invariance and equivariance β€’ 1D convolution β€’ Convolutional layers β€’ Channels β€’ Receptive fields β€’ Convolutional network for MNIST 1D 44 ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/10_convolutional_networks.pdf', 'page': 43, 'date': '02/27/2024\\\\n Tuesday', 'tldr': 'We cover 1D and 2D convolutional neural networks along with subsampling and upsampling operations.', 'title': '10 - Convolutional Neural Networks', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_8ejvyib4', 'suggested_readings': 'UDL Chapter 10', 'score': 0.26420832659802906}\"], [\"page_content='Transformers β€’ Multi-headed Self Attention is just one component of the transformer architecture β€’ Let’s look at a transformer block (or layer) from the encoder 49 Encoder Decoder ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 48, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.2451855994037282}\", \"page_content='Transformers β€’ Motivation β€’ Dot-product self-attention β€’ Applying Self-Attention β€’ The Transformer Architecture β€’ Three Types of NLP Transformer Models β€’ Encoder β€’ Decoder β€’ Encoder-Decoder 55 ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 54, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.24314965373956088}\", \"page_content='Transformers β€’ Motivation β€’ Dot-product self-attention β€’ Applying Self-Attention β€’ The Transformer Architecture β€’ Three Types of NLP Transformer Models β€’ Encoder β€’ Decoder β€’ Encoder-Decoder 10 ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/13_transformers_part2_v2.pdf', 'page': 9, 'date': '03/07/2024\\\\n Thursday', 'tldr': 'In this lecture we continue to review the transformer architecture. We continue the discussion of decoders and encoder-decoder architectures, then discuss scaling to large contexts and then tokenization and embedding.', 'title': '13 - Transformers Part 2', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_m9hhz175', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.2367633946657962}\"], ['page_content=\\'Graph convolutional network These models are convolutional in that they update each node by aggregating information from nearby nodes. As such, they induce a relational inductive bias (i.e., a bias toward prioritizing information from neighbors). 39 A function 𝐹[β‹…] with parameters πœ™\" that takes the node embeddings and adjacency matrix and outputs new node embeddings \\' metadata={\\'source\\': \\'https://dl4ds.github.io/sp2024/static_files/lectures/20_graph_neural_networks.pdf\\', \\'page\\': 38, \\'date\\': \\'04/16/2024\\\\n Tuesday\\', \\'tldr\\': \\'In this lecture we introduce graph neural networks, define matrix representations, how to do graph level classification and regression, and how to define graph convolutional network layers.\\', \\'title\\': \\'20 -- Graph Neural Networks\\', \\'lecture_recording\\': \\'https://mymedia.bu.edu/media/t/1_bt7qtymk\\', \\'suggested_readings\\': \\'UDL Chapter 13\\', \\'score\\': 0.3613770850497736}', \"page_content='Transformers β€’ Multi-headed Self Attention is just one component of the transformer architecture β€’ Let’s look at a transformer block (or layer) from the encoder 49 Encoder Decoder ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 48, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.2733153333305661}\", \"page_content='Transformers β€’ Multi-headed Self Attention is just one component of the transformer architecture 48 Encoder Decoder ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 47, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.22982265165302218}\"], ['page_content=\\'Image captioning 24 UDL β€’ E.g. CNN-RNN, LSTM, Transformers \\' metadata={\\'source\\': \\'https://dl4ds.github.io/sp2024/static_files/presentations/01_sp2024_v02.pdf\\', \\'page\\': 23, \\'date\\': \\'01/18/2024\\\\n Thursday\\', \\'tldr\\': \"We will introduce the topic of deep learning, a bit about it\\'s history, and what impact it has had. Then we\\'ll go over the course logistics, the lecture topics, problem sets and the mid-term and final projects.\", \\'title\\': \\'01 - Intro to Deep Learning and Course Logistics\\', \\'lecture_recording\\': None, \\'suggested_readings\\': \\'UDL Chapter 1\\', \\'score\\': 0.552598095386973}', \"page_content='Next Time β€’ Tokenization and Learned Embeddings β€’ Training and Fine-Tuning Transformers β€’ Image Transformers β€’ Multimodal Transformers β€’ … 76 Link Feedback ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 75, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.34481147697680925}\", \"page_content='Encoder-Decoder Model β€’ Used for machine translation, which is a sequence-to-sequence task 22 https://jalammar.github.io/illustrated-transformer/ ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/13_transformers_part2_v2.pdf', 'page': 21, 'date': '03/07/2024\\\\n Thursday', 'tldr': 'In this lecture we continue to review the transformer architecture. We continue the discussion of decoders and encoder-decoder architectures, then discuss scaling to large contexts and then tokenization and embedding.', 'title': '13 - Transformers Part 2', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_m9hhz175', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.33002046196039614}\"], [\"page_content='After the break β€’ Image Transformers β€’ Multimodal Transformers β€’ RAG pattern β€’ Training and Fine-Tuning Transformers β€’ … 68 Feedback ChatGPT ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/13_transformers_part2_v2.pdf', 'page': 67, 'date': '03/07/2024\\\\n Thursday', 'tldr': 'In this lecture we continue to review the transformer architecture. We continue the discussion of decoders and encoder-decoder architectures, then discuss scaling to large contexts and then tokenization and embedding.', 'title': '13 - Transformers Part 2', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_m9hhz175', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.3366359998900601}\", \"page_content='UDL Chapter 11 Lecture 02/29/2024 Thursday 11a - Recurrent Neural Networks [slides] [lecture recording] Suggested Readings: UDL Chapter 11 Lecture 03/05/2024 Tuesday 12 - Transformers [slides] [lecture recording] Suggested Readings: UDL Chapter 12 Optional The Illustrated Transformer Discussion 03/06/2024 Wednesday Discussion_05 - Deep Dive 2: Deep Learning Modules in Pytorch (CNN, RNN/LSTM, Transformer) Github Link: disc5 Lecture' metadata={'source': 'https://dl4ds.github.io/sp2024/schedule/', 'page': 0, 'score': 0.3100015518546917}\", 'page_content=\"Suggested Readings: UDL Chapter 12 Optional The Illustrated Transformer 14 -- Vision & Multimodal Transformers tl;dr: In this lecture we\\'ll cover vision and multimodal transformers as a survey of three papers. [slides] [lecture recording] Suggested Readings: See slides for references 15 -- Improving LLM Perf tl;dr: In this lecture we talk about ways to improve LLM performance short of retraining or finetuning. We cover more sophisticated prompt strategies, retrieval augmentation and cognitive architectures building systems and agents based on LLMs. [slides] [lecture recording] Suggested Readings: See slides for references\" metadata={\\'source\\': \\'https://dl4ds.github.io/sp2024/lectures/\\', \\'page\\': 0, \\'score\\': 0.3078678255019791}'], ['page_content=\\'Transformers, GPTs and Assistants A. Vaswani et al., β€œAttention is All you Need,” presented at the 31st Conference on Neural Information Processing Systems (NIPS 2017), Long Beach, CA, USA, 2017, p. 11. [Online]. Available: https://arxiv.org/abs/1706.03762 State of GPT, Andrej Karpathy, MS Build Keynote 38 \\' metadata={\\'source\\': \\'https://dl4ds.github.io/sp2024/static_files/presentations/01_sp2024_v02.pdf\\', \\'page\\': 37, \\'date\\': \\'01/18/2024\\\\n Thursday\\', \\'tldr\\': \"We will introduce the topic of deep learning, a bit about it\\'s history, and what impact it has had. Then we\\'ll go over the course logistics, the lecture topics, problem sets and the mid-term and final projects.\", \\'title\\': \\'01 - Intro to Deep Learning and Course Logistics\\', \\'lecture_recording\\': None, \\'suggested_readings\\': \\'UDL Chapter 1\\', \\'score\\': 0.40361429640387503}', \"page_content='Motivation β€’ Need to efficiently process large strings of text β€’ Need to relate words across fairly long context lengths Self-Attention addresses these problems ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 16, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.2435349602286475}\", \"page_content='Graph Attention 59 Regular graph convolution Graph attention ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/20_graph_neural_networks.pdf', 'page': 58, 'date': '04/16/2024\\\\n Tuesday', 'tldr': 'In this lecture we introduce graph neural networks, define matrix representations, how to do graph level classification and regression, and how to define graph convolutional network layers.', 'title': '20 -- Graph Neural Networks', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_bt7qtymk', 'suggested_readings': 'UDL Chapter 13', 'score': 0.22901789972755515}\"], [\"page_content='Motivation β€’ Need to efficiently process large strings of text β€’ Need to relate words across fairly long context lengths Self-Attention addresses these problems ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 16, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': -0.015666472239410423}\", \"page_content='Example 1: univariate regression 61 ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf', 'page': 59, 'date': '02/01/2024\\\\n Thursday', 'tldr': 'We reconsider loss functions as a measure of how well the data fits to parametric probability distribution. We show that for univariate gaussian distributions we arrive back at least squares loss. We then introduce the notion of maximum likelihood and see how we can use that to define loss functions for many types data distributions. We cover some examples and then show how to generalize. This is a key topic to aid you in applying deep learning models to new types of data.', 'title': '05 - Loss Functions', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_k2t4xjkd', 'suggested_readings': 'UDL Chapter 5', 'score': -0.03567729015020071}\", 'page_content=\\'Example 1: univariate regression AXBXiclZjZbtw2FEDHXdN0c1 rELwVaoUaApHAM20iXlwKJHWezU0/iN bEcg9JQGsYUJUuUPRN1not+TN+KvY7 +hX9hV5K8jC6l37oAPEw9xul5TEUZB JUeilpX9m3n3vfc/+PDKR1c/uTz 6fvfbFXpGWech3w1Sm+UHACi6F4rta MkPspyzJB8PzhZM3z/jOeFSNWOHmf8 KGxEpEImYbQ8ezv/fzm+Fc/CdJRFU0 O/SAaLcCfbCiOFvxCxAl7tXL+9nzo5 yF1fKk8ovTXFcrfiZaXK1MJhOfjzJf8 kgf3m7Mm+PbzkZv1T7Ut5X9XMRDfXQ8 O7+0uFR/PFpYbgvzvfbTP752feAP0rB MuNKhZEVxuLyU6aOK5VqEk+u+mXBMx aesJgfQlGxhBdHVZ2ziXcDIgMvSnP4p 7RXR9+uUbGkKMZJAGbC9LDAzARd7LDU 0U9HlVBZqbkKm46iUno69cwCeAOR81D LMRYmAsYqxcOGeRMwzJd9RU/D9MkYW pQ+avrzyDhAY+FqvhpWS/ZNJ1muHQ /EyY/XxzrQVoXki3nDSK2YRi4ReDyp Kr4YL2IgOACxyAlIFS+gTZOfIPKWEYU tKgFXzR6BveE9n5CmleYx5KSjvSQaFD LJRx1rjViwlElH2QbF8254BnCdwyrAU OGLozXYzpiaXNTfKTzpCpMDPeQMxXz uguYcsikmVHXUKWUDXsWL9g6zlTJ23 i0qweam4iyNrJu47OaV7UoOvUEWTBJo y7Vh1BloQbyoAlDLclo9hwolnIm5VK KwKsjH7eRp0+85MBO/NUQbXS9dbr0j6 zxjKiAnA1We+BVMh7+pr6dT2LpJzVvu mwEfeEBarW4XlcTOti05gVm1sQs06V8 ik2YJQnp53TMah8oz0Z2gCeCLrsyFi t7SFuoSbFkT9hdgqnkp+eHtxe/56Kha MpeN+UOyCQ0VZeZqyIT/R0MDeITh/QU RvHipRIsHgXrxUgn3d7R0LMcb20TqtY OCUEwKPUaXv4hVt04dwYNEzRWCJh24 ZsJhRY5irqyCRgZvuFh7NhAIZpk2Mwx lGlR5pzc/NB+hkitm9tiLszDqntDlUb o3je4nNaCMjwczvgl1QOU0aDJZ5CWas BylMyRWdLRK7/QcIm5rv56yZui04r56 UbH4wLVqcMQ356vIHXIyYWdSRqC04/ zrYksRz9QVvT7fr2yKqNV9+RrR07XLc pSbvtKN2w71kBPx0zHaTeIRizoStd WOkHrEcvQHbnzuOmahcN1m5K0e5FHp +1wpyba/tHOkGtmjkmpHJhjXyr9JoRF TUXtFNOEx0hsQlhMyq4F/8fKtjksd60 mhMV+IbqaCWBpwCWeQhPCYnMJd802ht VNh7rpVpnMhshsQlh8yBI86yaExZiKs VM8YVmGxCZE8jEeRzSPGZYylwSXpHM sSJkS7k2VD5Mu5IJYGmEehs5OoMRyFS hDtsglgu68wrnzlNoFyu6i3dHe9e0r FmqETwNIWucY8f8t5kQU4xXDMciU5E 8jKaAL72OlT5+L0F0QVOckF0djSMaXn lp5Tum/pPqW5peQXQRA9t5T8OgmiM0v PKN2zdI/S0tKS0l1LdymNLI0ofWDpA0 pDS0NK1yxdo1RbSk6k8ESwdIfSoaVDS g8sPaD0haUvKH1k6SNKX1r6ktI3lr6h 9J6l9yhljJK1y1dp5RbSl4dBNGqpau UBpaS35wrVnapzSzNKP0vqX3KR1YSn 4Vw/PMUnK8gQejpZLSx5Y+plRYSn6/B dFTS59SmliaUPrE0ieUvrb0NaUPLX1I aWwpeTcApxNLtym1b4GqgtJnlj6j9NT SU/d7AT5dxsC1MbdsA1uUpamlG5YSn 4pwFHC0hNynoxUe1e7eNtE7muRmnIHa zN+UZvkPFJT7mDt3emiNrk/RWrKh2To 63vTFymQUrjTH8/OL+O3sLSwt7K4/MP inWd35u+utm9or/S+6n3bu9lb7v3Yu9 t71Ov3dnth79+Z6zNfz3wz9vcH3N/z v3VqO/MtHW+7HU+c3/B3l+BUI= Pr(y|f[x, Ο†], Οƒ2) = 1 p 2⇑σ2 exp \\\\uf8ff βˆ’(y βˆ’f[x, Ο†])2 2Οƒ2 \" AW4XiclZjLbtw2FECVPtP05b SoN90INQIkRWLYRvrYFEjsOC879Th+J pZjUBpKw5iZImyZ6LOB3RXdNtP6hf0 M7ptN73UaIbRvfSiAzjD3HPExyUpcRT mUpR6aemvK+8+97H3x49aNrH3/y6W efz13/Yr/MqiLie1Ems+IwZCWXQvE9L bTkh3nBWRpKfhCerhl+cM6LUmRqV49y fpyRIlYRExD6GSO94qbo1+CtLodlCJ J2auVW/5PfhAXLKqXx3VQnhW6Xgly0e J6ZTweB3yYB5LH+ujOxLw5ugNV3Go2 FYNCpEM9PHJ3MLS4lLz8WlhuS0seO2n d3L9q37Qz6Iq5UpHkpXl0fJSro9rVmg RST6+FlQlz1l0yhJ+BEXFUl4e10+xv 4NiPT9OCvgT2m/ib59Rc3SshylIZgp0 4MSMxN0saNKxz8e10LleYqmjQUV9LX mW+S6/dFwSMtR1BgUSGgr340YJAhDVN wLVD8IsrSlKl+Hayub0N6Q54IVfOzqp mO8bjrDcOh+JlxuqT3VktQvNUvOGk kYxlVwi8GRc13wxWcRAcABikROQKV5C nSY/YewvIwrLTwIGHmZD6FzsPx+TqpX mCeSko70kGhRyYcda41YMJVpR9kBxf dv+AZwXcAsQFfhi6M52MmZGk+v03yoi 7QuTQy3UDCV8KYJGHLEpBlR1CVlHBp 1LF+xtZzpk7bxGV509XCRJC1W3QdXdC 8qH7XaSLIgkWYdK0mgiwJN4s+SxlkuS 2fwIBT30TcqlBYFWRh9os7Ladmwhem 8Mc9kvXW69J+s8ZyogJwO4z34KpiHf1 tWxm+9PknDe+KfChP4DJ6l7CimQyrGk jMKo2NqZmkytk0mxBqMguqbpjUPlue gO0ATwpqsKoeK3tNtNCZasCQe3YahFJ fnRncXv+PC4XjLbxvxDsgkVlVXuqsiE /0dFfXg84fUFETx5mUSTB4Fm8jIJ93c 0dazAC9tEmrmDglBMCj1C218kqntNE8 GdzVLUVwiYeuGbCYUmOY67sgkYGb7hQ etYQBEaZDQZYySzsio4ufmh9QyRje3 xUKYh1X3hiqN0L1vcDm7CsrwcDjnl1w eoyGk3yGWaX6rEDJHJopHb4KSg1bzL X7mymfFJ1Wws82vagXzA7VRTxs5MNP B8JsagjUV1wsnHWJYnlaA/qmi3Xt3tW b7z6liztxOG6TUnqbXvpth3uJT3gZ5u O3m4Sj1jUkaiutofUI5ajPajLncdN1y gcrtuUpN5pHp2w52ZaPnHuwOumTkmZ bJvjn2ZDCYhLGoqaqeYpTxB4iSExbTq WvB/rOyYw3LXmoSw2CtFVzMBLPW5xEO YhLA42cJds41hdOhbrpVJvMBMichLD 5iKR71JITFhIqJUzxleY7ESYjkcYDzO KB5zLGUuyQ8I7ljRsiSci2oYpB1JRPA 0hC1NnQ0Bj2QmUINtkEsl3Tlc6Vp9A qVnQV7ka3rukYc1QhSaApS2yx/xgy7 nJQpxiOGa5kpwLZOU0gT3s9KgzPf2Fc U1OcmE8snRE6YWlF5QeWHpAaWEp+UQ xs8tJb9Owvjc0nNK9y3dp7SytKJ0z9I 9SmNLY0ofWvqQ0sjSiNI1S9co1ZaSEy k8ESzdpXRg6YDSQ0sPKX1h6QtKH1v6m NKXlr6k9I2lbyi9b+l9SpmljNJ1S9cp 5ZaSVwdhvGrpKqWhpeS3H+w1S3uU5pb mlD6w9AGlfUvJr2J4nlKjfwYLRUv rE0ieUCkvJ7cwfmbpM0pTS1NKn1r6l NLXlr6m9JGljyhNLCXvBuB0YukOpfYt UF1Sum3pNqVnlp653wvw2TSGroW5ZSv YojSzNKN0w1LySwGOEpaekvNkrNq72v RtE7mvxWrGHazN+PRqkvNYzbiDtXen6 dXk/hSrGR+Qrq/vz16kQErhTn8yt7CM 38LSwv7K4vL3i3e37y7cW23f0F71va +8W56y94P3j3vsdfz9rzI+9P72/vH+3 c+mv91/rf53yfqO1fa70Op/5P/4D+ G/4mg= Pr(y|Β΅, Οƒ2) = 1 p 2⇑σ2 exp \\\\uf8ff βˆ’(y βˆ’Β΅)2 2Οƒ2 \" In this case, just the mean Just learn the mean, πœ‡, and assume the variance is fixed,. 63 \\' metadata={\\'source\\': \\'https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf\\', \\'page\\': 61, \\'date\\': \\'02/01/2024\\\\n Thursday\\', \\'tldr\\': \\'We reconsider loss functions as a measure of how well the data fits to parametric probability distribution. We show that for univariate gaussian distributions we arrive back at least squares loss. We then introduce the notion of maximum likelihood and see how we can use that to define loss functions for many types data distributions. We cover some examples and then show how to generalize. This is a key topic to aid you in applying deep learning models to new types of data.\\', \\'title\\': \\'05 - Loss Functions\\', \\'lecture_recording\\': \\'https://mymedia.bu.edu/media/t/1_k2t4xjkd\\', \\'suggested_readings\\': \\'UDL Chapter 5\\', \\'score\\': -0.038714982109774976}']]}\n", + "Step ID: 1ab30fe6-99cb-4327-bd7d-e3128b2c1a62\n", + "User Input: What is inductive bias in CNNs?\n", + "Output: Inductive bias in CNNs refers to the inherent preference for prioritizing information from neighboring nodes when updating each node in the network. This bias helps the network effectively aggregate information from nearby nodes, leading to more efficient and accurate node embeddings. You can find more information about this topic in the lecture slides on Graph Neural Networks and Convolutional Networks.\n", + "Context: ['page_content=\\'Graph convolutional network These models are convolutional in that they update each node by aggregating information from nearby nodes. As such, they induce a relational inductive bias (i.e., a bias toward prioritizing information from neighbors). 39 A function 𝐹[β‹…] with parameters πœ™\" that takes the node embeddings and adjacency matrix and outputs new node embeddings \\' metadata={\\'source\\': \\'https://dl4ds.github.io/sp2024/static_files/lectures/20_graph_neural_networks.pdf\\', \\'page\\': 38, \\'date\\': \\'04/16/2024\\\\n Tuesday\\', \\'tldr\\': \\'In this lecture we introduce graph neural networks, define matrix representations, how to do graph level classification and regression, and how to define graph convolutional network layers.\\', \\'title\\': \\'20 -- Graph Neural Networks\\', \\'lecture_recording\\': \\'https://mymedia.bu.edu/media/t/1_bt7qtymk\\', \\'suggested_readings\\': \\'UDL Chapter 13\\', \\'score\\': 0.4147438882496012}', \"page_content='https://poloclub.github.io/cnn-explainer/ 92 ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/10_convolutional_networks.pdf', 'page': 91, 'date': '02/27/2024\\\\n Tuesday', 'tldr': 'We cover 1D and 2D convolutional neural networks along with subsampling and upsampling operations.', 'title': '10 - Convolutional Neural Networks', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_8ejvyib4', 'suggested_readings': 'UDL Chapter 10', 'score': 0.30807206912986296}\", \"page_content='Convolutional networks β€’ Networks for images β€’ Invariance and equivariance β€’ 1D convolution β€’ Convolutional layers β€’ Channels β€’ Receptive fields β€’ Convolutional network for MNIST 1D 44 ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/10_convolutional_networks.pdf', 'page': 43, 'date': '02/27/2024\\\\n Tuesday', 'tldr': 'We cover 1D and 2D convolutional neural networks along with subsampling and upsampling operations.', 'title': '10 - Convolutional Neural Networks', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_8ejvyib4', 'suggested_readings': 'UDL Chapter 10', 'score': 0.26420832659802906}\"]\n", + "Step ID: d2b74775-41fc-4ebc-bc40-d8d7cf67015d\n", + "User Input: Do transformers have this?\n", + "Output: I'm not sure, I don't have that information.\n", + "Context: [\"page_content='Transformers β€’ Multi-headed Self Attention is just one component of the transformer architecture β€’ Let’s look at a transformer block (or layer) from the encoder 49 Encoder Decoder ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 48, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.2451855994037282}\", \"page_content='Transformers β€’ Motivation β€’ Dot-product self-attention β€’ Applying Self-Attention β€’ The Transformer Architecture β€’ Three Types of NLP Transformer Models β€’ Encoder β€’ Decoder β€’ Encoder-Decoder 55 ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 54, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.24314965373956088}\", \"page_content='Transformers β€’ Motivation β€’ Dot-product self-attention β€’ Applying Self-Attention β€’ The Transformer Architecture β€’ Three Types of NLP Transformer Models β€’ Encoder β€’ Decoder β€’ Encoder-Decoder 10 ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/13_transformers_part2_v2.pdf', 'page': 9, 'date': '03/07/2024\\\\n Thursday', 'tldr': 'In this lecture we continue to review the transformer architecture. We continue the discussion of decoders and encoder-decoder architectures, then discuss scaling to large contexts and then tokenization and embedding.', 'title': '13 - Transformers Part 2', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_m9hhz175', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.2367633946657962}\"]\n", + "Step ID: fd58e85c-dbaa-4d06-8a75-a3839e37856d\n", + "User Input: cool, suggest a project with transformers to use as my final project\n", + "Output: You could consider working on a project that explores the use of transformers in natural language processing tasks, such as sentiment analysis, language translation, or text generation. This could be a great opportunity to apply what you've learned about transformers in a real-world context. Good luck with your project!\n", + "Context: ['page_content=\\'Graph convolutional network These models are convolutional in that they update each node by aggregating information from nearby nodes. As such, they induce a relational inductive bias (i.e., a bias toward prioritizing information from neighbors). 39 A function 𝐹[β‹…] with parameters πœ™\" that takes the node embeddings and adjacency matrix and outputs new node embeddings \\' metadata={\\'source\\': \\'https://dl4ds.github.io/sp2024/static_files/lectures/20_graph_neural_networks.pdf\\', \\'page\\': 38, \\'date\\': \\'04/16/2024\\\\n Tuesday\\', \\'tldr\\': \\'In this lecture we introduce graph neural networks, define matrix representations, how to do graph level classification and regression, and how to define graph convolutional network layers.\\', \\'title\\': \\'20 -- Graph Neural Networks\\', \\'lecture_recording\\': \\'https://mymedia.bu.edu/media/t/1_bt7qtymk\\', \\'suggested_readings\\': \\'UDL Chapter 13\\', \\'score\\': 0.3613770850497736}', \"page_content='Transformers β€’ Multi-headed Self Attention is just one component of the transformer architecture β€’ Let’s look at a transformer block (or layer) from the encoder 49 Encoder Decoder ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 48, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.2733153333305661}\", \"page_content='Transformers β€’ Multi-headed Self Attention is just one component of the transformer architecture 48 Encoder Decoder ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 47, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.22982265165302218}\"]\n", + "Step ID: 4d02c5ad-1bd9-4767-beab-4c724062093e\n", + "User Input: Could you give me a quick rundown on what image captioning is? and how transformers are useful for this?\n", + "Output: Image captioning is the process of generating textual descriptions for images. Transformers can be useful for this task because they are effective at processing sequential data and can learn to effectively capture the relationships between the visual and textual information in images and their corresponding captions. You can find more information about this in the lecture slides on Image Transformers and Multimodal Transformers. Good luck with your project!\n", + "Context: ['page_content=\\'Image captioning 24 UDL β€’ E.g. CNN-RNN, LSTM, Transformers \\' metadata={\\'source\\': \\'https://dl4ds.github.io/sp2024/static_files/presentations/01_sp2024_v02.pdf\\', \\'page\\': 23, \\'date\\': \\'01/18/2024\\\\n Thursday\\', \\'tldr\\': \"We will introduce the topic of deep learning, a bit about it\\'s history, and what impact it has had. Then we\\'ll go over the course logistics, the lecture topics, problem sets and the mid-term and final projects.\", \\'title\\': \\'01 - Intro to Deep Learning and Course Logistics\\', \\'lecture_recording\\': None, \\'suggested_readings\\': \\'UDL Chapter 1\\', \\'score\\': 0.552598095386973}', \"page_content='Next Time β€’ Tokenization and Learned Embeddings β€’ Training and Fine-Tuning Transformers β€’ Image Transformers β€’ Multimodal Transformers β€’ … 76 Link Feedback ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 75, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.34481147697680925}\", \"page_content='Encoder-Decoder Model β€’ Used for machine translation, which is a sequence-to-sequence task 22 https://jalammar.github.io/illustrated-transformer/ ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/13_transformers_part2_v2.pdf', 'page': 21, 'date': '03/07/2024\\\\n Thursday', 'tldr': 'In this lecture we continue to review the transformer architecture. We continue the discussion of decoders and encoder-decoder architectures, then discuss scaling to large contexts and then tokenization and embedding.', 'title': '13 - Transformers Part 2', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_m9hhz175', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.33002046196039614}\"]\n", + "Step ID: 9e061c1e-ad50-400f-983f-813587508223\n", + "User Input: How do I learn the math behind transformers, what are the most important bits?\n", + "Output: You can start by diving into the lecture slides and suggested readings on transformers, particularly Chapter 12 and the Illustrated Transformer. These resources should cover the important mathematical concepts behind transformers and help you understand the key components. Good luck with your studies!\n", + "Context: [\"page_content='After the break β€’ Image Transformers β€’ Multimodal Transformers β€’ RAG pattern β€’ Training and Fine-Tuning Transformers β€’ … 68 Feedback ChatGPT ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/13_transformers_part2_v2.pdf', 'page': 67, 'date': '03/07/2024\\\\n Thursday', 'tldr': 'In this lecture we continue to review the transformer architecture. We continue the discussion of decoders and encoder-decoder architectures, then discuss scaling to large contexts and then tokenization and embedding.', 'title': '13 - Transformers Part 2', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_m9hhz175', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.3366359998900601}\", \"page_content='UDL Chapter 11 Lecture 02/29/2024 Thursday 11a - Recurrent Neural Networks [slides] [lecture recording] Suggested Readings: UDL Chapter 11 Lecture 03/05/2024 Tuesday 12 - Transformers [slides] [lecture recording] Suggested Readings: UDL Chapter 12 Optional The Illustrated Transformer Discussion 03/06/2024 Wednesday Discussion_05 - Deep Dive 2: Deep Learning Modules in Pytorch (CNN, RNN/LSTM, Transformer) Github Link: disc5 Lecture' metadata={'source': 'https://dl4ds.github.io/sp2024/schedule/', 'page': 0, 'score': 0.3100015518546917}\", 'page_content=\"Suggested Readings: UDL Chapter 12 Optional The Illustrated Transformer 14 -- Vision & Multimodal Transformers tl;dr: In this lecture we\\'ll cover vision and multimodal transformers as a survey of three papers. [slides] [lecture recording] Suggested Readings: See slides for references 15 -- Improving LLM Perf tl;dr: In this lecture we talk about ways to improve LLM performance short of retraining or finetuning. We cover more sophisticated prompt strategies, retrieval augmentation and cognitive architectures building systems and agents based on LLMs. [slides] [lecture recording] Suggested Readings: See slides for references\" metadata={\\'source\\': \\'https://dl4ds.github.io/sp2024/lectures/\\', \\'page\\': 0, \\'score\\': 0.3078678255019791}']\n", + "Step ID: 74ed6d18-6c08-4d65-b47a-4d8efbcdfc20\n", + "User Input: What is attention?\n", + "Output: Attention is a mechanism used in transformers to focus on different parts of the input when processing information. It allows the model to weigh the importance of different elements in the input sequence, enabling it to effectively capture relationships and dependencies. You can find more information about attention in the lecture slides on transformers. Good luck with your studies!\n", + "Context: ['page_content=\\'Transformers, GPTs and Assistants A. Vaswani et al., β€œAttention is All you Need,” presented at the 31st Conference on Neural Information Processing Systems (NIPS 2017), Long Beach, CA, USA, 2017, p. 11. [Online]. Available: https://arxiv.org/abs/1706.03762 State of GPT, Andrej Karpathy, MS Build Keynote 38 \\' metadata={\\'source\\': \\'https://dl4ds.github.io/sp2024/static_files/presentations/01_sp2024_v02.pdf\\', \\'page\\': 37, \\'date\\': \\'01/18/2024\\\\n Thursday\\', \\'tldr\\': \"We will introduce the topic of deep learning, a bit about it\\'s history, and what impact it has had. Then we\\'ll go over the course logistics, the lecture topics, problem sets and the mid-term and final projects.\", \\'title\\': \\'01 - Intro to Deep Learning and Course Logistics\\', \\'lecture_recording\\': None, \\'suggested_readings\\': \\'UDL Chapter 1\\', \\'score\\': 0.40361429640387503}', \"page_content='Motivation β€’ Need to efficiently process large strings of text β€’ Need to relate words across fairly long context lengths Self-Attention addresses these problems ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 16, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': 0.2435349602286475}\", \"page_content='Graph Attention 59 Regular graph convolution Graph attention ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/20_graph_neural_networks.pdf', 'page': 58, 'date': '04/16/2024\\\\n Tuesday', 'tldr': 'In this lecture we introduce graph neural networks, define matrix representations, how to do graph level classification and regression, and how to define graph convolutional network layers.', 'title': '20 -- Graph Neural Networks', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_bt7qtymk', 'suggested_readings': 'UDL Chapter 13', 'score': 0.22901789972755515}\"]\n", + "Step ID: 4ac119fb-86f5-4528-9edc-44cc616de390\n", + "User Input: cool thanks, see ya\n", + "Output: You're welcome! If you have any more questions in the future, feel free to ask. Goodbye!\n", + "Context: [\"page_content='Motivation β€’ Need to efficiently process large strings of text β€’ Need to relate words across fairly long context lengths Self-Attention addresses these problems ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/12_transformers_v3_export.pdf', 'page': 16, 'date': '03/05/2024\\\\n Tuesday', 'tldr': 'In this lecture we cover the transformer architecture, starting with the motivation that required a new type of model, the concept and implementation of self-attention and then the full transformer architecture for encoder, decoder and encoder-decoder type models.', 'title': '12 - Transformers', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_r59pkzbo', 'suggested_readings': 'UDL Chapter 12\\\\n\\\\n\\\\nOptional \\\\nThe Illustrated Transformer', 'score': -0.015666472239410423}\", \"page_content='Example 1: univariate regression 61 ' metadata={'source': 'https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf', 'page': 59, 'date': '02/01/2024\\\\n Thursday', 'tldr': 'We reconsider loss functions as a measure of how well the data fits to parametric probability distribution. We show that for univariate gaussian distributions we arrive back at least squares loss. We then introduce the notion of maximum likelihood and see how we can use that to define loss functions for many types data distributions. We cover some examples and then show how to generalize. This is a key topic to aid you in applying deep learning models to new types of data.', 'title': '05 - Loss Functions', 'lecture_recording': 'https://mymedia.bu.edu/media/t/1_k2t4xjkd', 'suggested_readings': 'UDL Chapter 5', 'score': -0.03567729015020071}\", 'page_content=\\'Example 1: univariate regression AXBXiclZjZbtw2FEDHXdN0c1 rELwVaoUaApHAM20iXlwKJHWezU0/iN bEcg9JQGsYUJUuUPRN1not+TN+KvY7 +hX9hV5K8jC6l37oAPEw9xul5TEUZB JUeilpX9m3n3vfc/+PDKR1c/uTz 6fvfbFXpGWech3w1Sm+UHACi6F4rta MkPspyzJB8PzhZM3z/jOeFSNWOHmf8 KGxEpEImYbQ8ezv/fzm+Fc/CdJRFU0 O/SAaLcCfbCiOFvxCxAl7tXL+9nzo5 yF1fKk8ovTXFcrfiZaXK1MJhOfjzJf8 kgf3m7Mm+PbzkZv1T7Ut5X9XMRDfXQ8 O7+0uFR/PFpYbgvzvfbTP752feAP0rB MuNKhZEVxuLyU6aOK5VqEk+u+mXBMx aesJgfQlGxhBdHVZ2ziXcDIgMvSnP4p 7RXR9+uUbGkKMZJAGbC9LDAzARd7LDU 0U9HlVBZqbkKm46iUno69cwCeAOR81D LMRYmAsYqxcOGeRMwzJd9RU/D9MkYW pQ+avrzyDhAY+FqvhpWS/ZNJ1muHQ /EyY/XxzrQVoXki3nDSK2YRi4ReDyp Kr4YL2IgOACxyAlIFS+gTZOfIPKWEYU tKgFXzR6BveE9n5CmleYx5KSjvSQaFD LJRx1rjViwlElH2QbF8254BnCdwyrAU OGLozXYzpiaXNTfKTzpCpMDPeQMxXz uguYcsikmVHXUKWUDXsWL9g6zlTJ23 i0qweam4iyNrJu47OaV7UoOvUEWTBJo y7Vh1BloQbyoAlDLclo9hwolnIm5VK KwKsjH7eRp0+85MBO/NUQbXS9dbr0j6 zxjKiAnA1We+BVMh7+pr6dT2LpJzVvu mwEfeEBarW4XlcTOti05gVm1sQs06V8 ik2YJQnp53TMah8oz0Z2gCeCLrsyFi t7SFuoSbFkT9hdgqnkp+eHtxe/56Kha MpeN+UOyCQ0VZeZqyIT/R0MDeITh/QU RvHipRIsHgXrxUgn3d7R0LMcb20TqtY OCUEwKPUaXv4hVt04dwYNEzRWCJh24 ZsJhRY5irqyCRgZvuFh7NhAIZpk2Mwx lGlR5pzc/NB+hkitm9tiLszDqntDlUb o3je4nNaCMjwczvgl1QOU0aDJZ5CWas BylMyRWdLRK7/QcIm5rv56yZui04r56 UbH4wLVqcMQ356vIHXIyYWdSRqC04/ zrYksRz9QVvT7fr2yKqNV9+RrR07XLc pSbvtKN2w71kBPx0zHaTeIRizoStd WOkHrEcvQHbnzuOmahcN1m5K0e5FHp +1wpyba/tHOkGtmjkmpHJhjXyr9JoRF TUXtFNOEx0hsQlhMyq4F/8fKtjksd60 mhMV+IbqaCWBpwCWeQhPCYnMJd802ht VNh7rpVpnMhshsQlh8yBI86yaExZiKs VM8YVmGxCZE8jEeRzSPGZYylwSXpHM sSJkS7k2VD5Mu5IJYGmEehs5OoMRyFS hDtsglgu68wrnzlNoFyu6i3dHe9e0r FmqETwNIWucY8f8t5kQU4xXDMciU5E 8jKaAL72OlT5+L0F0QVOckF0djSMaXn lp5Tum/pPqW5peQXQRA9t5T8OgmiM0v PKN2zdI/S0tKS0l1LdymNLI0ofWDpA0 pDS0NK1yxdo1RbSk6k8ESwdIfSoaVDS g8sPaD0haUvKH1k6SNKX1r6ktI3lr6h 9J6l9yhljJK1y1dp5RbSl4dBNGqpau UBpaS35wrVnapzSzNKP0vqX3KR1YSn 4Vw/PMUnK8gQejpZLSx5Y+plRYSn6/B dFTS59SmliaUPrE0ieUvrb0NaUPLX1I aWwpeTcApxNLtym1b4GqgtJnlj6j9NT SU/d7AT5dxsC1MbdsA1uUpamlG5YSn 4pwFHC0hNynoxUe1e7eNtE7muRmnIHa zN+UZvkPFJT7mDt3emiNrk/RWrKh2To 63vTFymQUrjTH8/OL+O3sLSwt7K4/MP inWd35u+utm9or/S+6n3bu9lb7v3Yu9 t71Ov3dnth79+Z6zNfz3wz9vcH3N/z v3VqO/MtHW+7HU+c3/B3l+BUI= Pr(y|f[x, Ο†], Οƒ2) = 1 p 2⇑σ2 exp \\\\uf8ff βˆ’(y βˆ’f[x, Ο†])2 2Οƒ2 \" AW4XiclZjLbtw2FECVPtP05b SoN90INQIkRWLYRvrYFEjsOC879Th+J pZjUBpKw5iZImyZ6LOB3RXdNtP6hf0 M7ptN73UaIbRvfSiAzjD3HPExyUpcRT mUpR6aemvK+8+97H3x49aNrH3/y6W efz13/Yr/MqiLie1Ems+IwZCWXQvE9L bTkh3nBWRpKfhCerhl+cM6LUmRqV49y fpyRIlYRExD6GSO94qbo1+CtLodlCJ J2auVW/5PfhAXLKqXx3VQnhW6Xgly0e J6ZTweB3yYB5LH+ujOxLw5ugNV3Go2 FYNCpEM9PHJ3MLS4lLz8WlhuS0seO2n d3L9q37Qz6Iq5UpHkpXl0fJSro9rVmg RST6+FlQlz1l0yhJ+BEXFUl4e10+xv 4NiPT9OCvgT2m/ib59Rc3SshylIZgp0 4MSMxN0saNKxz8e10LleYqmjQUV9LX mW+S6/dFwSMtR1BgUSGgr340YJAhDVN wLVD8IsrSlKl+Hayub0N6Q54IVfOzqp mO8bjrDcOh+JlxuqT3VktQvNUvOGk kYxlVwi8GRc13wxWcRAcABikROQKV5C nSY/YewvIwrLTwIGHmZD6FzsPx+TqpX mCeSko70kGhRyYcda41YMJVpR9kBxf dv+AZwXcAsQFfhi6M52MmZGk+v03yoi 7QuTQy3UDCV8KYJGHLEpBlR1CVlHBp 1LF+xtZzpk7bxGV509XCRJC1W3QdXdC 8qH7XaSLIgkWYdK0mgiwJN4s+SxlkuS 2fwIBT30TcqlBYFWRh9os7Ladmwhem 8Mc9kvXW69J+s8ZyogJwO4z34KpiHf1 tWxm+9PknDe+KfChP4DJ6l7CimQyrGk jMKo2NqZmkytk0mxBqMguqbpjUPlue gO0ATwpqsKoeK3tNtNCZasCQe3YahFJ fnRncXv+PC4XjLbxvxDsgkVlVXuqsiE /0dFfXg84fUFETx5mUSTB4Fm8jIJ93c 0dazAC9tEmrmDglBMCj1C218kqntNE8 GdzVLUVwiYeuGbCYUmOY67sgkYGb7hQ etYQBEaZDQZYySzsio4ufmh9QyRje3 xUKYh1X3hiqN0L1vcDm7CsrwcDjnl1w eoyGk3yGWaX6rEDJHJopHb4KSg1bzL X7mymfFJ1Wws82vagXzA7VRTxs5MNP B8JsagjUV1wsnHWJYnlaA/qmi3Xt3tW b7z6liztxOG6TUnqbXvpth3uJT3gZ5u O3m4Sj1jUkaiutofUI5ajPajLncdN1y gcrtuUpN5pHp2w52ZaPnHuwOumTkmZ bJvjn2ZDCYhLGoqaqeYpTxB4iSExbTq WvB/rOyYw3LXmoSw2CtFVzMBLPW5xEO YhLA42cJds41hdOhbrpVJvMBMichLD 5iKR71JITFhIqJUzxleY7ESYjkcYDzO KB5zLGUuyQ8I7ljRsiSci2oYpB1JRPA 0hC1NnQ0Bj2QmUINtkEsl3Tlc6Vp9A qVnQV7ka3rukYc1QhSaApS2yx/xgy7 nJQpxiOGa5kpwLZOU0gT3s9KgzPf2Fc U1OcmE8snRE6YWlF5QeWHpAaWEp+UQ xs8tJb9Owvjc0nNK9y3dp7SytKJ0z9I 9SmNLY0ofWvqQ0sjSiNI1S9co1ZaSEy k8ESzdpXRg6YDSQ0sPKX1h6QtKH1v6m NKXlr6k9I2lbyi9b+l9SpmljNJ1S9cp 5ZaSVwdhvGrpKqWhpeS3H+w1S3uU5pb mlD6w9AGlfUvJr2J4nlKjfwYLRUv rE0ieUCkvJ7cwfmbpM0pTS1NKn1r6l NLXlr6m9JGljyhNLCXvBuB0YukOpfYt UF1Sum3pNqVnlp653wvw2TSGroW5ZSv YojSzNKN0w1LySwGOEpaekvNkrNq72v RtE7mvxWrGHazN+PRqkvNYzbiDtXen6 dXk/hSrGR+Qrq/vz16kQErhTn8yt7CM 38LSwv7K4vL3i3e37y7cW23f0F71va +8W56y94P3j3vsdfz9rzI+9P72/vH+3 c+mv91/rf53yfqO1fa70Op/5P/4D+ G/4mg= Pr(y|Β΅, Οƒ2) = 1 p 2⇑σ2 exp \\\\uf8ff βˆ’(y βˆ’Β΅)2 2Οƒ2 \" In this case, just the mean Just learn the mean, πœ‡, and assume the variance is fixed,. 63 \\' metadata={\\'source\\': \\'https://dl4ds.github.io/sp2024/static_files/lectures/05_loss_functions_v2.pdf\\', \\'page\\': 61, \\'date\\': \\'02/01/2024\\\\n Thursday\\', \\'tldr\\': \\'We reconsider loss functions as a measure of how well the data fits to parametric probability distribution. We show that for univariate gaussian distributions we arrive back at least squares loss. We then introduce the notion of maximum likelihood and see how we can use that to define loss functions for many types data distributions. We cover some examples and then show how to generalize. This is a key topic to aid you in applying deep learning models to new types of data.\\', \\'title\\': \\'05 - Loss Functions\\', \\'lecture_recording\\': \\'https://mymedia.bu.edu/media/t/1_k2t4xjkd\\', \\'suggested_readings\\': \\'UDL Chapter 5\\', \\'score\\': -0.038714982109774976}']\n" + ] + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "from literalai import LiteralClient\n", + "\n", + "virtual_app = dict(\n", + " llm=dict(modelname=\"AI Tutor - GPT3 - FAISS\"),\n", + " template=\"information about the template I used in my app\",\n", + " debug=\"all of these fields are completely optional\",\n", + ")\n", + "from trulens_eval import Select\n", + "from trulens_eval.tru_virtual import VirtualApp\n", + "\n", + "virtual_app = VirtualApp(virtual_app) # can start with the prior dictionary\n", + "virtual_app[Select.RecordCalls.llm.maxtokens] = 1024\n", + "\n", + "from trulens_eval import Select\n", + "\n", + "retriever = Select.RecordCalls.retriever\n", + "synthesizer = Select.RecordCalls.synthesizer\n", + "\n", + "virtual_app[retriever] = \"retriever\"\n", + "virtual_app[synthesizer] = \"synthesizer\"\n", + "\n", + "\n", + "load_dotenv()\n", + "\n", + "literal_client = LiteralClient()\n", + "\n", + "threads = literal_client.api.get_threads(first=1).data\n", + "print(threads)\n", + "\n", + "rag_steps = []\n", + "for thread in threads:\n", + " rag_steps.extend([step for step in thread.steps if step.name == \"RAG\"])\n", + "\n", + "save_log_dict = {\"step_id\": [], \"user_input\": [], \"output\": [], \"context\": []}\n", + "for step in rag_steps:\n", + " save_log_dict[\"step_id\"].append(step.id)\n", + " save_log_dict[\"user_input\"].append(step.input[\"question\"])\n", + " save_log_dict[\"output\"].append(step.output[\"answer\"])\n", + " save_log_dict[\"context\"].append(step.output[\"source_documents\"])\n", + "\n", + "print(save_log_dict)\n", + "\n", + "records = []\n", + "for i in range(len(save_log_dict[\"step_id\"])):\n", + " step_id = save_log_dict[\"step_id\"][i]\n", + " user_input = save_log_dict[\"user_input\"][i]\n", + " output = save_log_dict[\"output\"][i]\n", + " context = save_log_dict[\"context\"][i]\n", + "\n", + " print(f\"Step ID: {step_id}\")\n", + " print(f\"User Input: {user_input}\")\n", + " print(f\"Output: {output}\")\n", + " print(f\"Context: {context}\")\n", + "\n", + " from trulens_eval.tru_virtual import VirtualRecord\n", + "\n", + " # The selector for a presumed context retrieval component's call to\n", + " # `get_context`. The names are arbitrary but may be useful for readability on\n", + " # your end.\n", + " context_call = retriever.get_context\n", + " generation = synthesizer.generate\n", + "\n", + " rec = VirtualRecord(\n", + " main_input=f\"{user_input}\",\n", + " main_output=f\"{output}\",\n", + " calls={\n", + " context_call: dict(args=[f\"{user_input}\"], rets=[f\"{output}\"]),\n", + " generation: dict(\n", + " args=[f\"{context}\"],\n", + " rets=[f\"{output}\"],\n", + " ),\n", + " },\n", + " )\n", + "\n", + " records.append(rec)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "βœ… In context_relevance_with_cot_reasons, input question will be set to __record__.main_input or `Select.RecordInput` .\n", + "βœ… In context_relevance_with_cot_reasons, input context will be set to __record__.app.retriever.get_context.rets[:] .\n", + "βœ… In Groundedness, input source will be set to __record__.app.retriever.get_context.rets[:].collect() .\n", + "βœ… In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .\n", + "βœ… In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .\n", + "βœ… In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .\n" + ] + } + ], + "source": [ + "from trulens_eval.feedback.provider import OpenAI\n", + "from trulens_eval.feedback.feedback import Feedback\n", + "\n", + "# Initialize provider class\n", + "provider = OpenAI()\n", + "\n", + "# Select context to be used in feedback. We select the return values of the\n", + "# virtual `get_context` call in the virtual `retriever` component. Names are\n", + "# arbitrary except for `rets`.\n", + "context = context_call.rets[:]\n", + "\n", + "# Question/statement relevance between question and each context chunk.\n", + "f_context_relevance = (\n", + " Feedback(provider.context_relevance_with_cot_reasons)\n", + " .on_input()\n", + " .on(context)\n", + ")\n", + "\n", + "# Define a groundedness feedback function\n", + "f_groundedness = (\n", + " Feedback(provider.groundedness_measure_with_cot_reasons, name = \"Groundedness\")\n", + " .on(context.collect())\n", + " .on_output()\n", + ")\n", + "\n", + "# Question/answer relevance between overall question and answer.\n", + "f_qa_relevance = (\n", + " Feedback(provider.relevance_with_cot_reasons, name = \"Answer Relevance\")\n", + " .on_input_output()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "πŸ¦‘ Tru initialized with db url sqlite:///default.sqlite .\n", + "πŸ›‘ Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.\n" + ] + } + ], + "source": [ + "from trulens_eval.tru_virtual import TruVirtual\n", + "\n", + "virtual_recorder = TruVirtual(\n", + " app_id=\"AI Tutor - GPT3 - FAISS\",\n", + " app=virtual_app,\n", + " feedbacks=[f_context_relevance, f_groundedness, f_qa_relevance],\n", + " feedback_mode = \"deferred\" # optional\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "for record in records:\n", + " virtual_recorder.add_record(record)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Force stopping dashboard ...\n", + "Starting dashboard ...\n", + "Config file already exists. Skipping writing process.\n", + "Credentials file already exists. Skipping writing process.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c3eb563d17e84983ad0d8eca36904ddb", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dashboard started at http://10.0.0.226:8501 .\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from trulens_eval import Tru\n", + "tru = Tru()\n", + "\n", + "tru.run_dashboard(force=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Will keep max of 32 feedback(s) running.\n", + "Tasks are spread among max of 128 thread(s).\n", + "Will rerun running feedbacks after a minute.\n", + "Will rerun failed feedbacks after 5 minutes.\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "df51361ac4664845b804e574497e1b58", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Feedback Status: 80%|######## | 84/105 [00:00