{ "cells": [ { "cell_type": "markdown", "id": "8acae3ed-2953-45a3-aba9-0327b6ae3679", "metadata": {}, "source": [ "### ChromaDB method - create vectorstore based on Chroma" ] }, { "cell_type": "code", "execution_count": null, "id": "7de9c591-5a77-4bbe-80f1-4897e15f0b97", "metadata": {}, "outputs": [], "source": [ "import sys, os, shutil\n", "sys.path.insert(0, \"../\")\n", "\n", "from preprocess_raw_documents import split_content\n", "\n", "import chromadb\n", "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n", "from llama_index.vector_stores.chroma.base import ChromaVectorStore\n", "from llama_index.core import StorageContext\n", "from llama_index.core import ServiceContext\n", "from llama_index.core import Document\n", "\n", "from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding\n", "from llama_index.core import Settings\n", "\n", "import nest_asyncio\n", "nest_asyncio.apply()\n", "\n", "import time\n", "import PyPDF2" ] }, { "cell_type": "code", "execution_count": null, "id": "978152ce-4d87-44b5-b521-dbaff60b32b0", "metadata": {}, "outputs": [], "source": [ "split_content(filepath=\"../raw_documents/answers.txt\", \n", " separator=\"\\n\\n\", \n", " tmp_folder=\"../raw_documents/answers_temp\")\n", "\n", "split_content(filepath=\"../raw_documents/qna.txt\", \n", " separator=\"\\n\\n\\n\", \n", " tmp_folder=\"../raw_documents/qna_temp\")" ] }, { "cell_type": "code", "execution_count": null, "id": "d925371b-8777-4f5b-a7f2-ec3f228ef266", "metadata": {}, "outputs": [], "source": [ "answers_temp_files = []\n", "folder_path = \"../raw_documents/answers_temp\"\n", "for f in os.listdir(folder_path):\n", " fpath = os.path.join(folder_path, f)\n", " answers_temp_files.append(fpath)\n", " \n", "qna_temp_files = []\n", "folder_path = \"../raw_documents/qna_temp\"\n", "for f in os.listdir(folder_path):\n", " fpath = os.path.join(folder_path, f)\n", " qna_temp_files.append(fpath)" ] }, { "cell_type": "code", "execution_count": null, "id": "a83b4fd8-5075-4c52-820c-a3ac7ee7f0c8", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "3e65dff6-77b6-4be8-8857-5cecf3a035bb", "metadata": {}, "outputs": [], "source": [ "# load some documents\n", "if False:\n", " documents = SimpleDirectoryReader(input_files=[\n", " \"../raw_documents/HI Chapter Summary Version 1.3.pdf\",\n", " \"../raw_documents/conversation_examples.txt\",\n", " \"../raw_documents/HI_Knowledge_Base.pdf\",\n", " ] + answers_temp_files + qna_temp_files ).load_data()\n", "else:\n", " reader_summary = PyPDF2.PdfReader(\"../raw_documents/HI Chapter Summary Version 1.3.pdf\")\n", " documents_summary = [ p.extract_text() for p in reader_summary.pages ]\n", "\n", " reader_base = PyPDF2.PdfReader(\"../raw_documents/HI_Knowledge_Base.pdf\")\n", " documents_base = [ p.extract_text() for p in reader_base.pages ]\n", " \n", " documents_txt = SimpleDirectoryReader(input_files=[\n", " \"../raw_documents/conversation_examples.txt\",\n", " \"../raw_documents/qna.txt\",\n", " \"../raw_documents/answers.txt\"\n", " ] ).load_data()\n", " documents_txt = [doc.text for doc in documents_txt]\n", "\n", "document = Document(text=\"\\n\\n\".join(documents_summary + documents_base + documents_txt))" ] }, { "cell_type": "code", "execution_count": null, "id": "e485f801-1829-4b50-b6b2-52803203853b", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "bd86b3f5-1dfc-4257-bd9c-86d34f02398d", "metadata": {}, "outputs": [], "source": [ "# initialize client, setting path to save data\n", "db = chromadb.PersistentClient(path=\"../models/chroma_db_advanced_corrected\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f568ce7b-bcbf-455c-acf1-6c2cae129fed", "metadata": {}, "outputs": [], "source": [ "# create collection\n", "chroma_collection = db.get_or_create_collection(\"quickstart\")" ] }, { "cell_type": "code", "execution_count": null, "id": "ed0b018e-1982-46b2-b1b4-04f5c0ce8672", "metadata": {}, "outputs": [], "source": [ "# assign chroma as the vector_store to the context\n", "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)" ] }, { "cell_type": "code", "execution_count": null, "id": "eb5edab2-30db-4bf7-96b5-4005d3161988", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "0946b6ce-96ab-44de-ad75-e424a8429f67", "metadata": {}, "outputs": [], "source": [ "Settings.llm = None\n", "Settings.chunk_size = 1024\n", "Settings.chunk_overlap = 50\n", "Settings.embed_model = \"local:../models/fine-tuned-embeddings-advanced\"" ] }, { "cell_type": "code", "execution_count": null, "id": "b8c73a2c-1129-406a-8046-085afcaf9cbb", "metadata": {}, "outputs": [], "source": [ "nodes = Settings.node_parser.get_nodes_from_documents([document])" ] }, { "cell_type": "code", "execution_count": null, "id": "75f1c76f-d3e5-4b69-818c-98865adb1457", "metadata": {}, "outputs": [], "source": [ "len(nodes)" ] }, { "cell_type": "code", "execution_count": null, "id": "adfe688f-95c0-477c-a9de-e9e77541a1d7", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "dab4c6f3-ef67-4d90-b3d5-e290c5d1b6f4", "metadata": {}, "outputs": [], "source": [ "storage_context = StorageContext.from_defaults(vector_store=vector_store)" ] }, { "cell_type": "code", "execution_count": null, "id": "6a764113-ad7e-4674-aa57-ebbf405902a8", "metadata": {}, "outputs": [], "source": [ "storage_context.docstore.add_documents(nodes)" ] }, { "cell_type": "code", "execution_count": null, "id": "38e7c88d-6c45-4275-8293-d09b4b85a7cf", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "e492ed4a-23a3-47d6-8b50-51fb48b3aa05", "metadata": {}, "outputs": [], "source": [ "start_time = time.time()" ] }, { "cell_type": "code", "execution_count": null, "id": "cbd11b89-9b83-4f08-bb30-160f750f2ffb", "metadata": {}, "outputs": [], "source": [ "vector_index = VectorStoreIndex(nodes, storage_context=storage_context)" ] }, { "cell_type": "code", "execution_count": null, "id": "d3bd848d-9985-4a3d-bdc4-ec340cc69ef3", "metadata": {}, "outputs": [], "source": [ "indexing_cost = time.time() - start_time\n", "indexing_cost = indexing_cost / 60\n", "print(f\"Indexing time: {indexing_cost:.1f} mins\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f16cca33-71fb-437d-a033-671b9fd44054", "metadata": {}, "outputs": [], "source": [ "vector_query_engine = vector_index.as_query_engine()" ] }, { "cell_type": "code", "execution_count": null, "id": "3290e870-41d7-49c4-9c4f-cb16bd1f469e", "metadata": { "scrolled": true }, "outputs": [], "source": [ "response = vector_query_engine.query(\"Healthcare System in Singapore consists of?\")\n", "response" ] }, { "cell_type": "code", "execution_count": null, "id": "d83e2938-61fa-4d02-920d-0ae88a437abc", "metadata": {}, "outputs": [], "source": [ "response = vector_query_engine.query(\"what is integrated shield plan\")\n", "response" ] }, { "cell_type": "code", "execution_count": null, "id": "aa4b9906-5f75-4003-9f4c-5cfcc7ab1eaf", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "1bb75b04-6a62-43a4-8728-d2e52e49f1c0", "metadata": {}, "outputs": [], "source": [ "if os.path.exists(\"../raw_documents/answers_temp\"):\n", " shutil.rmtree(\"../raw_documents/answers_temp\")" ] }, { "cell_type": "code", "execution_count": null, "id": "0ed920fb-6456-49ac-8b63-08bd86b5b39c", "metadata": {}, "outputs": [], "source": [ "if os.path.exists(\"../raw_documents/qna_temp\"):\n", " shutil.rmtree(\"../raw_documents/qna_temp\")" ] }, { "cell_type": "code", "execution_count": null, "id": "131d907a-0677-4ad8-b3f7-6fc9b9c5d0a5", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "08fb2be5-3a44-4bb8-a9fc-61d7f03b7a35", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "a7fc01f6-4738-415b-a96b-afd6cf8d789a", "metadata": {}, "source": [ "### ChromaDB method - load vectorstore based on Chroma" ] }, { "cell_type": "code", "execution_count": null, "id": "c1a42c35-5f57-423c-8fb7-7d18b3b466b5", "metadata": {}, "outputs": [], "source": [ "import chromadb\n", "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n", "from llama_index.vector_stores.chroma.base import ChromaVectorStore\n", "from llama_index.core import StorageContext\n", "from llama_index.core import ServiceContext\n", "from llama_index.core import Document\n", "from llama_index.core import Settings\n", "\n", "from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding\n", "from llama_index.llms.openai import OpenAI\n", "from llama_index.core.memory import ChatMemoryBuffer\n", "\n", "import time\n", "\n", "from prompt_engineering import (\n", " system_content, \n", " textbook_content, \n", " winnie_the_pooh_prompt, \n", " introduction_line\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "72dd0ece-c72d-428a-89b4-9494d948c845", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "d38dc953-b923-4128-86a1-c8c6f69af0ed", "metadata": {}, "outputs": [], "source": [ "fine_tuned_path = \"local:../models/fine-tuned-embeddings-advanced\"" ] }, { "cell_type": "code", "execution_count": null, "id": "4c83c613-2cfc-4871-9d07-c82f77a3bd5e", "metadata": {}, "outputs": [], "source": [ "llm = OpenAI(model=\"gpt-4-0125-preview\", temperature=0.0)" ] }, { "cell_type": "code", "execution_count": null, "id": "0583e9b0-d977-488c-8331-46dfa749924c", "metadata": {}, "outputs": [], "source": [ "Settings.llm = llm\n", "Settings.embed_model = fine_tuned_path" ] }, { "cell_type": "code", "execution_count": null, "id": "f994f440-f647-48b4-a517-46a79f7561e5", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "2159a2b6-494b-41b9-ac54-dd342bfb74ba", "metadata": {}, "outputs": [], "source": [ "db = chromadb.PersistentClient(path=\"../models/chroma_db_advanced_corrected\")" ] }, { "cell_type": "code", "execution_count": null, "id": "1b385644-b46e-4d13-88fa-9f4af39db405", "metadata": {}, "outputs": [], "source": [ "chroma_collection = db.get_or_create_collection(\"quickstart\")" ] }, { "cell_type": "code", "execution_count": null, "id": "93cb53d1-6b8c-4b2d-a839-53501c0d54b2", "metadata": {}, "outputs": [], "source": [ "# assign chroma as the vector_store to the context\n", "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n", "storage_context = StorageContext.from_defaults(vector_store=vector_store)" ] }, { "cell_type": "code", "execution_count": null, "id": "c40d59e1-6d42-41f0-8c9b-70aa026093ae", "metadata": {}, "outputs": [], "source": [ "# create your index\n", "index = VectorStoreIndex.from_vector_store(\n", " vector_store=vector_store,\n", " storage_context=storage_context\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "73ba6d06-ba69-4b5e-962a-9cf7d2dc4d94", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "3f592848-8536-4b4d-b34a-adc32d043432", "metadata": {}, "outputs": [], "source": [ "memory = ChatMemoryBuffer.from_defaults(token_limit=100_000)" ] }, { "cell_type": "code", "execution_count": null, "id": "6c7df81a-fd2f-42bf-b09c-46d7750f7252", "metadata": {}, "outputs": [], "source": [ "chat_engine = index.as_chat_engine(\n", " chat_mode=\"context\",\n", " memory=memory,\n", " system_prompt=system_content\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "c3106dff-dd6f-47a9-9454-1e61775e7539", "metadata": {}, "outputs": [], "source": [ "hi_engine = index.as_query_engine(\n", " memory=memory,\n", " system_prompt=system_content,\n", " similarity_top_k=20,\n", " streaming=True\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "53a38081-4a79-44bc-bfa3-5d8653804328", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "434f0caf-8b1f-40c6-b9ec-b039cd1ca612", "metadata": {}, "outputs": [], "source": [ "prompt = \"\"\"\n", "Question: Which is not a government healthcare philosophy? \n", "A. To nurture a healthy nation by promoting good health.\n", "B. To rely on competition to improve service and raise efficiency\n", "C. To intervene directly whenever necessary\n", "D. To provide for the care of employees\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "id": "a1c83dff-50d1-47b1-b7e9-4fc5cd08e1e8", "metadata": {}, "outputs": [], "source": [ "response = hi_engine.query(prompt)\n", "for res in response.response_gen:\n", " print(res, end=\"\")" ] }, { "cell_type": "code", "execution_count": null, "id": "cedd3512-548d-4455-80fd-c6a8b2c0cd00", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "91821a22-c1c4-46a6-90f0-c00651afb0f6", "metadata": {}, "outputs": [], "source": [ "# query_string = \"tell me more about integrated shield plans\"\n", "# query_string = \"how to use CPF\"\n", "query_string = \"what is MediSave\"\n", "\n", "response = hi_engine.query(query_string)\n", "for res in response.response_gen:\n", " print(res, end=\"\")" ] }, { "cell_type": "code", "execution_count": null, "id": "07969feb-2667-4d7d-a769-953082138988", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ec53dfcf-d4c0-4d10-a24e-be2004a83656", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "1e62303c-3a00-448f-ad93-15cb6cee1f24", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "301e8270-783d-4942-a05f-9683ca96fbda", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 5 }