### ChromaDB method - create vectorstore based on Chroma

In [None]:
import sys, os, shutil
sys.path.insert(0, "../")

from preprocess_raw_documents import split_content

import chromadb
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma.base import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core import ServiceContext
from llama_index.core import Document

from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding
from llama_index.core import Settings

import nest_asyncio
nest_asyncio.apply()

import time
import PyPDF2

In [None]:
split_content(filepath="../raw_documents/answers.txt", 
 separator="\n\n", 
 tmp_folder="../raw_documents/answers_temp")

split_content(filepath="../raw_documents/qna.txt", 
 separator="\n\n\n", 
 tmp_folder="../raw_documents/qna_temp")

In [None]:
answers_temp_files = []
folder_path = "../raw_documents/answers_temp"
for f in os.listdir(folder_path):
 fpath = os.path.join(folder_path, f)
 answers_temp_files.append(fpath)
 
qna_temp_files = []
folder_path = "../raw_documents/qna_temp"
for f in os.listdir(folder_path):
 fpath = os.path.join(folder_path, f)
 qna_temp_files.append(fpath)

In [None]:
# load some documents
if False:
 documents = SimpleDirectoryReader(input_files=[
 "../raw_documents/HI Chapter Summary Version 1.3.pdf",
 "../raw_documents/conversation_examples.txt",
 "../raw_documents/HI_Knowledge_Base.pdf",
 ] + answers_temp_files + qna_temp_files ).load_data()
else:
 reader_summary = PyPDF2.PdfReader("../raw_documents/HI Chapter Summary Version 1.3.pdf")
 documents_summary = [ p.extract_text() for p in reader_summary.pages ]

 reader_base = PyPDF2.PdfReader("../raw_documents/HI_Knowledge_Base.pdf")
 documents_base = [ p.extract_text() for p in reader_base.pages ]
 
 documents_txt = SimpleDirectoryReader(input_files=[
 "../raw_documents/conversation_examples.txt",
 "../raw_documents/qna.txt",
 "../raw_documents/answers.txt"
 ] ).load_data()
 documents_txt = [doc.text for doc in documents_txt]

document = Document(text="\n\n".join(documents_summary + documents_base + documents_txt))

In [None]:
# initialize client, setting path to save data
db = chromadb.PersistentClient(path="../models/chroma_db_advanced_corrected")

In [None]:
# create collection
chroma_collection = db.get_or_create_collection("quickstart")

In [None]:
# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [None]:
Settings.llm = None
Settings.chunk_size = 1024
Settings.chunk_overlap = 50
Settings.embed_model = "local:../models/fine-tuned-embeddings-advanced"

In [None]:
nodes = Settings.node_parser.get_nodes_from_documents([document])

In [None]:
len(nodes)

In [None]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
storage_context.docstore.add_documents(nodes)

In [None]:
start_time = time.time()

In [None]:
vector_index = VectorStoreIndex(nodes, storage_context=storage_context)

In [None]:
indexing_cost = time.time() - start_time
indexing_cost = indexing_cost / 60
print(f"Indexing time: {indexing_cost:.1f} mins")

In [None]:
vector_query_engine = vector_index.as_query_engine()

In [None]:
response = vector_query_engine.query("Healthcare System in Singapore consists of?")
response

In [None]:
response = vector_query_engine.query("what is integrated shield plan")
response

In [None]:
if os.path.exists("../raw_documents/answers_temp"):
 shutil.rmtree("../raw_documents/answers_temp")

In [None]:
if os.path.exists("../raw_documents/qna_temp"):
 shutil.rmtree("../raw_documents/qna_temp")

### ChromaDB method - load vectorstore based on Chroma

In [None]:
import chromadb
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma.base import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core import ServiceContext
from llama_index.core import Document
from llama_index.core import Settings

from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.memory import ChatMemoryBuffer

import time

from prompt_engineering import (
 system_content, 
 textbook_content, 
 winnie_the_pooh_prompt, 
 introduction_line
)

In [None]:
fine_tuned_path = "local:../models/fine-tuned-embeddings-advanced"

In [None]:
llm = OpenAI(model="gpt-4-0125-preview", temperature=0.0)

In [None]:
Settings.llm = llm
Settings.embed_model = fine_tuned_path

In [None]:
db = chromadb.PersistentClient(path="../models/chroma_db_advanced_corrected")

In [None]:
chroma_collection = db.get_or_create_collection("quickstart")

In [None]:
# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
# create your index
index = VectorStoreIndex.from_vector_store(
 vector_store=vector_store,
 storage_context=storage_context
)

In [None]:
memory = ChatMemoryBuffer.from_defaults(token_limit=100_000)

In [None]:
chat_engine = index.as_chat_engine(
 chat_mode="context",
 memory=memory,
 system_prompt=system_content
)

In [None]:
hi_engine = index.as_query_engine(
 memory=memory,
 system_prompt=system_content,
 similarity_top_k=20,
 streaming=True
)

In [None]:
prompt = """
Question: Which is not a government healthcare philosophy? 
A. To nurture a healthy nation by promoting good health.
B. To rely on competition to improve service and raise efficiency
C. To intervene directly whenever necessary
D. To provide for the care of employees
"""

In [None]:
response = hi_engine.query(prompt)
for res in response.response_gen:
 print(res, end="")

In [None]:
# query_string = "tell me more about integrated shield plans"
# query_string = "how to use CPF"
query_string = "what is MediSave"

response = hi_engine.query(query_string)
for res in response.response_gen:
 print(res, end="")