File size: 4,485 Bytes
301614f
 
 
9035153
 
 
604b59c
 
 
 
6c36800
1ea407d
822a50d
 
 
 
a01ca04
7a3625d
a01ca04
7a3625d
0df69fa
058d9a5
a01ca04
2cd4e0a
 
604b59c
a01ca04
b0a8958
a01ca04
b0a8958
058d9a5
d121146
a01ca04
 
 
 
 
d121146
9035153
 
 
 
 
 
a01ca04
 
9035153
a01ca04
 
 
 
 
 
 
 
 
9035153
 
 
03e01d3
a01ca04
7a3625d
 
 
 
 
 
9035153
725d485
2a5a407
d121146
b0a8958
a01ca04
 
 
 
 
 
 
725d485
a01ca04
 
 
0df69fa
a01ca04
0df69fa
 
f80ac06
9035153
dffeb2d
911a8be
5573a68
a01ca04
 
 
 
 
 
 
 
911a8be
868e2fc
b0a8958
 
 
986cfd0
a01ca04
 
b0a8958
 
a01ca04
 
 
b0a8958
 
 
a01ca04
 
b0a8958
 
a01ca04
 
b0a8958
 
a01ca04
b0a8958
 
 
 
 
 
4627abd
725d485
911a8be
 
5573a68
911a8be
 
 
 
0df69fa
 
a42e5a8
 
a01ca04
 
 
 
 
dffeb2d
a01ca04
 
59b084c
 
 
 
 
a01ca04
59b084c
 
a01ca04
 
 
0df69fa
a01ca04
7c1d20d
fc4d061
e554b8b
a01ca04
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import gradio as gr
from langchain.document_loaders import PDFMinerLoader, PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter
import chromadb
import chromadb.config
from chromadb.config import Settings
from transformers import T5ForConditionalGeneration, AutoTokenizer
import torch
import uuid
from sentence_transformers import SentenceTransformer
import os

from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

# load the model
model_name = 'google/flan-t5-base'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


# to calculate text embeddings
ST_name = 'sentence-transformers/sentence-t5-base'
st_model = SentenceTransformer(ST_name)

# to store our embeddings and search
client = chromadb.Client()
collection = client.create_collection("my_db") 


def get_context(query_text):
    '''
    Given query in tokenized format, find its embeddings
    Search in Chroma DB
    and return results
    '''
    
    query_emb = st_model.encode(query_text)
    query_response = collection.query(query_embeddings=query_emb.tolist(), n_results=4)
    context = query_response['documents'][0][0]
    context = context.replace('\n', ' ').replace('  ', ' ')
    return context



def local_query(query, context):
    '''
    Given query (user response)
    Construct LLM query adding context to it
    Return response of LLM
    '''

    
    t5query = """Please answer the question based on the given context. 
    If you are not sure about your response, say I am not sure.
    Context: {}
    Question: {}
    """.format(context, query)

    # calculate embeddings for the query
    inputs = tokenizer(t5query, return_tensors="pt")

    outputs = model.generate(**inputs, max_new_tokens=20)
 
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)



   
   

def run_query(history, query):
    '''
    Run Gradio ChatInterface
    Given user response (query), find the most similar/related part to the question from the uploaded document
    Using Chroma search
    Update the query with context, and ask the question to LLM
    '''

    context = get_context(query) # find the related part from the pdf
    result = local_query(query, context) #  add context to model query
 

    history.append((query, str(result[0])))  # append result to chatInterface history
        

    return  history, ""



def upload_pdf(file):
    '''
    Upload a PDF
    Split into chunks 
    Encode each chunk into embeddings
    Assign a unique ID for each chunk embedding
    Construct Chroma DB
    Update your global Chroma DB collection
    '''
    try:
        if file is not None: 

            global collection
            
            file_name = file.name 

            #  Upload pdf document
            loader = PDFMinerLoader(file_name)
            doc = loader.load()

            #  extract chunks
            text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
            texts = text_splitter.split_documents(doc)
        
            texts = [i.page_content for i in texts]

            #  find embedding for each chunk
            doc_emb = st_model.encode(texts)
            doc_emb = doc_emb.tolist()

            #  index the embeddings
            ids = [str(uuid.uuid1()) for _ in doc_emb]
        
            #  add each chunk embedding to ChromaDB
            collection.add(
                embeddings=doc_emb,
                documents=texts,
                ids=ids
            )

    
            return 'Successfully uploaded!'
        else:
            return "No file uploaded."

    except Exception as e:
        return f"An error occurred: {e}"



    

with gr.Blocks() as demo:  
    '''
    Frontend for our tool
    '''

    #  Upload a PDF focument
    btn = gr.UploadButton("Upload a PDF", file_types=[".pdf"])
    output = gr.Textbox(label="Output Box") #  to put message indicating the status of upload
    chatbot = gr.Chatbot(height=240) #  our chatbot interface
    
    with gr.Row():
        with gr.Column(scale=0.70):
            txt = gr.Textbox(
                show_label=False,
                placeholder="Type a question",
            ) 


    # Backend for our tool
    # Event handlers
    btn.upload(fn=upload_pdf, inputs=[btn], outputs=[output])
    txt.submit(run_query, [chatbot, txt], [chatbot, txt])


gr.close_all()
demo.queue().launch() # use query for a better performance