owaiskha9654 commited on
Commit
c5634f3
·
verified ·
1 Parent(s): 5f86f35

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +186 -0
app.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import pathlib, fitz
4
+ from langchain.vectorstores import Chroma
5
+ # from PyPDF2 import PdfReader
6
+ from google.colab import files
7
+ from google.colab import userdata
8
+ from sentence_transformers import SentenceTransformer
9
+ from sentence_transformers.util import cos_sim
10
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
11
+ from langchain.docstore.document import Document
12
+ from huggingface_hub import InferenceClient
13
+ import gradio as gr
14
+
15
+
16
+ file_paths = ["docs/MANUU U.G. PROGRAMMES PROSPECTUS 2022-23 Eng 5 April 2022 4 PM.pdf","Prospectus 2023-24 (Eng-Version) (1)_0.pdf"]
17
+
18
+ page_contents = []
19
+ for fname in file_paths:
20
+ with fitz.open(fname) as doc:
21
+ print("Total Pages in {} are {}".format(fname,len(doc)))
22
+ for page in doc:
23
+ text = page.get_text()
24
+ if "............" in text:
25
+ continue
26
+ #print(text)
27
+ page_contents.append(text)
28
+ #break
29
+
30
+
31
+ embedding_model = HuggingFaceInstructEmbeddings(
32
+ #model_name="hkunlp/instructor-large",
33
+ #model_name="jinaai/jina-embedding-b-en-v1",
34
+ model_name="WhereIsAI/UAE-Large-V1",
35
+ model_kwargs={"device": "cuda"}
36
+ #model_kwargs={"device": "cpu"}
37
+ )
38
+
39
+ df_documents_chunks = pd.DataFrame({"doc_pages":page_contents})
40
+ df_documents_chunks["index_id"] = df_documents_chunks.index
41
+ print(df_documents_chunks)
42
+
43
+ def row_to_doc(row):
44
+ return Document(metadata={
45
+ 'id': row['index_id']
46
+ }, page_content=row['doc_pages'])
47
+
48
+
49
+ manuuindex_df_processed_documents = df_documents_chunks.apply(lambda row:row_to_doc(row),axis=1).to_list()
50
+
51
+
52
+ COLLECTION_NAME='Manuu_collection'
53
+ PERSIST_DIR='MANUU_dir4'
54
+
55
+
56
+ if os.path.exists(PERSIST_DIR):
57
+ print('Existing Collection : ', COLLECTION_NAME)
58
+ vectordb = Chroma(persist_directory=PERSIST_DIR, collection_name=COLLECTION_NAME, embedding_function=embedding_model)
59
+ print(f"Collection {vectordb._collection.name} has {vectordb._collection.count()} documents...")
60
+ else:
61
+ print('New Collection : ', COLLECTION_NAME)
62
+ vectordb = Chroma.from_documents(documents=manuuindex_df_processed_documents,
63
+ embedding=embedding_model,
64
+ collection_name=COLLECTION_NAME,
65
+ persist_directory=PERSIST_DIR,
66
+ collection_metadata=None)
67
+ client = vectordb.persist() # Save vector database as persistent files in the output folder
68
+
69
+ print(f"Collection {vectordb._collection.name} has {vectordb._collection.count()} documents...")
70
+
71
+
72
+ client = InferenceClient(
73
+ model = "mistralai/Mixtral-8x7B-Instruct-v0.1")
74
+
75
+
76
+ def context_fn(question_text,vectordb):
77
+ relevant_chunks = vectordb.similarity_search_with_score(
78
+ query=question_text,
79
+ k=5,)
80
+ context_5 = "\n\n\n".join([i[0].page_content for i in relevant_chunks])
81
+
82
+ return context_5
83
+
84
+
85
+ def format_prompt(message, history, context_prompt):
86
+ prompt = "<s>"
87
+ for user_prompt, bot_response in history:
88
+ prompt += f"[INST] {user_prompt}. Do not Give information from outside the Document Contexts and general Information[/INST]"
89
+ prompt += f" {bot_response}\n"
90
+ prompt += f" CONTEXT:{context_prompt}</s> "
91
+ prompt += f"[INST] {message} [/INST]"
92
+ with open('prompts.txt', 'a') as file:
93
+ print("user_prompt",prompt, file=file)
94
+ file.close()
95
+ return prompt
96
+
97
+
98
+ def generate_fn(
99
+ prompt, history, system_prompt, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,vectordb = vectordb
100
+ ):
101
+ temperature = float(temperature)
102
+ if temperature < 1e-2:
103
+ temperature = 1e-2
104
+ top_p = float(top_p)
105
+
106
+ generate_kwargs = dict(
107
+ temperature=temperature,
108
+ max_new_tokens=max_new_tokens,
109
+ top_p=top_p,
110
+ repetition_penalty=repetition_penalty,
111
+ do_sample=True,
112
+ seed=42,
113
+ )
114
+ context_5 = context_fn(question_text = prompt, vectordb = vectordb)
115
+ formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history, context_5)
116
+ #print("formatted_prompt",formatted_prompt)
117
+ stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
118
+ output = ""
119
+
120
+ for response in stream:
121
+ output += response.token.text
122
+ yield output
123
+ return output
124
+ additional_inputs=[
125
+ gr.Textbox(
126
+ label="System Prompt",
127
+ max_lines=1,
128
+ interactive=True,
129
+ ),
130
+ gr.Slider(
131
+ label="Temperature",
132
+ value=0.7,
133
+ minimum=0.0,
134
+ maximum=1.0,
135
+ step=0.05,
136
+ interactive=True,
137
+ info="Higher values produce more diverse outputs",
138
+ ),
139
+ gr.Slider(
140
+ label="Max new tokens",
141
+ value=256,
142
+ minimum=0,
143
+ maximum=2048,
144
+ step=64,
145
+ interactive=True,
146
+ info="The maximum numbers of new tokens",
147
+ ),
148
+ gr.Slider(
149
+ label="Top-p (nucleus sampling)",
150
+ value=0.90,
151
+ minimum=0.0,
152
+ maximum=1,
153
+ step=0.05,
154
+ interactive=True,
155
+ info="Higher values sample more low-probability tokens",
156
+ ),
157
+ gr.Slider(
158
+ label="Repetition penalty",
159
+ value=1.3,
160
+ minimum=1.0,
161
+ maximum=2.0,
162
+ step=0.05,
163
+ interactive=True,
164
+ info="Penalize repeated tokens",
165
+ )
166
+ ]
167
+
168
+ examples=[["Where is Maulana Azad National Urdu University?", None, None, None, None, None,],
169
+ [ "When was Department of Women Education established?", None, None, None, None, None, ],
170
+ ["Tell me about Department of Public Administration", None, None, None, None, None,],
171
+ ["What are Reservations for SCs/STs/OBCs /Women candidates/EWS Categories?", None, None, None, None, None,],
172
+ ["What is Upper Age Limit limit for Admissions", None, None, None, None, None,],
173
+ ["Fetch Details of Hostel Fee* (2022-23)?", None, None, None, None, None,],
174
+ ["What is Entrance Test Schedule 2023-24?", None, None, None, None, None,],
175
+ ]
176
+
177
+ gr.ChatInterface(
178
+ fn=generate_fn,
179
+ analytics_enabled=True,
180
+
181
+ chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"),
182
+ additional_inputs=additional_inputs,
183
+ title="Mixtral 46.7B",
184
+ examples=examples,
185
+ concurrency_limit=20,
186
+ ).launch()