Shreyas094 commited on
Commit
a6abb8f
·
verified ·
1 Parent(s): 1f50701

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +222 -69
app.py CHANGED
@@ -14,15 +14,26 @@ from llama_parse import LlamaParse
14
  from langchain_core.documents import Document
15
  from huggingface_hub import InferenceClient
16
  import inspect
 
 
 
 
 
17
 
18
  # Environment variables and configurations
19
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
20
  llama_cloud_api_key = os.environ.get("LLAMA_CLOUD_API_KEY")
 
 
 
 
 
 
21
 
22
  MODELS = [
23
  "mistralai/Mistral-7B-Instruct-v0.3",
24
  "mistralai/Mixtral-8x7B-Instruct-v0.1",
25
- "microsoft/Phi-3-mini-4k-instruct"
26
  ]
27
 
28
  # Initialize LlamaParse
@@ -79,31 +90,71 @@ def update_vectors(files, parser):
79
 
80
  def generate_chunked_response(prompt, model, max_tokens=1000, num_calls=3, temperature=0.2, should_stop=False):
81
  print(f"Starting generate_chunked_response with {num_calls} calls")
82
- client = InferenceClient(model, token=huggingface_token)
83
  full_response = ""
84
  messages = [{"role": "user", "content": prompt}]
85
 
86
- for i in range(num_calls):
87
- print(f"Starting API call {i+1}")
88
- if should_stop:
89
- print("Stop clicked, breaking loop")
90
- break
91
- try:
92
- for message in client.chat_completion(
93
- messages=messages,
94
- max_tokens=max_tokens,
95
- temperature=temperature,
96
- stream=True,
97
- ):
98
- if should_stop:
99
- print("Stop clicked during streaming, breaking")
100
- break
101
- if message.choices and message.choices[0].delta and message.choices[0].delta.content:
102
- chunk = message.choices[0].delta.content
103
- full_response += chunk
104
- print(f"API call {i+1} completed")
105
- except Exception as e:
106
- print(f"Error in generating response: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  # Clean up the response
109
  clean_response = re.sub(r'<s>\[INST\].*?\[/INST\]\s*', '', full_response, flags=re.DOTALL)
@@ -144,16 +195,15 @@ def chatbot_interface(message, history, use_web_search, model, temperature, num_
144
  history = history + [(message, "")]
145
 
146
  try:
147
- if use_web_search:
148
- for main_content, sources in get_response_with_search(message, model, num_calls=num_calls, temperature=temperature):
149
- history[-1] = (message, f"{main_content}\n\n{sources}")
150
- yield history
151
- else:
152
- for partial_response in get_response_from_pdf(message, model, num_calls=num_calls, temperature=temperature):
153
- history[-1] = (message, partial_response)
154
- yield history
155
  except gr.CancelledError:
156
  yield history
 
 
 
 
157
 
158
  def retry_last_response(history, use_web_search, model, temperature, num_calls):
159
  if not history:
@@ -165,12 +215,103 @@ def retry_last_response(history, use_web_search, model, temperature, num_calls):
165
  return chatbot_interface(last_user_msg, history, use_web_search, model, temperature, num_calls)
166
 
167
  def respond(message, history, model, temperature, num_calls, use_web_search):
168
- if use_web_search:
169
- for main_content, sources in get_response_with_search(message, model, num_calls=num_calls, temperature=temperature):
170
- yield f"{main_content}\n\n{sources}"
171
- else:
172
- for partial_response in get_response_from_pdf(message, model, num_calls=num_calls, temperature=temperature):
173
- yield partial_response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
  def get_response_with_search(query, model, num_calls=3, temperature=0.2):
176
  search_results = duckduckgo_search(query)
@@ -181,21 +322,27 @@ def get_response_with_search(query, model, num_calls=3, temperature=0.2):
181
  {context}
182
  Write a detailed and complete research document that fulfills the following user request: '{query}'
183
  After writing the document, please provide a list of sources used in your response."""
184
-
185
- client = InferenceClient(model, token=huggingface_token)
186
-
187
- main_content = ""
188
- for i in range(num_calls):
189
- for message in client.chat_completion(
190
- messages=[{"role": "user", "content": prompt}],
191
- max_tokens=1000,
192
- temperature=temperature,
193
- stream=True,
194
- ):
195
- if message.choices and message.choices[0].delta and message.choices[0].delta.content:
196
- chunk = message.choices[0].delta.content
197
- main_content += chunk
198
- yield main_content, "" # Yield partial main content without sources
 
 
 
 
 
 
199
 
200
  def get_response_from_pdf(query, model, num_calls=3, temperature=0.2):
201
  embed = get_embeddings()
@@ -209,24 +356,30 @@ def get_response_from_pdf(query, model, num_calls=3, temperature=0.2):
209
  relevant_docs = retriever.get_relevant_documents(query)
210
  context_str = "\n".join([doc.page_content for doc in relevant_docs])
211
 
212
- prompt = f"""Using the following context from the PDF documents:
 
 
 
 
 
 
213
  {context_str}
214
  Write a detailed and complete response that answers the following user question: '{query}'"""
215
-
216
- client = InferenceClient(model, token=huggingface_token)
217
-
218
- response = ""
219
- for i in range(num_calls):
220
- for message in client.chat_completion(
221
- messages=[{"role": "user", "content": prompt}],
222
- max_tokens=1000,
223
- temperature=temperature,
224
- stream=True,
225
- ):
226
- if message.choices and message.choices[0].delta and message.choices[0].delta.content:
227
- chunk = message.choices[0].delta.content
228
- response += chunk
229
- yield response # Yield partial response
230
 
231
  def vote(data: gr.LikeData):
232
  if data.liked:
@@ -299,7 +452,7 @@ with demo:
299
  1. Upload PDF documents using the file input at the top.
300
  2. Select the PDF parser (pypdf or llamaparse) and click "Upload Document" to update the vector store.
301
  3. Ask questions in the chat interface.
302
- 4. Toggle "Use Web Search" to switch between PDF chat and web search, the toggle box is present inside additional inputs dropdown.
303
  5. Adjust Temperature and Number of API Calls to fine-tune the response generation.
304
  6. Use the provided examples or ask your own questions.
305
  """
 
14
  from langchain_core.documents import Document
15
  from huggingface_hub import InferenceClient
16
  import inspect
17
+ import logging
18
+
19
+
20
+ # Set up basic configuration for logging
21
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
22
 
23
  # Environment variables and configurations
24
  huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
25
  llama_cloud_api_key = os.environ.get("LLAMA_CLOUD_API_KEY")
26
+ ACCOUNT_ID = os.environ.get("CLOUDFARE_ACCOUNT_ID")
27
+ API_TOKEN = os.environ.get("CLOUDFLARE_AUTH_TOKEN")
28
+ API_BASE_URL = "https://api.cloudflare.com/client/v4/accounts/a17f03e0f049ccae0c15cdcf3b9737ce/ai/run/"
29
+
30
+ print(f"ACCOUNT_ID: {ACCOUNT_ID}")
31
+ print(f"CLOUDFLARE_AUTH_TOKEN: {API_TOKEN[:5]}..." if API_TOKEN else "Not set")
32
 
33
  MODELS = [
34
  "mistralai/Mistral-7B-Instruct-v0.3",
35
  "mistralai/Mixtral-8x7B-Instruct-v0.1",
36
+ "@cf/meta/llama-3.1-8b-instruct"
37
  ]
38
 
39
  # Initialize LlamaParse
 
90
 
91
  def generate_chunked_response(prompt, model, max_tokens=1000, num_calls=3, temperature=0.2, should_stop=False):
92
  print(f"Starting generate_chunked_response with {num_calls} calls")
 
93
  full_response = ""
94
  messages = [{"role": "user", "content": prompt}]
95
 
96
+ if model == "@cf/meta/llama-3.1-8b-instruct":
97
+ # Cloudflare API
98
+ for i in range(num_calls):
99
+ print(f"Starting Cloudflare API call {i+1}")
100
+ if should_stop:
101
+ print("Stop clicked, breaking loop")
102
+ break
103
+ try:
104
+ response = requests.post(
105
+ f"https://api.cloudflare.com/client/v4/accounts/{ACCOUNT_ID}/ai/run/@cf/meta/llama-3.1-8b-instruct",
106
+ headers={"Authorization": f"Bearer {API_TOKEN}"},
107
+ json={
108
+ "stream": true,
109
+ "messages": [
110
+ {"role": "system", "content": "You are a friendly assistant"},
111
+ {"role": "user", "content": prompt}
112
+ ],
113
+ "max_tokens": max_tokens,
114
+ "temperature": temperature
115
+ },
116
+ stream=true
117
+ )
118
+
119
+ for line in response.iter_lines():
120
+ if should_stop:
121
+ print("Stop clicked during streaming, breaking")
122
+ break
123
+ if line:
124
+ try:
125
+ json_data = json.loads(line.decode('utf-8').split('data: ')[1])
126
+ chunk = json_data['response']
127
+ full_response += chunk
128
+ except json.JSONDecodeError:
129
+ continue
130
+ print(f"Cloudflare API call {i+1} completed")
131
+ except Exception as e:
132
+ print(f"Error in generating response from Cloudflare: {str(e)}")
133
+ else:
134
+ # Original Hugging Face API logic
135
+ client = InferenceClient(model, token=huggingface_token)
136
+
137
+ for i in range(num_calls):
138
+ print(f"Starting Hugging Face API call {i+1}")
139
+ if should_stop:
140
+ print("Stop clicked, breaking loop")
141
+ break
142
+ try:
143
+ for message in client.chat_completion(
144
+ messages=messages,
145
+ max_tokens=max_tokens,
146
+ temperature=temperature,
147
+ stream=True,
148
+ ):
149
+ if should_stop:
150
+ print("Stop clicked during streaming, breaking")
151
+ break
152
+ if message.choices and message.choices[0].delta and message.choices[0].delta.content:
153
+ chunk = message.choices[0].delta.content
154
+ full_response += chunk
155
+ print(f"Hugging Face API call {i+1} completed")
156
+ except Exception as e:
157
+ print(f"Error in generating response from Hugging Face: {str(e)}")
158
 
159
  # Clean up the response
160
  clean_response = re.sub(r'<s>\[INST\].*?\[/INST\]\s*', '', full_response, flags=re.DOTALL)
 
195
  history = history + [(message, "")]
196
 
197
  try:
198
+ for response in respond(message, history, model, temperature, num_calls, use_web_search):
199
+ history[-1] = (message, response)
200
+ yield history
 
 
 
 
 
201
  except gr.CancelledError:
202
  yield history
203
+ except Exception as e:
204
+ logging.error(f"Unexpected error in chatbot_interface: {str(e)}")
205
+ history[-1] = (message, f"An unexpected error occurred: {str(e)}")
206
+ yield history
207
 
208
  def retry_last_response(history, use_web_search, model, temperature, num_calls):
209
  if not history:
 
215
  return chatbot_interface(last_user_msg, history, use_web_search, model, temperature, num_calls)
216
 
217
  def respond(message, history, model, temperature, num_calls, use_web_search):
218
+ logging.info(f"User Query: {message}")
219
+ logging.info(f"Model Used: {model}")
220
+ logging.info(f"Search Type: {'Web Search' if use_web_search else 'PDF Search'}")
221
+
222
+ try:
223
+ if use_web_search:
224
+ for main_content, sources in get_response_with_search(message, model, num_calls=num_calls, temperature=temperature):
225
+ response = f"{main_content}\n\n{sources}"
226
+ first_line = response.split('\n')[0] if response else ''
227
+ logging.info(f"Generated Response (first line): {first_line}")
228
+ yield response
229
+ else:
230
+ if model == "@cf/meta/llama-3.1-8b-instruct":
231
+ # Use Cloudflare API
232
+ embed = get_embeddings()
233
+ if os.path.exists("faiss_database"):
234
+ database = FAISS.load_local("faiss_database", embed, allow_dangerous_deserialization=True)
235
+ retriever = database.as_retriever()
236
+ relevant_docs = retriever.get_relevant_documents(message)
237
+ context_str = "\n".join([doc.page_content for doc in relevant_docs])
238
+ else:
239
+ context_str = "No documents available."
240
+
241
+ for partial_response in get_response_from_cloudflare(prompt="", context=context_str, query=message, num_calls=num_calls, temperature=temperature, search_type="pdf"):
242
+ first_line = partial_response.split('\n')[0] if partial_response else ''
243
+ logging.info(f"Generated Response (first line): {first_line}")
244
+ yield partial_response
245
+ else:
246
+ # Use Hugging Face API
247
+ for partial_response in get_response_from_pdf(message, model, num_calls=num_calls, temperature=temperature):
248
+ first_line = partial_response.split('\n')[0] if partial_response else ''
249
+ logging.info(f"Generated Response (first line): {first_line}")
250
+ yield partial_response
251
+ except Exception as e:
252
+ logging.error(f"Error with {model}: {str(e)}")
253
+ if "microsoft/Phi-3-mini-4k-instruct" in model:
254
+ logging.info("Falling back to Mistral model due to Phi-3 error")
255
+ fallback_model = "mistralai/Mistral-7B-Instruct-v0.3"
256
+ yield from respond(message, history, fallback_model, temperature, num_calls, use_web_search)
257
+ else:
258
+ yield f"An error occurred with the {model} model: {str(e)}. Please try again or select a different model."
259
+
260
+ logging.basicConfig(level=logging.DEBUG)
261
+
262
+ def get_response_from_cloudflare(prompt, context, query, num_calls=3, temperature=0.2, search_type="pdf"):
263
+ headers = {
264
+ "Authorization": f"Bearer {API_TOKEN}",
265
+ "Content-Type": "application/json"
266
+ }
267
+ model = "@cf/meta/llama-3.1-8b-instruct"
268
+
269
+ if search_type == "pdf":
270
+ instruction = f"""Using the following context:
271
+ {context}
272
+ Write a detailed and complete research document that fulfills the following user request: '{query}'"""
273
+ else: # web search
274
+ instruction = f"""Using the following context:
275
+ {context}
276
+ Write a detailed and complete research document that fulfills the following user request: '{query}'
277
+ After writing the document, please provide a list of sources used in your response."""
278
+
279
+ inputs = [
280
+ {"role": "system", "content": instruction},
281
+ {"role": "user", "content": query}
282
+ ]
283
+
284
+ payload = {
285
+ "messages": inputs,
286
+ "stream": True,
287
+ "temperature": temperature
288
+ }
289
+
290
+ full_response = ""
291
+ for i in range(num_calls):
292
+ try:
293
+ with requests.post(f"{API_BASE_URL}{model}", headers=headers, json=payload, stream=True) as response:
294
+ if response.status_code == 200:
295
+ for line in response.iter_lines():
296
+ if line:
297
+ try:
298
+ json_response = json.loads(line.decode('utf-8').split('data: ')[1])
299
+ if 'response' in json_response:
300
+ chunk = json_response['response']
301
+ full_response += chunk
302
+ yield full_response
303
+ except (json.JSONDecodeError, IndexError) as e:
304
+ logging.error(f"Error parsing streaming response: {str(e)}")
305
+ continue
306
+ else:
307
+ logging.error(f"HTTP Error: {response.status_code}, Response: {response.text}")
308
+ yield f"I apologize, but I encountered an HTTP error: {response.status_code}. Please try again later."
309
+ except Exception as e:
310
+ logging.error(f"Error in generating response from Cloudflare: {str(e)}")
311
+ yield f"I apologize, but an error occurred: {str(e)}. Please try again later."
312
+
313
+ if not full_response:
314
+ yield "I apologize, but I couldn't generate a response at this time. Please try again later."
315
 
316
  def get_response_with_search(query, model, num_calls=3, temperature=0.2):
317
  search_results = duckduckgo_search(query)
 
322
  {context}
323
  Write a detailed and complete research document that fulfills the following user request: '{query}'
324
  After writing the document, please provide a list of sources used in your response."""
325
+
326
+ if model == "@cf/meta/llama-3.1-8b-instruct":
327
+ # Use Cloudflare API
328
+ for response in get_response_from_cloudflare(prompt="", context=context, query=query, num_calls=num_calls, temperature=temperature, search_type="web"):
329
+ yield response, "" # Yield streaming response without sources
330
+ else:
331
+ # Use Hugging Face API
332
+ client = InferenceClient(model, token=huggingface_token)
333
+
334
+ main_content = ""
335
+ for i in range(num_calls):
336
+ for message in client.chat_completion(
337
+ messages=[{"role": "user", "content": prompt}],
338
+ max_tokens=1000,
339
+ temperature=temperature,
340
+ stream=True,
341
+ ):
342
+ if message.choices and message.choices[0].delta and message.choices[0].delta.content:
343
+ chunk = message.choices[0].delta.content
344
+ main_content += chunk
345
+ yield main_content, "" # Yield partial main content without sources
346
 
347
  def get_response_from_pdf(query, model, num_calls=3, temperature=0.2):
348
  embed = get_embeddings()
 
356
  relevant_docs = retriever.get_relevant_documents(query)
357
  context_str = "\n".join([doc.page_content for doc in relevant_docs])
358
 
359
+ if model == "@cf/meta/llama-3.1-8b-instruct":
360
+ # Use Cloudflare API with the retrieved context
361
+ for response in get_response_from_cloudflare(prompt="", context=context_str, query=query, num_calls=num_calls, temperature=temperature, search_type="pdf"):
362
+ yield response
363
+ else:
364
+ # Use Hugging Face API
365
+ prompt = f"""Using the following context from the PDF documents:
366
  {context_str}
367
  Write a detailed and complete response that answers the following user question: '{query}'"""
368
+
369
+ client = InferenceClient(model, token=huggingface_token)
370
+
371
+ response = ""
372
+ for i in range(num_calls):
373
+ for message in client.chat_completion(
374
+ messages=[{"role": "user", "content": prompt}],
375
+ max_tokens=1000,
376
+ temperature=temperature,
377
+ stream=True,
378
+ ):
379
+ if message.choices and message.choices[0].delta and message.choices[0].delta.content:
380
+ chunk = message.choices[0].delta.content
381
+ response += chunk
382
+ yield response # Yield partial response
383
 
384
  def vote(data: gr.LikeData):
385
  if data.liked:
 
452
  1. Upload PDF documents using the file input at the top.
453
  2. Select the PDF parser (pypdf or llamaparse) and click "Upload Document" to update the vector store.
454
  3. Ask questions in the chat interface.
455
+ 4. Toggle "Use Web Search" to switch between PDF chat and web search.
456
  5. Adjust Temperature and Number of API Calls to fine-tune the response generation.
457
  6. Use the provided examples or ask your own questions.
458
  """