ppsingh commited on
Commit
a23643d
·
1 Parent(s): 5cd7986

adding comments

Browse files
app.py CHANGED
@@ -1,9 +1,7 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import logging
4
- import numpy as np
5
  import os
6
- import time
7
  import re
8
  import json
9
  from uuid import uuid4
@@ -11,15 +9,11 @@ from datetime import datetime
11
  from pathlib import Path
12
  from huggingface_hub import CommitScheduler
13
  from auditqa.sample_questions import QUESTIONS
14
- from auditqa.engine.prompts import audience_prompts
15
  from auditqa.reports import files, report_list
16
- from auditqa.doc_process import process_pdf, get_local_qdrant
17
  from langchain.schema import (
18
  HumanMessage,
19
  SystemMessage,
20
  )
21
- from langchain_core.output_parsers import StrOutputParser
22
- from langchain_core.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
23
  from langchain_community.llms import HuggingFaceEndpoint
24
  from auditqa.process_chunks import load_chunks, getconfig
25
  from langchain_community.chat_models.huggingface import ChatHuggingFace
@@ -27,15 +21,19 @@ from langchain.retrievers import ContextualCompressionRetriever
27
  from langchain.retrievers.document_compressors import CrossEncoderReranker
28
  from langchain_community.cross_encoders import HuggingFaceCrossEncoder
29
  from qdrant_client.http import models as rest
30
- #from qdrant_client import QdrantClient
31
  from dotenv import load_dotenv
32
- import pkg_resources
33
  load_dotenv()
 
 
34
  HF_token = os.environ["HF_TOKEN"]
 
 
35
  JSON_DATASET_DIR = Path("json_dataset")
36
  JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
37
  JSON_DATASET_PATH = JSON_DATASET_DIR / f"logs-{uuid4()}.json"
38
 
 
 
39
  scheduler = CommitScheduler(
40
  repo_id="GIZ/spaces_logs",
41
  repo_type="dataset",
@@ -44,38 +42,36 @@ scheduler = CommitScheduler(
44
  )
45
 
46
  model_config = getconfig("model_params.cfg")
47
- #installed_packages = pkg_resources.working_set
48
- #package_list_ = ""
49
- #for package in installed_packages:
50
- # package_list_ = package_list_ + f"{package.key}=={package.version}\n"
51
- #print(package_list_)
52
 
53
 
54
- ######## Vector Store #######
55
- # process all files and get the vectorstores collections
56
- # vectorestore colection are stored on persistent storage so this needs to be run only once
57
- # hence, comment out line below when creating for first time
58
  vectorstores = load_chunks()
59
- # once the vectore embeddings are created we will qdrant client to access these
60
- #vectorstores = get_local_qdrant()
61
 
62
- # -------------------------------------------------------------
63
- # Functions
64
- # -------------------------------------------------------------
 
 
65
 
66
  def save_logs(logs) -> None:
 
 
 
67
  with scheduler.lock:
68
  with JSON_DATASET_PATH.open("a") as f:
69
  json.dump(logs, f)
70
  f.write("\n")
71
  logging.info("logging done")
72
-
 
73
  def make_html_source(source,i):
74
  """
75
  takes the text and converts it into html format for display in "source" side tab
76
  """
77
  meta = source.metadata
78
- # content = source.page_content.split(":",1)[1].strip()
79
  content = source.page_content.strip()
80
 
81
  name = meta['filename']
@@ -120,8 +116,9 @@ def finish_chat():
120
  return (gr.update(interactive = True,value = ""))
121
 
122
  async def chat(query,history,sources,reports,subtype,year):
123
- """taking a query and a message history, use a pipeline (reformulation, retriever, answering) to yield a tuple of:
124
- (messages in gradio format, messages in langchain format, source documents)"""
 
125
 
126
  logging.info(f">> NEW QUESTION : {query}")
127
  logging.info(f"history:{history}")
@@ -133,13 +130,9 @@ async def chat(query,history,sources,reports,subtype,year):
133
  docs_html = ""
134
  output_query = ""
135
 
136
- ##------------------------decide which collection to fetch------------------------------
137
- if len(reports) == 0:
138
- vectorstore = vectorstores["allreports"]
139
- else:
140
- vectorstore = vectorstores["allreports"]
141
-
142
- ###-------------------------------------Construct Filter------------------------------------
143
  if len(reports) == 0:
144
  ("defining filter for:{}:{}:{}".format(sources,subtype,year))
145
  filter=rest.Filter(
@@ -165,12 +158,18 @@ async def chat(query,history,sources,reports,subtype,year):
165
  )])
166
 
167
 
168
- ##------------------------------get context----------------------------------------------------
169
  context_retrieved_lst = []
170
  question_lst= [query]
 
171
  for question in question_lst:
 
 
 
172
  retriever = vectorstore.as_retriever(
173
- search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.6, "k": int(model_config.get('retriever','TOP_K')), "filter":filter})
 
 
174
  model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base")
175
  compressor = CrossEncoderReranker(model=model, top_n=3)
176
  compression_retriever = ContextualCompressionRetriever(
@@ -187,7 +186,7 @@ async def chat(query,history,sources,reports,subtype,year):
187
  context_retrieved_formatted = format_docs(context_retrieved)
188
  context_retrieved_lst.append(context_retrieved_formatted)
189
 
190
- ##-------------------Prompt---------------------------------------------------------------
191
  SYSTEM_PROMPT = """
192
  You are AuditQ&A, an AI Assistant created by Auditors and Data Scientist. You are given a question and extracted passages of the consolidated/departmental/thematic focus audit reports. Provide a clear and structured answer based on the passages/context provided and the guidelines.
193
  Guidelines:
@@ -209,15 +208,13 @@ async def chat(query,history,sources,reports,subtype,year):
209
  """.format(context = context_retrieved_lst, question=query)
210
 
211
  messages = [
212
- SystemMessage(content=SYSTEM_PROMPT),
213
- HumanMessage(
214
- content=USER_PROMPT
215
- ),]
216
 
217
- ###-----------------getting inference endpoints------------------------------
218
 
219
- # llama-3_1 endpoint = https://howaqfw0lpap12sg.us-east-1.aws.endpoints.huggingface.cloud
220
- # llama-3 endpoint = https://nhe9phsr2zhs0e36.eu-west-1.aws.endpoints.huggingface.cloud
221
  #callbacks = [StreamingStdOutCallbackHandler()]
222
  llm_qa = HuggingFaceEndpoint(
223
  endpoint_url= model_config.get('reader','ENDPOINT'),
@@ -226,10 +223,10 @@ async def chat(query,history,sources,reports,subtype,year):
226
  timeout=70,
227
  huggingfacehub_api_token=HF_token,)
228
 
229
- # create rag chain
230
  chat_model = ChatHuggingFace(llm=llm_qa)
231
 
232
- ###-------------------------- get answers ---------------------------------------
233
  answer_lst = []
234
  for question, context in zip(question_lst , context_retrieved_lst):
235
  answer = chat_model.invoke(messages)
@@ -249,10 +246,9 @@ async def chat(query,history,sources,reports,subtype,year):
249
 
250
  yield history,docs_html
251
 
252
-
253
  try:
254
  timestamp = str(datetime.now().timestamp())
255
- #file_store = "/data/logs/" + timestamp + ".json"
256
  logs = {
257
  "system_prompt": SYSTEM_PROMPT,
258
  "sources":sources,
@@ -271,12 +267,10 @@ async def chat(query,history,sources,reports,subtype,year):
271
  except Exception as e:
272
  logging.error(e)
273
 
274
- #process_pdf()
275
 
276
 
277
- # --------------------------------------------------------------------
278
- # Gradio
279
- # --------------------------------------------------------------------
280
 
281
  # Set up Gradio Theme
282
  theme = gr.themes.Base(
@@ -323,13 +317,13 @@ with gr.Blocks(title="Audit Q&A", css= "style.css", theme=theme,elem_id = "main-
323
  with gr.Column(scale=1, variant="panel",elem_id = "right-panel"):
324
  # creating tabs on right panel
325
  with gr.Tabs() as tabs:
326
- ################## tab for REPORTS SELECTION ##########
327
 
328
  with gr.Tab("Reports",elem_id = "tab-config",id = 2):
329
  gr.Markdown("Reminder: To get better results select the specific report/reports")
330
 
331
 
332
- #### First level filter for selecting Report source/category
333
  dropdown_sources = gr.Radio(
334
  ["Consolidated", "District","Ministry"],
335
  label="Select Report Category",
@@ -337,19 +331,19 @@ with gr.Blocks(title="Audit Q&A", css= "style.css", theme=theme,elem_id = "main-
337
  interactive=True,
338
  )
339
 
340
- #### second level filter for selecting subtype within the report category selected above
341
  dropdown_category = gr.Dropdown(
342
  list(files["Consolidated"].keys()),
343
  value = list(files["Consolidated"].keys())[0],
344
  label = "Filter for Sub-Type",
345
  interactive=True)
346
 
347
- #### update the secodn level filter abse don values from first level
348
  def rs_change(rs):
349
  return gr.update(choices=files[rs], value=list(files[rs].keys())[0])
350
  dropdown_sources.change(fn=rs_change, inputs=[dropdown_sources], outputs=[dropdown_category])
351
 
352
- #### Select the years for reports
353
  dropdown_year = gr.Dropdown(
354
  ['2018','2019','2020','2021','2022'],
355
  label="Filter for year",
@@ -358,7 +352,7 @@ with gr.Blocks(title="Audit Q&A", css= "style.css", theme=theme,elem_id = "main-
358
  interactive=True,
359
  )
360
  gr.Markdown("-------------------------------------------------------------------------")
361
- ##### Another way to select reports across category and sub-type
362
  dropdown_reports = gr.Dropdown(
363
  report_list,
364
  label="Or select specific reports",
@@ -396,7 +390,7 @@ with gr.Blocks(title="Audit Q&A", css= "style.css", theme=theme,elem_id = "main-
396
  )
397
 
398
  samples.append(group_examples)
399
- ########## tab for Sources reporting #################
400
  with gr.Tab("Sources",elem_id = "tab-citations",id = 1):
401
  sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
402
  docs_textbox = gr.State("")
 
1
  import gradio as gr
2
  import pandas as pd
3
  import logging
 
4
  import os
 
5
  import re
6
  import json
7
  from uuid import uuid4
 
9
  from pathlib import Path
10
  from huggingface_hub import CommitScheduler
11
  from auditqa.sample_questions import QUESTIONS
 
12
  from auditqa.reports import files, report_list
 
13
  from langchain.schema import (
14
  HumanMessage,
15
  SystemMessage,
16
  )
 
 
17
  from langchain_community.llms import HuggingFaceEndpoint
18
  from auditqa.process_chunks import load_chunks, getconfig
19
  from langchain_community.chat_models.huggingface import ChatHuggingFace
 
21
  from langchain.retrievers.document_compressors import CrossEncoderReranker
22
  from langchain_community.cross_encoders import HuggingFaceCrossEncoder
23
  from qdrant_client.http import models as rest
 
24
  from dotenv import load_dotenv
 
25
  load_dotenv()
26
+ # token to allow acces to Hub, This token should also be
27
+ # valid fo calls made to Inference endpoints
28
  HF_token = os.environ["HF_TOKEN"]
29
+
30
+ # create the local logs repo
31
  JSON_DATASET_DIR = Path("json_dataset")
32
  JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
33
  JSON_DATASET_PATH = JSON_DATASET_DIR / f"logs-{uuid4()}.json"
34
 
35
+ # the logs are written to dataset repo
36
+ # https://huggingface.co/spaces/Wauplin/space_to_dataset_saver
37
  scheduler = CommitScheduler(
38
  repo_id="GIZ/spaces_logs",
39
  repo_type="dataset",
 
42
  )
43
 
44
  model_config = getconfig("model_params.cfg")
 
 
 
 
 
45
 
46
 
47
+
48
+ #### VECTOR STORE ####
49
+ # reports contain the already created chunks from Markdown version of pdf reports
50
+ # document processing was done using : https://github.com/axa-group/Parsr
51
  vectorstores = load_chunks()
 
 
52
 
53
+
54
+ #### FUNCTIONS ####
55
+ # App UI and and its functionality is inspired and adapted from
56
+ # https://huggingface.co/spaces/Ekimetrics/climate-question-answering
57
+
58
 
59
  def save_logs(logs) -> None:
60
+ """ Every interaction with app saves the log of question and answer,
61
+ this is to get the usage statistics of app and evaluate model performances
62
+ """
63
  with scheduler.lock:
64
  with JSON_DATASET_PATH.open("a") as f:
65
  json.dump(logs, f)
66
  f.write("\n")
67
  logging.info("logging done")
68
+
69
+
70
  def make_html_source(source,i):
71
  """
72
  takes the text and converts it into html format for display in "source" side tab
73
  """
74
  meta = source.metadata
 
75
  content = source.page_content.strip()
76
 
77
  name = meta['filename']
 
116
  return (gr.update(interactive = True,value = ""))
117
 
118
  async def chat(query,history,sources,reports,subtype,year):
119
+ """taking a query and a message history, use a pipeline (reformulation, retriever, answering)
120
+ to yield a tuple of:(messages in gradio format/messages in langchain format, source documents)
121
+ """
122
 
123
  logging.info(f">> NEW QUESTION : {query}")
124
  logging.info(f"history:{history}")
 
130
  docs_html = ""
131
  output_query = ""
132
 
133
+ ##------------------------fetch collection from vectorstore------------------------------
134
+ vectorstore = vectorstores["allreports"]
135
+ ##---------------------construct filter for metdata filtering---------------------------
 
 
 
 
136
  if len(reports) == 0:
137
  ("defining filter for:{}:{}:{}".format(sources,subtype,year))
138
  filter=rest.Filter(
 
158
  )])
159
 
160
 
161
+ ##------------------------------get context----------------------------------------------
162
  context_retrieved_lst = []
163
  question_lst= [query]
164
+
165
  for question in question_lst:
166
+ # similarity score threshold can be used to make adjustments in quality and quantity for Retriever
167
+ # However need to make balancing, as retrieved results are again used by Ranker to fetch best among
168
+ # retreived results
169
  retriever = vectorstore.as_retriever(
170
+ search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.6,
171
+ "k": int(model_config.get('retriever','TOP_K')),
172
+ "filter":filter})
173
  model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base")
174
  compressor = CrossEncoderReranker(model=model, top_n=3)
175
  compression_retriever = ContextualCompressionRetriever(
 
186
  context_retrieved_formatted = format_docs(context_retrieved)
187
  context_retrieved_lst.append(context_retrieved_formatted)
188
 
189
+ ##------------------- -------------Prompt--------------------------------------------------
190
  SYSTEM_PROMPT = """
191
  You are AuditQ&A, an AI Assistant created by Auditors and Data Scientist. You are given a question and extracted passages of the consolidated/departmental/thematic focus audit reports. Provide a clear and structured answer based on the passages/context provided and the guidelines.
192
  Guidelines:
 
208
  """.format(context = context_retrieved_lst, question=query)
209
 
210
  messages = [
211
+ SystemMessage(content=SYSTEM_PROMPT),
212
+ HumanMessage(
213
+ content=USER_PROMPT
214
+ ),]
215
 
216
+ ##-----------------------getting inference endpoints------------------------------
217
 
 
 
218
  #callbacks = [StreamingStdOutCallbackHandler()]
219
  llm_qa = HuggingFaceEndpoint(
220
  endpoint_url= model_config.get('reader','ENDPOINT'),
 
223
  timeout=70,
224
  huggingfacehub_api_token=HF_token,)
225
 
226
+ # create RAG
227
  chat_model = ChatHuggingFace(llm=llm_qa)
228
 
229
+ ##-------------------------- get answers ---------------------------------------
230
  answer_lst = []
231
  for question, context in zip(question_lst , context_retrieved_lst):
232
  answer = chat_model.invoke(messages)
 
246
 
247
  yield history,docs_html
248
 
249
+ # logging the event
250
  try:
251
  timestamp = str(datetime.now().timestamp())
 
252
  logs = {
253
  "system_prompt": SYSTEM_PROMPT,
254
  "sources":sources,
 
267
  except Exception as e:
268
  logging.error(e)
269
 
 
270
 
271
 
272
+
273
+ #### Gradio App ####
 
274
 
275
  # Set up Gradio Theme
276
  theme = gr.themes.Base(
 
317
  with gr.Column(scale=1, variant="panel",elem_id = "right-panel"):
318
  # creating tabs on right panel
319
  with gr.Tabs() as tabs:
320
+ #---------------- tab for REPORTS SELECTION ----------------------
321
 
322
  with gr.Tab("Reports",elem_id = "tab-config",id = 2):
323
  gr.Markdown("Reminder: To get better results select the specific report/reports")
324
 
325
 
326
+ #----- First level filter for selecting Report source/category ----------
327
  dropdown_sources = gr.Radio(
328
  ["Consolidated", "District","Ministry"],
329
  label="Select Report Category",
 
331
  interactive=True,
332
  )
333
 
334
+ #------ second level filter for selecting subtype within the report category selected above
335
  dropdown_category = gr.Dropdown(
336
  list(files["Consolidated"].keys()),
337
  value = list(files["Consolidated"].keys())[0],
338
  label = "Filter for Sub-Type",
339
  interactive=True)
340
 
341
+ #----------- update the secodn level filter abse don values from first level ----------------
342
  def rs_change(rs):
343
  return gr.update(choices=files[rs], value=list(files[rs].keys())[0])
344
  dropdown_sources.change(fn=rs_change, inputs=[dropdown_sources], outputs=[dropdown_category])
345
 
346
+ #--------- Select the years for reports -------------------------------------
347
  dropdown_year = gr.Dropdown(
348
  ['2018','2019','2020','2021','2022'],
349
  label="Filter for year",
 
352
  interactive=True,
353
  )
354
  gr.Markdown("-------------------------------------------------------------------------")
355
+ #---------------- Another way to select reports across category and sub-type ------------
356
  dropdown_reports = gr.Dropdown(
357
  report_list,
358
  label="Or select specific reports",
 
390
  )
391
 
392
  samples.append(group_examples)
393
+ ##------------------- tab for Sources reporting ##------------------
394
  with gr.Tab("Sources",elem_id = "tab-citations",id = 1):
395
  sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
396
  docs_textbox = gr.State("")
auditqa/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (127 Bytes). View file
 
auditqa/__pycache__/doc_process.cpython-310.pyc ADDED
Binary file (3.22 kB). View file
 
auditqa/__pycache__/process_chunks.cpython-310.pyc ADDED
Binary file (3.29 kB). View file
 
auditqa/__pycache__/reports.cpython-310.pyc ADDED
Binary file (1.68 kB). View file
 
auditqa/__pycache__/sample_questions.cpython-310.pyc ADDED
Binary file (3.65 kB). View file
 
auditqa/doc_process.py CHANGED
@@ -10,6 +10,9 @@ from qdrant_client import QdrantClient
10
  from auditqa.reports import files, report_list
11
  device = 'cuda' if cuda.is_available() else 'cpu'
12
 
 
 
 
13
  # path to the pdf files
14
  path_to_data = "./data/pdf/"
15
 
 
10
  from auditqa.reports import files, report_list
11
  device = 'cuda' if cuda.is_available() else 'cpu'
12
 
13
+ ### This script is NO MORE IN USE #####
14
+ # Preprocessed report pdf is brought along with chunks and added to existing reports database
15
+
16
  # path to the pdf files
17
  path_to_data = "./data/pdf/"
18
 
auditqa/engine/prompts.py DELETED
@@ -1,68 +0,0 @@
1
- llama_propmt = """<|begin_of_text|>
2
- <|start_header_id|>system<|end_header_id|>
3
- You are AuditQ&A, an AI Assistant created by Auditors and Data Scientist. You are given a question and extracted passages of the consolidated/departmental/thematic focus audit reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
4
- Guidelines:
5
- - If the passages have useful facts or numbers, use them in your answer.
6
- - When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
7
- - Do not use the sentence 'Doc i says ...' to say where information came from.
8
- - If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
9
- - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
10
- - If it makes sense, use bullet points and lists to make your answers easier to understand.
11
- - You do not need to use every passage. Only use the ones that help answer the question.
12
- - If the documents do not have the information needed to answer the question, just say you do not have enough information.
13
- <|eot_id|>
14
- <|start_header_id|>user<|end_header_id|>
15
- Passages:
16
- {context}
17
- -----------------------
18
- Question: {question} - Explained to {audience}
19
- Answer in {language} with the passages citations:
20
- <|eot_id|>
21
- <|start_header_id|>assistant<|end_header_id|>
22
- """
23
- system_propmt = """
24
- You are AuditQ&A, an AI Assistant created by Auditors and Data Scientist. You are given a question and extracted passages of the consolidated/departmental/thematic focus audit reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
25
- Guidelines:
26
- - If the passages have useful facts or numbers, use them in your answer.
27
- - When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
28
- - Do not use the sentence 'Doc i says ...' to say where information came from.
29
- - If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
30
- - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
31
- - If it makes sense, use bullet points and lists to make your answers easier to understand.
32
- - You do not need to use every passage. Only use the ones that help answer the question.
33
- - If the documents do not have the information needed to answer the question, just say you do not have enough information.
34
- """
35
- user_propmt = """
36
- Passages:
37
- {context}
38
- -----------------------
39
- Question: {question} - Explained to {audience}
40
- Answer in {language} with the passages citations:
41
- """
42
-
43
- answer_prompt_template = """
44
- You are AuditQ&A, an AI Assistant created by Auditors and Data Scientist. You are given a question and extracted passages of the consolidated/departmental/thematic focus audit reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
45
- Guidelines:
46
- - If the passages have useful facts or numbers, use them in your answer.
47
- - When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
48
- - Do not use the sentence 'Doc i says ...' to say where information came from.
49
- - If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
50
- - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
51
- - If it makes sense, use bullet points and lists to make your answers easier to understand.
52
- - You do not need to use every passage. Only use the ones that help answer the question.
53
- - If the documents do not have the information needed to answer the question, just say you do not have enough information.
54
- - Consider by default that the question is about the past century unless it is specified otherwise.
55
- - If the passage is the caption of a picture, you can still use it as part of your answer as any other document.
56
- -----------------------
57
- Passages:
58
- {context}
59
- -----------------------
60
- Question: {question} - Explained to {audience}
61
- Answer in {language} with the passages citations:
62
- """
63
-
64
- audience_prompts = {
65
- "children": "6 year old children that don't know anything about audit and governance and need metaphors to learn",
66
- "general": "the general public who know the basics in audit and governance and want to learn more about it without technical terms. Still use references to passages.",
67
- "experts": "expert and climate scientists that are not afraid of technical terms",
68
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
auditqa/reports.py CHANGED
@@ -1,9 +1,3 @@
1
- POSSIBLE_REPORTS = [
2
- "Consolidated2021",
3
- "MWTS2021",
4
- "MWTS2022"
5
- ]
6
-
7
  report_list = ['Annual Consolidated OAG audit reports 2018',
8
  'Annual Consolidated OAG audit reports 2019',
9
  'Annual Consolidated OAG audit reports 2020',
 
 
 
 
 
 
 
1
  report_list = ['Annual Consolidated OAG audit reports 2018',
2
  'Annual Consolidated OAG audit reports 2019',
3
  'Annual Consolidated OAG audit reports 2020',
requirements.txt CHANGED
@@ -1,6 +1,5 @@
1
  langchain~=0.1.0
2
  langchain-huggingface==0.0.3
3
- #langchainhub~=0.1.14
4
  python-dotenv
5
  transformers>=4.35.2
6
  huggingface_hub==0.23.5
@@ -8,4 +7,4 @@ sentence_transformers~=3.0.1
8
  langchain-qdrant==0.1.3
9
  qdrant-client~=1.10.1
10
  PyMuPDF~=1.23.7
11
- sentencepiece
 
1
  langchain~=0.1.0
2
  langchain-huggingface==0.0.3
 
3
  python-dotenv
4
  transformers>=4.35.2
5
  huggingface_hub==0.23.5
 
7
  langchain-qdrant==0.1.3
8
  qdrant-client~=1.10.1
9
  PyMuPDF~=1.23.7
10
+ sentencepiece==0.2.0