Mary12 commited on
Commit
cd60fbf
·
1 Parent(s): 24cbb94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -7
app.py CHANGED
@@ -13,13 +13,23 @@ def remove_references(text):
13
  return text
14
 
15
 
 
 
 
 
 
 
 
 
 
16
  def extract_text_from_pdf(file_path):
17
  text = ""
18
- pdf_reader = PdfReader(file_path)
19
- for page in pdf_reader.pages:
20
- text += page.extract_text() + "\n"
 
 
21
  return text
22
-
23
 
24
 
25
  def model(model_name):
@@ -32,10 +42,11 @@ def model(model_name):
32
  )
33
 
34
  return model_pipeline
 
 
 
35
 
36
- def qa_result(context, question, file):
37
- model_name = "timpal0l/mdeberta-v3-base-squad2"
38
- pipe = model(model_name)
39
  if file is not None:
40
  allowed_types = [".pdf", ".csv", ".doc"]
41
  extension = "." + file.name.split(".")[-1].lower()
 
13
  return text
14
 
15
 
16
+ # def extract_text_from_pdf(file_path):
17
+ # text = ""
18
+ # pdf_reader = PdfReader(file_path)
19
+ # for page in pdf_reader.pages:
20
+ # text += page.extract_text() + "\n"
21
+ # return text
22
+
23
+ import fitz # PyMuPDF
24
+
25
  def extract_text_from_pdf(file_path):
26
  text = ""
27
+ pdf_document = fitz.open(file_path)
28
+ for page_num in range(pdf_document.page_count):
29
+ page = pdf_document[page_num]
30
+ text += page.get_text("text") + "\n"
31
+ pdf_document.close()
32
  return text
 
33
 
34
 
35
  def model(model_name):
 
42
  )
43
 
44
  return model_pipeline
45
+
46
+ model_name = "timpal0l/mdeberta-v3-base-squad2"
47
+ pipe = model(model_name)
48
 
49
+ def qa_result(pipe = pipe, context, question, file):
 
 
50
  if file is not None:
51
  allowed_types = [".pdf", ".csv", ".doc"]
52
  extension = "." + file.name.split(".")[-1].lower()