srijaydeshpande commited on
Commit
cec9932
·
verified ·
1 Parent(s): d3c7d8d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -13
app.py CHANGED
@@ -31,16 +31,13 @@ hf_hub_download(
31
  # local_dir = "./models"
32
  # )
33
 
34
- def process_document(pdf_path, page_ids=None):
35
- extracted_pages = extract_pages(pdf_path, page_numbers=page_ids)
36
-
37
  page2content = {}
38
-
39
  for extracted_page in tqdm(extracted_pages):
40
  page_id = extracted_page.pageid
41
  content = process_page(extracted_page)
42
  page2content[page_id] = content
43
-
44
  return page2content
45
 
46
 
@@ -196,12 +193,12 @@ def pdf_to_text(files, maxtokens=2048, temperature=0, top_probability=0.95):
196
  file_name = os.path.basename(file)
197
  file_name_splt = file_name.split('.')
198
  if (len(file_name_splt) > 1 and file_name_splt[1] == 'pdf'):
199
- page2content = process_document(file, page_ids=[0])
200
- pdftext = page2content[1]
201
- if (pdftext): #shift this if block to right later
202
- anonymized_text = deidentify_doc(pdftext, maxtokens, temperature, top_probability)
203
- else:
204
- anonymized_text = 'PDF file appears to be corrupted.'
205
  return anonymized_text
206
 
207
  css = ".gradio-container {background: 'logo.png'}"
@@ -215,8 +212,7 @@ output_path_component = gr.File(label="Select Output Path")
215
  iface = gr.Interface(
216
  fn=pdf_to_text,
217
  inputs=['file'],
218
- # css = css,
219
- outputs=output_text,
220
  title='DeID: Endoscopy Report De-Identification',
221
  description="This application assists to remove personal information from the uploaded clinical report",
222
  theme=gr.themes.Soft(),
 
31
  # local_dir = "./models"
32
  # )
33
 
34
+ def process_document(pdf_path):
35
+ extracted_pages = extract_pages(pdf_path)
 
36
  page2content = {}
 
37
  for extracted_page in tqdm(extracted_pages):
38
  page_id = extracted_page.pageid
39
  content = process_page(extracted_page)
40
  page2content[page_id] = content
 
41
  return page2content
42
 
43
 
 
193
  file_name = os.path.basename(file)
194
  file_name_splt = file_name.split('.')
195
  if (len(file_name_splt) > 1 and file_name_splt[1] == 'pdf'):
196
+ page2content = process_document(file)
197
+ anonymized_text = ''
198
+ for page_id in page2content:
199
+ pdftext = page2content[page_id]
200
+ anonymized_text += deidentify_doc(pdftext, maxtokens, temperature, top_probability)
201
+ anonymized_text += '\n\n\n'
202
  return anonymized_text
203
 
204
  css = ".gradio-container {background: 'logo.png'}"
 
212
  iface = gr.Interface(
213
  fn=pdf_to_text,
214
  inputs=['file'],
215
+ outputs="text",
 
216
  title='DeID: Endoscopy Report De-Identification',
217
  description="This application assists to remove personal information from the uploaded clinical report",
218
  theme=gr.themes.Soft(),