Spaces:
Running
on
Zero
Running
on
Zero
srijaydeshpande
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -31,16 +31,13 @@ hf_hub_download(
|
|
31 |
# local_dir = "./models"
|
32 |
# )
|
33 |
|
34 |
-
def process_document(pdf_path
|
35 |
-
extracted_pages = extract_pages(pdf_path
|
36 |
-
|
37 |
page2content = {}
|
38 |
-
|
39 |
for extracted_page in tqdm(extracted_pages):
|
40 |
page_id = extracted_page.pageid
|
41 |
content = process_page(extracted_page)
|
42 |
page2content[page_id] = content
|
43 |
-
|
44 |
return page2content
|
45 |
|
46 |
|
@@ -196,12 +193,12 @@ def pdf_to_text(files, maxtokens=2048, temperature=0, top_probability=0.95):
|
|
196 |
file_name = os.path.basename(file)
|
197 |
file_name_splt = file_name.split('.')
|
198 |
if (len(file_name_splt) > 1 and file_name_splt[1] == 'pdf'):
|
199 |
-
page2content = process_document(file
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
anonymized_text
|
205 |
return anonymized_text
|
206 |
|
207 |
css = ".gradio-container {background: 'logo.png'}"
|
@@ -215,8 +212,7 @@ output_path_component = gr.File(label="Select Output Path")
|
|
215 |
iface = gr.Interface(
|
216 |
fn=pdf_to_text,
|
217 |
inputs=['file'],
|
218 |
-
|
219 |
-
outputs=output_text,
|
220 |
title='DeID: Endoscopy Report De-Identification',
|
221 |
description="This application assists to remove personal information from the uploaded clinical report",
|
222 |
theme=gr.themes.Soft(),
|
|
|
31 |
# local_dir = "./models"
|
32 |
# )
|
33 |
|
34 |
+
def process_document(pdf_path):
|
35 |
+
extracted_pages = extract_pages(pdf_path)
|
|
|
36 |
page2content = {}
|
|
|
37 |
for extracted_page in tqdm(extracted_pages):
|
38 |
page_id = extracted_page.pageid
|
39 |
content = process_page(extracted_page)
|
40 |
page2content[page_id] = content
|
|
|
41 |
return page2content
|
42 |
|
43 |
|
|
|
193 |
file_name = os.path.basename(file)
|
194 |
file_name_splt = file_name.split('.')
|
195 |
if (len(file_name_splt) > 1 and file_name_splt[1] == 'pdf'):
|
196 |
+
page2content = process_document(file)
|
197 |
+
anonymized_text = ''
|
198 |
+
for page_id in page2content:
|
199 |
+
pdftext = page2content[page_id]
|
200 |
+
anonymized_text += deidentify_doc(pdftext, maxtokens, temperature, top_probability)
|
201 |
+
anonymized_text += '\n\n\n'
|
202 |
return anonymized_text
|
203 |
|
204 |
css = ".gradio-container {background: 'logo.png'}"
|
|
|
212 |
iface = gr.Interface(
|
213 |
fn=pdf_to_text,
|
214 |
inputs=['file'],
|
215 |
+
outputs="text",
|
|
|
216 |
title='DeID: Endoscopy Report De-Identification',
|
217 |
description="This application assists to remove personal information from the uploaded clinical report",
|
218 |
theme=gr.themes.Soft(),
|