Spaces:
Sleeping
Sleeping
srijaydeshpande
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -65,7 +65,8 @@ def txt_to_html(text):
|
|
65 |
return html_content
|
66 |
|
67 |
|
68 |
-
def deidentify_doc(pdftext="",
|
|
|
69 |
output = model.create_chat_completion(
|
70 |
messages=[
|
71 |
{"role": "assistant", "content": prompt},
|
@@ -78,31 +79,45 @@ def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=1.2, top_pr
|
|
78 |
temperature=temperature
|
79 |
)
|
80 |
output = output['choices'][0]['message']['content']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
return output
|
82 |
|
83 |
@spaces.GPU(duration=120)
|
84 |
-
def pdf_to_text(files, output_folder,
|
85 |
-
output_folder = output_folder.replace('\\', '/')
|
86 |
files=[files]#remove later
|
87 |
for file in files:
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
pdftext = file # remove later
|
96 |
-
|
97 |
-
|
98 |
return anonymized_text
|
99 |
|
100 |
|
101 |
model_id = "Meta-Llama-3-8B-Instruct.Q8_0.gguf"
|
102 |
-
model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers
|
103 |
|
104 |
css = ".gradio-container {background: 'logo.png'}"
|
105 |
-
temp_slider = gr.Slider(minimum=0, maximum=2, value=0.
|
106 |
prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
|
107 |
max_tokens = gr.Number(value=600, label="Max Tokens")
|
108 |
input_folder = gr.File(file_count='multiple')
|
@@ -111,8 +126,8 @@ output_text = gr.Textbox()
|
|
111 |
output_path_component = gr.File(label="Select Output Path")
|
112 |
iface = gr.Interface(
|
113 |
fn=pdf_to_text,
|
114 |
-
|
115 |
-
inputs=["textbox", input_folder_text, "textbox", max_tokens, temp_slider, prob_slider],
|
116 |
outputs=output_text,
|
117 |
title='COBIx Endoscopy Report De-Identification',
|
118 |
description="This application assists to remove personal information from the uploaded clinical report",
|
|
|
65 |
return html_content
|
66 |
|
67 |
|
68 |
+
def deidentify_doc(pdftext="", maxtokens=600, temperature=1.2, top_probability=0.95):
|
69 |
+
prompt = "Perform the following actions on given text: 1. Replace any person names with term [redacted] 2. Replace any person addresses with term [redacted]"
|
70 |
output = model.create_chat_completion(
|
71 |
messages=[
|
72 |
{"role": "assistant", "content": prompt},
|
|
|
79 |
temperature=temperature
|
80 |
)
|
81 |
output = output['choices'][0]['message']['content']
|
82 |
+
|
83 |
+
prompt = "Perform the following actions on given text: 1. Replace NHS number and Case note number and Date of Birth with term [redacted] 2. DO NOT REPLACE ANY MEDICAL MEASUREMENTS 3. Replace only the CALENDAR DATES of format 'day/month/year' with term [redacted]"
|
84 |
+
output = model.create_chat_completion(
|
85 |
+
messages=[
|
86 |
+
{"role": "assistant", "content": prompt},
|
87 |
+
{
|
88 |
+
"role": "user",
|
89 |
+
"content": output
|
90 |
+
}
|
91 |
+
],
|
92 |
+
max_tokens=maxtokens,
|
93 |
+
temperature=temperature
|
94 |
+
)
|
95 |
+
output = output['choices'][0]['message']['content']
|
96 |
+
|
97 |
return output
|
98 |
|
99 |
@spaces.GPU(duration=120)
|
100 |
+
def pdf_to_text(files, output_folder, maxtokens=2048, temperature=0, top_probability=0.95):
|
|
|
101 |
files=[files]#remove later
|
102 |
for file in files:
|
103 |
+
file_name = os.path.basename(file)
|
104 |
+
file_name_splt = file_name.split('.')
|
105 |
+
print('File name is ', file_name)
|
106 |
+
print('output folder is ', output_folder)
|
107 |
+
if (len(file_name_splt) > 1 and file_name_splt[1] == 'pdf'):
|
108 |
+
page2content = process_document(file, page_ids=[0])
|
109 |
+
pdftext = page2content[1]
|
110 |
+
# pdftext = file # remove later
|
111 |
+
if (pdftext): #shift this if block to right later
|
112 |
+
anonymized_text = deidentify_doc(pdftext, maxtokens, temperature, top_probability)
|
113 |
return anonymized_text
|
114 |
|
115 |
|
116 |
model_id = "Meta-Llama-3-8B-Instruct.Q8_0.gguf"
|
117 |
+
model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=-1, n_batch=128)
|
118 |
|
119 |
css = ".gradio-container {background: 'logo.png'}"
|
120 |
+
temp_slider = gr.Slider(minimum=0, maximum=2, value=0.9, label="Temperature Value")
|
121 |
prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
|
122 |
max_tokens = gr.Number(value=600, label="Max Tokens")
|
123 |
input_folder = gr.File(file_count='multiple')
|
|
|
126 |
output_path_component = gr.File(label="Select Output Path")
|
127 |
iface = gr.Interface(
|
128 |
fn=pdf_to_text,
|
129 |
+
inputs=['file', input_folder_text],
|
130 |
+
# inputs=["textbox", input_folder_text, "textbox", max_tokens, temp_slider, prob_slider],
|
131 |
outputs=output_text,
|
132 |
title='COBIx Endoscopy Report De-Identification',
|
133 |
description="This application assists to remove personal information from the uploaded clinical report",
|