Spaces:
Running
on
Zero
Running
on
Zero
srijaydeshpande
commited on
Update
Browse files
app.py
CHANGED
@@ -4,14 +4,15 @@ from tqdm import tqdm
|
|
4 |
import re
|
5 |
import gradio as gr
|
6 |
import os
|
7 |
-
|
8 |
-
|
9 |
import transformers
|
10 |
# from transformers import GemmaTokenizer, AutoModelForCausalLM
|
11 |
# from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
12 |
import accelerate
|
13 |
import torch
|
14 |
|
|
|
15 |
|
16 |
def process_document(pdf_path, page_ids=None):
|
17 |
|
@@ -68,12 +69,9 @@ def txt_to_html(text):
|
|
68 |
html_content += "</body></html>"
|
69 |
return html_content
|
70 |
|
71 |
-
def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=
|
72 |
|
73 |
-
|
74 |
-
|
75 |
-
print('Max Tokens is ',maxtokens)
|
76 |
-
print('Temperature is ',temperature)
|
77 |
|
78 |
# output = model.create_chat_completion(
|
79 |
# messages = [
|
@@ -118,19 +116,35 @@ def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=1.2, top_pr
|
|
118 |
|
119 |
return output
|
120 |
|
121 |
-
def
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
# model_id = "D:/llama/meta-llama/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
|
133 |
-
# model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers
|
134 |
|
135 |
# model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf", n_threads=8, device='gpu')
|
136 |
# model.chat_session()
|
@@ -140,17 +154,19 @@ model = transformers.pipeline(
|
|
140 |
"text-generation",
|
141 |
model=model_id,
|
142 |
model_kwargs={"torch_dtype": torch.bfloat16},
|
143 |
-
device="cuda"
|
144 |
)
|
145 |
|
146 |
-
css = ".gradio-container {background: 'logo.png'}"
|
147 |
temp_slider = gr.Slider(minimum=0, maximum=2, value=0.2, label="Temperature Value")
|
148 |
prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
|
149 |
max_tokens = gr.Number(value=600, label="Max Tokens")
|
|
|
|
|
|
|
150 |
iface = gr.Interface(
|
151 |
fn = pdf_to_text,
|
152 |
-
inputs = ['
|
153 |
-
outputs=
|
154 |
title='COBIx Endoscopy Report De-Identification',
|
155 |
description="This application assists to remove personal information from the uploaded clinical report",
|
156 |
theme=gr.themes.Soft(),
|
|
|
4 |
import re
|
5 |
import gradio as gr
|
6 |
import os
|
7 |
+
from llama_cpp import Llama
|
8 |
+
from gpt4all import GPT4All
|
9 |
import transformers
|
10 |
# from transformers import GemmaTokenizer, AutoModelForCausalLM
|
11 |
# from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
12 |
import accelerate
|
13 |
import torch
|
14 |
|
15 |
+
# HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
16 |
|
17 |
def process_document(pdf_path, page_ids=None):
|
18 |
|
|
|
69 |
html_content += "</body></html>"
|
70 |
return html_content
|
71 |
|
72 |
+
def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=0.0001, top_probability=0.95):
|
73 |
|
74 |
+
prompt = "Task: Please anonymize the following clinical note. Specific Rules: Replace all the following information with the term \"[redacted]\": 1. Redact any strings that is person name 2. Redact any medical staff names 3. Redact any strings that is location or address, such as \"3970 Longview Drive\" 4. Redact any strings that is age of person 5. Redact any dates and IDs 6. Redact clinic and hospital names 7. Redact professions such as \"manager\" 8. Redact any contact information"
|
|
|
|
|
|
|
75 |
|
76 |
# output = model.create_chat_completion(
|
77 |
# messages = [
|
|
|
116 |
|
117 |
return output
|
118 |
|
119 |
+
def mkdir(dir):
|
120 |
+
if not os.path.exists(dir):
|
121 |
+
os.makedirs(dir)
|
122 |
+
|
123 |
+
def pdf_to_text(files, output_folder, prompt="", maxtokens=600, temperature=1.2, top_probability=0.95):
|
124 |
+
if not output_folder:
|
125 |
+
output_folder='C:/'
|
126 |
+
output_folder = output_folder.replace('\\','/')
|
127 |
+
for file in files:
|
128 |
+
file_name = os.path.basename(file)
|
129 |
+
file_name_splt = file_name.split('.')
|
130 |
+
print('File name is ', file_name)
|
131 |
+
print('output folder is ', output_folder)
|
132 |
+
if(len(file_name_splt)>1 and file_name_splt[1]=='pdf'):
|
133 |
+
page2content = process_document(file, page_ids=[0])
|
134 |
+
pdftext = page2content[1]
|
135 |
+
if(pdftext):
|
136 |
+
anonymized_text = deidentify_doc(pdftext, prompt, maxtokens, temperature, top_probability)
|
137 |
+
# html = txt_to_html(display_text)
|
138 |
+
# with open('out.html', "w", encoding="utf-8") as file:
|
139 |
+
# file.write(html)
|
140 |
+
with open(os.path.join(output_folder, file_name_splt[0]+'.txt'), 'w') as file:
|
141 |
+
# Write some text to the file
|
142 |
+
file.write(anonymized_text)
|
143 |
+
display_text = "All selected reports are anonymized and results are saved in "
|
144 |
+
return display_text
|
145 |
|
146 |
# model_id = "D:/llama/meta-llama/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
|
147 |
+
# model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=32, n_batch=64)
|
148 |
|
149 |
# model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf", n_threads=8, device='gpu')
|
150 |
# model.chat_session()
|
|
|
154 |
"text-generation",
|
155 |
model=model_id,
|
156 |
model_kwargs={"torch_dtype": torch.bfloat16},
|
157 |
+
device="cuda",
|
158 |
)
|
159 |
|
|
|
160 |
temp_slider = gr.Slider(minimum=0, maximum=2, value=0.2, label="Temperature Value")
|
161 |
prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
|
162 |
max_tokens = gr.Number(value=600, label="Max Tokens")
|
163 |
+
input_folder = gr.File(file_count='multiple')
|
164 |
+
output_text = gr.Textbox()
|
165 |
+
output_path_component = gr.File(label="Select Output Path")
|
166 |
iface = gr.Interface(
|
167 |
fn = pdf_to_text,
|
168 |
+
inputs = ['files', gr.Textbox(label='Enter output folder path')],
|
169 |
+
outputs=output_text,
|
170 |
title='COBIx Endoscopy Report De-Identification',
|
171 |
description="This application assists to remove personal information from the uploaded clinical report",
|
172 |
theme=gr.themes.Soft(),
|