srijaydeshpande commited on
Commit
d92d21a
·
verified ·
1 Parent(s): 35e2666
Files changed (1) hide show
  1. app.py +38 -22
app.py CHANGED
@@ -4,14 +4,15 @@ from tqdm import tqdm
4
  import re
5
  import gradio as gr
6
  import os
7
- # from llama_cpp import Llama
8
- # from gpt4all import GPT4All
9
  import transformers
10
  # from transformers import GemmaTokenizer, AutoModelForCausalLM
11
  # from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
12
  import accelerate
13
  import torch
14
 
 
15
 
16
  def process_document(pdf_path, page_ids=None):
17
 
@@ -68,12 +69,9 @@ def txt_to_html(text):
68
  html_content += "</body></html>"
69
  return html_content
70
 
71
- def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=1.2, top_probability=0.95):
72
 
73
- # prompt = "Please anonymize the following clinical note. Replace all the following information with the term '[redacted]': Redact any strings that might be a name or initials, patients’ names, doctors’ names, the names Dr., redact any medical staff names, redact any strings that might be a location or address, such as '3970 Longview Drive', redact any strings that look like 'age 37', redact any dates and registration numbers, redact professions such as 'manager', redact any contact information."
74
-
75
- print('Max Tokens is ',maxtokens)
76
- print('Temperature is ',temperature)
77
 
78
  # output = model.create_chat_completion(
79
  # messages = [
@@ -118,19 +116,35 @@ def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=1.2, top_pr
118
 
119
  return output
120
 
121
- def pdf_to_text(file, prompt, maxtokens=600, temperature=1.2, top_probability=0.95):
122
- pdftext=""
123
- if(file):
124
- page2content = process_document(file, page_ids=[0])
125
- pdftext = page2content[1]
126
- display_text = deidentify_doc(pdftext, prompt, maxtokens, temperature, top_probability)
127
- html = txt_to_html(display_text)
128
- with open('out.html', "w", encoding="utf-8") as file:
129
- file.write(html)
130
- return html
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  # model_id = "D:/llama/meta-llama/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
133
- # model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=-1, n_batch=64)
134
 
135
  # model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf", n_threads=8, device='gpu')
136
  # model.chat_session()
@@ -140,17 +154,19 @@ model = transformers.pipeline(
140
  "text-generation",
141
  model=model_id,
142
  model_kwargs={"torch_dtype": torch.bfloat16},
143
- device="cuda"
144
  )
145
 
146
- css = ".gradio-container {background: 'logo.png'}"
147
  temp_slider = gr.Slider(minimum=0, maximum=2, value=0.2, label="Temperature Value")
148
  prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
149
  max_tokens = gr.Number(value=600, label="Max Tokens")
 
 
 
150
  iface = gr.Interface(
151
  fn = pdf_to_text,
152
- inputs = ['file', "textbox", max_tokens, temp_slider, prob_slider],
153
- outputs="html",
154
  title='COBIx Endoscopy Report De-Identification',
155
  description="This application assists to remove personal information from the uploaded clinical report",
156
  theme=gr.themes.Soft(),
 
4
  import re
5
  import gradio as gr
6
  import os
7
+ from llama_cpp import Llama
8
+ from gpt4all import GPT4All
9
  import transformers
10
  # from transformers import GemmaTokenizer, AutoModelForCausalLM
11
  # from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
12
  import accelerate
13
  import torch
14
 
15
+ # HF_TOKEN = os.environ.get("HF_TOKEN", None)
16
 
17
  def process_document(pdf_path, page_ids=None):
18
 
 
69
  html_content += "</body></html>"
70
  return html_content
71
 
72
+ def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=0.0001, top_probability=0.95):
73
 
74
+ prompt = "Task: Please anonymize the following clinical note. Specific Rules: Replace all the following information with the term \"[redacted]\": 1. Redact any strings that is person name 2. Redact any medical staff names 3. Redact any strings that is location or address, such as \"3970 Longview Drive\" 4. Redact any strings that is age of person 5. Redact any dates and IDs 6. Redact clinic and hospital names 7. Redact professions such as \"manager\" 8. Redact any contact information"
 
 
 
75
 
76
  # output = model.create_chat_completion(
77
  # messages = [
 
116
 
117
  return output
118
 
119
+ def mkdir(dir):
120
+ if not os.path.exists(dir):
121
+ os.makedirs(dir)
122
+
123
+ def pdf_to_text(files, output_folder, prompt="", maxtokens=600, temperature=1.2, top_probability=0.95):
124
+ if not output_folder:
125
+ output_folder='C:/'
126
+ output_folder = output_folder.replace('\\','/')
127
+ for file in files:
128
+ file_name = os.path.basename(file)
129
+ file_name_splt = file_name.split('.')
130
+ print('File name is ', file_name)
131
+ print('output folder is ', output_folder)
132
+ if(len(file_name_splt)>1 and file_name_splt[1]=='pdf'):
133
+ page2content = process_document(file, page_ids=[0])
134
+ pdftext = page2content[1]
135
+ if(pdftext):
136
+ anonymized_text = deidentify_doc(pdftext, prompt, maxtokens, temperature, top_probability)
137
+ # html = txt_to_html(display_text)
138
+ # with open('out.html', "w", encoding="utf-8") as file:
139
+ # file.write(html)
140
+ with open(os.path.join(output_folder, file_name_splt[0]+'.txt'), 'w') as file:
141
+ # Write some text to the file
142
+ file.write(anonymized_text)
143
+ display_text = "All selected reports are anonymized and results are saved in "
144
+ return display_text
145
 
146
  # model_id = "D:/llama/meta-llama/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
147
+ # model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=32, n_batch=64)
148
 
149
  # model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf", n_threads=8, device='gpu')
150
  # model.chat_session()
 
154
  "text-generation",
155
  model=model_id,
156
  model_kwargs={"torch_dtype": torch.bfloat16},
157
+ device="cuda",
158
  )
159
 
 
160
  temp_slider = gr.Slider(minimum=0, maximum=2, value=0.2, label="Temperature Value")
161
  prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
162
  max_tokens = gr.Number(value=600, label="Max Tokens")
163
+ input_folder = gr.File(file_count='multiple')
164
+ output_text = gr.Textbox()
165
+ output_path_component = gr.File(label="Select Output Path")
166
  iface = gr.Interface(
167
  fn = pdf_to_text,
168
+ inputs = ['files', gr.Textbox(label='Enter output folder path')],
169
+ outputs=output_text,
170
  title='COBIx Endoscopy Report De-Identification',
171
  description="This application assists to remove personal information from the uploaded clinical report",
172
  theme=gr.themes.Soft(),