srijaydeshpande commited on
Commit
9c4df5b
·
verified ·
1 Parent(s): 40b59dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -17
app.py CHANGED
@@ -65,7 +65,8 @@ def txt_to_html(text):
65
  return html_content
66
 
67
 
68
- def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=1.2, top_probability=0.95):
 
69
  output = model.create_chat_completion(
70
  messages=[
71
  {"role": "assistant", "content": prompt},
@@ -78,31 +79,45 @@ def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=1.2, top_pr
78
  temperature=temperature
79
  )
80
  output = output['choices'][0]['message']['content']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  return output
82
 
83
  @spaces.GPU(duration=120)
84
- def pdf_to_text(files, output_folder, prompt, maxtokens=600, temperature=1.2, top_probability=0.95):
85
- output_folder = output_folder.replace('\\', '/')
86
  files=[files]#remove later
87
  for file in files:
88
- # file_name = os.path.basename(file)
89
- # file_name_splt = file_name.split('.')
90
- # print('File name is ', file_name)
91
- # print('output folder is ', output_folder)
92
- # if (len(file_name_splt) > 1 and file_name_splt[1] == 'pdf'):
93
- # page2content = process_document(file, page_ids=[0])
94
- # pdftext = page2content[1]
95
- pdftext = file # remove later
96
- if (pdftext): #shift this if block to right later
97
- anonymized_text = deidentify_doc(pdftext, prompt, maxtokens, temperature, top_probability)
98
  return anonymized_text
99
 
100
 
101
  model_id = "Meta-Llama-3-8B-Instruct.Q8_0.gguf"
102
- model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=81, n_batch=64)
103
 
104
  css = ".gradio-container {background: 'logo.png'}"
105
- temp_slider = gr.Slider(minimum=0, maximum=2, value=0.2, label="Temperature Value")
106
  prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
107
  max_tokens = gr.Number(value=600, label="Max Tokens")
108
  input_folder = gr.File(file_count='multiple')
@@ -111,8 +126,8 @@ output_text = gr.Textbox()
111
  output_path_component = gr.File(label="Select Output Path")
112
  iface = gr.Interface(
113
  fn=pdf_to_text,
114
- # inputs=['files', input_folder_text, "textbox", max_tokens, temp_slider, prob_slider],
115
- inputs=["textbox", input_folder_text, "textbox", max_tokens, temp_slider, prob_slider],
116
  outputs=output_text,
117
  title='COBIx Endoscopy Report De-Identification',
118
  description="This application assists to remove personal information from the uploaded clinical report",
 
65
  return html_content
66
 
67
 
68
+ def deidentify_doc(pdftext="", maxtokens=600, temperature=1.2, top_probability=0.95):
69
+ prompt = "Perform the following actions on given text: 1. Replace any person names with term [redacted] 2. Replace any person addresses with term [redacted]"
70
  output = model.create_chat_completion(
71
  messages=[
72
  {"role": "assistant", "content": prompt},
 
79
  temperature=temperature
80
  )
81
  output = output['choices'][0]['message']['content']
82
+
83
+ prompt = "Perform the following actions on given text: 1. Replace NHS number and Case note number and Date of Birth with term [redacted] 2. DO NOT REPLACE ANY MEDICAL MEASUREMENTS 3. Replace only the CALENDAR DATES of format 'day/month/year' with term [redacted]"
84
+ output = model.create_chat_completion(
85
+ messages=[
86
+ {"role": "assistant", "content": prompt},
87
+ {
88
+ "role": "user",
89
+ "content": output
90
+ }
91
+ ],
92
+ max_tokens=maxtokens,
93
+ temperature=temperature
94
+ )
95
+ output = output['choices'][0]['message']['content']
96
+
97
  return output
98
 
99
  @spaces.GPU(duration=120)
100
+ def pdf_to_text(files, output_folder, maxtokens=2048, temperature=0, top_probability=0.95):
 
101
  files=[files]#remove later
102
  for file in files:
103
+ file_name = os.path.basename(file)
104
+ file_name_splt = file_name.split('.')
105
+ print('File name is ', file_name)
106
+ print('output folder is ', output_folder)
107
+ if (len(file_name_splt) > 1 and file_name_splt[1] == 'pdf'):
108
+ page2content = process_document(file, page_ids=[0])
109
+ pdftext = page2content[1]
110
+ # pdftext = file # remove later
111
+ if (pdftext): #shift this if block to right later
112
+ anonymized_text = deidentify_doc(pdftext, maxtokens, temperature, top_probability)
113
  return anonymized_text
114
 
115
 
116
  model_id = "Meta-Llama-3-8B-Instruct.Q8_0.gguf"
117
+ model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=-1, n_batch=128)
118
 
119
  css = ".gradio-container {background: 'logo.png'}"
120
+ temp_slider = gr.Slider(minimum=0, maximum=2, value=0.9, label="Temperature Value")
121
  prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
122
  max_tokens = gr.Number(value=600, label="Max Tokens")
123
  input_folder = gr.File(file_count='multiple')
 
126
  output_path_component = gr.File(label="Select Output Path")
127
  iface = gr.Interface(
128
  fn=pdf_to_text,
129
+ inputs=['file', input_folder_text],
130
+ # inputs=["textbox", input_folder_text, "textbox", max_tokens, temp_slider, prob_slider],
131
  outputs=output_text,
132
  title='COBIx Endoscopy Report De-Identification',
133
  description="This application assists to remove personal information from the uploaded clinical report",