srijaydeshpande commited on
Commit
e5ca27d
·
verified ·
1 Parent(s): b8b829b

Update main.py

Browse files
Files changed (1) hide show
  1. app.py +56 -57
app.py CHANGED
@@ -4,15 +4,14 @@ from tqdm import tqdm
4
  import re
5
  import gradio as gr
6
  import os
7
- from llama_cpp import Llama
8
- from gpt4all import GPT4All
9
  import transformers
10
  # from transformers import GemmaTokenizer, AutoModelForCausalLM
11
  # from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
12
  import accelerate
13
  import torch
14
 
15
- # HF_TOKEN = os.environ.get("HF_TOKEN", None)
16
 
17
  def process_document(pdf_path, page_ids=None):
18
 
@@ -69,57 +68,57 @@ def txt_to_html(text):
69
  html_content += "</body></html>"
70
  return html_content
71
 
72
- def deidentify_doc(pdftext=""):
73
 
74
  prompt = "Please anonymize the following clinical note. Replace all the following information with the term '[redacted]': Redact any strings that might be a name or initials, patients’ names, doctors’ names, the names Dr., redact any medical staff names, redact any strings that might be a location or address, such as '3970 Longview Drive', redact any strings that look like 'age 37', redact any dates and registration numbers, redact professions such as 'manager', redact any contact information."
75
 
76
- print('Input prompt is ',prompt)
77
- print('Input pdf text is ',pdftext)
78
-
79
- output = model.create_chat_completion(
80
- messages = [
81
- {"role": "assistant", "content": prompt},
82
- {
83
- "role": "user",
84
- "content": pdftext
85
- }
86
- ],
87
- max_tokens=600,
88
- temperature=0
89
- )
90
- output = output['choices'][0]['message']['content']
91
 
92
  # if (pdftext):
93
  # prompt = prompt + ': ' + pdftext
94
  # output = model.generate(prompt=prompt, max_tokens=1024, n_batch=128)
95
 
96
- # messages = [
97
- # {"role": "assistant",
98
- # "content": prompt},
99
- # {"role": "user",
100
- # "content": pdftext}, ]
101
- # prompt = model.tokenizer.apply_chat_template(
102
- # messages,
103
- # tokenize=False,
104
- # add_generation_prompt=True
105
- # )
106
- # terminators = [
107
- # model.tokenizer.eos_token_id,
108
- # model.tokenizer.convert_tokens_to_ids("<|eot_id|>")
109
- # ]
110
- # outputs = model(
111
- # prompt,
112
- # max_new_tokens=1024,
113
- # eos_token_id=terminators,
114
- # do_sample=True,
115
- # temperature=0.3,
116
- # top_p=0.95,
117
- # )
118
- # output = outputs[0]["generated_text"][len(prompt):]
119
 
120
  return output
121
 
122
- def pdf_to_text(file):
123
  pdftext=""
124
  if(file):
125
  page2content = process_document(file, page_ids=[0])
@@ -130,27 +129,27 @@ def pdf_to_text(file):
130
  file.write(html)
131
  return html
132
 
133
- model_id = "Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
134
- model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=32, n_batch=64)
135
- print('Model loaded')
136
 
137
- # print('list of available devices is ',GPT4All.list_gpus())
138
- # model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf", n_threads=8, device='auto')
139
  # model.chat_session()
140
 
141
- # model_id = "Meta-Llama-3-8B-Instruct"
142
- # model = transformers.pipeline(
143
- # "text-generation",
144
- # model=model_id,
145
- # model_kwargs={"torch_dtype": torch.bfloat16},
146
- # device="cpu",
147
- # )
148
 
149
  css = ".gradio-container {background: 'logo.png'}"
150
-
 
 
151
  iface = gr.Interface(
152
  fn = pdf_to_text,
153
- inputs = ['file'],
154
  outputs="html",
155
  title='COBIx Endoscopy Report De-Identification',
156
  description="This application assists to remove personal information from the uploaded clinical report",
 
4
  import re
5
  import gradio as gr
6
  import os
7
+ # from llama_cpp import Llama
8
+ # from gpt4all import GPT4All
9
  import transformers
10
  # from transformers import GemmaTokenizer, AutoModelForCausalLM
11
  # from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
12
  import accelerate
13
  import torch
14
 
 
15
 
16
  def process_document(pdf_path, page_ids=None):
17
 
 
68
  html_content += "</body></html>"
69
  return html_content
70
 
71
+ def deidentify_doc(pdftext="", maxtokens=600, temperature=1.2, top_probability=0.95):
72
 
73
  prompt = "Please anonymize the following clinical note. Replace all the following information with the term '[redacted]': Redact any strings that might be a name or initials, patients’ names, doctors’ names, the names Dr., redact any medical staff names, redact any strings that might be a location or address, such as '3970 Longview Drive', redact any strings that look like 'age 37', redact any dates and registration numbers, redact professions such as 'manager', redact any contact information."
74
 
75
+ print('Max Tokens is ',maxtokens)
76
+ print('Temperature is ',temperature)
77
+
78
+ # output = model.create_chat_completion(
79
+ # messages = [
80
+ # {"role": "assistant", "content": prompt},
81
+ # {
82
+ # "role": "user",
83
+ # "content": pdftext
84
+ # }
85
+ # ],
86
+ # max_tokens=800,
87
+ # temperature=0
88
+ # )
89
+ # output = output['choices'][0]['message']['content']
90
 
91
  # if (pdftext):
92
  # prompt = prompt + ': ' + pdftext
93
  # output = model.generate(prompt=prompt, max_tokens=1024, n_batch=128)
94
 
95
+ messages = [
96
+ {"role": "assistant",
97
+ "content": prompt},
98
+ {"role": "user",
99
+ "content": pdftext}, ]
100
+ prompt = model.tokenizer.apply_chat_template(
101
+ messages,
102
+ tokenize=False,
103
+ add_generation_prompt=True
104
+ )
105
+ terminators = [
106
+ model.tokenizer.eos_token_id,
107
+ model.tokenizer.convert_tokens_to_ids("<|eot_id|>")
108
+ ]
109
+ outputs = model(
110
+ prompt,
111
+ max_new_tokens=maxtokens,
112
+ eos_token_id=terminators,
113
+ do_sample=True,
114
+ temperature=temperature,
115
+ top_p=top_probability,
116
+ )
117
+ output = outputs[0]["generated_text"][len(prompt):]
118
 
119
  return output
120
 
121
+ def pdf_to_text(file, maxtokens=600, temperature=1.2, top_probability=0.95):
122
  pdftext=""
123
  if(file):
124
  page2content = process_document(file, page_ids=[0])
 
129
  file.write(html)
130
  return html
131
 
132
+ # model_id = "D:/llama/meta-llama/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
133
+ # model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=-1, n_batch=64)
 
134
 
135
+ # model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf", n_threads=8, device='gpu')
 
136
  # model.chat_session()
137
 
138
+ model_id = "Meta-Llama-3-8B-Instruct"
139
+ model = transformers.pipeline(
140
+ "text-generation",
141
+ model=model_id,
142
+ model_kwargs={"torch_dtype": torch.bfloat16},
143
+ device="gpu"
144
+ )
145
 
146
  css = ".gradio-container {background: 'logo.png'}"
147
+ temp_slider = gr.Slider(minimum=0, maximum=2, value=0.2, label="Temperature Value")
148
+ prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
149
+ max_tokens = gr.Number(value=600, label="Max Tokens")
150
  iface = gr.Interface(
151
  fn = pdf_to_text,
152
+ inputs = ['file', max_tokens, temp_slider, prob_slider],
153
  outputs="html",
154
  title='COBIx Endoscopy Report De-Identification',
155
  description="This application assists to remove personal information from the uploaded clinical report",