Spaces:

srijaydeshpande
/

DeID

Running on Zero

App Files Files Community

srijaydeshpande commited on May 13, 2024

Commit

e5ca27d

verified ·

1 Parent(s): b8b829b

Update main.py

Browse files

Files changed (1) hide show

app.py +56 -57

app.py CHANGED Viewed

@@ -4,15 +4,14 @@ from tqdm import tqdm
 import re
 import gradio as gr
 import os
-from llama_cpp import Llama
-from gpt4all import GPT4All
 import transformers
 # from transformers import GemmaTokenizer, AutoModelForCausalLM
 # from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import accelerate
 import torch
-# HF_TOKEN = os.environ.get("HF_TOKEN", None)
 def process_document(pdf_path, page_ids=None):
@@ -69,57 +68,57 @@ def txt_to_html(text):
     html_content += "</body></html>"
     return html_content
-def deidentify_doc(pdftext=""):
     prompt = "Please anonymize the following clinical note. Replace all the following information with the term '[redacted]': Redact any strings that might be a name or initials, patients’ names, doctors’ names, the names Dr., redact any medical staff names, redact any strings that might be a location or address, such as '3970 Longview Drive', redact any strings that look like 'age 37', redact any dates and registration numbers, redact professions such as 'manager', redact any contact information."
-    print('Input prompt is ',prompt)
-    print('Input pdf text is ',pdftext)
-    output = model.create_chat_completion(
-                    messages = [
-                        {"role": "assistant", "content": prompt},
-                        {
-                            "role": "user",
-                            "content": pdftext
-                        }
-                    ],
-                    max_tokens=600,
-                    temperature=0
-                )
-    output = output['choices'][0]['message']['content']
     # if (pdftext):
     #     prompt = prompt + ': ' + pdftext
     # output = model.generate(prompt=prompt, max_tokens=1024, n_batch=128)
-    # messages = [
-    #     {"role": "assistant",
-    #      "content": prompt},
-    #     {"role": "user",
-    #      "content": pdftext}, ]
-    # prompt = model.tokenizer.apply_chat_template(
-    #     messages,
-    #     tokenize=False,
-    #     add_generation_prompt=True
-    # )
-    # terminators = [
-    #     model.tokenizer.eos_token_id,
-    #     model.tokenizer.convert_tokens_to_ids("<|eot_id|>")
-    # ]
-    # outputs = model(
-    #     prompt,
-    #     max_new_tokens=1024,
-    #     eos_token_id=terminators,
-    #     do_sample=True,
-    #     temperature=0.3,
-    #     top_p=0.95,
-    # )
-    # output = outputs[0]["generated_text"][len(prompt):]
     return output
-def pdf_to_text(file):
     pdftext=""
     if(file):
        page2content = process_document(file, page_ids=[0])
@@ -130,27 +129,27 @@ def pdf_to_text(file):
         file.write(html)
     return html
-model_id = "Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
-model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=32, n_batch=64)
-print('Model loaded')
-# print('list of available devices is ',GPT4All.list_gpus())
-# model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf", n_threads=8, device='auto')
 # model.chat_session()
-# model_id = "Meta-Llama-3-8B-Instruct"
-# model = transformers.pipeline(
-#     "text-generation",
-#     model=model_id,
-#     model_kwargs={"torch_dtype": torch.bfloat16},
-#     device="cpu",
-# )
 css = ".gradio-container {background: 'logo.png'}"
 iface = gr.Interface(
     fn = pdf_to_text,
-    inputs = ['file'],
     outputs="html",
     title='COBIx Endoscopy Report De-Identification',
     description="This application assists to remove personal information from the uploaded clinical report",

 import re
 import gradio as gr
 import os
+# from llama_cpp import Llama
+# from gpt4all import GPT4All
 import transformers
 # from transformers import GemmaTokenizer, AutoModelForCausalLM
 # from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import accelerate
 import torch
 def process_document(pdf_path, page_ids=None):
     html_content += "</body></html>"
     return html_content
+def deidentify_doc(pdftext="", maxtokens=600, temperature=1.2, top_probability=0.95):
     prompt = "Please anonymize the following clinical note. Replace all the following information with the term '[redacted]': Redact any strings that might be a name or initials, patients’ names, doctors’ names, the names Dr., redact any medical staff names, redact any strings that might be a location or address, such as '3970 Longview Drive', redact any strings that look like 'age 37', redact any dates and registration numbers, redact professions such as 'manager', redact any contact information."
+    print('Max Tokens is  ',maxtokens)
+    print('Temperature is  ',temperature)
+    # output = model.create_chat_completion(
+    #                 messages = [
+    #                     {"role": "assistant", "content": prompt},
+    #                     {
+    #                         "role": "user",
+    #                         "content": pdftext
+    #                     }
+    #                 ],
+    #                 max_tokens=800,
+    #                 temperature=0
+    #             )
+    # output = output['choices'][0]['message']['content']
     # if (pdftext):
     #     prompt = prompt + ': ' + pdftext
     # output = model.generate(prompt=prompt, max_tokens=1024, n_batch=128)
+    messages = [
+        {"role": "assistant",
+         "content": prompt},
+        {"role": "user",
+         "content": pdftext}, ]
+    prompt = model.tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    terminators = [
+        model.tokenizer.eos_token_id,
+        model.tokenizer.convert_tokens_to_ids("<|eot_id|>")
+    ]
+    outputs = model(
+        prompt,
+        max_new_tokens=maxtokens,
+        eos_token_id=terminators,
+        do_sample=True,
+        temperature=temperature,
+        top_p=top_probability,
+    )
+    output = outputs[0]["generated_text"][len(prompt):]
     return output
+def pdf_to_text(file, maxtokens=600, temperature=1.2, top_probability=0.95):
     pdftext=""
     if(file):
        page2content = process_document(file, page_ids=[0])
         file.write(html)
     return html
+# model_id = "D:/llama/meta-llama/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
+# model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=-1, n_batch=64)
+# model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf", n_threads=8, device='gpu')
 # model.chat_session()
+model_id = "Meta-Llama-3-8B-Instruct"
+model = transformers.pipeline(
+    "text-generation",
+    model=model_id,
+    model_kwargs={"torch_dtype": torch.bfloat16},
+    device="gpu"
+)
 css = ".gradio-container {background: 'logo.png'}"
+temp_slider = gr.Slider(minimum=0, maximum=2, value=0.2, label="Temperature Value")
+prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
+max_tokens = gr.Number(value=600, label="Max Tokens")
 iface = gr.Interface(
     fn = pdf_to_text,
+    inputs = ['file', max_tokens, temp_slider, prob_slider],
     outputs="html",
     title='COBIx Endoscopy Report De-Identification',
     description="This application assists to remove personal information from the uploaded clinical report",