Spaces:

srijaydeshpande
/

DeID

Sleeping

App Files Files Community

srijaydeshpande commited on May 26, 2024

Commit

9c4df5b

verified ·

1 Parent(s): 40b59dc

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -17

app.py CHANGED Viewed

@@ -65,7 +65,8 @@ def txt_to_html(text):
     return html_content
-def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=1.2, top_probability=0.95):
     output = model.create_chat_completion(
         messages=[
             {"role": "assistant", "content": prompt},
@@ -78,31 +79,45 @@ def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=1.2, top_pr
         temperature=temperature
     )
     output = output['choices'][0]['message']['content']
     return output
 @spaces.GPU(duration=120)
-def pdf_to_text(files, output_folder, prompt, maxtokens=600, temperature=1.2, top_probability=0.95):
-    output_folder = output_folder.replace('\\', '/')
     files=[files]#remove later
     for file in files:
-        # file_name = os.path.basename(file)
-        # file_name_splt = file_name.split('.')
-        # print('File name is ', file_name)
-        # print('output folder is ', output_folder)
-        # if (len(file_name_splt) > 1 and file_name_splt[1] == 'pdf'):
-            # page2content = process_document(file, page_ids=[0])
-            # pdftext = page2content[1]
-        pdftext = file # remove later
-        if (pdftext): #shift this if block to right later
-            anonymized_text = deidentify_doc(pdftext, prompt, maxtokens, temperature, top_probability)
     return anonymized_text
 model_id = "Meta-Llama-3-8B-Instruct.Q8_0.gguf"
-model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=81, n_batch=64)
 css = ".gradio-container {background: 'logo.png'}"
-temp_slider = gr.Slider(minimum=0, maximum=2, value=0.2, label="Temperature Value")
 prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
 max_tokens = gr.Number(value=600, label="Max Tokens")
 input_folder = gr.File(file_count='multiple')
@@ -111,8 +126,8 @@ output_text = gr.Textbox()
 output_path_component = gr.File(label="Select Output Path")
 iface = gr.Interface(
     fn=pdf_to_text,
-    # inputs=['files', input_folder_text, "textbox", max_tokens, temp_slider, prob_slider],
-    inputs=["textbox", input_folder_text, "textbox", max_tokens, temp_slider, prob_slider],
     outputs=output_text,
     title='COBIx Endoscopy Report De-Identification',
     description="This application assists to remove personal information from the uploaded clinical report",

     return html_content
+def deidentify_doc(pdftext="", maxtokens=600, temperature=1.2, top_probability=0.95):
+    prompt = "Perform the following actions on given text:  1. Replace any person names with term [redacted]  2. Replace any person addresses with term [redacted]"
     output = model.create_chat_completion(
         messages=[
             {"role": "assistant", "content": prompt},
         temperature=temperature
     )
     output = output['choices'][0]['message']['content']
+    prompt = "Perform the following actions on given text:  1. Replace NHS number and Case note number and Date of Birth with term [redacted]  2. DO NOT REPLACE ANY MEDICAL MEASUREMENTS  3. Replace only the CALENDAR DATES of format 'day/month/year' with term [redacted]"
+    output = model.create_chat_completion(
+        messages=[
+            {"role": "assistant", "content": prompt},
+            {
+                "role": "user",
+                "content": output
+            }
+        ],
+        max_tokens=maxtokens,
+        temperature=temperature
+    )
+    output = output['choices'][0]['message']['content']
     return output
 @spaces.GPU(duration=120)
+def pdf_to_text(files, output_folder, maxtokens=2048, temperature=0, top_probability=0.95):
     files=[files]#remove later
     for file in files:
+        file_name = os.path.basename(file)
+        file_name_splt = file_name.split('.')
+        print('File name is ', file_name)
+        print('output folder is ', output_folder)
+        if (len(file_name_splt) > 1 and file_name_splt[1] == 'pdf'):
+            page2content = process_document(file, page_ids=[0])
+            pdftext = page2content[1]
+        # pdftext = file # remove later
+            if (pdftext): #shift this if block to right later
+                anonymized_text = deidentify_doc(pdftext, maxtokens, temperature, top_probability)
     return anonymized_text
 model_id = "Meta-Llama-3-8B-Instruct.Q8_0.gguf"
+model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=-1, n_batch=128)
 css = ".gradio-container {background: 'logo.png'}"
+temp_slider = gr.Slider(minimum=0, maximum=2, value=0.9, label="Temperature Value")
 prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
 max_tokens = gr.Number(value=600, label="Max Tokens")
 input_folder = gr.File(file_count='multiple')
 output_path_component = gr.File(label="Select Output Path")
 iface = gr.Interface(
     fn=pdf_to_text,
+    inputs=['file', input_folder_text],
+    # inputs=["textbox", input_folder_text, "textbox", max_tokens, temp_slider, prob_slider],
     outputs=output_text,
     title='COBIx Endoscopy Report De-Identification',
     description="This application assists to remove personal information from the uploaded clinical report",