File size: 4,532 Bytes
4c94a49
 
 
 
b35ac00
4c94a49
 
b35ac00
4c94a49
b35ac00
4c94a49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
from tqdm import tqdm
import re
import gradio as gr
from llama_cpp import Llama
import os

def process_document(pdf_path, page_ids=None):

   extracted_pages = extract_pages(pdf_path, page_numbers=page_ids)

   page2content = {}

   # Process each extracted page
   for extracted_page in tqdm(extracted_pages):
       page_id = extracted_page.pageid
       content = process_page(extracted_page)
       page2content[page_id] = content

   return page2content

def process_page(extracted_page):
   content = []
   elements = [element for element in extracted_page._objs]
   elements.sort(key=lambda a: a.y1, reverse=True)
   for i, element in enumerate(elements):
       # Extract text if the element is a text container
       # and text extraction is enabled
       if isinstance(element, LTTextContainer):
           line_text = extract_text_and_normalize(element)
           content.append(line_text)
   content = re.sub('\n+', ' ', ''.join(content))
   return content

def extract_text_and_normalize(element):
   # Extract text from line and split it with new lines
   line_texts = element.get_text().split('\n')
   norm_text = ''
   for line_text in line_texts:
       line_text=line_text.strip()
       # empty strings after striping convert to newline character
       if not line_text:
           line_text = '\n'
       else:
           line_text = re.sub('\s+', ' ', line_text)
           # if the last character is not a letter or number,
           # add newline character to a line
           if not re.search('[\w\d\,\-]', line_text[-1]):
               line_text+='\n'
           else:
               line_text+=' '
       # concatenate into single string
       norm_text+=line_text
   return norm_text

def txt_to_html(text):
    html_content = "<html><body>"
    for line in text.split('\n'):
        html_content += "<p>{}</p>".format(line.strip())
    html_content += "</body></html>"
    return html_content

def deidentify_doc(pdftext=""):

    # prompt = "Please anonymize the following clinical note. Replace all the following information with the term '[redacted]': Redact any strings that might be a name or acronym or initials, patients’ names, doctors’ names, the names of the M.D. or Dr., redact any medical staff names, redact any strings that might be a location or address, such as '3970 Longview Drive', redact any strings that look like 'something years old' or 'age 37', redact any dates and IDs and record dates, redact professions such as 'manager', redact any contact information."
    prompt = "Please anonymize the following clinical note. Replace all the following information with the term '[redacted]': Redact any strings that might be a name or initials, patients’ names, doctors’ names, the names Dr., redact any medical staff names, redact any strings that might be a location or address, such as '3970 Longview Drive', redact any strings that look like 'age 37', redact any dates and registration numbers, redact professions such as 'manager', redact any contact information."

    print('Input prompt is ',prompt)
    print('Input pdf text is ',pdftext)

    output = model.create_chat_completion(
                    messages = [
                        {"role": "assistant", "content": prompt},
                        {
                            "role": "user",
                            "content": pdftext
                        }
                    ],
                    max_tokens=600,
                    temperature=0
                )
    output = output['choices'][0]['message']['content']
    
    return output

def pdf_to_text(file):
    pdftext=""
    if(file):
       page2content = process_document(file, page_ids=[0])
       pdftext = page2content[1]
    display_text = deidentify_doc(pdftext)
    html = txt_to_html(display_text)
    with open('out.html', "w", encoding="utf-8") as file:
        file.write(html)
    return html

model_id = "srijaydeshpande/llama3-8B-Instruct-Q5KM/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=32, n_batch=128)

css = ".gradio-container {background: 'logo.png'}"

iface = gr.Interface(
    fn = pdf_to_text,
    inputs = ['file'],
    outputs="html",
    title='COBIx Endoscopy Report De-Identification',
    description="This application assists to remove personal information from the uploaded clinical report",
    theme=gr.themes.Soft(),
    )
iface.launch()