File size: 5,197 Bytes
4c94a49
 
 
 
b35ac00
4c94a49
e852fd8
ab4dd7f
ca3e828
 
 
 
e852fd8
ab4dd7f
 
b35ac00
4c94a49
40e5815
b35ac00
40e5815
4c94a49
40e5815
 
 
 
4c94a49
40e5815
4c94a49
 
 
40e5815
 
 
 
 
 
 
 
 
 
4c94a49
 
40e5815
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c94a49
 
 
 
 
 
 
 
e5ca27d
9c4df5b
 
0d8ba24
 
 
 
 
 
 
 
 
 
 
 
9c4df5b
5d5aecb
 
 
 
897b5dd
9c4df5b
 
 
 
 
 
 
 
 
 
 
 
 
5d5aecb
 
 
 
4c94a49
 
ab4dd7f
9c4df5b
50dfc74
e852fd8
9c4df5b
 
 
 
 
 
 
 
 
 
e852fd8
b0abf08
40e5815
40b59dc
9c4df5b
d7d2792
e852fd8
9c4df5b
e5ca27d
 
d92d21a
e852fd8
d92d21a
 
4c94a49
40e5815
9c4df5b
 
e852fd8
4c94a49
 
 
40e5815
4c94a49
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
from tqdm import tqdm
import re
import gradio as gr
import os
import accelerate
import spaces
import subprocess

subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
subprocess.run('pip install llama-cpp-agent==0.2.10', shell=True)

from llama_cpp import Llama


def process_document(pdf_path, page_ids=None):
    extracted_pages = extract_pages(pdf_path, page_numbers=page_ids)

    page2content = {}

    for extracted_page in tqdm(extracted_pages):
        page_id = extracted_page.pageid
        content = process_page(extracted_page)
        page2content[page_id] = content

    return page2content


def process_page(extracted_page):
    content = []
    elements = [element for element in extracted_page._objs]
    elements.sort(key=lambda a: a.y1, reverse=True)
    for i, element in enumerate(elements):
        if isinstance(element, LTTextContainer):
            line_text = extract_text_and_normalize(element)
            content.append(line_text)
    content = re.sub('\n+', ' ', ''.join(content))
    return content


def extract_text_and_normalize(element):
    # Extract text from line and split it with new lines
    line_texts = element.get_text().split('\n')
    norm_text = ''
    for line_text in line_texts:
        line_text = line_text.strip()
        if not line_text:
            line_text = '\n'
        else:
            line_text = re.sub('\s+', ' ', line_text)
            if not re.search('[\w\d\,\-]', line_text[-1]):
                line_text += '\n'
            else:
                line_text += ' '
        norm_text += line_text
    return norm_text


def txt_to_html(text):
    html_content = "<html><body>"
    for line in text.split('\n'):
        html_content += "<p>{}</p>".format(line.strip())
    html_content += "</body></html>"
    return html_content


def deidentify_doc(pdftext="", maxtokens=600, temperature=1.2, top_probability=0.95):
    prompt = "Perform the following actions on given text:  1. Replace any person names with term [redacted]  2. Replace any person addresses with term [redacted]"
    output = model.create_chat_completion(
        messages=[
            {"role": "assistant", "content": prompt},
            {
                "role": "user",
                "content": pdftext
            }
        ],
        max_tokens=maxtokens,
        temperature=temperature
    )
    output = output['choices'][0]['message']['content']

    print(prompt)
    print(output)
    print('-------------------------------------------------------')

    prompt = "Perform the following actions on given text:  1. Replace NHS number and Case note number and Date of Birth with term [redacted]  2. Replace only the CALENDAR DATES of format 'day/month/year' with term [redacted]  3. DO NOT REPLACE ANY MEDICAL MEASUREMENTS"
    output = model.create_chat_completion(
        messages=[
            {"role": "assistant", "content": prompt},
            {
                "role": "user",
                "content": output
            }
        ],
        max_tokens=maxtokens,
        temperature=temperature
    )
    output = output['choices'][0]['message']['content']

    print(prompt)
    print(output)
    print('-------------------------------------------------------')

    return output

@spaces.GPU(duration=120)
def pdf_to_text(files, output_folder, maxtokens=2048, temperature=0, top_probability=0.95):
    files=[files]#remove later
    for file in files:
        file_name = os.path.basename(file)
        file_name_splt = file_name.split('.')
        print('File name is ', file_name)
        print('output folder is ', output_folder)
        if (len(file_name_splt) > 1 and file_name_splt[1] == 'pdf'):
            page2content = process_document(file, page_ids=[0])
            pdftext = page2content[1]
        # pdftext = file # remove later
            if (pdftext): #shift this if block to right later
                anonymized_text = deidentify_doc(pdftext, maxtokens, temperature, top_probability)
    return anonymized_text


model_id = "Meta-Llama-3-8B-Instruct.Q8_0.gguf"
model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=-1, n_batch=128)

css = ".gradio-container {background: 'logo.png'}"
temp_slider = gr.Slider(minimum=0, maximum=2, value=0.9, label="Temperature Value")
prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
max_tokens = gr.Number(value=600, label="Max Tokens")
input_folder = gr.File(file_count='multiple')
input_folder_text = gr.Textbox(label='Enter output folder path')
output_text = gr.Textbox()
output_path_component = gr.File(label="Select Output Path")
iface = gr.Interface(
    fn=pdf_to_text,
    inputs=['file', input_folder_text],
    # inputs=["textbox", input_folder_text, "textbox", max_tokens, temp_slider, prob_slider],
    outputs=output_text,
    title='COBIx Endoscopy Report De-Identification',
    description="This application assists to remove personal information from the uploaded clinical report",
    theme=gr.themes.Soft(),
)
iface.launch()