Spaces:
Sleeping
Sleeping
File size: 5,197 Bytes
4c94a49 b35ac00 4c94a49 e852fd8 ab4dd7f ca3e828 e852fd8 ab4dd7f b35ac00 4c94a49 40e5815 b35ac00 40e5815 4c94a49 40e5815 4c94a49 40e5815 4c94a49 40e5815 4c94a49 40e5815 4c94a49 e5ca27d 9c4df5b 0d8ba24 9c4df5b 5d5aecb 897b5dd 9c4df5b 5d5aecb 4c94a49 ab4dd7f 9c4df5b 50dfc74 e852fd8 9c4df5b e852fd8 b0abf08 40e5815 40b59dc 9c4df5b d7d2792 e852fd8 9c4df5b e5ca27d d92d21a e852fd8 d92d21a 4c94a49 40e5815 9c4df5b e852fd8 4c94a49 40e5815 4c94a49 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
from tqdm import tqdm
import re
import gradio as gr
import os
import accelerate
import spaces
import subprocess
subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
subprocess.run('pip install llama-cpp-agent==0.2.10', shell=True)
from llama_cpp import Llama
def process_document(pdf_path, page_ids=None):
extracted_pages = extract_pages(pdf_path, page_numbers=page_ids)
page2content = {}
for extracted_page in tqdm(extracted_pages):
page_id = extracted_page.pageid
content = process_page(extracted_page)
page2content[page_id] = content
return page2content
def process_page(extracted_page):
content = []
elements = [element for element in extracted_page._objs]
elements.sort(key=lambda a: a.y1, reverse=True)
for i, element in enumerate(elements):
if isinstance(element, LTTextContainer):
line_text = extract_text_and_normalize(element)
content.append(line_text)
content = re.sub('\n+', ' ', ''.join(content))
return content
def extract_text_and_normalize(element):
# Extract text from line and split it with new lines
line_texts = element.get_text().split('\n')
norm_text = ''
for line_text in line_texts:
line_text = line_text.strip()
if not line_text:
line_text = '\n'
else:
line_text = re.sub('\s+', ' ', line_text)
if not re.search('[\w\d\,\-]', line_text[-1]):
line_text += '\n'
else:
line_text += ' '
norm_text += line_text
return norm_text
def txt_to_html(text):
html_content = "<html><body>"
for line in text.split('\n'):
html_content += "<p>{}</p>".format(line.strip())
html_content += "</body></html>"
return html_content
def deidentify_doc(pdftext="", maxtokens=600, temperature=1.2, top_probability=0.95):
prompt = "Perform the following actions on given text: 1. Replace any person names with term [redacted] 2. Replace any person addresses with term [redacted]"
output = model.create_chat_completion(
messages=[
{"role": "assistant", "content": prompt},
{
"role": "user",
"content": pdftext
}
],
max_tokens=maxtokens,
temperature=temperature
)
output = output['choices'][0]['message']['content']
print(prompt)
print(output)
print('-------------------------------------------------------')
prompt = "Perform the following actions on given text: 1. Replace NHS number and Case note number and Date of Birth with term [redacted] 2. Replace only the CALENDAR DATES of format 'day/month/year' with term [redacted] 3. DO NOT REPLACE ANY MEDICAL MEASUREMENTS"
output = model.create_chat_completion(
messages=[
{"role": "assistant", "content": prompt},
{
"role": "user",
"content": output
}
],
max_tokens=maxtokens,
temperature=temperature
)
output = output['choices'][0]['message']['content']
print(prompt)
print(output)
print('-------------------------------------------------------')
return output
@spaces.GPU(duration=120)
def pdf_to_text(files, output_folder, maxtokens=2048, temperature=0, top_probability=0.95):
files=[files]#remove later
for file in files:
file_name = os.path.basename(file)
file_name_splt = file_name.split('.')
print('File name is ', file_name)
print('output folder is ', output_folder)
if (len(file_name_splt) > 1 and file_name_splt[1] == 'pdf'):
page2content = process_document(file, page_ids=[0])
pdftext = page2content[1]
# pdftext = file # remove later
if (pdftext): #shift this if block to right later
anonymized_text = deidentify_doc(pdftext, maxtokens, temperature, top_probability)
return anonymized_text
model_id = "Meta-Llama-3-8B-Instruct.Q8_0.gguf"
model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=-1, n_batch=128)
css = ".gradio-container {background: 'logo.png'}"
temp_slider = gr.Slider(minimum=0, maximum=2, value=0.9, label="Temperature Value")
prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value")
max_tokens = gr.Number(value=600, label="Max Tokens")
input_folder = gr.File(file_count='multiple')
input_folder_text = gr.Textbox(label='Enter output folder path')
output_text = gr.Textbox()
output_path_component = gr.File(label="Select Output Path")
iface = gr.Interface(
fn=pdf_to_text,
inputs=['file', input_folder_text],
# inputs=["textbox", input_folder_text, "textbox", max_tokens, temp_slider, prob_slider],
outputs=output_text,
title='COBIx Endoscopy Report De-Identification',
description="This application assists to remove personal information from the uploaded clinical report",
theme=gr.themes.Soft(),
)
iface.launch() |