from pdfminer.high_level import extract_pages from pdfminer.layout import LTTextContainer from tqdm import tqdm import re import gradio as gr import os import accelerate import spaces import subprocess subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True) subprocess.run('pip install llama-cpp-agent==0.2.10', shell=True) from llama_cpp import Llama def process_document(pdf_path, page_ids=None): extracted_pages = extract_pages(pdf_path, page_numbers=page_ids) page2content = {} for extracted_page in tqdm(extracted_pages): page_id = extracted_page.pageid content = process_page(extracted_page) page2content[page_id] = content return page2content def process_page(extracted_page): content = [] elements = [element for element in extracted_page._objs] elements.sort(key=lambda a: a.y1, reverse=True) for i, element in enumerate(elements): if isinstance(element, LTTextContainer): line_text = extract_text_and_normalize(element) content.append(line_text) content = re.sub('\n+', ' ', ''.join(content)) return content def extract_text_and_normalize(element): # Extract text from line and split it with new lines line_texts = element.get_text().split('\n') norm_text = '' for line_text in line_texts: line_text = line_text.strip() if not line_text: line_text = '\n' else: line_text = re.sub('\s+', ' ', line_text) if not re.search('[\w\d\,\-]', line_text[-1]): line_text += '\n' else: line_text += ' ' norm_text += line_text return norm_text def txt_to_html(text): html_content = "
" for line in text.split('\n'): html_content += "{}
".format(line.strip()) html_content += "" return html_content def deidentify_doc(pdftext="", maxtokens=600, temperature=1.2, top_probability=0.95): prompt = "Perform the following actions on given text: 1. Replace any person names with term [redacted] 2. Replace any person addresses with term [redacted]" output = model.create_chat_completion( messages=[ {"role": "assistant", "content": prompt}, { "role": "user", "content": pdftext } ], max_tokens=maxtokens, temperature=temperature ) output = output['choices'][0]['message']['content'] print(prompt) print(output) print('-------------------------------------------------------') prompt = "Perform the following actions on given text: 1. Replace NHS number and Case note number and Date of Birth with term [redacted] 2. Replace only the CALENDAR DATES of format 'day/month/year' with term [redacted] 3. DO NOT REPLACE ANY MEDICAL MEASUREMENTS" output = model.create_chat_completion( messages=[ {"role": "assistant", "content": prompt}, { "role": "user", "content": output } ], max_tokens=maxtokens, temperature=temperature ) output = output['choices'][0]['message']['content'] print(prompt) print(output) print('-------------------------------------------------------') return output @spaces.GPU(duration=120) def pdf_to_text(files, output_folder, maxtokens=2048, temperature=0, top_probability=0.95): files=[files]#remove later for file in files: file_name = os.path.basename(file) file_name_splt = file_name.split('.') print('File name is ', file_name) print('output folder is ', output_folder) if (len(file_name_splt) > 1 and file_name_splt[1] == 'pdf'): page2content = process_document(file, page_ids=[0]) pdftext = page2content[1] # pdftext = file # remove later if (pdftext): #shift this if block to right later anonymized_text = deidentify_doc(pdftext, maxtokens, temperature, top_probability) return anonymized_text model_id = "Meta-Llama-3-8B-Instruct.Q8_0.gguf" model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=-1, n_batch=128) css = ".gradio-container {background: 'logo.png'}" temp_slider = gr.Slider(minimum=0, maximum=2, value=0.9, label="Temperature Value") prob_slider = gr.Slider(minimum=0, maximum=1, value=0.95, label="Max Probability Value") max_tokens = gr.Number(value=600, label="Max Tokens") input_folder = gr.File(file_count='multiple') input_folder_text = gr.Textbox(label='Enter output folder path') output_text = gr.Textbox() output_path_component = gr.File(label="Select Output Path") iface = gr.Interface( fn=pdf_to_text, inputs=['file', input_folder_text], # inputs=["textbox", input_folder_text, "textbox", max_tokens, temp_slider, prob_slider], outputs=output_text, title='COBIx Endoscopy Report De-Identification', description="This application assists to remove personal information from the uploaded clinical report", theme=gr.themes.Soft(), ) iface.launch()