srijaydeshpande commited on
Commit
4c94a49
·
verified ·
1 Parent(s): 5ec760f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -4
app.py CHANGED
@@ -1,7 +1,111 @@
 
 
 
 
1
  import gradio as gr
 
 
2
 
3
- def greet(name):
4
- return "Hello Srijay " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pdfminer.high_level import extract_pages
2
+ from pdfminer.layout import LTTextContainer
3
+ from tqdm import tqdm
4
+ import re
5
  import gradio as gr
6
+ from llama_cpp import Llama
7
+ import os
8
 
9
+ def process_document(pdf_path, page_ids=None):
 
10
 
11
+ extracted_pages = extract_pages(pdf_path, page_numbers=page_ids)
12
+
13
+ page2content = {}
14
+
15
+ # Process each extracted page
16
+ for extracted_page in tqdm(extracted_pages):
17
+ page_id = extracted_page.pageid
18
+ content = process_page(extracted_page)
19
+ page2content[page_id] = content
20
+
21
+ return page2content
22
+
23
+ def process_page(extracted_page):
24
+ content = []
25
+ elements = [element for element in extracted_page._objs]
26
+ elements.sort(key=lambda a: a.y1, reverse=True)
27
+ for i, element in enumerate(elements):
28
+ # Extract text if the element is a text container
29
+ # and text extraction is enabled
30
+ if isinstance(element, LTTextContainer):
31
+ line_text = extract_text_and_normalize(element)
32
+ content.append(line_text)
33
+ content = re.sub('\n+', ' ', ''.join(content))
34
+ return content
35
+
36
+ def extract_text_and_normalize(element):
37
+ # Extract text from line and split it with new lines
38
+ line_texts = element.get_text().split('\n')
39
+ norm_text = ''
40
+ for line_text in line_texts:
41
+ line_text=line_text.strip()
42
+ # empty strings after striping convert to newline character
43
+ if not line_text:
44
+ line_text = '\n'
45
+ else:
46
+ line_text = re.sub('\s+', ' ', line_text)
47
+ # if the last character is not a letter or number,
48
+ # add newline character to a line
49
+ if not re.search('[\w\d\,\-]', line_text[-1]):
50
+ line_text+='\n'
51
+ else:
52
+ line_text+=' '
53
+ # concatenate into single string
54
+ norm_text+=line_text
55
+ return norm_text
56
+
57
+ def txt_to_html(text):
58
+ html_content = "<html><body>"
59
+ for line in text.split('\n'):
60
+ html_content += "<p>{}</p>".format(line.strip())
61
+ html_content += "</body></html>"
62
+ return html_content
63
+
64
+ def deidentify_doc(pdftext=""):
65
+
66
+ # prompt = "Please anonymize the following clinical note. Replace all the following information with the term '[redacted]': Redact any strings that might be a name or acronym or initials, patients’ names, doctors’ names, the names of the M.D. or Dr., redact any medical staff names, redact any strings that might be a location or address, such as '3970 Longview Drive', redact any strings that look like 'something years old' or 'age 37', redact any dates and IDs and record dates, redact professions such as 'manager', redact any contact information."
67
+ prompt = "Please anonymize the following clinical note. Replace all the following information with the term '[redacted]': Redact any strings that might be a name or initials, patients’ names, doctors’ names, the names Dr., redact any medical staff names, redact any strings that might be a location or address, such as '3970 Longview Drive', redact any strings that look like 'age 37', redact any dates and registration numbers, redact professions such as 'manager', redact any contact information."
68
+
69
+ print('Input prompt is ',prompt)
70
+ print('Input pdf text is ',pdftext)
71
+
72
+ output = model.create_chat_completion(
73
+ messages = [
74
+ {"role": "assistant", "content": prompt},
75
+ {
76
+ "role": "user",
77
+ "content": pdftext
78
+ }
79
+ ],
80
+ max_tokens=600,
81
+ temperature=0
82
+ )
83
+ output = output['choices'][0]['message']['content']
84
+
85
+ return output
86
+
87
+ def pdf_to_text(file):
88
+ pdftext=""
89
+ if(file):
90
+ page2content = process_document(file, page_ids=[0])
91
+ pdftext = page2content[1]
92
+ display_text = deidentify_doc(pdftext)
93
+ html = txt_to_html(display_text)
94
+ with open('out.html', "w", encoding="utf-8") as file:
95
+ file.write(html)
96
+ return html
97
+
98
+ model_id = "srijaydeshpande/llama3-8B-Instruct-Q5KM/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
99
+ model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=32, n_batch=128)
100
+
101
+ css = ".gradio-container {background: 'logo.png'}"
102
+
103
+ iface = gr.Interface(
104
+ fn = pdf_to_text,
105
+ inputs = ['file'],
106
+ outputs="html",
107
+ title='COBIx Endoscopy Report De-Identification',
108
+ description="This application assists to remove personal information from the uploaded clinical report",
109
+ theme=gr.themes.Soft(),
110
+ )
111
+ iface.launch()