srijaydeshpande commited on
Commit
40e5815
·
verified ·
1 Parent(s): f41ea40

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -56
app.py CHANGED
@@ -13,48 +13,49 @@ subprocess.run('pip install llama-cpp-agent==0.2.10', shell=True)
13
 
14
  from llama_cpp import Llama
15
 
16
- # HF_TOKEN = os.environ.get("HF_TOKEN", None)
17
 
18
  def process_document(pdf_path, page_ids=None):
 
19
 
20
- extracted_pages = extract_pages(pdf_path, page_numbers=page_ids)
21
 
22
- page2content = {}
 
 
 
23
 
24
- for extracted_page in tqdm(extracted_pages):
25
- page_id = extracted_page.pageid
26
- content = process_page(extracted_page)
27
- page2content[page_id] = content
28
 
29
- return page2content
30
 
31
  def process_page(extracted_page):
32
- content = []
33
- elements = [element for element in extracted_page._objs]
34
- elements.sort(key=lambda a: a.y1, reverse=True)
35
- for i, element in enumerate(elements):
36
- if isinstance(element, LTTextContainer):
37
- line_text = extract_text_and_normalize(element)
38
- content.append(line_text)
39
- content = re.sub('\n+', ' ', ''.join(content))
40
- return content
 
41
 
42
  def extract_text_and_normalize(element):
43
- # Extract text from line and split it with new lines
44
- line_texts = element.get_text().split('\n')
45
- norm_text = ''
46
- for line_text in line_texts:
47
- line_text=line_text.strip()
48
- if not line_text:
49
- line_text = '\n'
50
- else:
51
- line_text = re.sub('\s+', ' ', line_text)
52
- if not re.search('[\w\d\,\-]', line_text[-1]):
53
- line_text+='\n'
54
- else:
55
- line_text+=' '
56
- norm_text+=line_text
57
- return norm_text
 
58
 
59
  def txt_to_html(text):
60
  html_content = "<html><body>"
@@ -63,23 +64,37 @@ def txt_to_html(text):
63
  html_content += "</body></html>"
64
  return html_content
65
 
66
- def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=1.2, top_probability=0.95):
67
 
68
- # prompt = "Please anonymize the following clinical note. Replace all the following information with the term '[redacted]': Redact any strings that might be a name or initials, patients’ names, doctors’ names, the names Dr., redact any medical staff names, redact any strings that might be a location or address, such as '3970 Longview Drive', redact any strings that look like 'age 37', redact any dates and registration numbers, redact professions such as 'manager', redact any contact information."
69
-
70
- output = model.create_chat_completion(
71
- messages = [
72
- {"role": "assistant", "content": prompt},
73
- {
74
- "role": "user",
75
- "content": pdftext
76
- }
77
- ],
78
- max_tokens=maxtokens,
79
- temperature=temperature
80
- )
81
- output = output['choices'][0]['message']['content']
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  return output
84
 
85
  def mkdir(dir):
@@ -88,19 +103,20 @@ def mkdir(dir):
88
 
89
  @spaces.GPU(duration=120)
90
  def pdf_to_text(files, output_folder, prompt, maxtokens=600, temperature=1.2, top_probability=0.95):
91
- output_folder = output_folder.replace('\\','/')
92
  for file in files:
93
  file_name = os.path.basename(file)
94
  file_name_splt = file_name.split('.')
95
  print('File name is ', file_name)
96
  print('output folder is ', output_folder)
97
- if(len(file_name_splt)>1 and file_name_splt[1]=='pdf'):
98
- page2content = process_document(file, page_ids=[0])
99
- pdftext = page2content[1]
100
- if(pdftext):
101
- anonymized_text = deidentify_doc(pdftext, prompt, maxtokens, temperature, top_probability)
102
  return anonymized_text
103
 
 
104
  model_id = "Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
105
  model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=81, n_batch=64)
106
 
@@ -113,11 +129,11 @@ input_folder_text = gr.Textbox(label='Enter output folder path')
113
  output_text = gr.Textbox()
114
  output_path_component = gr.File(label="Select Output Path")
115
  iface = gr.Interface(
116
- fn = pdf_to_text,
117
- inputs = ['files', input_folder_text, "textbox", max_tokens, temp_slider, prob_slider],
118
  outputs=output_text,
119
  title='COBIx Endoscopy Report De-Identification',
120
  description="This application assists to remove personal information from the uploaded clinical report",
121
  theme=gr.themes.Soft(),
122
- )
123
  iface.launch()
 
13
 
14
  from llama_cpp import Llama
15
 
 
16
 
17
  def process_document(pdf_path, page_ids=None):
18
+ extracted_pages = extract_pages(pdf_path, page_numbers=page_ids)
19
 
20
+ page2content = {}
21
 
22
+ for extracted_page in tqdm(extracted_pages):
23
+ page_id = extracted_page.pageid
24
+ content = process_page(extracted_page)
25
+ page2content[page_id] = content
26
 
27
+ return page2content
 
 
 
28
 
 
29
 
30
  def process_page(extracted_page):
31
+ content = []
32
+ elements = [element for element in extracted_page._objs]
33
+ elements.sort(key=lambda a: a.y1, reverse=True)
34
+ for i, element in enumerate(elements):
35
+ if isinstance(element, LTTextContainer):
36
+ line_text = extract_text_and_normalize(element)
37
+ content.append(line_text)
38
+ content = re.sub('\n+', ' ', ''.join(content))
39
+ return content
40
+
41
 
42
  def extract_text_and_normalize(element):
43
+ # Extract text from line and split it with new lines
44
+ line_texts = element.get_text().split('\n')
45
+ norm_text = ''
46
+ for line_text in line_texts:
47
+ line_text = line_text.strip()
48
+ if not line_text:
49
+ line_text = '\n'
50
+ else:
51
+ line_text = re.sub('\s+', ' ', line_text)
52
+ if not re.search('[\w\d\,\-]', line_text[-1]):
53
+ line_text += '\n'
54
+ else:
55
+ line_text += ' '
56
+ norm_text += line_text
57
+ return norm_text
58
+
59
 
60
  def txt_to_html(text):
61
  html_content = "<html><body>"
 
64
  html_content += "</body></html>"
65
  return html_content
66
 
 
67
 
68
+ def deidentify_doc(pdftext="", prompt="", maxtokens=600, temperature=1.2, top_probability=0.95):
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ def replace_words_with_asterisk(big_string, words_to_replace):
71
+ for word in words_to_replace:
72
+ big_string = big_string.replace(word, '*')
73
+ return big_string
74
+
75
+ def get_output(pdfcontent):
76
+ output = model.create_chat_completion(
77
+ messages=[
78
+ {"role": "assistant", "content": prompt},
79
+ {
80
+ "role": "user",
81
+ "content": pdfcontent
82
+ }
83
+ ],
84
+ max_tokens=maxtokens,
85
+ temperature=temperature
86
+ )
87
+ wordstoremove = output['choices'][0]['message']['content']
88
+ position = wordstoremove.find("STARTTOKEN,")
89
+ if position != -1:
90
+ wordstoremove = wordstoremove[position + len("STARTTOKEN,"):].strip()
91
+ output = replace_words_with_asterisk(pdftext, wordstoremove.split(','))
92
+ return output
93
+
94
+ iterations=2
95
+ output = pdftext
96
+ for _ in iterations:
97
+ output = get_output(output)
98
  return output
99
 
100
  def mkdir(dir):
 
103
 
104
  @spaces.GPU(duration=120)
105
  def pdf_to_text(files, output_folder, prompt, maxtokens=600, temperature=1.2, top_probability=0.95):
106
+ output_folder = output_folder.replace('\\', '/')
107
  for file in files:
108
  file_name = os.path.basename(file)
109
  file_name_splt = file_name.split('.')
110
  print('File name is ', file_name)
111
  print('output folder is ', output_folder)
112
+ if (len(file_name_splt) > 1 and file_name_splt[1] == 'pdf'):
113
+ page2content = process_document(file, page_ids=[0])
114
+ pdftext = page2content[1]
115
+ if (pdftext):
116
+ anonymized_text = deidentify_doc(pdftext, prompt, maxtokens, temperature, top_probability)
117
  return anonymized_text
118
 
119
+
120
  model_id = "Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
121
  model = Llama(model_path=model_id, n_ctx=2048, n_threads=8, n_gpu_layers=81, n_batch=64)
122
 
 
129
  output_text = gr.Textbox()
130
  output_path_component = gr.File(label="Select Output Path")
131
  iface = gr.Interface(
132
+ fn=pdf_to_text,
133
+ inputs=['files', input_folder_text, "textbox", max_tokens, temp_slider, prob_slider],
134
  outputs=output_text,
135
  title='COBIx Endoscopy Report De-Identification',
136
  description="This application assists to remove personal information from the uploaded clinical report",
137
  theme=gr.themes.Soft(),
138
+ )
139
  iface.launch()