srijaydeshpande commited on
Commit
25632d4
·
verified ·
1 Parent(s): 5958396

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -78
app.py CHANGED
@@ -18,13 +18,21 @@ from llama_cpp_agent.chat_history.messages import Roles
18
  # subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
19
  # subprocess.run('pip install llama-cpp-agent==0.2.10', shell=True)
20
 
 
 
21
 
22
  hf_hub_download(
23
- repo_id="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF",
24
- filename="Meta-Llama-3-8B-Instruct.Q8_0.gguf",
25
  local_dir = "./models"
26
  )
27
 
 
 
 
 
 
 
28
  # hf_hub_download(
29
  # repo_id="bartowski/Meta-Llama-3-70B-Instruct-GGUF",
30
  # filename="Meta-Llama-3-70B-Instruct-Q3_K_M.gguf",
@@ -79,95 +87,107 @@ def txt_to_html(text):
79
  return html_content
80
 
81
  def deidentify_doc(llm, pdftext, maxtokens, temperature, top_probability):
82
-
83
- #### Remove Dates ###
84
- prompt = "In the following text replace only the calendar dates with term [date]. Example: if input is 'Date of birth: 15/5/1959 calculated BP (Systolic 158.00 mm, Diastolic 124.95 mm)' output should be 'Date of birth: [date] calculated BP (Systolic 158.00 mm, Diastolic 124.95 mm)'"
85
- output = llm.create_chat_completion(
86
- messages=[
87
- {"role": "assistant", "content": prompt},
88
- {
89
- "role": "user",
90
- "content": pdftext
91
- }
92
- ],
93
- max_tokens=maxtokens,
94
- temperature=temperature
95
- )
96
- output = output['choices'][0]['message']['content']
97
-
98
- # Remove starting header string in output
99
- find_index = output.find(' '.join(pdftext.split()[:3]))
100
- if find_index != -1:
101
- output = output[find_index:].strip()
102
-
103
 
104
-
105
- # #### Remove Locations and Addresses ###
106
- prompt = "In the following text replace location or address, such as '3970 Longview Drive, CV36HE' with term [address]. Replace complete GP address, such as 'Phanton Medical Centre, Birmingham, CV36HE' with term [address]. It is important that all addresses are completely replaced with [address]."
107
  output = llm.create_chat_completion(
108
  messages=[
109
- {"role": "assistant", "content": prompt},
110
- {
111
- "role": "user",
112
- "content": output
113
- }
114
  ],
115
  max_tokens=maxtokens,
116
  temperature=temperature
117
  )
118
  output = output['choices'][0]['message']['content']
119
-
120
 
121
  # Remove starting header string in output
122
  find_index = output.find(' '.join(pdftext.split()[:3]))
123
  if find_index != -1:
124
  output = output[find_index:].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
-
127
-
128
- #### Remove Names ###
129
- prompt = "In the following text replace any person name with term [name]. It is important that all person names are replaced with term [name]. Remove any gender terms 'male' or 'female' if exists."
130
- output = llm.create_chat_completion(
131
- messages=[
132
- {"role": "assistant", "content": prompt},
133
- {
134
- "role": "user",
135
- "content": output
136
- }
137
- ],
138
- max_tokens=maxtokens,
139
- temperature=temperature
140
- )
141
- output = output['choices'][0]['message']['content']
142
-
143
- # Remove starting header string in output
144
- find_index = output.find(' '.join(pdftext.split()[:3]))
145
- if find_index != -1:
146
- output = output[find_index:].strip()
147
-
148
-
149
-
150
-
151
- ### Remove Registration Numbers ###
152
-
153
- prompt = "In the following text replace the nhs number and the case note number with term [ID]. Replace Hospital number with [ID]."
154
- output = llm.create_chat_completion(
155
- messages=[
156
- {"role": "assistant", "content": prompt},
157
- {
158
- "role": "user",
159
- "content": output
160
- }
161
- ],
162
- max_tokens=maxtokens,
163
- temperature=temperature
164
- )
165
- output = output['choices'][0]['message']['content']
166
-
167
- # Remove starting header string in output
168
- find_index = output.find(' '.join(pdftext.split()[:3]))
169
- if find_index != -1:
170
- output = output[find_index:].strip()
 
 
 
171
 
172
  return output
173
 
@@ -175,7 +195,7 @@ def deidentify_doc(llm, pdftext, maxtokens, temperature, top_probability):
175
  def pdf_to_text(files, maxtokens=2048, temperature=0, top_probability=0.95):
176
  files=[files]
177
  llm = Llama(
178
- model_path="models/Meta-Llama-3-8B-Instruct.Q8_0.gguf",
179
  flash_attn=True,
180
  n_gpu_layers=81,
181
  n_batch=1024,
 
18
  # subprocess.run('pip install llama-cpp-python==0.2.75 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124', shell=True)
19
  # subprocess.run('pip install llama-cpp-agent==0.2.10', shell=True)
20
 
21
+ repo_id = "srijaydeshpande/Deid-Fine-Tuned"
22
+ model_id = "deid_finetuned.Q4_K_M.gguf"
23
 
24
  hf_hub_download(
25
+ repo_id=repo_id,
26
+ filename=model_id,
27
  local_dir = "./models"
28
  )
29
 
30
+ # hf_hub_download(
31
+ # repo_id="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF",
32
+ # filename="Meta-Llama-3-8B-Instruct.Q8_0.gguf",
33
+ # local_dir = "./models"
34
+ # )
35
+
36
  # hf_hub_download(
37
  # repo_id="bartowski/Meta-Llama-3-70B-Instruct-GGUF",
38
  # filename="Meta-Llama-3-70B-Instruct-Q3_K_M.gguf",
 
87
  return html_content
88
 
89
  def deidentify_doc(llm, pdftext, maxtokens, temperature, top_probability):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
+ prompt = "In the following text, perform the following actions: 1. Replace only the calendar dates with term [date]. Example: if input is 'Date of birth: 15/5/1959 calculated BP (Systolic 158.00 mm, Diastolic 124.95 mm)' output should be 'Date of birth: [date] calculated BP (Systolic 158.00 mm, Diastolic 124.95 mm)' 2. Replace location or address, such as '3970 Longview Drive, CV36HE' with term [address]. Replace complete GP address, such as 'Phanton Medical Centre, Birmingham, CV36HE' with term [address]. It is important that all addresses are completely replaced with [address]. 3. Replace any person name with term [name]. It is important that all person names are replaced with term [name]. Remove any gender terms 'male' or 'female' if exists. 4. Replace the nhs number and the case note number with term [ID]. Replace Hospital number with [ID]."
 
 
92
  output = llm.create_chat_completion(
93
  messages=[
94
+ {"from": "user", "value": prompt + ' Text: ' + pdftext},
 
 
 
 
95
  ],
96
  max_tokens=maxtokens,
97
  temperature=temperature
98
  )
99
  output = output['choices'][0]['message']['content']
 
100
 
101
  # Remove starting header string in output
102
  find_index = output.find(' '.join(pdftext.split()[:3]))
103
  if find_index != -1:
104
  output = output[find_index:].strip()
105
+
106
+ # #### Remove Dates ###
107
+ # prompt = "In the following text replace only the calendar dates with term [date]. Example: if input is 'Date of birth: 15/5/1959 calculated BP (Systolic 158.00 mm, Diastolic 124.95 mm)' output should be 'Date of birth: [date] calculated BP (Systolic 158.00 mm, Diastolic 124.95 mm)'"
108
+ # output = llm.create_chat_completion(
109
+ # messages=[
110
+ # {"role": "assistant", "content": prompt},
111
+ # {
112
+ # "role": "user",
113
+ # "content": pdftext
114
+ # }
115
+ # ],
116
+ # max_tokens=maxtokens,
117
+ # temperature=temperature
118
+ # )
119
+ # output = output['choices'][0]['message']['content']
120
+
121
+ # # Remove starting header string in output
122
+ # find_index = output.find(' '.join(pdftext.split()[:3]))
123
+ # if find_index != -1:
124
+ # output = output[find_index:].strip()
125
+
126
+
127
+
128
+ # # #### Remove Locations and Addresses ###
129
+ # prompt = "In the following text replace location or address, such as '3970 Longview Drive, CV36HE' with term [address]. Replace complete GP address, such as 'Phanton Medical Centre, Birmingham, CV36HE' with term [address]. It is important that all addresses are completely replaced with [address]."
130
+ # output = llm.create_chat_completion(
131
+ # messages=[
132
+ # {"role": "assistant", "content": prompt},
133
+ # {
134
+ # "role": "user",
135
+ # "content": output
136
+ # }
137
+ # ],
138
+ # max_tokens=maxtokens,
139
+ # temperature=temperature
140
+ # )
141
+ # output = output['choices'][0]['message']['content']
142
 
143
+
144
+ # # Remove starting header string in output
145
+ # find_index = output.find(' '.join(pdftext.split()[:3]))
146
+ # if find_index != -1:
147
+ # output = output[find_index:].strip()
148
+
149
+
150
+
151
+ # #### Remove Names ###
152
+ # prompt = "In the following text replace any person name with term [name]. It is important that all person names are replaced with term [name]. Remove any gender terms 'male' or 'female' if exists."
153
+ # output = llm.create_chat_completion(
154
+ # messages=[
155
+ # {"role": "assistant", "content": prompt},
156
+ # {
157
+ # "role": "user",
158
+ # "content": output
159
+ # }
160
+ # ],
161
+ # max_tokens=maxtokens,
162
+ # temperature=temperature
163
+ # )
164
+ # output = output['choices'][0]['message']['content']
165
+
166
+ # # Remove starting header string in output
167
+ # find_index = output.find(' '.join(pdftext.split()[:3]))
168
+ # if find_index != -1:
169
+ # output = output[find_index:].strip()
170
+
171
+ # ### Remove Registration Numbers ###
172
+
173
+ # prompt = "In the following text replace the nhs number and the case note number with term [ID]. Replace Hospital number with [ID]."
174
+ # output = llm.create_chat_completion(
175
+ # messages=[
176
+ # {"role": "assistant", "content": prompt},
177
+ # {
178
+ # "role": "user",
179
+ # "content": output
180
+ # }
181
+ # ],
182
+ # max_tokens=maxtokens,
183
+ # temperature=temperature
184
+ # )
185
+ # output = output['choices'][0]['message']['content']
186
+
187
+ # # Remove starting header string in output
188
+ # find_index = output.find(' '.join(pdftext.split()[:3]))
189
+ # if find_index != -1:
190
+ # output = output[find_index:].strip()
191
 
192
  return output
193
 
 
195
  def pdf_to_text(files, maxtokens=2048, temperature=0, top_probability=0.95):
196
  files=[files]
197
  llm = Llama(
198
+ model_path="models/" + model_id,
199
  flash_attn=True,
200
  n_gpu_layers=81,
201
  n_batch=1024,