srijaydeshpande commited on
Commit
e289e8d
·
verified ·
1 Parent(s): 11f2903

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -11
app.py CHANGED
@@ -79,36 +79,37 @@ def txt_to_html(text):
79
  return html_content
80
 
81
  def deidentify_doc(llm, pdftext, maxtokens, temperature, top_probability):
82
-
83
- # #### Remove Locations and Addresses ###
84
- prompt = "In the following text replace location or address, such as '3970 Longview Drive, CV36HE' with term [address]. Replace complete GP address with term [address]. It is important that all addresses are fully replaced with [address]."
85
  output = llm.create_chat_completion(
86
  messages=[
87
  {"role": "assistant", "content": prompt},
88
  {
89
  "role": "user",
90
- "content": pdftext
91
  }
92
  ],
93
  max_tokens=maxtokens,
94
  temperature=temperature
95
  )
96
  output = output['choices'][0]['message']['content']
97
-
98
 
99
  # Remove starting header string in output
100
  find_index = output.find(' '.join(pdftext.split()[:3]))
101
  if find_index != -1:
102
  output = output[find_index:].strip()
103
-
104
- #### Remove Dates ###
105
- prompt = "In the following text replace only the calendar dates with term [date]. Example: if input is 'Date of birth: 15/5/1959 calculated BP (Systolic 158.00 mm, Diastolic 124.95 mm)' output should be 'Date of birth: [date] calculated BP (Systolic 158.00 mm, Diastolic 124.95 mm)'"
 
 
106
  output = llm.create_chat_completion(
107
  messages=[
108
  {"role": "assistant", "content": prompt},
109
  {
110
  "role": "user",
111
- "content": output
112
  }
113
  ],
114
  max_tokens=maxtokens,
@@ -123,6 +124,7 @@ def deidentify_doc(llm, pdftext, maxtokens, temperature, top_probability):
123
  output = output[find_index:].strip()
124
 
125
 
 
126
  #### Remove Names ###
127
  prompt = "In the following text replace any person name with term [name]. It is important that all person names are replaced with term [name]. Remove any gender terms 'male' or 'female' if exists."
128
  output = llm.create_chat_completion(
@@ -143,8 +145,7 @@ def deidentify_doc(llm, pdftext, maxtokens, temperature, top_probability):
143
  if find_index != -1:
144
  output = output[find_index:].strip()
145
 
146
- # print('---------------Remove Names-----------------------')
147
- # print(output)
148
 
149
 
150
  ### Remove Registration Numbers ###
 
79
  return html_content
80
 
81
  def deidentify_doc(llm, pdftext, maxtokens, temperature, top_probability):
82
+
83
+ #### Remove Dates ###
84
+ prompt = "In the following text replace only the calendar dates with term [date]. Example: if input is 'Date of birth: 15/5/1959 calculated BP (Systolic 158.00 mm, Diastolic 124.95 mm)' output should be 'Date of birth: [date] calculated BP (Systolic 158.00 mm, Diastolic 124.95 mm)'"
85
  output = llm.create_chat_completion(
86
  messages=[
87
  {"role": "assistant", "content": prompt},
88
  {
89
  "role": "user",
90
+ "content": output
91
  }
92
  ],
93
  max_tokens=maxtokens,
94
  temperature=temperature
95
  )
96
  output = output['choices'][0]['message']['content']
 
97
 
98
  # Remove starting header string in output
99
  find_index = output.find(' '.join(pdftext.split()[:3]))
100
  if find_index != -1:
101
  output = output[find_index:].strip()
102
+
103
+
104
+
105
+ # #### Remove Locations and Addresses ###
106
+ prompt = "In the following text replace location or address, such as '3970 Longview Drive, CV36HE' with term [address]. Replace complete GP address with term [address]. It is important that all addresses are fully replaced with [address]."
107
  output = llm.create_chat_completion(
108
  messages=[
109
  {"role": "assistant", "content": prompt},
110
  {
111
  "role": "user",
112
+ "content": pdftext
113
  }
114
  ],
115
  max_tokens=maxtokens,
 
124
  output = output[find_index:].strip()
125
 
126
 
127
+
128
  #### Remove Names ###
129
  prompt = "In the following text replace any person name with term [name]. It is important that all person names are replaced with term [name]. Remove any gender terms 'male' or 'female' if exists."
130
  output = llm.create_chat_completion(
 
145
  if find_index != -1:
146
  output = output[find_index:].strip()
147
 
148
+
 
149
 
150
 
151
  ### Remove Registration Numbers ###