Spaces:
Running
Running
danielhajialigol
commited on
Commit
·
94c41db
1
Parent(s):
a8118cc
removed uncleaned summaries
Browse files- all_summaries.csv +2 -2
- related_summaries.py +13 -2
- utils.py +45 -0
all_summaries.csv
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ee0536b4e7e2297521a1f11b6f18e788d77f70129c8651fbbb3e7044782e3675
|
3 |
+
size 28540700
|
related_summaries.py
CHANGED
@@ -2,6 +2,7 @@ import pandas as pd
|
|
2 |
import torch
|
3 |
from transformers import AutoTokenizer, AutoModel, set_seed
|
4 |
from tqdm import tqdm
|
|
|
5 |
|
6 |
from model import MimicTransformer
|
7 |
set_seed(42)
|
@@ -17,7 +18,7 @@ mimic.eval()
|
|
17 |
mimic.cuda()
|
18 |
tokenizer = mimic.tokenizer
|
19 |
|
20 |
-
summaries = pd.read_csv('
|
21 |
|
22 |
def mean_pooling(model_output, attention_mask):
|
23 |
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
@@ -34,10 +35,20 @@ def get_model_outputs(text):
|
|
34 |
|
35 |
return_tensors = torch.zeros(size=(10000, 738))
|
36 |
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
res = get_model_outputs(text=summary)
|
39 |
return_tensors[i, :] = res.detach().cpu()
|
40 |
# sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
41 |
# sentence_embeddings = sentence_embeddings/sentence_embeddings.norm(dim=1)[:,None]
|
42 |
|
|
|
43 |
torch.save(return_tensors, f='discharge_embeddings.pt')
|
|
|
2 |
import torch
|
3 |
from transformers import AutoTokenizer, AutoModel, set_seed
|
4 |
from tqdm import tqdm
|
5 |
+
from utils import clean_text
|
6 |
|
7 |
from model import MimicTransformer
|
8 |
set_seed(42)
|
|
|
18 |
mimic.cuda()
|
19 |
tokenizer = mimic.tokenizer
|
20 |
|
21 |
+
summaries = pd.read_csv('all_summaries_backup.csv')['SUMMARIES']
|
22 |
|
23 |
def mean_pooling(model_output, attention_mask):
|
24 |
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
|
|
35 |
|
36 |
return_tensors = torch.zeros(size=(10000, 738))
|
37 |
|
38 |
+
non_defunct_summaries = []
|
39 |
+
|
40 |
+
for i, summary in tqdm(enumerate(summaries[:50000])):
|
41 |
+
cleaned = clean_text(summary)
|
42 |
+
if len(non_defunct_summaries) == 10000:
|
43 |
+
break
|
44 |
+
if len(cleaned) > 100:
|
45 |
+
non_defunct_summaries.append(cleaned)
|
46 |
+
|
47 |
+
for i, summary in tqdm(enumerate(non_defunct_summaries)):
|
48 |
res = get_model_outputs(text=summary)
|
49 |
return_tensors[i, :] = res.detach().cpu()
|
50 |
# sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
51 |
# sentence_embeddings = sentence_embeddings/sentence_embeddings.norm(dim=1)[:,None]
|
52 |
|
53 |
+
pd.DataFrame(data={'SUMMARIES':non_defunct_summaries}).to_csv('all_summaries.csv', index=False)
|
54 |
torch.save(return_tensors, f='discharge_embeddings.pt')
|
utils.py
CHANGED
@@ -3,6 +3,7 @@ import json
|
|
3 |
import pandas as pd
|
4 |
import ssl
|
5 |
import torch
|
|
|
6 |
from pprint import pprint
|
7 |
from captum.attr import visualization
|
8 |
|
@@ -20,6 +21,50 @@ class PyTMinMaxScalerVectorized(object):
|
|
20 |
tensor.mul_(scale).sub_(tensor.min(dim=0, keepdim=True)[0])
|
21 |
return tensor
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
def get_drg_link(drg_code):
|
24 |
return f'https://www.aapc.com/codes/icd9-codes/{drg_code}'
|
25 |
|
|
|
3 |
import pandas as pd
|
4 |
import ssl
|
5 |
import torch
|
6 |
+
import re
|
7 |
from pprint import pprint
|
8 |
from captum.attr import visualization
|
9 |
|
|
|
21 |
tensor.mul_(scale).sub_(tensor.min(dim=0, keepdim=True)[0])
|
22 |
return tensor
|
23 |
|
24 |
+
def find_end(text):
|
25 |
+
"""Find the end of the report."""
|
26 |
+
ends = [len(text)]
|
27 |
+
patterns = [
|
28 |
+
re.compile(r'BY ELECTRONICALLY SIGNING THIS REPORT', re.I),
|
29 |
+
re.compile(r'\n {3,}DR.', re.I),
|
30 |
+
re.compile(r'[ ]{1,}RADLINE ', re.I),
|
31 |
+
re.compile(r'.*electronically signed on', re.I),
|
32 |
+
re.compile(r'M\[0KM\[0KM')
|
33 |
+
]
|
34 |
+
for pattern in patterns:
|
35 |
+
matchobj = pattern.search(text)
|
36 |
+
if matchobj:
|
37 |
+
ends.append(matchobj.start())
|
38 |
+
return min(ends)
|
39 |
+
|
40 |
+
def pattern_repl(matchobj):
|
41 |
+
"""
|
42 |
+
Return a replacement string to be used for match object
|
43 |
+
"""
|
44 |
+
return ' '.rjust(len(matchobj.group(0)))
|
45 |
+
|
46 |
+
def clean_text(text):
|
47 |
+
"""
|
48 |
+
Clean text
|
49 |
+
"""
|
50 |
+
|
51 |
+
# Replace [**Patterns**] with spaces.
|
52 |
+
text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text)
|
53 |
+
# Replace `_` with spaces.
|
54 |
+
text = re.sub(r'_', ' ', text)
|
55 |
+
|
56 |
+
start = 0
|
57 |
+
end = find_end(text)
|
58 |
+
new_text = ''
|
59 |
+
if start > 0:
|
60 |
+
new_text += ' ' * start
|
61 |
+
new_text = text[start:end]
|
62 |
+
|
63 |
+
# make sure the new text has the same length of old text.
|
64 |
+
if len(text) - end > 0:
|
65 |
+
new_text += ' ' * (len(text) - end)
|
66 |
+
return new_text
|
67 |
+
|
68 |
def get_drg_link(drg_code):
|
69 |
return f'https://www.aapc.com/codes/icd9-codes/{drg_code}'
|
70 |
|