danielhajialigol commited on
Commit
94c41db
·
1 Parent(s): a8118cc

removed uncleaned summaries

Browse files
Files changed (3) hide show
  1. all_summaries.csv +2 -2
  2. related_summaries.py +13 -2
  3. utils.py +45 -0
all_summaries.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57a94e016723f692a3f4b4ad2b61f509cb77d0b3f14a2d2da9287d9d24e2dd42
3
- size 26408635
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee0536b4e7e2297521a1f11b6f18e788d77f70129c8651fbbb3e7044782e3675
3
+ size 28540700
related_summaries.py CHANGED
@@ -2,6 +2,7 @@ import pandas as pd
2
  import torch
3
  from transformers import AutoTokenizer, AutoModel, set_seed
4
  from tqdm import tqdm
 
5
 
6
  from model import MimicTransformer
7
  set_seed(42)
@@ -17,7 +18,7 @@ mimic.eval()
17
  mimic.cuda()
18
  tokenizer = mimic.tokenizer
19
 
20
- summaries = pd.read_csv('all_summaries.csv')['SUMMARIES']
21
 
22
  def mean_pooling(model_output, attention_mask):
23
  token_embeddings = model_output[0] #First element of model_output contains all token embeddings
@@ -34,10 +35,20 @@ def get_model_outputs(text):
34
 
35
  return_tensors = torch.zeros(size=(10000, 738))
36
 
37
- for i, summary in tqdm(enumerate(summaries[:10000])):
 
 
 
 
 
 
 
 
 
38
  res = get_model_outputs(text=summary)
39
  return_tensors[i, :] = res.detach().cpu()
40
  # sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
41
  # sentence_embeddings = sentence_embeddings/sentence_embeddings.norm(dim=1)[:,None]
42
 
 
43
  torch.save(return_tensors, f='discharge_embeddings.pt')
 
2
  import torch
3
  from transformers import AutoTokenizer, AutoModel, set_seed
4
  from tqdm import tqdm
5
+ from utils import clean_text
6
 
7
  from model import MimicTransformer
8
  set_seed(42)
 
18
  mimic.cuda()
19
  tokenizer = mimic.tokenizer
20
 
21
+ summaries = pd.read_csv('all_summaries_backup.csv')['SUMMARIES']
22
 
23
  def mean_pooling(model_output, attention_mask):
24
  token_embeddings = model_output[0] #First element of model_output contains all token embeddings
 
35
 
36
  return_tensors = torch.zeros(size=(10000, 738))
37
 
38
+ non_defunct_summaries = []
39
+
40
+ for i, summary in tqdm(enumerate(summaries[:50000])):
41
+ cleaned = clean_text(summary)
42
+ if len(non_defunct_summaries) == 10000:
43
+ break
44
+ if len(cleaned) > 100:
45
+ non_defunct_summaries.append(cleaned)
46
+
47
+ for i, summary in tqdm(enumerate(non_defunct_summaries)):
48
  res = get_model_outputs(text=summary)
49
  return_tensors[i, :] = res.detach().cpu()
50
  # sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
51
  # sentence_embeddings = sentence_embeddings/sentence_embeddings.norm(dim=1)[:,None]
52
 
53
+ pd.DataFrame(data={'SUMMARIES':non_defunct_summaries}).to_csv('all_summaries.csv', index=False)
54
  torch.save(return_tensors, f='discharge_embeddings.pt')
utils.py CHANGED
@@ -3,6 +3,7 @@ import json
3
  import pandas as pd
4
  import ssl
5
  import torch
 
6
  from pprint import pprint
7
  from captum.attr import visualization
8
 
@@ -20,6 +21,50 @@ class PyTMinMaxScalerVectorized(object):
20
  tensor.mul_(scale).sub_(tensor.min(dim=0, keepdim=True)[0])
21
  return tensor
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def get_drg_link(drg_code):
24
  return f'https://www.aapc.com/codes/icd9-codes/{drg_code}'
25
 
 
3
  import pandas as pd
4
  import ssl
5
  import torch
6
+ import re
7
  from pprint import pprint
8
  from captum.attr import visualization
9
 
 
21
  tensor.mul_(scale).sub_(tensor.min(dim=0, keepdim=True)[0])
22
  return tensor
23
 
24
+ def find_end(text):
25
+ """Find the end of the report."""
26
+ ends = [len(text)]
27
+ patterns = [
28
+ re.compile(r'BY ELECTRONICALLY SIGNING THIS REPORT', re.I),
29
+ re.compile(r'\n {3,}DR.', re.I),
30
+ re.compile(r'[ ]{1,}RADLINE ', re.I),
31
+ re.compile(r'.*electronically signed on', re.I),
32
+ re.compile(r'M\[0KM\[0KM')
33
+ ]
34
+ for pattern in patterns:
35
+ matchobj = pattern.search(text)
36
+ if matchobj:
37
+ ends.append(matchobj.start())
38
+ return min(ends)
39
+
40
+ def pattern_repl(matchobj):
41
+ """
42
+ Return a replacement string to be used for match object
43
+ """
44
+ return ' '.rjust(len(matchobj.group(0)))
45
+
46
+ def clean_text(text):
47
+ """
48
+ Clean text
49
+ """
50
+
51
+ # Replace [**Patterns**] with spaces.
52
+ text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text)
53
+ # Replace `_` with spaces.
54
+ text = re.sub(r'_', ' ', text)
55
+
56
+ start = 0
57
+ end = find_end(text)
58
+ new_text = ''
59
+ if start > 0:
60
+ new_text += ' ' * start
61
+ new_text = text[start:end]
62
+
63
+ # make sure the new text has the same length of old text.
64
+ if len(text) - end > 0:
65
+ new_text += ' ' * (len(text) - end)
66
+ return new_text
67
+
68
  def get_drg_link(drg_code):
69
  return f'https://www.aapc.com/codes/icd9-codes/{drg_code}'
70