EmicoBinsfinder commited on
Commit
af3419a
·
1 Parent(s): a00f952

Update tridentmodel/classification.py

Browse files
Files changed (1) hide show
  1. tridentmodel/classification.py +98 -98
tridentmodel/classification.py CHANGED
@@ -126,101 +126,101 @@ def mean_pooling(model_output, attention_mask):
126
  return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf)
127
 
128
  ### Sentence Embedder
129
- def sentence_embedder(sentences, model_path):
130
- """
131
- Calling the sentence similarity model to generate embeddings on input text.
132
- :param sentences: takes input text in the form of a string
133
- :param model_path: path to the text similarity model
134
- :return returns a (1, 384) embedding of the input text
135
- """
136
- tokenizer = AutoTokenizer.from_pretrained(model_path) #instantiating the sentence embedder using HuggingFace library
137
- model = AutoModel.from_pretrained(model_path, from_tf=True) #making a model instance
138
- encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
139
- # Compute token embeddings
140
- with torch.no_grad():
141
- model_output = model(**encoded_input)
142
- sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) #outputs a (1, 384) tensor representation of input text
143
- return sentence_embeddings
144
-
145
- ### Sentence Embedding Preparation Function
146
- def convert_saved_embeddings(embedding_string):
147
- """
148
- Preparing pre-computed embeddings for use for comparison with new abstract embeddings .
149
- Pre-computed embeddings are saved as tensors in string format so need to be converted back to numpy arrays in order to calculate cosine similarity.
150
- :param embedding_string:
151
- :return: Should be a single tensor with dims (,384) in string formate
152
- """
153
- embedding = embedding_string.replace('(', '')
154
- embedding = embedding.replace(')', '')
155
- embedding = embedding.replace('[', '')
156
- embedding = embedding.replace(']', '')
157
- embedding = embedding.replace('tensor', '')
158
- embedding = embedding.replace(' ', '')
159
- embedding = embedding.split(',')
160
- embedding = [float(x) for x in embedding]
161
- embedding = np.array(embedding)
162
- embedding = np.expand_dims(embedding, axis=0)
163
- embedding = torch.from_numpy(embedding)
164
- return embedding
165
-
166
-
167
- ### Generating Class Embeddings
168
-
169
- Model_Path = 'Model_bert' ### Insert Path to MODEL DIRECTORY here
170
- def class_embbedding_generator(classes):
171
- """
172
- This function is to be used to generate and save class embeddings
173
- Takes an input of 'cleaned' classes, generated by clean_data function, and computes vector representations of these classes (the embeddings) and saves them to csv
174
- :classes: Classes should be a dataframe including all of broad scope classes that are intended to be used to make comparisons with
175
- """
176
- class_embeddings = pd.DataFrame(columns=['Class', 'Description', 'Embedding'])
177
- for i in range(len(classes)):
178
- class_name = classes.iloc[i, 0]
179
- print(class_name)
180
- class_description = classes.iloc[i, 1]
181
- class_description_embedding = sentence_embedder(class_description, Model_Path)
182
- class_description_embedding = class_description_embedding.numpy()
183
- class_description_embedding = torch.from_numpy(class_description_embedding)
184
- embedding_entry = [class_name, class_description, class_description_embedding]
185
- class_embeddings.loc[len(class_embeddings)] = embedding_entry
186
-
187
- ### Broad Scope Classifier
188
- Model_Path = 'Model_bert' ### Insert Path to MODEL DIRECTORY here
189
- def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'):
190
- """
191
- Takes in pre-computed class embeddings and abstract texts, converts abstract text into
192
- :param class_embeddings: dataframe of class embeddings
193
- :param abstract: a single abstract embedding
194
- :param N: N highest matching classes to return, from highest to lowest, default is 5
195
- :return: predictions: a full dataframe of all the predictions on the 9500+ classes, HighestSimilarity: Dataframe of the N most similar classes
196
- """
197
- predictions = pd.DataFrame(columns=['Class Name', 'Score'])
198
- for i in range(len(class_embeddings)):
199
- class_name = class_embeddings.iloc[i, 0]
200
- embedding = class_embeddings.iloc[i, 2]
201
- embedding = convert_saved_embeddings(embedding)
202
- abstract_embedding = abstract_embedding.numpy()
203
- abstract_embedding = torch.from_numpy(abstract_embedding)
204
- cos = torch.nn.CosineSimilarity(dim=1)
205
- score = cos(abstract_embedding, embedding).numpy().tolist()
206
- result = [class_name, score[0]]
207
- predictions.loc[len(predictions)] = result
208
- greenpredictions = predictions.tail(52)
209
- if Sensitivity == 'High':
210
- Threshold = 0.5
211
- elif Sensitivity == 'Medium':
212
- Threshold = 0.40
213
- elif Sensitivity == 'Low':
214
- Threshold = 0.35
215
- GreenLikelihood = 'False'
216
- for i in range(len(greenpredictions)):
217
- score = greenpredictions.iloc[i, 1]
218
- if float(score) >= Threshold:
219
- GreenLikelihood = 'True'
220
- break
221
- else:
222
- continue
223
- HighestSimilarity = predictions.nlargest(N, ['Score'])
224
- print(HighestSimilarity)
225
- print(GreenLikelihood)
226
- return predictions, HighestSimilarity, GreenLikelihood
 
126
  return tf.reduce_sum(token_embeddings * input_mask_expanded, 1) / tf.clip_by_value(input_mask_expanded.sum(1), clip_value_min=1e-9, clip_value_max=math.inf)
127
 
128
  ### Sentence Embedder
129
+ # def sentence_embedder(sentences, model_path):
130
+ # """
131
+ # Calling the sentence similarity model to generate embeddings on input text.
132
+ # :param sentences: takes input text in the form of a string
133
+ # :param model_path: path to the text similarity model
134
+ # :return returns a (1, 384) embedding of the input text
135
+ # """
136
+ # tokenizer = AutoTokenizer.from_pretrained(model_path) #instantiating the sentence embedder using HuggingFace library
137
+ # model = AutoModel.from_pretrained(model_path, from_tf=True) #making a model instance
138
+ # encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
139
+ # # Compute token embeddings
140
+ # with torch.no_grad():
141
+ # model_output = model(**encoded_input)
142
+ # sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) #outputs a (1, 384) tensor representation of input text
143
+ # return sentence_embeddings
144
+
145
+ # ### Sentence Embedding Preparation Function
146
+ # def convert_saved_embeddings(embedding_string):
147
+ # """
148
+ # Preparing pre-computed embeddings for use for comparison with new abstract embeddings .
149
+ # Pre-computed embeddings are saved as tensors in string format so need to be converted back to numpy arrays in order to calculate cosine similarity.
150
+ # :param embedding_string:
151
+ # :return: Should be a single tensor with dims (,384) in string formate
152
+ # """
153
+ # embedding = embedding_string.replace('(', '')
154
+ # embedding = embedding.replace(')', '')
155
+ # embedding = embedding.replace('[', '')
156
+ # embedding = embedding.replace(']', '')
157
+ # embedding = embedding.replace('tensor', '')
158
+ # embedding = embedding.replace(' ', '')
159
+ # embedding = embedding.split(',')
160
+ # embedding = [float(x) for x in embedding]
161
+ # embedding = np.array(embedding)
162
+ # embedding = np.expand_dims(embedding, axis=0)
163
+ # embedding = torch.from_numpy(embedding)
164
+ # return embedding
165
+
166
+
167
+ # ### Generating Class Embeddings
168
+
169
+ # Model_Path = 'Model_bert' ### Insert Path to MODEL DIRECTORY here
170
+ # def class_embbedding_generator(classes):
171
+ # """
172
+ # This function is to be used to generate and save class embeddings
173
+ # Takes an input of 'cleaned' classes, generated by clean_data function, and computes vector representations of these classes (the embeddings) and saves them to csv
174
+ # :classes: Classes should be a dataframe including all of broad scope classes that are intended to be used to make comparisons with
175
+ # """
176
+ # class_embeddings = pd.DataFrame(columns=['Class', 'Description', 'Embedding'])
177
+ # for i in range(len(classes)):
178
+ # class_name = classes.iloc[i, 0]
179
+ # print(class_name)
180
+ # class_description = classes.iloc[i, 1]
181
+ # class_description_embedding = sentence_embedder(class_description, Model_Path)
182
+ # class_description_embedding = class_description_embedding.numpy()
183
+ # class_description_embedding = torch.from_numpy(class_description_embedding)
184
+ # embedding_entry = [class_name, class_description, class_description_embedding]
185
+ # class_embeddings.loc[len(class_embeddings)] = embedding_entry
186
+
187
+ # ### Broad Scope Classifier
188
+ # Model_Path = 'Model_bert' ### Insert Path to MODEL DIRECTORY here
189
+ # def broad_scope_class_predictor(class_embeddings, abstract_embedding, N=5, Sensitivity='Medium'):
190
+ # """
191
+ # Takes in pre-computed class embeddings and abstract texts, converts abstract text into
192
+ # :param class_embeddings: dataframe of class embeddings
193
+ # :param abstract: a single abstract embedding
194
+ # :param N: N highest matching classes to return, from highest to lowest, default is 5
195
+ # :return: predictions: a full dataframe of all the predictions on the 9500+ classes, HighestSimilarity: Dataframe of the N most similar classes
196
+ # """
197
+ # predictions = pd.DataFrame(columns=['Class Name', 'Score'])
198
+ # for i in range(len(class_embeddings)):
199
+ # class_name = class_embeddings.iloc[i, 0]
200
+ # embedding = class_embeddings.iloc[i, 2]
201
+ # embedding = convert_saved_embeddings(embedding)
202
+ # abstract_embedding = abstract_embedding.numpy()
203
+ # abstract_embedding = torch.from_numpy(abstract_embedding)
204
+ # cos = torch.nn.CosineSimilarity(dim=1)
205
+ # score = cos(abstract_embedding, embedding).numpy().tolist()
206
+ # result = [class_name, score[0]]
207
+ # predictions.loc[len(predictions)] = result
208
+ # greenpredictions = predictions.tail(52)
209
+ # if Sensitivity == 'High':
210
+ # Threshold = 0.5
211
+ # elif Sensitivity == 'Medium':
212
+ # Threshold = 0.40
213
+ # elif Sensitivity == 'Low':
214
+ # Threshold = 0.35
215
+ # GreenLikelihood = 'False'
216
+ # for i in range(len(greenpredictions)):
217
+ # score = greenpredictions.iloc[i, 1]
218
+ # if float(score) >= Threshold:
219
+ # GreenLikelihood = 'True'
220
+ # break
221
+ # else:
222
+ # continue
223
+ # HighestSimilarity = predictions.nlargest(N, ['Score'])
224
+ # print(HighestSimilarity)
225
+ # print(GreenLikelihood)
226
+ # return predictions, HighestSimilarity, GreenLikelihood