|
""" |
|
|
|
Determine if maintaining the same predicted label |
|
--------------------------------------------------------------------- |
|
""" |
|
|
|
|
|
from .classification_goal_function import ClassificationGoalFunction |
|
|
|
|
|
class InputReduction(ClassificationGoalFunction): |
|
"""Attempts to reduce the input down to as few words as possible while |
|
maintaining the same predicted label. |
|
|
|
From Feng, Wallace, Grissom, Iyyer, Rodriguez, Boyd-Graber. (2018). |
|
Pathologies of Neural Models Make Interpretations Difficult. |
|
https://arxiv.org/abs/1804.07781 |
|
""" |
|
|
|
def __init__(self, *args, target_num_words=1, **kwargs): |
|
self.target_num_words = target_num_words |
|
super().__init__(*args, **kwargs) |
|
|
|
def _is_goal_complete(self, model_output, attacked_text): |
|
return ( |
|
self.ground_truth_output == model_output.argmax() |
|
and attacked_text.num_words <= self.target_num_words |
|
) |
|
|
|
def _should_skip(self, model_output, attacked_text): |
|
return self.ground_truth_output != model_output.argmax() |
|
|
|
def _get_score(self, model_output, attacked_text): |
|
|
|
if self.ground_truth_output != model_output.argmax(): |
|
return 0 |
|
|
|
cur_num_words = attacked_text.num_words |
|
initial_num_words = self.initial_attacked_text.num_words |
|
|
|
|
|
|
|
num_words_score = max( |
|
(initial_num_words - cur_num_words) / initial_num_words, 0 |
|
) |
|
model_score = model_output[self.ground_truth_output] |
|
return min(num_words_score + model_score / initial_num_words, 1) |
|
|
|
def extra_repr_keys(self): |
|
if self.maximizable: |
|
return ["maximizable"] |
|
else: |
|
return ["maximizable", "target_num_words"] |
|
|