Spaces:
Running
Running
from intel_evaluate_extension.evaluation_suite.model_card_suite import ModelCardSuiteResults | |
from evaluate.evaluation_suite import SubTask | |
from evaluate.visualization import radar_plot | |
_HEADER = "GLUE/AdvGlue Evaluation Results" | |
_DESCRIPTION = """ | |
The suite compares the GLUE results with Adversarial GLUE (AdvGLUE), a | |
multi-task benchmark that tests the vulnerability of modern large-scale | |
language models againstvarious adversarial attacks.""" | |
class Suite(ModelCardSuiteResults): | |
def __init__(self, name): | |
super().__init__(name) | |
self.result_keys = ["accuracy", "f1"] | |
self.preprocessor = lambda x: {"text": x["text"].lower()} | |
self.suite = [ | |
SubTask( | |
task_type="text-classification", | |
data="glue", | |
subset="sst2", | |
split="validation[:5]", | |
args_for_task={ | |
"metric": "glue", | |
"input_column": "sentence", | |
"label_column": "label", | |
"config_name": "sst2", | |
"label_mapping": {"LABEL_0": 0.0, "LABEL_1": 1.0}, | |
}, | |
), | |
SubTask( | |
task_type="text-classification", | |
data="adv_glue", | |
subset="adv_sst2", | |
split="validation[:5]", | |
args_for_task={ | |
"metric": "glue", | |
"input_column": "sentence", | |
"label_column": "label", | |
"config_name": "sst2", | |
"label_mapping": {"LABEL_0": 0.0, "LABEL_1": 1.0}, | |
}, | |
), | |
SubTask( | |
task_type="text-classification", | |
data="glue", | |
subset="qqp", | |
split="validation[:5]", | |
args_for_task={ | |
"metric": "glue", | |
"input_column": "question1", | |
"second_input_column": "question2", | |
"label_column": "label", | |
"config_name": "qqp", | |
"label_mapping": {"LABEL_0": 0, "LABEL_1": 1}, | |
}, | |
), | |
SubTask( | |
task_type="text-classification", | |
data="adv_glue", | |
subset="adv_qqp", | |
split="validation[:5]", | |
args_for_task={ | |
"metric": "glue", | |
"input_column": "question1", | |
"second_input_column": "question2", | |
"label_column": "label", | |
"config_name": "qqp", | |
"label_mapping": {"LABEL_0": 0, "LABEL_1": 1}, | |
}, | |
), | |
SubTask( | |
task_type="text-classification", | |
data="glue", | |
subset="qnli", | |
split="validation[:5]", | |
args_for_task={ | |
"metric": "glue", | |
"input_column": "question", | |
"second_input_column": "sentence", | |
"label_column": "label", | |
"config_name": "qnli", | |
"label_mapping": {"LABEL_0": 0, "LABEL_1": 1}, | |
}, | |
), | |
SubTask( | |
task_type="text-classification", | |
data="adv_glue", | |
subset="adv_qnli", | |
split="validation[:5]", | |
args_for_task={ | |
"metric": "glue", | |
"input_column": "question", | |
"second_input_column": "sentence", | |
"label_column": "label", | |
"config_name": "qnli", | |
"label_mapping": {"LABEL_0": 0, "LABEL_1": 1}, | |
}, | |
), | |
SubTask( | |
task_type="text-classification", | |
data="glue", | |
subset="rte", | |
split="validation[:5]", | |
args_for_task={ | |
"metric": "glue", | |
"input_column": "sentence1", | |
"second_input_column": "sentence2", | |
"label_column": "label", | |
"config_name": "rte", | |
"label_mapping": {"LABEL_0": 0, "LABEL_1": 1}, | |
}, | |
), | |
SubTask( | |
task_type="text-classification", | |
data="adv_glue", | |
subset="adv_rte", | |
split="validation[:5]", | |
args_for_task={ | |
"metric": "glue", | |
"input_column": "sentence1", | |
"second_input_column": "sentence2", | |
"label_column": "label", | |
"config_name": "rte", | |
"label_mapping": {"LABEL_0": 0, "LABEL_1": 1}, | |
}, | |
), | |
SubTask( | |
task_type="text-classification", | |
data="glue", | |
subset="mnli", | |
split="validation_mismatched[:5]", | |
args_for_task={ | |
"metric": "glue", | |
"input_column": "premise", | |
"second_input_column": "hypothesis", | |
"config_name": "mnli", | |
"label_mapping": {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2}, | |
}, | |
), | |
SubTask( | |
task_type="text-classification", | |
data="adv_glue", | |
subset="adv_mnli", | |
split="validation[:5]", | |
args_for_task={ | |
"metric": "glue", | |
"input_column": "premise", | |
"second_input_column": "hypothesis", | |
"config_name": "mnli", | |
"label_mapping": {"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2}, | |
}, | |
), | |
] | |
def process_results(self, results): | |
radar_data = [ | |
{"accuracy " + result["task_name"].split("/")[-1]: result["accuracy"] for result in results[::2]}, | |
{ | |
"accuracy " + result["task_name"].replace("adv_", "").split("/")[-1]: result["accuracy"] | |
for result in results[1::2] | |
}, | |
] | |
return radar_plot(radar_data, ["GLUE", "AdvGLUE"]) | |
def plot_results(self, results, model_or_pipeline): | |
radar_data = self.process_results(results) | |
graphic = radar_plot(radar_data, ["GLUE " + model_or_pipeline, "AdvGLUE " + model_or_pipeline]) | |
return graphic | |