ronald cardenas acosta commited on
Commit
141eb78
·
1 Parent(s): 40166c5
Files changed (2) hide show
  1. app.py +2 -2
  2. nwentfaithfulness.py +70 -21
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import evaluate
2
  from evaluate.utils import launch_gradio_widget
3
 
4
-
5
- module = evaluate.load("ronaldahmed/nwentfaithfulness")
6
  launch_gradio_widget(module)
 
1
  import evaluate
2
  from evaluate.utils import launch_gradio_widget
3
 
4
+ METRICS_CACHE_DIR="/gfs/team/nlp/users/rcardena/tools/huggingface/evaluate"
5
+ module = evaluate.load("nwentfaithfulness",module_type="metric",cache_dir=METRICS_CACHE_DIR)
6
  launch_gradio_widget(module)
nwentfaithfulness.py CHANGED
@@ -15,20 +15,25 @@
15
 
16
  import evaluate
17
  import datasets
 
 
 
 
 
18
 
19
 
20
  # TODO: Add BibTeX citation
21
  _CITATION = """\
22
- @InProceedings{huggingface:module,
23
- title = {A great new module},
24
- authors={huggingface, Inc.},
25
- year={2020}
26
  }
27
  """
28
 
29
  # TODO: Add description of the module here
30
  _DESCRIPTION = """\
31
- This new module is designed to solve this great ML task and is crafted with a lot of care.
 
 
 
32
  """
33
 
34
 
@@ -36,13 +41,12 @@ This new module is designed to solve this great ML task and is crafted with a lo
36
  _KWARGS_DESCRIPTION = """
37
  Calculates how good are predictions given some references, using certain scores
38
  Args:
39
- predictions: list of predictions to score. Each predictions
40
- should be a string with tokens separated by spaces.
41
- references: list of reference for each prediction. Each
42
- reference should be a string with tokens separated by spaces.
43
  Returns:
44
- accuracy: description of the first score,
45
- another_score: description of the second score,
46
  Examples:
47
  Examples should be written in doctest format, and should illustrate how
48
  to use the function.
@@ -50,11 +54,11 @@ Examples:
50
  >>> my_new_module = evaluate.load("my_new_module")
51
  >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
52
  >>> print(results)
53
- {'accuracy': 1.0}
54
  """
55
 
56
  # TODO: Define external resources urls if needed
57
- BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
58
 
59
 
60
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
@@ -75,7 +79,7 @@ class NwEntFaithfulness(evaluate.Metric):
75
  'references': datasets.Value('int64'),
76
  }),
77
  # Homepage of the module for documentation
78
- homepage="http://module.homepage",
79
  # Additional links to the codebase or references
80
  codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
81
  reference_urls=["http://path.to.reference.url/new_module"]
@@ -86,10 +90,55 @@ class NwEntFaithfulness(evaluate.Metric):
86
  # TODO: Download external resources if needed
87
  pass
88
 
89
- def _compute(self, predictions, references):
90
- """Returns the scores"""
91
- # TODO: Compute the different scores of the module
92
- accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
93
- return {
94
- "accuracy": accuracy,
95
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  import evaluate
17
  import datasets
18
+ import numpy as np
19
+ import torch
20
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
21
+ import evaluate
22
+ from evaluate import logging
23
 
24
 
25
  # TODO: Add BibTeX citation
26
  _CITATION = """\
27
+
 
 
 
28
  }
29
  """
30
 
31
  # TODO: Add description of the module here
32
  _DESCRIPTION = """\
33
+ This metric quantifies the faithfulness of a summary wrt to a source document,
34
+ as given by the probability that the document is entailed by the summary.
35
+ This metric uses pretrained models apt for the Newswire domain (see ScEntFaithfulness
36
+ for a version in scientific domain).
37
  """
38
 
39
 
 
41
  _KWARGS_DESCRIPTION = """
42
  Calculates how good are predictions given some references, using certain scores
43
  Args:
44
+ predictions: list of predictions to score. Each prediction represents a summary and
45
+ should be a string with tokens separated by spaces
46
+ references: list of references for each prediction. Each
47
+ reference represents the input document and should be a string with tokens separated by spaces.
48
  Returns:
49
+ ent-faith: description of the first score,
 
50
  Examples:
51
  Examples should be written in doctest format, and should illustrate how
52
  to use the function.
 
54
  >>> my_new_module = evaluate.load("my_new_module")
55
  >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
56
  >>> print(results)
57
+ {'ent-faith': 1.0}
58
  """
59
 
60
  # TODO: Define external resources urls if needed
61
+ # BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
62
 
63
 
64
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 
79
  'references': datasets.Value('int64'),
80
  }),
81
  # Homepage of the module for documentation
82
+ homepage="https://huggingface.co/spaces/ronaldahmed/nwentfaithfulness",
83
  # Additional links to the codebase or references
84
  codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
85
  reference_urls=["http://path.to.reference.url/new_module"]
 
90
  # TODO: Download external resources if needed
91
  pass
92
 
93
+ # original: references
94
+ def _compute(self, predictions, documents,
95
+ batch_size: int = 16, device=None):
96
+
97
+ MODEL_CACHE_DIR="/gfs/team/nlp/users/rcardena/tools/huggingface"
98
+ if device is not None:
99
+ assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
100
+ if device == "gpu":
101
+ device = "cuda"
102
+ else:
103
+ device = "cuda" if torch.cuda.is_available() else "cpu"
104
+ model = AutoModelForSequenceClassification.from_pretrained(
105
+ "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli",
106
+ cache_dir=MODEL_CACHE_DIR)
107
+ model = model.to(device)
108
+
109
+ tokenizer = AutoTokenizer.from_pretrained(
110
+ "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli",
111
+ cache_dir=MODEL_CACHE_DIR)
112
+ max_tokenized_len = model.config.max_length | 256
113
+
114
+ encoded_texts = []
115
+ attn_masks = []
116
+ tok_types = []
117
+ for pred,doc in zip(predictions,documents):
118
+ enc = tokenizer.encode_plus(pred, doc,
119
+ max_length=max_tokenized_len,
120
+ padding=True,
121
+ truncation=True,
122
+ return_token_type_ids=True,
123
+ return_attention_mask=True)
124
+ encoded_texts.append(enc["input_ids"])
125
+ attn_masks.append(enc["attention_mask"])
126
+ tok_types.append(enc["token_type_ids"])
127
+
128
+
129
+ enf_fs = []
130
+ for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):
131
+ end_index = min(start_index + batch_size, len(encoded_texts))
132
+ encoded_batch = torch.Long(encoded_texts[start_index:end_index]).to(device)
133
+ attn_mask = torch.Long(attn_masks[start_index:end_index]).to(device)
134
+ token_type = torch.Long(tok_types[start_index:end_index]).to(device)
135
+
136
+ with torch.no_grad():
137
+ outputs = model(encoded_batch,
138
+ attention_mask=attn_mask,
139
+ token_type_ids=token_type,
140
+ labels=None)[0]
141
+ probs = torch.softmax(outputs,dim=1)[:,0].tolist()
142
+ enf_fs += probs
143
+
144
+ return {"ent-faith": enf_fs, "mean_ent-faith": np.mean(enf_fs)}