Spaces:
Runtime error
Runtime error
Delete lib
Browse files- lib/.DS_Store +0 -0
- lib/.ipynb_checkpoints/utils-checkpoint.py +0 -206
- lib/__init__.py +0 -0
- lib/__pycache__/__init__.cpython-310.pyc +0 -0
- lib/__pycache__/utils.cpython-310.pyc +0 -0
- lib/utils.py +0 -210
lib/.DS_Store
DELETED
Binary file (6.15 kB)
|
|
lib/.ipynb_checkpoints/utils-checkpoint.py
DELETED
@@ -1,206 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
from scipy.special import softmax
|
3 |
-
import collections
|
4 |
-
import torch
|
5 |
-
from torch.utils.data import DataLoader
|
6 |
-
from transformers import default_data_collator
|
7 |
-
import pandas as pd
|
8 |
-
|
9 |
-
def preprocess_examples(examples, tokenizer , max_length = 384, stride = 128):
|
10 |
-
"""
|
11 |
-
Preprocesses and tokenizes examples in preparation for inference
|
12 |
-
|
13 |
-
Parameters:
|
14 |
-
-----------
|
15 |
-
examples : datasets.Dataset
|
16 |
-
The dataset of examples. Must have columns:
|
17 |
-
'id', 'question', 'context'
|
18 |
-
tokenizer : transformers.AutoTokenizer
|
19 |
-
The tokenizer for the model
|
20 |
-
max_length : int
|
21 |
-
The max length for context truncation
|
22 |
-
stride : int
|
23 |
-
The stride for context truncation
|
24 |
-
|
25 |
-
Returns:
|
26 |
-
--------
|
27 |
-
inputs : dict
|
28 |
-
The tokenized and processed data dictionary with
|
29 |
-
keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
|
30 |
-
All values are lists of length = # of inputs output by tokenizer
|
31 |
-
inputs['input_ids'][k] : list
|
32 |
-
token ids corresponding to tokens in feature k
|
33 |
-
inputs['attention_mask'][k] : list
|
34 |
-
attention mask for feature k
|
35 |
-
inputs['offset_ids'][k] : list
|
36 |
-
offset ids for feature k
|
37 |
-
inputs['example_id'][k] : int
|
38 |
-
id of example from which feature k originated
|
39 |
-
"""
|
40 |
-
questions = [q.strip() for q in examples["question"]]
|
41 |
-
inputs = tokenizer(
|
42 |
-
questions,
|
43 |
-
examples['context'],
|
44 |
-
max_length=max_length,
|
45 |
-
truncation="only_second",
|
46 |
-
stride=stride,
|
47 |
-
return_overflowing_tokens=True,
|
48 |
-
return_offsets_mapping=True,
|
49 |
-
padding="max_length",
|
50 |
-
)
|
51 |
-
|
52 |
-
sample_map = inputs.pop("overflow_to_sample_mapping")
|
53 |
-
example_ids = []
|
54 |
-
|
55 |
-
for i in range(len(inputs["input_ids"])):
|
56 |
-
sample_idx = sample_map[i]
|
57 |
-
example_ids.append(examples["id"][sample_idx])
|
58 |
-
|
59 |
-
sequence_ids = inputs.sequence_ids(i)
|
60 |
-
offset = inputs["offset_mapping"][i]
|
61 |
-
inputs["offset_mapping"][i] = [
|
62 |
-
o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
|
63 |
-
]
|
64 |
-
|
65 |
-
inputs["example_id"] = example_ids
|
66 |
-
return inputs
|
67 |
-
|
68 |
-
|
69 |
-
def make_predictions(model,tokenizer,inputs,examples,
|
70 |
-
n_best = 20,max_answer_length=30):
|
71 |
-
"""
|
72 |
-
Generates a list of prediction data based on logits
|
73 |
-
|
74 |
-
Parameters:
|
75 |
-
-----------
|
76 |
-
model : transformers.AutoModelForQuestionAnswering
|
77 |
-
The trained model
|
78 |
-
tokenizer : transformers.AutoTokenizer
|
79 |
-
The model's tokenizer
|
80 |
-
inputs : dict
|
81 |
-
The tokenized and processed data dictionary with
|
82 |
-
keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
|
83 |
-
All values are lists of length = # of inputs output by tokenizer
|
84 |
-
inputs['input_ids'][k] : list
|
85 |
-
token ids corresponding to tokens in feature k
|
86 |
-
inputs['attention_mask'][k] : list
|
87 |
-
attention mask for feature k
|
88 |
-
inputs['offset_ids'][k] : list
|
89 |
-
offset ids for feature k
|
90 |
-
inputs['example_id'][k] : int
|
91 |
-
id of example from which feature k originated
|
92 |
-
examples : datasets.Dataset
|
93 |
-
The dataset of examples. Must have columns:
|
94 |
-
'id', 'question', 'context'
|
95 |
-
n_best : int
|
96 |
-
The number of top start/end (by logit) indices to consider
|
97 |
-
max_answer_length : int
|
98 |
-
The maximum length (in characters) allowed for a candidate answer
|
99 |
-
|
100 |
-
Returns:
|
101 |
-
--------
|
102 |
-
predicted_answers : list(dict)
|
103 |
-
predicted_answers[k] has keys 'id','prediction_text','confidence'
|
104 |
-
predicted_answers[k]['id'] : int
|
105 |
-
The unique id of the example
|
106 |
-
predicted_answers[k]['prediction_text'] : str
|
107 |
-
The predicted answer as a string
|
108 |
-
predicted_answers[k]['confidence'] : float
|
109 |
-
The predicted probability corresponding to the answer, i.e. the
|
110 |
-
corresponding output of a softmax function on logits
|
111 |
-
"""
|
112 |
-
assert n_best <= len(inputs['input_ids'][0]), 'n_best cannot be larger than max_length'
|
113 |
-
|
114 |
-
if torch.backends.mps.is_available():
|
115 |
-
device = "mps"
|
116 |
-
elif torch.cuda.is_available():
|
117 |
-
device = "cuda"
|
118 |
-
else:
|
119 |
-
device = "cpu"
|
120 |
-
data_for_model = inputs.remove_columns(["example_id", "offset_mapping"])
|
121 |
-
data_for_model.set_format("torch",device=device)
|
122 |
-
dl = DataLoader(
|
123 |
-
data_for_model,
|
124 |
-
collate_fn=default_data_collator,
|
125 |
-
batch_size=len(inputs)
|
126 |
-
)
|
127 |
-
model = model.to(device)
|
128 |
-
for batch in dl:
|
129 |
-
outputs = model(**batch)
|
130 |
-
|
131 |
-
start_logits = outputs.start_logits.cpu().detach().numpy()
|
132 |
-
end_logits = outputs.end_logits.cpu().detach().numpy()
|
133 |
-
example_to_inputs = collections.defaultdict(list)
|
134 |
-
for idx, feature in enumerate(inputs):
|
135 |
-
example_to_inputs[feature["example_id"]].append(idx)
|
136 |
-
|
137 |
-
predicted_answers = []
|
138 |
-
for example in examples:
|
139 |
-
example_id = example["id"]
|
140 |
-
context = example["context"]
|
141 |
-
answers = []
|
142 |
-
|
143 |
-
for feature_index in example_to_inputs[example_id]:
|
144 |
-
start_logit = start_logits[feature_index]
|
145 |
-
end_logit = end_logits[feature_index]
|
146 |
-
offsets = inputs[feature_index]['offset_mapping']
|
147 |
-
|
148 |
-
start_indices = np.argsort(start_logit)[-1:-n_best-1:-1].tolist()
|
149 |
-
end_indices = np.argsort(end_logit)[-1 :-n_best-1: -1].tolist()
|
150 |
-
|
151 |
-
for start_index in start_indices:
|
152 |
-
for end_index in end_indices:
|
153 |
-
# Skip answers with a length that is either < 0 or > max_answer_length.
|
154 |
-
if(
|
155 |
-
end_index < start_index
|
156 |
-
or end_index - start_index + 1 > max_answer_length
|
157 |
-
):
|
158 |
-
continue
|
159 |
-
|
160 |
-
if (offsets[start_index] is None)^(offsets[end_index] is None):
|
161 |
-
continue
|
162 |
-
if (offsets[start_index] is None)&(offsets[end_index] is None):
|
163 |
-
answers.append(
|
164 |
-
{
|
165 |
-
"text": '',
|
166 |
-
"logit_score": start_logit[start_index] + end_logit[end_index],
|
167 |
-
}
|
168 |
-
)
|
169 |
-
else:
|
170 |
-
answers.append(
|
171 |
-
{
|
172 |
-
"text": context[offsets[start_index][0] : offsets[end_index][1]],
|
173 |
-
"logit_score": start_logit[start_index] + end_logit[end_index],
|
174 |
-
}
|
175 |
-
)
|
176 |
-
answer_logits = [a['logit_score'] for a in answers]
|
177 |
-
answer_probs = softmax(answer_logits)
|
178 |
-
|
179 |
-
if len(answers)>0:
|
180 |
-
best_answer = max(answers, key=lambda x:x['logit_score'])
|
181 |
-
predicted_answers.append(
|
182 |
-
{'id':example_id, 'prediction_text':best_answer['text'], 'confidence':answer_probs[0]}
|
183 |
-
)
|
184 |
-
else:
|
185 |
-
predicted_answers.append({'id':example_id, 'prediction_text':'','confidence':answer_probs[0]})
|
186 |
-
for pred in predicted_answers:
|
187 |
-
if pred['prediction_text'] == '':
|
188 |
-
pred['prediction_text'] = "I don't have an answer based on the context provided."
|
189 |
-
return predicted_answers
|
190 |
-
|
191 |
-
def get_examples():
|
192 |
-
"""
|
193 |
-
Retrieve pre-made examples from a .csv file
|
194 |
-
Parameters: None
|
195 |
-
-----------
|
196 |
-
Returns:
|
197 |
-
--------
|
198 |
-
questions, contexts : list, list
|
199 |
-
Lists of examples of corresponding question-context pairs
|
200 |
-
|
201 |
-
"""
|
202 |
-
examples = pd.read_csv('examples.csv')
|
203 |
-
questions = list(examples['question'])
|
204 |
-
contexts = list(examples['context'])
|
205 |
-
return questions, contexts
|
206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lib/__init__.py
DELETED
File without changes
|
lib/__pycache__/__init__.cpython-310.pyc
DELETED
Binary file (151 Bytes)
|
|
lib/__pycache__/utils.cpython-310.pyc
DELETED
Binary file (6.56 kB)
|
|
lib/utils.py
DELETED
@@ -1,210 +0,0 @@
|
|
1 |
-
import numpy as np
|
2 |
-
from scipy.special import softmax
|
3 |
-
import collections
|
4 |
-
import torch
|
5 |
-
from torch.utils.data import DataLoader
|
6 |
-
from transformers import default_data_collator
|
7 |
-
import pandas as pd
|
8 |
-
|
9 |
-
def preprocess_examples(examples, tokenizer , max_length = 384, stride = 128):
|
10 |
-
"""
|
11 |
-
Preprocesses and tokenizes examples in preparation for inference
|
12 |
-
|
13 |
-
Parameters:
|
14 |
-
-----------
|
15 |
-
examples : datasets.Dataset
|
16 |
-
The dataset of examples. Must have columns:
|
17 |
-
'id', 'question', 'context'
|
18 |
-
tokenizer : transformers.AutoTokenizer
|
19 |
-
The tokenizer for the model
|
20 |
-
max_length : int
|
21 |
-
The max length for context truncation
|
22 |
-
stride : int
|
23 |
-
The stride for context truncation
|
24 |
-
|
25 |
-
Returns:
|
26 |
-
--------
|
27 |
-
inputs : dict
|
28 |
-
The tokenized and processed data dictionary with
|
29 |
-
keys 'input_ids', 'attention_mask', 'offset_mapping', 'example_id'
|
30 |
-
All values are lists of length = # of inputs output by tokenizer
|
31 |
-
inputs['input_ids'][k] : list
|
32 |
-
token ids corresponding to tokens in feature k
|
33 |
-
inputs['attention_mask'][k] : list
|
34 |
-
attention mask for feature k
|
35 |
-
inputs['offset_mapping'][k] : list
|
36 |
-
offset_mapping for feature k
|
37 |
-
inputs['example_id'][k] : int
|
38 |
-
id of example from which feature k originated
|
39 |
-
"""
|
40 |
-
questions = [q.strip() for q in examples["question"]]
|
41 |
-
inputs = tokenizer(
|
42 |
-
questions,
|
43 |
-
examples['context'],
|
44 |
-
max_length=max_length,
|
45 |
-
truncation="only_second",
|
46 |
-
stride=stride,
|
47 |
-
return_overflowing_tokens=True,
|
48 |
-
return_offsets_mapping=True,
|
49 |
-
padding="max_length",
|
50 |
-
)
|
51 |
-
|
52 |
-
sample_map = inputs.pop("overflow_to_sample_mapping")
|
53 |
-
example_ids = []
|
54 |
-
|
55 |
-
for i in range(len(inputs["input_ids"])):
|
56 |
-
sample_idx = sample_map[i]
|
57 |
-
example_ids.append(examples["id"][sample_idx])
|
58 |
-
|
59 |
-
sequence_ids = inputs.sequence_ids(i)
|
60 |
-
offset = inputs["offset_mapping"][i]
|
61 |
-
inputs["offset_mapping"][i] = [
|
62 |
-
o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
|
63 |
-
]
|
64 |
-
|
65 |
-
inputs["example_id"] = example_ids
|
66 |
-
return inputs
|
67 |
-
|
68 |
-
|
69 |
-
def make_predictions(model,tokenizer,inputs,examples,
|
70 |
-
n_best = 20,max_answer_length=30):
|
71 |
-
"""
|
72 |
-
Generates a list of prediction data based on logits
|
73 |
-
|
74 |
-
Parameters:
|
75 |
-
-----------
|
76 |
-
model : transformers.AutoModelForQuestionAnswering
|
77 |
-
The trained model
|
78 |
-
tokenizer : transformers.AutoTokenizer
|
79 |
-
The model's tokenizer
|
80 |
-
inputs : dict
|
81 |
-
The tokenized and processed data dictionary with
|
82 |
-
keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
|
83 |
-
All values are lists of length = # of inputs output by tokenizer
|
84 |
-
inputs['input_ids'][k] : list
|
85 |
-
token ids corresponding to tokens in feature k
|
86 |
-
inputs['attention_mask'][k] : list
|
87 |
-
attention mask for feature k
|
88 |
-
inputs['offset_ids'][k] : list
|
89 |
-
offset ids for feature k
|
90 |
-
inputs['example_id'][k] : int
|
91 |
-
id of example from which feature k originated
|
92 |
-
examples : datasets.Dataset
|
93 |
-
The dataset of examples. Must have columns:
|
94 |
-
'id', 'question', 'context'
|
95 |
-
n_best : int
|
96 |
-
The number of top start/end (by logit) indices to consider
|
97 |
-
max_answer_length : int
|
98 |
-
The maximum length (in characters) allowed for a candidate answer
|
99 |
-
|
100 |
-
Returns:
|
101 |
-
--------
|
102 |
-
predicted_answers : list(dict)
|
103 |
-
predicted_answers[k] has keys 'id','prediction_text','confidence'
|
104 |
-
predicted_answers[k]['id'] : int
|
105 |
-
The unique id of the example
|
106 |
-
predicted_answers[k]['prediction_text'] : str
|
107 |
-
The predicted answer as a string
|
108 |
-
predicted_answers[k]['confidence'] : float
|
109 |
-
The predicted probability corresponding to the answer, i.e. the
|
110 |
-
corresponding output of a softmax function on logits
|
111 |
-
"""
|
112 |
-
assert n_best <= len(inputs['input_ids'][0]), 'n_best cannot be larger than max_length'
|
113 |
-
|
114 |
-
if torch.backends.mps.is_available():
|
115 |
-
device = "mps"
|
116 |
-
elif torch.cuda.is_available():
|
117 |
-
device = "cuda"
|
118 |
-
else:
|
119 |
-
device = "cpu"
|
120 |
-
data_for_model = inputs.remove_columns(["example_id", "offset_mapping"])
|
121 |
-
data_for_model.set_format("torch",device=device)
|
122 |
-
dl = DataLoader(
|
123 |
-
data_for_model,
|
124 |
-
collate_fn=default_data_collator,
|
125 |
-
batch_size=len(inputs)
|
126 |
-
)
|
127 |
-
model = model.to(device)
|
128 |
-
for batch in dl:
|
129 |
-
outputs = model(**batch)
|
130 |
-
|
131 |
-
start_logits = outputs.start_logits.cpu().detach().numpy()
|
132 |
-
end_logits = outputs.end_logits.cpu().detach().numpy()
|
133 |
-
example_to_inputs = collections.defaultdict(list)
|
134 |
-
for idx, feature in enumerate(inputs):
|
135 |
-
example_to_inputs[feature["example_id"]].append(idx)
|
136 |
-
|
137 |
-
predicted_answers = []
|
138 |
-
for example in examples:
|
139 |
-
example_id = example["id"]
|
140 |
-
context = example["context"]
|
141 |
-
answers = []
|
142 |
-
|
143 |
-
for feature_index in example_to_inputs[example_id]:
|
144 |
-
start_logit = start_logits[feature_index]
|
145 |
-
end_logit = end_logits[feature_index]
|
146 |
-
offsets = inputs[feature_index]['offset_mapping']
|
147 |
-
|
148 |
-
start_indices = np.argsort(start_logit)[-1:-n_best-1:-1].tolist()
|
149 |
-
end_indices = np.argsort(end_logit)[-1 :-n_best-1: -1].tolist()
|
150 |
-
|
151 |
-
for start_index in start_indices:
|
152 |
-
for end_index in end_indices:
|
153 |
-
# Skip answers with a length that is either < 0 or > max_answer_length.
|
154 |
-
if(
|
155 |
-
end_index < start_index
|
156 |
-
or end_index - start_index + 1 > max_answer_length
|
157 |
-
):
|
158 |
-
continue
|
159 |
-
|
160 |
-
if (offsets[start_index] is None)^(offsets[end_index] is None):
|
161 |
-
continue
|
162 |
-
if (offsets[start_index] is None)&(offsets[end_index] is None):
|
163 |
-
answers.append(
|
164 |
-
{
|
165 |
-
"text": '',
|
166 |
-
"logit_score": start_logit[start_index] + end_logit[end_index],
|
167 |
-
}
|
168 |
-
)
|
169 |
-
else:
|
170 |
-
answers.append(
|
171 |
-
{
|
172 |
-
"text": context[offsets[start_index][0] : offsets[end_index][1]],
|
173 |
-
"logit_score": start_logit[start_index] + end_logit[end_index],
|
174 |
-
}
|
175 |
-
)
|
176 |
-
answer_logits = [a['logit_score'] for a in answers]
|
177 |
-
answer_probs = softmax(answer_logits)
|
178 |
-
|
179 |
-
if len(answers)>0:
|
180 |
-
# best_answer = max(answers, key=lambda x:x['logit_score'])
|
181 |
-
best_answer_idx = np.argmax(answer_logits)
|
182 |
-
predicted_answers.append(
|
183 |
-
{'id':example_id,
|
184 |
-
'prediction_text':answers[best_answer_idx]['text'],
|
185 |
-
'confidence':answer_probs[best_answer_idx]}
|
186 |
-
)
|
187 |
-
else:
|
188 |
-
predicted_answers.append({'id':example_id, 'prediction_text':'',
|
189 |
-
'confidence':answer_probs[best_answer_idx]})
|
190 |
-
for pred in predicted_answers:
|
191 |
-
if pred['prediction_text'] == '':
|
192 |
-
pred['prediction_text'] = "I don't have an answer based on the context provided."
|
193 |
-
return predicted_answers
|
194 |
-
|
195 |
-
def get_examples():
|
196 |
-
"""
|
197 |
-
Retrieve pre-made examples from a .csv file
|
198 |
-
Parameters: None
|
199 |
-
-----------
|
200 |
-
Returns:
|
201 |
-
--------
|
202 |
-
questions, contexts : list, list
|
203 |
-
Lists of examples of corresponding question-context pairs
|
204 |
-
|
205 |
-
"""
|
206 |
-
examples = pd.read_csv('examples.csv')
|
207 |
-
questions = list(examples['question'])
|
208 |
-
contexts = list(examples['context'])
|
209 |
-
return questions, contexts
|
210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|