etweedy commited on
Commit
ef183f2
·
1 Parent(s): ac4ecec

Delete lib

Browse files
lib/.DS_Store DELETED
Binary file (6.15 kB)
 
lib/.ipynb_checkpoints/utils-checkpoint.py DELETED
@@ -1,206 +0,0 @@
1
- import numpy as np
2
- from scipy.special import softmax
3
- import collections
4
- import torch
5
- from torch.utils.data import DataLoader
6
- from transformers import default_data_collator
7
- import pandas as pd
8
-
9
- def preprocess_examples(examples, tokenizer , max_length = 384, stride = 128):
10
- """
11
- Preprocesses and tokenizes examples in preparation for inference
12
-
13
- Parameters:
14
- -----------
15
- examples : datasets.Dataset
16
- The dataset of examples. Must have columns:
17
- 'id', 'question', 'context'
18
- tokenizer : transformers.AutoTokenizer
19
- The tokenizer for the model
20
- max_length : int
21
- The max length for context truncation
22
- stride : int
23
- The stride for context truncation
24
-
25
- Returns:
26
- --------
27
- inputs : dict
28
- The tokenized and processed data dictionary with
29
- keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
30
- All values are lists of length = # of inputs output by tokenizer
31
- inputs['input_ids'][k] : list
32
- token ids corresponding to tokens in feature k
33
- inputs['attention_mask'][k] : list
34
- attention mask for feature k
35
- inputs['offset_ids'][k] : list
36
- offset ids for feature k
37
- inputs['example_id'][k] : int
38
- id of example from which feature k originated
39
- """
40
- questions = [q.strip() for q in examples["question"]]
41
- inputs = tokenizer(
42
- questions,
43
- examples['context'],
44
- max_length=max_length,
45
- truncation="only_second",
46
- stride=stride,
47
- return_overflowing_tokens=True,
48
- return_offsets_mapping=True,
49
- padding="max_length",
50
- )
51
-
52
- sample_map = inputs.pop("overflow_to_sample_mapping")
53
- example_ids = []
54
-
55
- for i in range(len(inputs["input_ids"])):
56
- sample_idx = sample_map[i]
57
- example_ids.append(examples["id"][sample_idx])
58
-
59
- sequence_ids = inputs.sequence_ids(i)
60
- offset = inputs["offset_mapping"][i]
61
- inputs["offset_mapping"][i] = [
62
- o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
63
- ]
64
-
65
- inputs["example_id"] = example_ids
66
- return inputs
67
-
68
-
69
- def make_predictions(model,tokenizer,inputs,examples,
70
- n_best = 20,max_answer_length=30):
71
- """
72
- Generates a list of prediction data based on logits
73
-
74
- Parameters:
75
- -----------
76
- model : transformers.AutoModelForQuestionAnswering
77
- The trained model
78
- tokenizer : transformers.AutoTokenizer
79
- The model's tokenizer
80
- inputs : dict
81
- The tokenized and processed data dictionary with
82
- keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
83
- All values are lists of length = # of inputs output by tokenizer
84
- inputs['input_ids'][k] : list
85
- token ids corresponding to tokens in feature k
86
- inputs['attention_mask'][k] : list
87
- attention mask for feature k
88
- inputs['offset_ids'][k] : list
89
- offset ids for feature k
90
- inputs['example_id'][k] : int
91
- id of example from which feature k originated
92
- examples : datasets.Dataset
93
- The dataset of examples. Must have columns:
94
- 'id', 'question', 'context'
95
- n_best : int
96
- The number of top start/end (by logit) indices to consider
97
- max_answer_length : int
98
- The maximum length (in characters) allowed for a candidate answer
99
-
100
- Returns:
101
- --------
102
- predicted_answers : list(dict)
103
- predicted_answers[k] has keys 'id','prediction_text','confidence'
104
- predicted_answers[k]['id'] : int
105
- The unique id of the example
106
- predicted_answers[k]['prediction_text'] : str
107
- The predicted answer as a string
108
- predicted_answers[k]['confidence'] : float
109
- The predicted probability corresponding to the answer, i.e. the
110
- corresponding output of a softmax function on logits
111
- """
112
- assert n_best <= len(inputs['input_ids'][0]), 'n_best cannot be larger than max_length'
113
-
114
- if torch.backends.mps.is_available():
115
- device = "mps"
116
- elif torch.cuda.is_available():
117
- device = "cuda"
118
- else:
119
- device = "cpu"
120
- data_for_model = inputs.remove_columns(["example_id", "offset_mapping"])
121
- data_for_model.set_format("torch",device=device)
122
- dl = DataLoader(
123
- data_for_model,
124
- collate_fn=default_data_collator,
125
- batch_size=len(inputs)
126
- )
127
- model = model.to(device)
128
- for batch in dl:
129
- outputs = model(**batch)
130
-
131
- start_logits = outputs.start_logits.cpu().detach().numpy()
132
- end_logits = outputs.end_logits.cpu().detach().numpy()
133
- example_to_inputs = collections.defaultdict(list)
134
- for idx, feature in enumerate(inputs):
135
- example_to_inputs[feature["example_id"]].append(idx)
136
-
137
- predicted_answers = []
138
- for example in examples:
139
- example_id = example["id"]
140
- context = example["context"]
141
- answers = []
142
-
143
- for feature_index in example_to_inputs[example_id]:
144
- start_logit = start_logits[feature_index]
145
- end_logit = end_logits[feature_index]
146
- offsets = inputs[feature_index]['offset_mapping']
147
-
148
- start_indices = np.argsort(start_logit)[-1:-n_best-1:-1].tolist()
149
- end_indices = np.argsort(end_logit)[-1 :-n_best-1: -1].tolist()
150
-
151
- for start_index in start_indices:
152
- for end_index in end_indices:
153
- # Skip answers with a length that is either < 0 or > max_answer_length.
154
- if(
155
- end_index < start_index
156
- or end_index - start_index + 1 > max_answer_length
157
- ):
158
- continue
159
-
160
- if (offsets[start_index] is None)^(offsets[end_index] is None):
161
- continue
162
- if (offsets[start_index] is None)&(offsets[end_index] is None):
163
- answers.append(
164
- {
165
- "text": '',
166
- "logit_score": start_logit[start_index] + end_logit[end_index],
167
- }
168
- )
169
- else:
170
- answers.append(
171
- {
172
- "text": context[offsets[start_index][0] : offsets[end_index][1]],
173
- "logit_score": start_logit[start_index] + end_logit[end_index],
174
- }
175
- )
176
- answer_logits = [a['logit_score'] for a in answers]
177
- answer_probs = softmax(answer_logits)
178
-
179
- if len(answers)>0:
180
- best_answer = max(answers, key=lambda x:x['logit_score'])
181
- predicted_answers.append(
182
- {'id':example_id, 'prediction_text':best_answer['text'], 'confidence':answer_probs[0]}
183
- )
184
- else:
185
- predicted_answers.append({'id':example_id, 'prediction_text':'','confidence':answer_probs[0]})
186
- for pred in predicted_answers:
187
- if pred['prediction_text'] == '':
188
- pred['prediction_text'] = "I don't have an answer based on the context provided."
189
- return predicted_answers
190
-
191
- def get_examples():
192
- """
193
- Retrieve pre-made examples from a .csv file
194
- Parameters: None
195
- -----------
196
- Returns:
197
- --------
198
- questions, contexts : list, list
199
- Lists of examples of corresponding question-context pairs
200
-
201
- """
202
- examples = pd.read_csv('examples.csv')
203
- questions = list(examples['question'])
204
- contexts = list(examples['context'])
205
- return questions, contexts
206
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
lib/__init__.py DELETED
File without changes
lib/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (151 Bytes)
 
lib/__pycache__/utils.cpython-310.pyc DELETED
Binary file (6.56 kB)
 
lib/utils.py DELETED
@@ -1,210 +0,0 @@
1
- import numpy as np
2
- from scipy.special import softmax
3
- import collections
4
- import torch
5
- from torch.utils.data import DataLoader
6
- from transformers import default_data_collator
7
- import pandas as pd
8
-
9
- def preprocess_examples(examples, tokenizer , max_length = 384, stride = 128):
10
- """
11
- Preprocesses and tokenizes examples in preparation for inference
12
-
13
- Parameters:
14
- -----------
15
- examples : datasets.Dataset
16
- The dataset of examples. Must have columns:
17
- 'id', 'question', 'context'
18
- tokenizer : transformers.AutoTokenizer
19
- The tokenizer for the model
20
- max_length : int
21
- The max length for context truncation
22
- stride : int
23
- The stride for context truncation
24
-
25
- Returns:
26
- --------
27
- inputs : dict
28
- The tokenized and processed data dictionary with
29
- keys 'input_ids', 'attention_mask', 'offset_mapping', 'example_id'
30
- All values are lists of length = # of inputs output by tokenizer
31
- inputs['input_ids'][k] : list
32
- token ids corresponding to tokens in feature k
33
- inputs['attention_mask'][k] : list
34
- attention mask for feature k
35
- inputs['offset_mapping'][k] : list
36
- offset_mapping for feature k
37
- inputs['example_id'][k] : int
38
- id of example from which feature k originated
39
- """
40
- questions = [q.strip() for q in examples["question"]]
41
- inputs = tokenizer(
42
- questions,
43
- examples['context'],
44
- max_length=max_length,
45
- truncation="only_second",
46
- stride=stride,
47
- return_overflowing_tokens=True,
48
- return_offsets_mapping=True,
49
- padding="max_length",
50
- )
51
-
52
- sample_map = inputs.pop("overflow_to_sample_mapping")
53
- example_ids = []
54
-
55
- for i in range(len(inputs["input_ids"])):
56
- sample_idx = sample_map[i]
57
- example_ids.append(examples["id"][sample_idx])
58
-
59
- sequence_ids = inputs.sequence_ids(i)
60
- offset = inputs["offset_mapping"][i]
61
- inputs["offset_mapping"][i] = [
62
- o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
63
- ]
64
-
65
- inputs["example_id"] = example_ids
66
- return inputs
67
-
68
-
69
- def make_predictions(model,tokenizer,inputs,examples,
70
- n_best = 20,max_answer_length=30):
71
- """
72
- Generates a list of prediction data based on logits
73
-
74
- Parameters:
75
- -----------
76
- model : transformers.AutoModelForQuestionAnswering
77
- The trained model
78
- tokenizer : transformers.AutoTokenizer
79
- The model's tokenizer
80
- inputs : dict
81
- The tokenized and processed data dictionary with
82
- keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
83
- All values are lists of length = # of inputs output by tokenizer
84
- inputs['input_ids'][k] : list
85
- token ids corresponding to tokens in feature k
86
- inputs['attention_mask'][k] : list
87
- attention mask for feature k
88
- inputs['offset_ids'][k] : list
89
- offset ids for feature k
90
- inputs['example_id'][k] : int
91
- id of example from which feature k originated
92
- examples : datasets.Dataset
93
- The dataset of examples. Must have columns:
94
- 'id', 'question', 'context'
95
- n_best : int
96
- The number of top start/end (by logit) indices to consider
97
- max_answer_length : int
98
- The maximum length (in characters) allowed for a candidate answer
99
-
100
- Returns:
101
- --------
102
- predicted_answers : list(dict)
103
- predicted_answers[k] has keys 'id','prediction_text','confidence'
104
- predicted_answers[k]['id'] : int
105
- The unique id of the example
106
- predicted_answers[k]['prediction_text'] : str
107
- The predicted answer as a string
108
- predicted_answers[k]['confidence'] : float
109
- The predicted probability corresponding to the answer, i.e. the
110
- corresponding output of a softmax function on logits
111
- """
112
- assert n_best <= len(inputs['input_ids'][0]), 'n_best cannot be larger than max_length'
113
-
114
- if torch.backends.mps.is_available():
115
- device = "mps"
116
- elif torch.cuda.is_available():
117
- device = "cuda"
118
- else:
119
- device = "cpu"
120
- data_for_model = inputs.remove_columns(["example_id", "offset_mapping"])
121
- data_for_model.set_format("torch",device=device)
122
- dl = DataLoader(
123
- data_for_model,
124
- collate_fn=default_data_collator,
125
- batch_size=len(inputs)
126
- )
127
- model = model.to(device)
128
- for batch in dl:
129
- outputs = model(**batch)
130
-
131
- start_logits = outputs.start_logits.cpu().detach().numpy()
132
- end_logits = outputs.end_logits.cpu().detach().numpy()
133
- example_to_inputs = collections.defaultdict(list)
134
- for idx, feature in enumerate(inputs):
135
- example_to_inputs[feature["example_id"]].append(idx)
136
-
137
- predicted_answers = []
138
- for example in examples:
139
- example_id = example["id"]
140
- context = example["context"]
141
- answers = []
142
-
143
- for feature_index in example_to_inputs[example_id]:
144
- start_logit = start_logits[feature_index]
145
- end_logit = end_logits[feature_index]
146
- offsets = inputs[feature_index]['offset_mapping']
147
-
148
- start_indices = np.argsort(start_logit)[-1:-n_best-1:-1].tolist()
149
- end_indices = np.argsort(end_logit)[-1 :-n_best-1: -1].tolist()
150
-
151
- for start_index in start_indices:
152
- for end_index in end_indices:
153
- # Skip answers with a length that is either < 0 or > max_answer_length.
154
- if(
155
- end_index < start_index
156
- or end_index - start_index + 1 > max_answer_length
157
- ):
158
- continue
159
-
160
- if (offsets[start_index] is None)^(offsets[end_index] is None):
161
- continue
162
- if (offsets[start_index] is None)&(offsets[end_index] is None):
163
- answers.append(
164
- {
165
- "text": '',
166
- "logit_score": start_logit[start_index] + end_logit[end_index],
167
- }
168
- )
169
- else:
170
- answers.append(
171
- {
172
- "text": context[offsets[start_index][0] : offsets[end_index][1]],
173
- "logit_score": start_logit[start_index] + end_logit[end_index],
174
- }
175
- )
176
- answer_logits = [a['logit_score'] for a in answers]
177
- answer_probs = softmax(answer_logits)
178
-
179
- if len(answers)>0:
180
- # best_answer = max(answers, key=lambda x:x['logit_score'])
181
- best_answer_idx = np.argmax(answer_logits)
182
- predicted_answers.append(
183
- {'id':example_id,
184
- 'prediction_text':answers[best_answer_idx]['text'],
185
- 'confidence':answer_probs[best_answer_idx]}
186
- )
187
- else:
188
- predicted_answers.append({'id':example_id, 'prediction_text':'',
189
- 'confidence':answer_probs[best_answer_idx]})
190
- for pred in predicted_answers:
191
- if pred['prediction_text'] == '':
192
- pred['prediction_text'] = "I don't have an answer based on the context provided."
193
- return predicted_answers
194
-
195
- def get_examples():
196
- """
197
- Retrieve pre-made examples from a .csv file
198
- Parameters: None
199
- -----------
200
- Returns:
201
- --------
202
- questions, contexts : list, list
203
- Lists of examples of corresponding question-context pairs
204
-
205
- """
206
- examples = pd.read_csv('examples.csv')
207
- questions = list(examples['question'])
208
- contexts = list(examples['context'])
209
- return questions, contexts
210
-