Mila commited on
Commit
3cff715
·
1 Parent(s): 3139db4

still broken?

Browse files
Files changed (4) hide show
  1. analogy_train.py +300 -300
  2. app_context.py +260 -0
  3. flan-t5-train.py +234 -234
  4. word_embedding.py +6 -0
analogy_train.py CHANGED
@@ -1,301 +1,301 @@
1
- import gradio as gr
2
- import math
3
- import spacy
4
- from datasets import load_dataset
5
- from sentence_transformers import SentenceTransformer
6
- from sentence_transformers import InputExample
7
- from sentence_transformers import losses
8
- from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
9
- from transformers import TrainingArguments, Trainer
10
- import torch
11
- import torch.nn.functional as F
12
- from torch.utils.data import DataLoader
13
- import numpy as np
14
- import evaluate
15
- import nltk
16
- from nltk.corpus import stopwords
17
- import subprocess
18
- import sys
19
- from transformers import DataCollatorWithPadding
20
- from transformers import TrainingArguments
21
- from transformers import (
22
- BertModel,
23
- BertTokenizerFast,
24
- Trainer,
25
- EvalPrediction
26
- )
27
-
28
- # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
29
- # subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
30
- # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
31
- # data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
32
- # nltk.download('stopwords')
33
- # nlp = spacy.load("en_core_web_sm")
34
- # stops = stopwords.words("english")
35
-
36
- # answer = "Pizza"
37
- guesses = []
38
- answer = "Pizza"
39
-
40
- tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
41
- metric = evaluate.load("accuracy")
42
-
43
- def tokenize_function(examples):
44
- return tokenizer(examples["stem"], padding="max_length", truncation=True)
45
-
46
-
47
- #Mean Pooling - Take attention mask into account for correct averaging
48
- def mean_pooling(model_output, attention_mask):
49
- token_embeddings = model_output[0] #First element of model_output contains all token embeddings
50
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
51
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
52
-
53
-
54
- def compute_metrics(eval_pred):
55
- logits, labels = eval_pred
56
- predictions = np.argmax(logits, axis=-1)
57
- metric = evaluate.load("accuracy")
58
- return metric.compute(predictions=predictions, references=labels)
59
-
60
-
61
- # def training():
62
- # dataset_id = "relbert/analogy_questions"
63
- # dataset_sub = "bats"
64
- # print("GETTING DATASET")
65
- # raw_dataset = load_dataset(dataset_id, dataset_sub)
66
- # # data_metric = evaluate.load(dataset_id, dataset_sub)
67
- # checkpoint = "bert-base-uncased"
68
- # model = BertModel.from_pretrained(checkpoint)
69
- # # dataset = dataset["train"]
70
- # # tokenized_datasets = dataset.map(tokenize_function, batched=True)
71
- # # print(raw_dataset)
72
- # test_data = raw_dataset["test"]
73
- # # print(test_data["stem"])
74
- # all_answers = []
75
- # for answer in raw_dataset["answer"]:
76
- # answer = raw_dataset["choice"][answer]
77
- # raw_dataset = raw_dataset.add_column("label", all_answers)
78
-
79
-
80
- # print(raw_dataset)
81
- # print(raw_dataset["label"])
82
- # dataset = raw_dataset.map(
83
- # lambda x: tokenizer(x["stem"], truncation=True),
84
- # batched=True,
85
- # )
86
- # print(dataset)
87
- # dataset = dataset.remove_columns(["stem", "answer", "choice"])
88
- # dataset = dataset.rename_column("label", "labels")
89
- # dataset = dataset.with_format("torch")
90
-
91
- # training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
92
-
93
- # print(dataset)
94
- # # print(f"- The {dataset_id} dataset has {dataset.num_rows} examples.")
95
- # # print(f"- Each example is a {type(dataset[0])} with a {type(dataset[0]['stem'])} as value.")
96
- # # print(f"- Examples look like this: {dataset[0]}")
97
-
98
- # # small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
99
- # # small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
100
-
101
- # # dataset = dataset["train"].map(tokenize_function, batched=True)
102
- # # dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
103
- # # dataset.format['type']
104
-
105
- # # tokenized_news = dataset.map(tokenize_function, batched=True)
106
-
107
- # # model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", num_labels=2)
108
-
109
- # # print(dataset)
110
-
111
- # # Choose the appropriate device based on availability (CUDA or CPU)
112
- # # gpu_available = torch.cuda.is_available()
113
- # # device = torch.device("cuda" if gpu_available else "cpu")
114
- # # model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')
115
-
116
- # # tokenized_datasets = dataset.map(tokenize_function, batched=True)
117
- # # print(tokenized_datasets)
118
- # # # small_train_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
119
- # # # small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(1000))
120
-
121
- # # model = model.to(device)
122
-
123
- # # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
124
- # # training_args = TrainingArguments(output_dir="test_trainer")
125
-
126
- # trainer = Trainer(
127
- # model=model,
128
- # args=training_args,
129
- # train_dataset=dataset["test"],
130
- # eval_dataset=dataset["validation"],
131
- # compute_metrics=compute_metrics,
132
- # )
133
-
134
- # output = trainer.train()
135
-
136
- # # train_examples = []
137
- # # train_data = dataset["train"]
138
- # # # For agility we only 1/2 of our available data
139
- # # n_examples = dataset["train"].num_rows // 2
140
-
141
- # # for i in range(n_examples):
142
- # # example = train_data[i]
143
- # # # example_opposite = dataset_clean[-(i)]
144
- # # # print(example["text"])
145
- # # train_examples.append(InputExample(texts=[example['stem'], example]))
146
-
147
-
148
- # # train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
149
-
150
- # # print("END DATALOADER")
151
-
152
- # # # print(train_examples)
153
-
154
- # # embeddings = finetune(train_dataloader)
155
- # print(output)
156
-
157
- # model.save("bert-analogies")
158
-
159
- # model.save_to_hub("smhavens/bert-base-analogies")
160
- # return output
161
-
162
-
163
- # def finetune(train_dataloader):
164
- # # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
165
- # model_id = "sentence-transformers/all-MiniLM-L6-v2"
166
- # model = SentenceTransformer(model_id)
167
- # device = torch.device('cuda:0')
168
- # model = model.to(device)
169
-
170
- # # training_args = TrainingArguments(output_dir="test_trainer")
171
-
172
- # # USE THIS LINK
173
- # # https://huggingface.co/blog/how-to-train-sentence-transformers
174
-
175
- # train_loss = losses.BatchHardSoftMarginTripletLoss(model=model)
176
-
177
- # print("BEGIN FIT")
178
-
179
- # model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
180
-
181
- # model.save("bert-analogies")
182
-
183
- # model.save_to_hub("smhavens/bert-base-analogies")
184
- # return 0
185
-
186
- def training():
187
- dataset_id = "relbert/analogy_questions"
188
- dataset_sub = "bats"
189
- print("GETTING DATASET")
190
- dataset = load_dataset(dataset_id, dataset_sub)
191
- # dataset = dataset["train"]
192
- # tokenized_datasets = dataset.map(tokenize_function, batched=True)
193
-
194
- print(f"- The {dataset_id} dataset has {dataset['test'].num_rows} examples.")
195
- print(f"- Each example is a {type(dataset['test'][0])} with a {type(dataset['test'][0]['stem'])} as value.")
196
- print(f"- Examples look like this: {dataset['test'][0]}")
197
-
198
- train_examples = []
199
- train_data = dataset["test"]
200
- # For agility we only 1/2 of our available data
201
- n_examples = dataset["test"].num_rows // 2
202
-
203
- for i in range(n_examples):
204
- example = train_data[i]
205
- temp_word_1 = example["stem"][0]
206
- temp_word_2 = example["stem"][1]
207
- temp_word_3 = example["choice"][example["answer"]][0]
208
- temp_word_4 = example["choice"][example["answer"]][1]
209
- comp1 = f"{temp_word_1} to {temp_word_2}"
210
- comp2 = f"{temp_word_3} to {temp_word_4}"
211
- # example_opposite = dataset_clean[-(i)]
212
- # print(example["text"])
213
- train_examples.append(InputExample(texts=[comp1, comp2]))
214
-
215
-
216
- train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
217
-
218
- print("END DATALOADER")
219
-
220
- # print(train_examples)
221
-
222
- embeddings = finetune(train_dataloader)
223
-
224
- return (dataset['test'].num_rows, type(dataset['test'][0]), type(dataset['test'][0]['stem']), dataset['test'][0], embeddings)
225
-
226
-
227
- def finetune(train_dataloader):
228
- # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
229
- model_id = "sentence-transformers/all-MiniLM-L6-v2"
230
- model = SentenceTransformer(model_id)
231
- device = torch.device('cuda:0')
232
- model = model.to(device)
233
-
234
- # training_args = TrainingArguments(output_dir="test_trainer")
235
-
236
- # USE THIS LINK
237
- # https://huggingface.co/blog/how-to-train-sentence-transformers
238
-
239
- train_loss = losses.MegaBatchMarginLoss(model=model)
240
-
241
- print("BEGIN FIT")
242
-
243
- model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
244
-
245
- model.save("bert-analogies")
246
-
247
- # model.save_to_hub("smhavens/bert-base-analogies")
248
- # accuracy = compute_metrics(eval, metric)
249
- return 0
250
-
251
- def greet(name):
252
- return "Hello " + name + "!!"
253
-
254
- def check_answer(guess:str):
255
- global guesses
256
- global answer
257
- guesses.append(guess)
258
- output = ""
259
- for guess in guesses:
260
- output += ("- " + guess + "\n")
261
- output = output[:-1]
262
-
263
- if guess.lower() == answer.lower():
264
- return "Correct!", output
265
- else:
266
- return "Try again!", output
267
-
268
- def main():
269
- print("BEGIN")
270
- word1 = "Black"
271
- word2 = "White"
272
- word3 = "Sun"
273
- global answer
274
- answer = "Moon"
275
- global guesses
276
-
277
- num_rows, data_type, value, example, embeddings = training()
278
-
279
- # prompt = f"{word1} is to {word2} as {word3} is to ____"
280
- # with gr.Blocks() as iface:
281
- # gr.Markdown(prompt)
282
- # with gr.Tab("Guess"):
283
- # text_input = gr.Textbox()
284
- # text_output = gr.Textbox()
285
- # text_button = gr.Button("Submit")
286
- # with gr.Accordion("Open for previous guesses"):
287
- # text_guesses = gr.Textbox()
288
- # with gr.Tab("Testing"):
289
- # gr.Markdown(f"""Number of rows in dataset is {num_rows}, with each having type {data_type} and value {value}.
290
- # An example is {example}.
291
- # The Embeddings are {embeddings}.""")
292
- # text_button.click(check_answer, inputs=[text_input], outputs=[text_output, text_guesses])
293
- # # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
294
- # iface.launch()
295
-
296
-
297
-
298
-
299
-
300
- if __name__ == "__main__":
301
  main()
 
1
+ import gradio as gr
2
+ import math
3
+ import spacy
4
+ from datasets import load_dataset
5
+ from sentence_transformers import SentenceTransformer
6
+ from sentence_transformers import InputExample
7
+ from sentence_transformers import losses
8
+ from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
9
+ from transformers import TrainingArguments, Trainer
10
+ import torch
11
+ import torch.nn.functional as F
12
+ from torch.utils.data import DataLoader
13
+ import numpy as np
14
+ import evaluate
15
+ import nltk
16
+ from nltk.corpus import stopwords
17
+ import subprocess
18
+ import sys
19
+ from transformers import DataCollatorWithPadding
20
+ from transformers import TrainingArguments
21
+ from transformers import (
22
+ BertModel,
23
+ BertTokenizerFast,
24
+ Trainer,
25
+ EvalPrediction
26
+ )
27
+
28
+ # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
29
+ # subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
30
+ # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
31
+ # data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
32
+ # nltk.download('stopwords')
33
+ # nlp = spacy.load("en_core_web_sm")
34
+ # stops = stopwords.words("english")
35
+
36
+ # answer = "Pizza"
37
+ guesses = []
38
+ answer = "Pizza"
39
+
40
+ tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
41
+ metric = evaluate.load("accuracy")
42
+
43
+ def tokenize_function(examples):
44
+ return tokenizer(examples["stem"], padding="max_length", truncation=True)
45
+
46
+
47
+ #Mean Pooling - Take attention mask into account for correct averaging
48
+ def mean_pooling(model_output, attention_mask):
49
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
50
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
51
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
52
+
53
+
54
+ def compute_metrics(eval_pred):
55
+ logits, labels = eval_pred
56
+ predictions = np.argmax(logits, axis=-1)
57
+ metric = evaluate.load("accuracy")
58
+ return metric.compute(predictions=predictions, references=labels)
59
+
60
+
61
+ # def training():
62
+ # dataset_id = "relbert/analogy_questions"
63
+ # dataset_sub = "bats"
64
+ # print("GETTING DATASET")
65
+ # raw_dataset = load_dataset(dataset_id, dataset_sub)
66
+ # # data_metric = evaluate.load(dataset_id, dataset_sub)
67
+ # checkpoint = "bert-base-uncased"
68
+ # model = BertModel.from_pretrained(checkpoint)
69
+ # # dataset = dataset["train"]
70
+ # # tokenized_datasets = dataset.map(tokenize_function, batched=True)
71
+ # # print(raw_dataset)
72
+ # test_data = raw_dataset["test"]
73
+ # # print(test_data["stem"])
74
+ # all_answers = []
75
+ # for answer in raw_dataset["answer"]:
76
+ # answer = raw_dataset["choice"][answer]
77
+ # raw_dataset = raw_dataset.add_column("label", all_answers)
78
+
79
+
80
+ # print(raw_dataset)
81
+ # print(raw_dataset["label"])
82
+ # dataset = raw_dataset.map(
83
+ # lambda x: tokenizer(x["stem"], truncation=True),
84
+ # batched=True,
85
+ # )
86
+ # print(dataset)
87
+ # dataset = dataset.remove_columns(["stem", "answer", "choice"])
88
+ # dataset = dataset.rename_column("label", "labels")
89
+ # dataset = dataset.with_format("torch")
90
+
91
+ # training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
92
+
93
+ # print(dataset)
94
+ # # print(f"- The {dataset_id} dataset has {dataset.num_rows} examples.")
95
+ # # print(f"- Each example is a {type(dataset[0])} with a {type(dataset[0]['stem'])} as value.")
96
+ # # print(f"- Examples look like this: {dataset[0]}")
97
+
98
+ # # small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
99
+ # # small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
100
+
101
+ # # dataset = dataset["train"].map(tokenize_function, batched=True)
102
+ # # dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
103
+ # # dataset.format['type']
104
+
105
+ # # tokenized_news = dataset.map(tokenize_function, batched=True)
106
+
107
+ # # model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", num_labels=2)
108
+
109
+ # # print(dataset)
110
+
111
+ # # Choose the appropriate device based on availability (CUDA or CPU)
112
+ # # gpu_available = torch.cuda.is_available()
113
+ # # device = torch.device("cuda" if gpu_available else "cpu")
114
+ # # model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')
115
+
116
+ # # tokenized_datasets = dataset.map(tokenize_function, batched=True)
117
+ # # print(tokenized_datasets)
118
+ # # # small_train_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
119
+ # # # small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(1000))
120
+
121
+ # # model = model.to(device)
122
+
123
+ # # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
124
+ # # training_args = TrainingArguments(output_dir="test_trainer")
125
+
126
+ # trainer = Trainer(
127
+ # model=model,
128
+ # args=training_args,
129
+ # train_dataset=dataset["test"],
130
+ # eval_dataset=dataset["validation"],
131
+ # compute_metrics=compute_metrics,
132
+ # )
133
+
134
+ # output = trainer.train()
135
+
136
+ # # train_examples = []
137
+ # # train_data = dataset["train"]
138
+ # # # For agility we only 1/2 of our available data
139
+ # # n_examples = dataset["train"].num_rows // 2
140
+
141
+ # # for i in range(n_examples):
142
+ # # example = train_data[i]
143
+ # # # example_opposite = dataset_clean[-(i)]
144
+ # # # print(example["text"])
145
+ # # train_examples.append(InputExample(texts=[example['stem'], example]))
146
+
147
+
148
+ # # train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
149
+
150
+ # # print("END DATALOADER")
151
+
152
+ # # # print(train_examples)
153
+
154
+ # # embeddings = finetune(train_dataloader)
155
+ # print(output)
156
+
157
+ # model.save("bert-analogies")
158
+
159
+ # model.save_to_hub("smhavens/bert-base-analogies")
160
+ # return output
161
+
162
+
163
+ # def finetune(train_dataloader):
164
+ # # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
165
+ # model_id = "sentence-transformers/all-MiniLM-L6-v2"
166
+ # model = SentenceTransformer(model_id)
167
+ # device = torch.device('cuda:0')
168
+ # model = model.to(device)
169
+
170
+ # # training_args = TrainingArguments(output_dir="test_trainer")
171
+
172
+ # # USE THIS LINK
173
+ # # https://huggingface.co/blog/how-to-train-sentence-transformers
174
+
175
+ # train_loss = losses.BatchHardSoftMarginTripletLoss(model=model)
176
+
177
+ # print("BEGIN FIT")
178
+
179
+ # model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
180
+
181
+ # model.save("bert-analogies")
182
+
183
+ # model.save_to_hub("smhavens/bert-base-analogies")
184
+ # return 0
185
+
186
+ def training():
187
+ dataset_id = "relbert/analogy_questions"
188
+ dataset_sub = "bats"
189
+ print("GETTING DATASET")
190
+ dataset = load_dataset(dataset_id, dataset_sub)
191
+ # dataset = dataset["train"]
192
+ # tokenized_datasets = dataset.map(tokenize_function, batched=True)
193
+
194
+ print(f"- The {dataset_id} dataset has {dataset['test'].num_rows} examples.")
195
+ print(f"- Each example is a {type(dataset['test'][0])} with a {type(dataset['test'][0]['stem'])} as value.")
196
+ print(f"- Examples look like this: {dataset['test'][0]}")
197
+
198
+ train_examples = []
199
+ train_data = dataset["test"]
200
+ # For agility we only 1/2 of our available data
201
+ n_examples = dataset["test"].num_rows // 2
202
+
203
+ for i in range(n_examples):
204
+ example = train_data[i]
205
+ temp_word_1 = example["stem"][0]
206
+ temp_word_2 = example["stem"][1]
207
+ temp_word_3 = example["choice"][example["answer"]][0]
208
+ temp_word_4 = example["choice"][example["answer"]][1]
209
+ comp1 = f"{temp_word_1} to {temp_word_2}"
210
+ comp2 = f"{temp_word_3} to {temp_word_4}"
211
+ # example_opposite = dataset_clean[-(i)]
212
+ # print(example["text"])
213
+ train_examples.append(InputExample(texts=[comp1, comp2]))
214
+
215
+
216
+ train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=25)
217
+
218
+ print("END DATALOADER")
219
+
220
+ # print(train_examples)
221
+
222
+ embeddings = finetune(train_dataloader)
223
+
224
+ return (dataset['test'].num_rows, type(dataset['test'][0]), type(dataset['test'][0]['stem']), dataset['test'][0], embeddings)
225
+
226
+
227
+ def finetune(train_dataloader):
228
+ # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
229
+ model_id = "sentence-transformers/all-MiniLM-L6-v2"
230
+ model = SentenceTransformer(model_id)
231
+ device = torch.device('cuda:0')
232
+ model = model.to(device)
233
+
234
+ # training_args = TrainingArguments(output_dir="test_trainer")
235
+
236
+ # USE THIS LINK
237
+ # https://huggingface.co/blog/how-to-train-sentence-transformers
238
+
239
+ train_loss = losses.MegaBatchMarginLoss(model=model)
240
+
241
+ print("BEGIN FIT")
242
+
243
+ model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
244
+
245
+ model.save("bert-analogies")
246
+
247
+ # model.save_to_hub("smhavens/bert-base-analogies")
248
+ # accuracy = compute_metrics(eval, metric)
249
+ return 0
250
+
251
+ def greet(name):
252
+ return "Hello " + name + "!!"
253
+
254
+ def check_answer(guess:str):
255
+ global guesses
256
+ global answer
257
+ guesses.append(guess)
258
+ output = ""
259
+ for guess in guesses:
260
+ output += ("- " + guess + "\n")
261
+ output = output[:-1]
262
+
263
+ if guess.lower() == answer.lower():
264
+ return "Correct!", output
265
+ else:
266
+ return "Try again!", output
267
+
268
+ def main():
269
+ print("BEGIN")
270
+ word1 = "Black"
271
+ word2 = "White"
272
+ word3 = "Sun"
273
+ global answer
274
+ answer = "Moon"
275
+ global guesses
276
+
277
+ num_rows, data_type, value, example, embeddings = training()
278
+
279
+ # prompt = f"{word1} is to {word2} as {word3} is to ____"
280
+ # with gr.Blocks() as iface:
281
+ # gr.Markdown(prompt)
282
+ # with gr.Tab("Guess"):
283
+ # text_input = gr.Textbox()
284
+ # text_output = gr.Textbox()
285
+ # text_button = gr.Button("Submit")
286
+ # with gr.Accordion("Open for previous guesses"):
287
+ # text_guesses = gr.Textbox()
288
+ # with gr.Tab("Testing"):
289
+ # gr.Markdown(f"""Number of rows in dataset is {num_rows}, with each having type {data_type} and value {value}.
290
+ # An example is {example}.
291
+ # The Embeddings are {embeddings}.""")
292
+ # text_button.click(check_answer, inputs=[text_input], outputs=[text_output, text_guesses])
293
+ # # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
294
+ # iface.launch()
295
+
296
+
297
+
298
+
299
+
300
+ if __name__ == "__main__":
301
  main()
app_context.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import gradio as gr
2
  import math
3
  import spacy
@@ -251,4 +252,263 @@ def main():
251
 
252
 
253
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  main()
 
1
+ <<<<<<< HEAD
2
  import gradio as gr
3
  import math
4
  import spacy
 
252
 
253
 
254
  if __name__ == "__main__":
255
+ =======
256
+ import gradio as gr
257
+ import math
258
+ import spacy
259
+ from datasets import load_dataset
260
+ from sentence_transformers import SentenceTransformer
261
+ from sentence_transformers import InputExample
262
+ from sentence_transformers import losses
263
+ from sentence_transformers import util
264
+ from transformers import pipeline, T5Tokenizer
265
+ from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
266
+ from transformers import TrainingArguments, Trainer, T5ForConditionalGeneration
267
+ import torch
268
+ import torch.nn.functional as F
269
+ from torch.utils.data import DataLoader
270
+ import numpy as np
271
+ import evaluate
272
+ import nltk
273
+ from nltk.corpus import stopwords
274
+ import subprocess
275
+ import sys
276
+ import random
277
+ from textwrap import fill
278
+
279
+ # !pip install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
280
+ subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl'])
281
+ # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
282
+ model_base = "results/checkpoint-17000"
283
+ nltk.download('stopwords')
284
+ nlp = spacy.load("en_core_web_sm")
285
+ stops = stopwords.words("english")
286
+ ROMAN_CONSTANTS = (
287
+ ( "", "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX" ),
288
+ ( "", "X", "XX", "XXX", "XL", "L", "LX", "LXX", "LXXX", "XC" ),
289
+ ( "", "C", "CC", "CCC", "CD", "D", "DC", "DCC", "DCCC", "CM" ),
290
+ ( "", "M", "MM", "MMM", "", "", "-", "", "", "" ),
291
+ ( "", "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix" ),
292
+ ( "", "x", "xx", "xxx", "xl", "l", "lx", "lxx", "lxxx", "xc" ),
293
+ ( "", "c", "cc", "ccc", "cd", "d", "dc", "dcc", "dccc", "cm" ),
294
+ ( "", "m", "mm", "mmm", "", "", "-", "", "", "" ),
295
+ )
296
+
297
+ # answer = "Pizza"
298
+ guesses = []
299
+ return_guesses = []
300
+ answer = "Moon"
301
+ word1 = "Black"
302
+ word2 = "White"
303
+ word3 = "Sun"
304
+ base_prompts = ["Sun is to Moon as ", "Black is to White as ", "Atom is to Element as",
305
+ "Athens is to Greece as ", "Cat is to Dog as ", "Robin is to Bird as",
306
+ "Hunger is to Ambition as "]
307
+
308
+
309
+ #Mean Pooling - Take attention mask into account for correct averaging
310
+ def mean_pooling(model_output, attention_mask):
311
+ token_embeddings = model_output['token_embeddings'] #First element of model_output contains all token embeddings
312
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
313
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
314
+
315
+
316
+ def normalize(comment, lowercase, remove_stopwords):
317
+ if lowercase:
318
+ comment = comment.lower()
319
+ comment = nlp(comment)
320
+ lemmatized = list()
321
+ for word in comment:
322
+ lemma = word.lemma_.strip()
323
+ if lemma:
324
+ if not remove_stopwords or (remove_stopwords and lemma not in stops):
325
+ lemmatized.append(lemma)
326
+ return " ".join(lemmatized)
327
+
328
+
329
+ # def tokenize_function(examples):
330
+ # return tokenizer(examples["text"])
331
+
332
+
333
+ def compute_metrics(eval_pred):
334
+ logits, labels = eval_pred
335
+ predictions = np.argmax(logits, axis=-1)
336
+ metric = evaluate.load("accuracy")
337
+ return metric.compute(predictions=predictions, references=labels)
338
+
339
+
340
+ def get_model():
341
+ global model_base
342
+ # last_checkpoint = "./results/checkpoint-22500"
343
+
344
+ finetuned_model = T5ForConditionalGeneration.from_pretrained(model_base)
345
+ tokenizer = T5Tokenizer.from_pretrained(model_base)
346
+ # model = SentenceTransformer(model_base)
347
+ gpu_available = torch.cuda.is_available()
348
+ device = torch.device("cuda" if gpu_available else "cpu")
349
+ finetuned_model = finetuned_model.to(device)
350
+ return finetuned_model, tokenizer
351
+
352
+
353
+ def cosine_scores(model, sentence):
354
+ global word1
355
+ global word2
356
+ global word3
357
+ # sentence1 = f"{word1} is to {word2} as"
358
+ embeddings1 = model.encode(sentence, convert_to_tensor=True)
359
+
360
+ def embeddings(model, sentences, tokenizer):
361
+ global word1
362
+ global word2
363
+ global word3
364
+ global model_base
365
+ gpu_available = torch.cuda.is_available()
366
+ device = torch.device("cuda" if gpu_available else "cpu")
367
+ # device = torch.device('cuda:0')
368
+ # embeddings = model.encode(sentences)
369
+ question = "Please answer to this question: " + sentences
370
+
371
+ inputs = tokenizer(question, return_tensors="pt")
372
+
373
+ print(inputs)
374
+ # print(inputs.device)
375
+ print(model.device)
376
+ print(inputs['input_ids'].device)
377
+ print(inputs['attention_mask'].device)
378
+
379
+ inputs['attention_mask'] = inputs['attention_mask'].to(device)
380
+ inputs['input_ids'] = inputs['input_ids'].to(device)
381
+
382
+ outputs = model.generate(**inputs)
383
+ answer = tokenizer.decode(outputs[0])
384
+ answer = answer[6:-4]
385
+ # print(fill(answer, width=80))
386
+
387
+ print("ANSWER IS", answer)
388
+
389
+ return answer
390
+
391
+
392
+ def random_word(model, tokenizer):
393
+ global model_base
394
+ vocab = tokenizer.get_vocab()
395
+ # with open(model_base + '/vocab.txt', 'r') as file:
396
+ line = ""
397
+ # content = file.readlines()
398
+ length = tokenizer.vocab_size
399
+ # print(vocab)
400
+ while line == "":
401
+ rand_line = random.randrange(0, length)
402
+ # print("TRYING TO FIND", rand_line, "OUT OF", length, "WITH VOCAB OF TYPE", type(vocab))
403
+ for word, id in vocab.items():
404
+ if id == rand_line and word[0].isalpha() and word not in stops and word not in ROMAN_CONSTANTS:
405
+ # if vocab[rand_line][0].isalpha() and vocab[rand_line][:-1] not in stops and vocab[rand_line][:-1] not in ROMAN_CONSTANTS:
406
+ line = word
407
+ elif id == rand_line:
408
+ print(f"{word} is not alpha or is a stop word")
409
+ # for num, aline in enumerate(file, 1997):
410
+ # if random.randrange(num) and aline.isalpha():
411
+ # continue
412
+ # # elif not aline.isalpha():
413
+
414
+ # line = aline
415
+ print(line)
416
+ return line
417
+
418
+
419
+ def generate_prompt(model, tokenizer):
420
+ global word1
421
+ global word2
422
+ global word3
423
+ global answer
424
+ global base_prompts
425
+ word1 = random_word(model, tokenizer)
426
+ # word2 = random_word()
427
+
428
+ word2 = embeddings(model, f"{base_prompts[random.randint(0, len(base_prompts) - 1)]}{word1} is to ___.", tokenizer)
429
+ word3 = random_word(model, tokenizer)
430
+ sentence = f"{word1} is to {word2} as {word3} is to ___."
431
+ print(sentence)
432
+ answer = embeddings(model, sentence, tokenizer)
433
+ print("ANSWER IS", answer)
434
+ return f"# {word1} is to {word2} as {word3} is to ___."
435
+ # cosine_scores(model, sentence)
436
+
437
+
438
+ def greet(name):
439
+ return "Hello " + name + "!!"
440
+
441
+ def check_answer(guess:str):
442
+ global guesses
443
+ global answer
444
+ global return_guesses
445
+ global word1
446
+ global word2
447
+ global word3
448
+
449
+ model, tokenizer = get_model()
450
+ output = ""
451
+ protected_guess = guess
452
+ sentence = f"{word1} is to {word2} as [MASK] is to {guess}."
453
+
454
+ other_word = embeddings(model, sentence, tokenizer)
455
+ guesses.append(guess)
456
+
457
+
458
+
459
+ for guess in return_guesses:
460
+ output += ("- " + guess + "<br>")
461
+
462
+ # output = output[:-1]
463
+ prompt = f"{word1} is to {word2} as {word3} is to ___."
464
+ # print("IS", protected_guess, "EQUAL TO", answer, ":", protected_guess.lower() == answer.lower())
465
+
466
+ if protected_guess.lower() == answer.lower():
467
+ return_guesses.append(f"{protected_guess}: {word1} is to {word2} as {word3} is to {protected_guess}.")
468
+ output += f"<span style='color:green'>- {return_guesses[-1]}</span><br>"
469
+ new_prompt = generate_prompt(model, tokenizer)
470
+ return new_prompt, "Correct!", output
471
+ else:
472
+ return_guess = f"{protected_guess}: {word1} is to {word2} as {other_word} is to {protected_guess}."
473
+ return_guesses.append(return_guess)
474
+ output += ("- " + return_guess + " <br>")
475
+ return prompt, "Try again!", output
476
+
477
+ def main():
478
+ global word1
479
+ global word2
480
+ global word3
481
+ global answer
482
+ # answer = "Moon"
483
+ global guesses
484
+
485
+
486
+ # num_rows, data_type, value, example, embeddings = training()
487
+ # sent_embeddings = embeddings()
488
+ model, tokenizer = get_model()
489
+ generate_prompt(model, tokenizer)
490
+
491
+ prompt = f"{word1} is to {word2} as {word3} is to ____"
492
+ print(prompt)
493
+ print("TESTING EMBEDDINGS")
494
+ with gr.Blocks() as iface:
495
+ mark_question = gr.Markdown(prompt)
496
+ with gr.Tab("Guess"):
497
+ text_input = gr.Textbox()
498
+ text_output = gr.Textbox()
499
+ text_button = gr.Button("Submit")
500
+ with gr.Accordion("Open for previous guesses"):
501
+ text_guesses = gr.Markdown()
502
+ # with gr.Tab("Testing"):
503
+ # gr.Markdown(f"""The Embeddings are {sent_embeddings}.""")
504
+ text_button.click(check_answer, inputs=[text_input], outputs=[mark_question, text_output, text_guesses])
505
+ # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
506
+ iface.launch()
507
+
508
+
509
+
510
+
511
+
512
+ if __name__ == "__main__":
513
+ >>>>>>> 5058aea (Problems)
514
  main()
flan-t5-train.py CHANGED
@@ -1,235 +1,235 @@
1
- import gradio as gr
2
- import math
3
- from datasets import load_dataset
4
- from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
5
- from transformers import TrainingArguments, Trainer
6
- from transformers import T5Tokenizer, T5ForConditionalGeneration
7
- import torch
8
- import torch.nn.functional as F
9
- from torch.utils.data import DataLoader
10
- import numpy as np
11
- import evaluate
12
- import nltk
13
- from nltk.corpus import stopwords
14
- import subprocess
15
- import sys
16
- from transformers import T5Tokenizer, DataCollatorForSeq2Seq
17
- from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
18
- from transformers import DataCollatorWithPadding, DistilBertTokenizerFast
19
- from transformers import TrainingArguments
20
- from transformers import (
21
- BertModel,
22
- BertTokenizerFast,
23
- Trainer,
24
- EvalPrediction
25
- )
26
-
27
- nltk.download("punkt", quiet=True)
28
- metric = evaluate.load("rouge")
29
-
30
- # Global Parameters
31
- L_RATE = 3e-4
32
- BATCH_SIZE = 8
33
- PER_DEVICE_EVAL_BATCH = 4
34
- WEIGHT_DECAY = 0.01
35
- SAVE_TOTAL_LIM = 3
36
- NUM_EPOCHS = 10
37
-
38
- # Set up training arguments
39
- training_args = Seq2SeqTrainingArguments(
40
- output_dir="./results",
41
- evaluation_strategy="epoch",
42
- learning_rate=L_RATE,
43
- per_device_train_batch_size=BATCH_SIZE,
44
- per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
45
- weight_decay=WEIGHT_DECAY,
46
- save_total_limit=SAVE_TOTAL_LIM,
47
- num_train_epochs=NUM_EPOCHS,
48
- predict_with_generate=True,
49
- push_to_hub=False
50
- )
51
-
52
- model_id = "google/flan-t5-base"
53
- tokenizer = T5Tokenizer.from_pretrained(model_id)
54
- # tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
55
- # metric = evaluate.load("accuracy")
56
-
57
- def tokenize_function(examples):
58
- return tokenizer(examples["stem"], padding="max_length", truncation=True)
59
-
60
-
61
- #Mean Pooling - Take attention mask into account for correct averaging
62
- def mean_pooling(model_output, attention_mask):
63
- token_embeddings = model_output[0] #First element of model_output contains all token embeddings
64
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
65
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
66
-
67
-
68
- # def compute_metrics(eval_pred):
69
- # logits, labels = eval_pred
70
- # predictions = np.argmax(logits, axis=-1)
71
- # metric = evaluate.load("accuracy")
72
- # return metric.compute(predictions=predictions, references=labels)
73
-
74
- def compute_metrics(eval_preds):
75
- preds, labels = eval_preds
76
-
77
- # decode preds and labels
78
- labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
79
- decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
80
- decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
81
-
82
- # rougeLSum expects newline after each sentence
83
- decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
84
- decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
85
-
86
- result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
87
-
88
- return result
89
-
90
-
91
- def training():
92
- dataset_id = "tomasmcz/word2vec_analogy"
93
- # dataset_id = "relbert/scientific_and_creative_analogy"
94
- # dataset_sub = "Quadruples_Kmiecik_random_split"
95
- print("GETTING DATASET")
96
- dataset = load_dataset(dataset_id)
97
- # dataset = dataset["train"]
98
- # tokenized_datasets = dataset.map(tokenize_function, batched=True)
99
-
100
- print(dataset)
101
- print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
102
- print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0])} as value.")
103
- print(f"- Examples look like this: {dataset['train'][0]}")
104
-
105
- # for i in dataset["train"]:
106
- # print(i["AB"], "to", i["CD"], "is", i["label"])
107
-
108
- dataset = dataset["train"].train_test_split(test_size=0.3)
109
-
110
- # We prefix our tasks with "answer the question"
111
- prefix = "Please answer this question: "
112
-
113
-
114
- def preprocess_function(examples):
115
- """Add prefix to the sentences, tokenize the text, and set the labels"""
116
- # The "inputs" are the tokenized answer:
117
- inputs = []
118
- # print(examples)
119
- # inputs = [prefix + doc for doc in examples["question"]]
120
- for doc in examples['word_a']:
121
- # print("THE DOC IS:", doc)
122
- # print("THE DOC IS:", examples[i]['AB'], examples[i]['CD'], examples[i]['label'])
123
- prompt = f"{prefix}{doc} is to "
124
- inputs.append(prompt)
125
- # inputs = [prefix + doc for doc in examples["question"]]
126
- for indx, doc in enumerate(examples["word_b"]):
127
- prompt = f"{doc} as "
128
- inputs[indx] += prompt
129
-
130
- for indx, doc in enumerate(examples["word_c"]):
131
- prompt = f"{doc} is to ___."
132
- inputs[indx] += prompt
133
- model_inputs = tokenizer(inputs, max_length=128, truncation=True)
134
-
135
- # print(examples["label"], type(examples["label"]))
136
-
137
- # The "labels" are the tokenized outputs:
138
- labels = tokenizer(text_target=examples["word_d"],
139
- max_length=512,
140
- truncation=True)
141
-
142
- model_inputs["labels"] = labels["input_ids"]
143
- return model_inputs
144
-
145
-
146
-
147
- # Map the preprocessing function across our dataset
148
- tokenized_dataset = dataset.map(preprocess_function, batched=True)
149
-
150
- print("END DATALOADER")
151
-
152
- # print(train_examples)
153
-
154
- embeddings = finetune(tokenized_dataset)
155
-
156
- return 0
157
-
158
-
159
- def finetune(dataset):
160
- # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
161
- # model_id = "sentence-transformers/all-MiniLM-L6-v2"
162
- model_id = "google/flan-t5-base"
163
- # model_id = "distilbert-base-uncased"
164
- # tokenizer = DistilBertTokenizerFast.from_pretrained(model_id)
165
- tokenizer = T5Tokenizer.from_pretrained(model_id)
166
- model = T5ForConditionalGeneration.from_pretrained(model_id)
167
- data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
168
- device = torch.device('cuda:0')
169
- model = model.to(device)
170
-
171
- # training_args = TrainingArguments(output_dir="test_trainer")
172
-
173
- # USE THIS LINK
174
- # https://huggingface.co/blog/how-to-train-sentence-transformers
175
-
176
- # train_loss = losses.MegaBatchMarginLoss(model=model)
177
- # ds_train, ds_valid = dataset.train_test_split(test_size=0.2, seed=42)
178
-
179
- print("BEGIN FIT")
180
-
181
- trainer = Seq2SeqTrainer(
182
- model=model,
183
- args=training_args,
184
- train_dataset=dataset["train"],
185
- eval_dataset=dataset["test"],
186
- # evaluation_strategy="no"
187
- tokenizer=tokenizer,
188
- data_collator=data_collator,
189
- compute_metrics=compute_metrics
190
- )
191
-
192
- # model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
193
-
194
- trainer.train()
195
-
196
- # model.save("flan-analogies")
197
-
198
- # model.save_to_hub("smhavens/bert-base-analogies")
199
- # accuracy = compute_metrics(eval, metric)
200
- return 0
201
-
202
- def greet(name):
203
- return "Hello " + name + "!!"
204
-
205
- def check_answer(guess:str):
206
- global guesses
207
- global answer
208
- guesses.append(guess)
209
- output = ""
210
- for guess in guesses:
211
- output += ("- " + guess + "\n")
212
- output = output[:-1]
213
-
214
- if guess.lower() == answer.lower():
215
- return "Correct!", output
216
- else:
217
- return "Try again!", output
218
-
219
- def main():
220
- print("BEGIN")
221
- word1 = "Black"
222
- word2 = "White"
223
- word3 = "Sun"
224
- global answer
225
- answer = "Moon"
226
- global guesses
227
-
228
- training()
229
-
230
-
231
-
232
-
233
-
234
- if __name__ == "__main__":
235
  main()
 
1
+ import gradio as gr
2
+ import math
3
+ from datasets import load_dataset
4
+ from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
5
+ from transformers import TrainingArguments, Trainer
6
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
7
+ import torch
8
+ import torch.nn.functional as F
9
+ from torch.utils.data import DataLoader
10
+ import numpy as np
11
+ import evaluate
12
+ import nltk
13
+ from nltk.corpus import stopwords
14
+ import subprocess
15
+ import sys
16
+ from transformers import T5Tokenizer, DataCollatorForSeq2Seq
17
+ from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
18
+ from transformers import DataCollatorWithPadding, DistilBertTokenizerFast
19
+ from transformers import TrainingArguments
20
+ from transformers import (
21
+ BertModel,
22
+ BertTokenizerFast,
23
+ Trainer,
24
+ EvalPrediction
25
+ )
26
+
27
+ nltk.download("punkt", quiet=True)
28
+ metric = evaluate.load("rouge")
29
+
30
+ # Global Parameters
31
+ L_RATE = 3e-4
32
+ BATCH_SIZE = 8
33
+ PER_DEVICE_EVAL_BATCH = 4
34
+ WEIGHT_DECAY = 0.01
35
+ SAVE_TOTAL_LIM = 3
36
+ NUM_EPOCHS = 10
37
+
38
+ # Set up training arguments
39
+ training_args = Seq2SeqTrainingArguments(
40
+ output_dir="./results",
41
+ evaluation_strategy="epoch",
42
+ learning_rate=L_RATE,
43
+ per_device_train_batch_size=BATCH_SIZE,
44
+ per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH,
45
+ weight_decay=WEIGHT_DECAY,
46
+ save_total_limit=SAVE_TOTAL_LIM,
47
+ num_train_epochs=NUM_EPOCHS,
48
+ predict_with_generate=True,
49
+ push_to_hub=False
50
+ )
51
+
52
+ model_id = "google/flan-t5-base"
53
+ tokenizer = T5Tokenizer.from_pretrained(model_id)
54
+ # tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
55
+ # metric = evaluate.load("accuracy")
56
+
57
+ def tokenize_function(examples):
58
+ return tokenizer(examples["stem"], padding="max_length", truncation=True)
59
+
60
+
61
+ #Mean Pooling - Take attention mask into account for correct averaging
62
+ def mean_pooling(model_output, attention_mask):
63
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
64
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
65
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
66
+
67
+
68
+ # def compute_metrics(eval_pred):
69
+ # logits, labels = eval_pred
70
+ # predictions = np.argmax(logits, axis=-1)
71
+ # metric = evaluate.load("accuracy")
72
+ # return metric.compute(predictions=predictions, references=labels)
73
+
74
+ def compute_metrics(eval_preds):
75
+ preds, labels = eval_preds
76
+
77
+ # decode preds and labels
78
+ labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
79
+ decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
80
+ decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
81
+
82
+ # rougeLSum expects newline after each sentence
83
+ decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
84
+ decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
85
+
86
+ result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
87
+
88
+ return result
89
+
90
+
91
+ def training():
92
+ dataset_id = "tomasmcz/word2vec_analogy"
93
+ # dataset_id = "relbert/scientific_and_creative_analogy"
94
+ # dataset_sub = "Quadruples_Kmiecik_random_split"
95
+ print("GETTING DATASET")
96
+ dataset = load_dataset(dataset_id)
97
+ # dataset = dataset["train"]
98
+ # tokenized_datasets = dataset.map(tokenize_function, batched=True)
99
+
100
+ print(dataset)
101
+ print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
102
+ print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0])} as value.")
103
+ print(f"- Examples look like this: {dataset['train'][0]}")
104
+
105
+ # for i in dataset["train"]:
106
+ # print(i["AB"], "to", i["CD"], "is", i["label"])
107
+
108
+ dataset = dataset["train"].train_test_split(test_size=0.3)
109
+
110
+ # We prefix our tasks with "answer the question"
111
+ prefix = "Please answer this question: "
112
+
113
+
114
+ def preprocess_function(examples):
115
+ """Add prefix to the sentences, tokenize the text, and set the labels"""
116
+ # The "inputs" are the tokenized answer:
117
+ inputs = []
118
+ # print(examples)
119
+ # inputs = [prefix + doc for doc in examples["question"]]
120
+ for doc in examples['word_a']:
121
+ # print("THE DOC IS:", doc)
122
+ # print("THE DOC IS:", examples[i]['AB'], examples[i]['CD'], examples[i]['label'])
123
+ prompt = f"{prefix}{doc} is to "
124
+ inputs.append(prompt)
125
+ # inputs = [prefix + doc for doc in examples["question"]]
126
+ for indx, doc in enumerate(examples["word_b"]):
127
+ prompt = f"{doc} as "
128
+ inputs[indx] += prompt
129
+
130
+ for indx, doc in enumerate(examples["word_c"]):
131
+ prompt = f"{doc} is to ___."
132
+ inputs[indx] += prompt
133
+ model_inputs = tokenizer(inputs, max_length=128, truncation=True)
134
+
135
+ # print(examples["label"], type(examples["label"]))
136
+
137
+ # The "labels" are the tokenized outputs:
138
+ labels = tokenizer(text_target=examples["word_d"],
139
+ max_length=512,
140
+ truncation=True)
141
+
142
+ model_inputs["labels"] = labels["input_ids"]
143
+ return model_inputs
144
+
145
+
146
+
147
+ # Map the preprocessing function across our dataset
148
+ tokenized_dataset = dataset.map(preprocess_function, batched=True)
149
+
150
+ print("END DATALOADER")
151
+
152
+ # print(train_examples)
153
+
154
+ embeddings = finetune(tokenized_dataset)
155
+
156
+ return 0
157
+
158
+
159
+ def finetune(dataset):
160
+ # model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
161
+ # model_id = "sentence-transformers/all-MiniLM-L6-v2"
162
+ model_id = "google/flan-t5-base"
163
+ # model_id = "distilbert-base-uncased"
164
+ # tokenizer = DistilBertTokenizerFast.from_pretrained(model_id)
165
+ tokenizer = T5Tokenizer.from_pretrained(model_id)
166
+ model = T5ForConditionalGeneration.from_pretrained(model_id)
167
+ data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
168
+ device = torch.device('cuda:0')
169
+ model = model.to(device)
170
+
171
+ # training_args = TrainingArguments(output_dir="test_trainer")
172
+
173
+ # USE THIS LINK
174
+ # https://huggingface.co/blog/how-to-train-sentence-transformers
175
+
176
+ # train_loss = losses.MegaBatchMarginLoss(model=model)
177
+ # ds_train, ds_valid = dataset.train_test_split(test_size=0.2, seed=42)
178
+
179
+ print("BEGIN FIT")
180
+
181
+ trainer = Seq2SeqTrainer(
182
+ model=model,
183
+ args=training_args,
184
+ train_dataset=dataset["train"],
185
+ eval_dataset=dataset["test"],
186
+ # evaluation_strategy="no"
187
+ tokenizer=tokenizer,
188
+ data_collator=data_collator,
189
+ compute_metrics=compute_metrics
190
+ )
191
+
192
+ # model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10)
193
+
194
+ trainer.train()
195
+
196
+ # model.save("flan-analogies")
197
+
198
+ # model.save_to_hub("smhavens/bert-base-analogies")
199
+ # accuracy = compute_metrics(eval, metric)
200
+ return 0
201
+
202
+ def greet(name):
203
+ return "Hello " + name + "!!"
204
+
205
+ def check_answer(guess:str):
206
+ global guesses
207
+ global answer
208
+ guesses.append(guess)
209
+ output = ""
210
+ for guess in guesses:
211
+ output += ("- " + guess + "\n")
212
+ output = output[:-1]
213
+
214
+ if guess.lower() == answer.lower():
215
+ return "Correct!", output
216
+ else:
217
+ return "Try again!", output
218
+
219
+ def main():
220
+ print("BEGIN")
221
+ word1 = "Black"
222
+ word2 = "White"
223
+ word3 = "Sun"
224
+ global answer
225
+ answer = "Moon"
226
+ global guesses
227
+
228
+ training()
229
+
230
+
231
+
232
+
233
+
234
+ if __name__ == "__main__":
235
  main()
word_embedding.py CHANGED
@@ -1,4 +1,7 @@
1
  <<<<<<< HEAD
 
 
 
2
  from datasets import load_dataset
3
  import shutil
4
  import json
@@ -615,6 +618,7 @@ def main():
615
 
616
 
617
  if __name__ == "__main__":
 
618
  =======
619
  from datasets import load_dataset
620
  import shutil
@@ -1233,4 +1237,6 @@ def main():
1233
 
1234
  if __name__ == "__main__":
1235
  >>>>>>> 7d5b505 (New in-context model with working UI System)
 
 
1236
  main()
 
1
  <<<<<<< HEAD
2
+ <<<<<<< HEAD
3
+ =======
4
+ >>>>>>> 5058aea (Problems)
5
  from datasets import load_dataset
6
  import shutil
7
  import json
 
618
 
619
 
620
  if __name__ == "__main__":
621
+ <<<<<<< HEAD
622
  =======
623
  from datasets import load_dataset
624
  import shutil
 
1237
 
1238
  if __name__ == "__main__":
1239
  >>>>>>> 7d5b505 (New in-context model with working UI System)
1240
+ =======
1241
+ >>>>>>> 5058aea (Problems)
1242
  main()