In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from transformers import get_scheduler
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForSeq2Seq
from accelerate import Accelerator
import evaluate
import datasets

from tqdm.auto import tqdm

In [2]:
tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
# prep dataset
dataset = load_dataset("tner/mit_restaurant")

In [4]:
ner_tags = {
    "O": 0,
    "B-Rating": 1,
    "I-Rating": 2,
    "B-Amenity": 3,
    "I-Amenity": 4,
    "B-Location": 5,
    "I-Location": 6,
    "B-Restaurant_Name": 7,
    "I-Restaurant_Name": 8,
    "B-Price": 9,
    "B-Hours": 10,
    "I-Hours": 11,
    "B-Dish": 12,
    "I-Dish": 13,
    "B-Cuisine": 14,
    "I-Price": 15,
    "I-Cuisine": 16,
}


label_names = {v: k for k, v in ner_tags.items()}

In [5]:
def decode_tags(tags, words):
    dict_out = {}
    word_ = ""
    for tag, word in zip(tags[::-1], words[::-1]):
        if tag == 0:
            continue
        word_ = word_ + " " + word
        if label_names[tag].startswith("B"):
            tag_name = label_names[tag][2:]
            word_ = word_.strip()
            if tag_name not in dict_out:
                dict_out[tag_name] = [word_]
            else:
                dict_out[tag_name].append(word_)
            word_ = ""
    return dict_out


def format_to_text(decoded):
    text = ""
    for key, value in decoded.items():
        text += f"{key}: {', '.join(value)}\n"
    return text

In [6]:
def generate_t5_data(example):
    decoded = decode_tags(example["tags"], example["tokens"])
    return {"tokens": " ".join(example["tokens"]), "labels": format_to_text(decoded)}

In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# the following 2 hyperparameters are task-specific
max_source_length = 512
max_target_length = 128

# encode the inputs
task_prefix = "What is the user intent?"


def tokenize(example):
    tokenized = tokenizer(
        task_prefix + example["tokens"],
        text_target=example["labels"],
        max_length=512,
        truncation=True,
    )
    return tokenized

In [8]:
tokenized_datasets = dataset.map(generate_t5_data)
tokenized_datasets = tokenized_datasets.remove_columns(["tags"])
tokenized_datasets = tokenized_datasets.map(tokenize)

Map:   0%|          | 0/6900 [00:00<?, ? examples/s]

In [9]:
import evaluate

metric = evaluate.load("sacrebleu")

In [10]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [11]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [12]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"T5 test",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True,
    fp16=True,
)

In [13]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [14]:
trainer.evaluate(max_length=512)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Trainer is attempting to log a value of "{'summarization': {'early_stopping': True, 'length_penalty': 2.0, 'max_length': 200, 'min_length': 30, 'no_repeat_ngram_size': 3, 'num_beams': 4, 'prefix': 'summarize: '}, 'translation_en_to_de': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to German: '}, 'translation_en_to_fr': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to French: '}, 'translation_en_to_ro': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to Romanian: '}}" for key "task_specific_params" as a parameter. MLflow's log_param() only accepts values no longer than 250 characters so we dropped this attribute. You can use `MLFLOW_FLATTEN_PARAMS` environment variable to flatten the parameters and avoid this message.


{'eval_loss': 6.675447940826416,
 'eval_bleu': 0.006728795795564811,
 'eval_runtime': 17.5858,
 'eval_samples_per_second': 43.217,
 'eval_steps_per_second': 0.682}

In [15]:
trainer.train()



Step,Training Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 456.00 MiB (GPU 0; 11.75 GiB total capacity; 10.26 GiB already allocated; 131.12 MiB free; 10.83 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF