I’m using boilerplate code to train a Roberta model on my text corpus.
Everything seems to be going okay until I try to load the pretrained tokenizer into the pipeline. I think it’s looking for a config.json file in the tokenizer folder but the BPE tokenizer is only outputting vocab.json
and merges.txt
files. what am I missing here?
! pip install transformers tokenizers --quiet
from google.colab import drive
drive.mount('/content/gdrive')
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
vocab_size = 50000
tokenizer_folder = "./gdrive/MyDrive/nlp-chart/chart_bpe_tokenizer/"
model_folder = './gdrive/MyDrive/nlp-chart/roberta_mlm_2_6_2022/'
%%time
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import RobertaProcessing
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer(lowercase=True)
# Customize training
tokenizer.train(files='./gdrive/MyDrive/nlp-chart/train charts.txt',
vocab_size=vocab_size,
min_frequency=5,
show_progress=True,
special_tokens=[
"<s>",
"<pad>",
"</s>",
"<unk>",
"<mask>",
])
#Save the Tokenizer to disk
tokenizer.save_model(tokenizer_folder)
CPU times: user 4min 39s, sys: 5.39 s, total: 4min 44s
Wall time: 1min 15s
tokenizer = ByteLevelBPETokenizer(
tokenizer_folder+'vocab.json',
tokenizer_folder+'merges.txt'
)
tokenizer._tokenizer.post_processor = RobertaProcessing(
("</s>", tokenizer.token_to_id("</s>")),
("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
tokenizer.encode("55 yo male with congestive heart failure").tokens
['<s>',
'55',
'Ġyo',
'Ġmale',
'Ġwith',
'Ġcongestive',
'Ġheart',
'Ġfailure',
'</s>']
from transformers import RobertaTokenizerFast
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_folder, max_len=512)
from transformers import RobertaConfig
config = RobertaConfig(
vocab_size=vocab_size,
max_position_embeddings=514,
num_attention_heads=12,
num_hidden_layers=6,
type_vocab_size=1,
)
from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM(config=config)
%%time
from transformers import LineByLineTextDataset
dataset = LineByLineTextDataset(
tokenizer=tokenizer,
file_path='./gdrive/MyDrive/nlp-chart/train charts.txt',
block_size=256,
)
/usr/local/lib/python3.7/dist-packages/transformers/data/datasets/language_modeling.py:125: FutureWarning: This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_mlm.py
FutureWarning,
CPU times: user 6min 37s, sys: 15.7 s, total: 6min 52s
Wall time: 1min 55s
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir= model_folder,
overwrite_output_dir=True,
num_train_epochs=1,
per_device_train_batch_size=32,
save_steps=1000,
save_total_limit=1,
prediction_loss_only=True,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset,
)
%%time
trainer.train()
/usr/local/lib/python3.7/dist-packages/transformers/optimization.py:309: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use thePyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
FutureWarning,
***** Running training *****
Num examples = 28502
Num Epochs = 1
Instantaneous batch size per device = 32
Total train batch size (w. parallel, distributed & accumulation) = 32
Gradient Accumulation steps = 1
Total optimization steps = 891
[891/891 10:03, Epoch 1/1]
Step Training Loss
500 5.730000
Training completed. Do not forget to share your model on huggingface.co/models =)
CPU times: user 10min 4s, sys: 2.39 s, total: 10min 6s
Wall time: 10min 4s
TrainOutput(global_step=891, training_loss=5.301820149191569, metrics={'train_runtime': 604.2323, 'train_samples_per_second': 47.171, 'train_steps_per_second': 1.475, 'total_flos': 1889981764288512.0, 'train_loss': 5.301820149191569, 'epoch': 1.0})
from transformers import pipeline
fill_mask = pipeline(
"fill-mask",
model= model_folder+'checkpoint-600',
tokenizer= tokenizer_folder
)
loading configuration file ./gdrive/MyDrive/nlp-chart/roberta_mlm_2_6_2022/checkpoint-600/config.json
Model config RobertaConfig {
"_name_or_path": "./gdrive/MyDrive/nlp-chart/roberta_mlm_2_6_2022/checkpoint-600",
"architectures": [
"RobertaForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 514,
"model_type": "roberta",
"num_attention_heads": 12,
"num_hidden_layers": 6,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"torch_dtype": "float32",
"transformers_version": "4.16.2",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 50000
}
loading configuration file ./gdrive/MyDrive/nlp-chart/roberta_mlm_2_6_2022/checkpoint-600/config.json
Model config RobertaConfig {
"_name_or_path": "./gdrive/MyDrive/nlp-chart/roberta_mlm_2_6_2022/checkpoint-600",
"architectures": [
"RobertaForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"bos_token_id": 0,
"classifier_dropout": null,
"eos_token_id": 2,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 514,
"model_type": "roberta",
"num_attention_heads": 12,
"num_hidden_layers": 6,
"pad_token_id": 1,
"position_embedding_type": "absolute",
"torch_dtype": "float32",
"transformers_version": "4.16.2",
"type_vocab_size": 1,
"use_cache": true,
"vocab_size": 50000
}
loading weights file ./gdrive/MyDrive/nlp-chart/roberta_mlm_2_6_2022/checkpoint-600/pytorch_model.bin
All model checkpoint weights were used when initializing RobertaForMaskedLM.
All the weights of RobertaForMaskedLM were initialized from the model checkpoint at ./gdrive/MyDrive/nlp-chart/roberta_mlm_2_6_2022/checkpoint-600.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaForMaskedLM for predictions without further training.
Could not locate the tokenizer configuration file, will try to use the model config instead.
---------------------------------------------------------------------------
IsADirectoryError Traceback (most recent call last)
<ipython-input-14-63ae434eea1a> in <module>()
4 "fill-mask",
5 model= model_folder+'checkpoint-600',
----> 6 tokenizer= tokenizer_folder
7 )
/usr/local/lib/python3.7/dist-packages/transformers/pipelines/__init__.py in pipeline(task, model, config, tokenizer, feature_extractor, framework, revision, use_fast, use_auth_token, model_kwargs, pipeline_class, **kwargs)
589
590 tokenizer = AutoTokenizer.from_pretrained(
--> 591 tokenizer_identifier, revision=revision, use_fast=use_fast, _from_pipeline=task, **tokenizer_kwargs
592 )
593
/usr/local/lib/python3.7/dist-packages/transformers/models/auto/tokenization_auto.py in from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
469 if not isinstance(config, PretrainedConfig):
470 config = AutoConfig.from_pretrained(
--> 471 pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
472 )
473 config_tokenizer_class = config.tokenizer_class
/usr/local/lib/python3.7/dist-packages/transformers/models/auto/configuration_auto.py in from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
610 kwargs["name_or_path"] = pretrained_model_name_or_path
611 trust_remote_code = kwargs.pop("trust_remote_code", False)
--> 612 config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
613 if "auto_map" in config_dict and "AutoConfig" in config_dict["auto_map"]:
614 if not trust_remote_code:
/usr/local/lib/python3.7/dist-packages/transformers/configuration_utils.py in get_config_dict(cls, pretrained_model_name_or_path, **kwargs)
535 original_kwargs = copy.deepcopy(kwargs)
536 # Get config dict associated with the base config file
--> 537 config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
538
539 # That config file may point us toward another config file to use.
/usr/local/lib/python3.7/dist-packages/transformers/configuration_utils.py in _get_config_dict(cls, pretrained_model_name_or_path, **kwargs)
633 try:
634 # Load config dict
--> 635 config_dict = cls._dict_from_json_file(resolved_config_file)
636 except (json.JSONDecodeError, UnicodeDecodeError):
637 raise EnvironmentError(
/usr/local/lib/python3.7/dist-packages/transformers/configuration_utils.py in _dict_from_json_file(cls, json_file)
702 @classmethod
703 def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
--> 704 with open(json_file, "r", encoding="utf-8") as reader:
705 text = reader.read()
706 return json.loads(text)
IsADirectoryError: [Errno 21] Is a directory: './gdrive/MyDrive/nlp-chart/chart_bpe_tokenizer/config.json'
result = fill_mask("congestive <mask> failure")
result