diff --git "a/bert/tokenization_utils_base.py" "b/bert/tokenization_utils_base.py" new file mode 100644--- /dev/null +++ "b/bert/tokenization_utils_base.py" @@ -0,0 +1,2317 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Base classes common to both the slow and the fast tokenization classes: + PreTrainedTokenizerBase (host all the user fronting encoding methodes) + Special token mixing (host the special tokens logic) and + BatchEncoding (wrap the dictionnary of output with special method for the Fast tokenizers) +""" + +import copy +import json +import logging +import os +import warnings +from collections import UserDict +from enum import Enum +from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union + +import numpy as np +from tokenizers import AddedToken +from tokenizers import Encoding as EncodingFast + +from .file_utils import ( + add_end_docstrings, + cached_path, + hf_bucket_url, + is_remote_url, + is_tf_available, + is_torch_available, + torch_required, +) + + +if is_tf_available(): + import tensorflow as tf +if is_torch_available(): + import torch + + +logger = logging.getLogger(__name__) + +VERY_LARGE_INTEGER = int(1e30) # This is used to set the max input length for a model with infinite size input +LARGE_INTEGER = int(1e20) # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER + +# Define type aliases and NamedTuples +TextInput = str +PreTokenizedInput = List[str] +EncodedInput = List[int] +TextInputPair = Tuple[str, str] +PreTokenizedInputPair = Tuple[List[str], List[str]] +EncodedInputPair = Tuple[List[int], List[int]] + + +# Slow tokenizers used to be saved in three separated files +SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" +ADDED_TOKENS_FILE = "added_tokens.json" +TOKENIZER_CONFIG_FILE = "tokenizer_config.json" + +# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file +FULL_TOKENIZER_FILE = "tokenizer.json" + + +class ExplicitEnum(Enum): + """ Enum with more explicit error message for missing values. + """ + + @classmethod + def _missing_(cls, value): + raise ValueError( + "%r is not a valid %s, please select one of %s" + % (value, cls.__name__, str(list(cls._value2member_map_.keys()))) + ) + + +class TruncationStrategy(ExplicitEnum): + ONLY_FIRST = "only_first" + ONLY_SECOND = "only_second" + LONGEST_FIRST = "longest_first" + DO_NOT_TRUNCATE = "do_not_truncate" + + +class PaddingStrategy(ExplicitEnum): + LONGEST = "longest" + MAX_LENGTH = "max_length" + DO_NOT_PAD = "do_not_pad" + + +class TensorType(ExplicitEnum): + PYTORCH = "pt" + TENSORFLOW = "tf" + NUMPY = "np" + + +class CharSpan(NamedTuple): + """ Character span in the original string + + Args: + start: index of the first character in the original string + end: index of the character following the last character in the original string + """ + + start: int + end: int + + +class TokenSpan(NamedTuple): + """ Token span in an encoded string (list of tokens) + + Args: + start: index of the first token in the span + end: index of the token following the last token in the span + """ + + start: int + end: int + + +class BatchEncoding(UserDict): + """ BatchEncoding hold the output of the encode and batch_encode methods (tokens, attention_masks, etc). + This class is derived from a python Dictionary and can be used as a dictionnary. + In addition, this class expose utility methods to map from word/char space to token space. + + Args: + data (:obj:`dict`): Dictionary of lists/arrays returned by the encode/batch_encode methods ('input_ids', 'attention_mask'...) + encoding (:obj:`EncodingFast`, :obj:`list(EncodingFast)`, `optional`, defaults to :obj:`None`): + If the tokenizer is a fast tokenizer which outputs additional informations like mapping from word/char space to token space + the `EncodingFast` instance or list of instance (for batches) hold these informations. + tensor_type (:obj:`Union[None, str, TensorType]`, `optional`, defaults to :obj:`None`): + You can give a tensor_type here to convert the lists of integers in PyTorch/TF/Numpy Tensors at initialization + prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to add a batch axis when converting in Tensors (see :obj:`tensor_type` above) + """ + + def __init__( + self, + data: Optional[Dict[str, Any]] = None, + encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None, + tensor_type: Union[None, str, TensorType] = None, + prepend_batch_axis: bool = False, + ): + super().__init__(data) + + if isinstance(encoding, EncodingFast): + encoding = [encoding] + + self._encodings = encoding + + self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis) + + @property + def is_fast(self): + """ + Indicate if this BatchEncoding was generated from the result of a PreTrainedTokenizerFast + Returns: True if generated from subclasses of PreTrainedTokenizerFast, else otherwise + """ + return self._encodings is not None + + def __getitem__(self, item: Union[int, str]) -> EncodingFast: + """ If the key is a string, get the value of the dict associated to `key` ('input_ids', 'attention_mask'...) + If the key is an integer, get the EncodingFast for batch item with index `key` + """ + if isinstance(item, str): + return self.data[item] + elif self._encodings is not None: + return self._encodings[item] + else: + raise KeyError( + "Indexing with integers (to access backend Encoding for a given batch index) " + "is not available when using Python based tokenizers" + ) + + def __getattr__(self, item: str): + try: + return self.data[item] + except KeyError: + raise AttributeError + + def __getstate__(self): + return {"data": self.data, "encodings": self._encodings} + + def __setstate__(self, state): + if "data" in state: + self.data = state["data"] + + if "encodings" in state: + self._encodings = state["encodings"] + + def keys(self): + return self.data.keys() + + def values(self): + return self.data.values() + + def items(self): + return self.data.items() + + # After this point: + # Extended properties and methods only available for fast (Rust-based) tokenizers + # provided by HuggingFace tokenizers library. + + @property + def encodings(self) -> Optional[List[EncodingFast]]: + """ + Return the list all encoding from the tokenization process + + Returns: List[EncodingFast] or None if input was tokenized through Python (i.e. not fast) tokenizer + """ + return self._encodings + + def tokens(self, batch_index: int = 0) -> List[str]: + if not self._encodings: + raise ValueError("tokens() is not available when using Python based tokenizers") + return self._encodings[batch_index].tokens + + def words(self, batch_index: int = 0) -> List[Optional[int]]: + if not self._encodings: + raise ValueError("words() is not available when using Python based tokenizers") + return self._encodings[batch_index].words + + def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int: + """ + Get the index of the word corresponding (i.e. comprising) to an encoded token + in a sequence of the batch. + + Can be called as: + + - ``self.token_to_word(token_index)`` if batch size is 1 + - ``self.token_to_word(batch_index, token_index)`` if batch size is greater than 1 + + This method is particularly suited when the input sequences are provided as + pre-tokenized sequences (i.e. words are defined by the user). In this case it allows + to easily associate encoded tokens with provided tokenized words. + + Args: + batch_or_token_index (:obj:`int`): + Index of the sequence in the batch. If the batch only comprise one sequence, + this can be the index of the token in the sequence + token_index (:obj:`int`, `optional`): + If a batch index is provided in `batch_or_token_index`, this can be the index + of the token in the sequence. + + Returns: + :obj:`int`: + index of the word in the input sequence. + + """ + + if not self._encodings: + raise ValueError("token_to_word() is not available when using Python based tokenizers") + if token_index is not None: + batch_index = batch_or_token_index + else: + batch_index = 0 + token_index = batch_or_token_index + if batch_index < 0: + batch_index = self._batch_size + batch_index + if token_index < 0: + token_index = self._seq_len + token_index + return self._encodings[batch_index].token_to_word(token_index) + + def word_to_tokens(self, batch_or_word_index: int, word_index: Optional[int] = None) -> TokenSpan: + """ + Get the encoded token span corresponding to a word in the sequence of the batch. + + Token spans are returned as a TokenSpan NamedTuple with: + + - start: index of the first token + - end: index of the token following the last token + + Can be called as: + + - ``self.word_to_tokens(word_index)`` if batch size is 1 + - ``self.word_to_tokens(batch_index, word_index)`` if batch size is greater or equal to 1 + + This method is particularly suited when the input sequences are provided as + pre-tokenized sequences (i.e. words are defined by the user). In this case it allows + to easily associate encoded tokens with provided tokenized words. + + Args: + batch_or_word_index (:obj:`int`): + Index of the sequence in the batch. If the batch only comprises one sequence, + this can be the index of the word in the sequence + word_index (:obj:`int`, `optional`): + If a batch index is provided in `batch_or_token_index`, this can be the index + of the word in the sequence. + + Returns: + :obj:`TokenSpan`: + Span of tokens in the encoded sequence. + + :obj:`TokenSpan` are NamedTuple with: + + - start: index of the first token + - end: index of the token following the last token + """ + + if not self._encodings: + raise ValueError("word_to_tokens() is not available when using Python based tokenizers") + if word_index is not None: + batch_index = batch_or_word_index + else: + batch_index = 0 + word_index = batch_or_word_index + if batch_index < 0: + batch_index = self._batch_size + batch_index + if word_index < 0: + word_index = self._seq_len + word_index + return TokenSpan(*(self._encodings[batch_index].word_to_tokens(word_index))) + + def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan: + """ + Get the character span corresponding to an encoded token in a sequence of the batch. + + Character spans are returned as a CharSpan NamedTuple with: + + - start: index of the first character in the original string associated to the token + - end: index of the character following the last character in the original string associated to the token + + Can be called as: + + - ``self.token_to_chars(token_index)`` if batch size is 1 + - ``self.token_to_chars(batch_index, token_index)`` if batch size is greater or equal to 1 + + Args: + batch_or_token_index (:obj:`int`): + Index of the sequence in the batch. If the batch only comprise one sequence, + this can be the index of the token in the sequence + token_index (:obj:`int`, `optional`): + If a batch index is provided in `batch_or_token_index`, this can be the index + of the token or tokens in the sequence. + + Returns: + :obj:`CharSpan`: + Span of characters in the original string. + + :obj:`CharSpan` are NamedTuple with: + + - start: index of the first character in the original string + - end: index of the character following the last character in the original string + """ + + if not self._encodings: + raise ValueError("token_to_chars() is not available when using Python based tokenizers") + if token_index is not None: + batch_index = batch_or_token_index + else: + batch_index = 0 + token_index = batch_or_token_index + return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index))) + + def char_to_token(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int: + """ + Get the index of the token in the encoded output comprising a character + in the original string for a sequence of the batch. + + Can be called as: + + - ``self.char_to_token(char_index)`` if batch size is 1 + - ``self.char_to_token(batch_index, char_index)`` if batch size is greater or equal to 1 + + This method is particularly suited when the input sequences are provided as + pre-tokenized sequences (i.e. words are defined by the user). In this case it allows + to easily associate encoded tokens with provided tokenized words. + + Args: + batch_or_char_index (:obj:`int`): + Index of the sequence in the batch. If the batch only comprise one sequence, + this can be the index of the word in the sequence + char_index (:obj:`int`, `optional`): + If a batch index is provided in `batch_or_token_index`, this can be the index + of the word in the sequence. + + + Returns: + :obj:`int`: Index of the token. + """ + + if not self._encodings: + raise ValueError("char_to_token() is not available when using Python based tokenizers") + if char_index is not None: + batch_index = batch_or_char_index + else: + batch_index = 0 + char_index = batch_or_char_index + return self._encodings[batch_index].char_to_token(char_index) + + def word_to_chars(self, batch_or_word_index: int, word_index: Optional[int] = None) -> CharSpan: + """ + Get the character span in the original string corresponding to given word in a sequence + of the batch. + + Character spans are returned as a CharSpan NamedTuple with: + + - start: index of the first character in the original string + - end: index of the character following the last character in the original string + + Can be called as: + + - ``self.word_to_chars(word_index)`` if batch size is 1 + - ``self.word_to_chars(batch_index, word_index)`` if batch size is greater or equal to 1 + + Args: + batch_or_word_index (:obj:`int`): + Index of the sequence in the batch. If the batch only comprise one sequence, + this can be the index of the word in the sequence + word_index (:obj:`int`, `optional`): + If a batch index is provided in `batch_or_token_index`, this can be the index + of the word in the sequence. + + Returns: + :obj:`CharSpan` or :obj:`List[CharSpan]`: + Span(s) of the associated character or characters in the string. + CharSpan are NamedTuple with: + + - start: index of the first character associated to the token in the original string + - end: index of the character following the last character associated to the token in the original string + """ + + if not self._encodings: + raise ValueError("word_to_chars() is not available when using Python based tokenizers") + if word_index is not None: + batch_index = batch_or_word_index + else: + batch_index = 0 + word_index = batch_or_word_index + return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index))) + + def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None) -> int: + """ + Get the word in the original string corresponding to a character in the original string of + a sequence of the batch. + + Can be called as: + + - ``self.char_to_word(char_index)`` if batch size is 1 + - ``self.char_to_word(batch_index, char_index)`` if batch size is greater than 1 + + This method is particularly suited when the input sequences are provided as + pre-tokenized sequences (i.e. words are defined by the user). In this case it allows + to easily associate encoded tokens with provided tokenized words. + + Args: + batch_or_char_index (:obj:`int`): + Index of the sequence in the batch. If the batch only comprise one sequence, + this can be the index of the character in the orginal string. + char_index (:obj:`int`, `optional`): + If a batch index is provided in `batch_or_token_index`, this can be the index + of the character in the orginal string. + + + Returns: + :obj:`int` or :obj:`List[int]`: + Index or indices of the associated encoded token(s). + """ + + if not self._encodings: + raise ValueError("char_to_word() is not available when using Python based tokenizers") + if char_index is not None: + batch_index = batch_or_char_index + else: + batch_index = 0 + char_index = batch_or_char_index + return self._encodings[batch_index].char_to_word(char_index) + + def convert_to_tensors(self, tensor_type: Union[None, str, TensorType], prepend_batch_axis: bool = False): + if tensor_type is None: + return self + + # Convert to TensorType + if not isinstance(tensor_type, TensorType): + tensor_type = TensorType(tensor_type) + + # Get a function reference for the correct framework + if tensor_type == TensorType.TENSORFLOW and is_tf_available(): + as_tensor = tf.constant + elif tensor_type == TensorType.PYTORCH and is_torch_available(): + as_tensor = torch.tensor + elif tensor_type == TensorType.NUMPY: + as_tensor = np.asarray + else: + raise ImportError( + "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format( + tensor_type + ) + ) + + # Do the tensor conversion in batch + for key, value in self.items(): + try: + if prepend_batch_axis: + value = [value] + + tensor = as_tensor(value) + + # at-least2d + if tensor.ndim > 2: + tensor = tensor.squeeze(0) + elif tensor.ndim < 2: + tensor = tensor[None, :] + + self[key] = tensor + except: # noqa E722 + raise ValueError( + "Unable to create tensor, you should probably activate truncation and/or padding " + "with 'padding=True' 'truncation=True' to have batched tensors with the same length." + ) + + return self + + @torch_required + def to(self, device: str): + """Send all values to device by calling v.to(device)""" + self.data = {k: v.to(device) for k, v in self.data.items()} + return self + + +# class AddedToken(UserString): +# """ AddedToken represents a token to be added to a Tokenizer + +# An AddedToken can have special options defining the way it should behave. + +# Args: +# content: str: +# The content of the token + +# single_word: bool +# Whether this token should only match against single word. If True, +# this token will never match inside of a word. + +# lstrip: bool +# Whether this token should strip all potential whitespaces on the left side. +# If True, this token will greedily match any whitespace on the left and then strip +# them out. + +# rstrip: bool +# Whether this token should strip all potential whitespaces on the right side. +# If True, this token will greedily match any whitespace on the right and then strip +# them out. +# """ + +# def __init__( +# self, data: str, single_word: bool = False, lstrip: bool = False, rstrip: bool = False, +# ): +# super().__init__(data) + +# self._single_word = single_word +# self._lstrip = lstrip +# self._rstrip = rstrip + +# def lower(self): +# return AddedToken(self.data.lower(), self._single_word, self._lstrip, self._rstrip) + + +class SpecialTokensMixin: + """ SpecialTokensMixin is derived by ``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` and + handles specific behaviors related to special tokens. In particular, this class hold the + attributes which can be used to directly access to these special tokens in a + model-independant manner and allow to set and update the special tokens. + """ + + SPECIAL_TOKENS_ATTRIBUTES = [ + "bos_token", + "eos_token", + "unk_token", + "sep_token", + "pad_token", + "cls_token", + "mask_token", + "additional_special_tokens", + ] + + def __init__(self, verbose=True, **kwargs): + self._bos_token = None + self._eos_token = None + self._unk_token = None + self._sep_token = None + self._pad_token = None + self._cls_token = None + self._mask_token = None + self._pad_token_type_id = 0 + self._additional_special_tokens = [] + self.verbose = verbose + + # We directly set the hidden value to allow initialization with special tokens + # which are not yet in the vocabulary. Necesssary for serialization/de-serialization + # TODO clean this up at some point (probably by sitching to fast tokenizers) + for key, value in kwargs.items(): + if key in self.SPECIAL_TOKENS_ATTRIBUTES: + if key == "additional_special_tokens": + assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value) + setattr(self, key, value) + elif isinstance(value, (str, AddedToken)): + setattr(self, key, value) + else: + raise TypeError( + "special token {} has to be either str or AddedToken but got: {}".format(key, type(value)) + ) + + def sanitize_special_tokens(self) -> int: + """ Make sure that all the special tokens attributes of the tokenizer (tokenizer.mask_token, tokenizer.cls_token, ...) + are in the vocabulary. Add the missing ones to the vocabulary if needed. + + Return: + Number of tokens added in the vocaulary during the operation. + """ + return self.add_tokens(self.all_special_tokens_extended, special_tokens=True) + + def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int: + """ + Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them + to class attributes. If special tokens are NOT in the vocabulary, they are added + to it (indexed starting from the last index of the current vocabulary). + + Using `add_special_tokens` will ensure your special tokens can be used in several ways: + + - special tokens are carefully handled by the tokenizer (they are never split) + - you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts. + + When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '') + + Args: + special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: + [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, + ``additional_special_tokens``]. + + Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). + + Returns: + Number of tokens added to the vocabulary. + + Examples:: + + # Let's see how to add a new classification token to GPT-2 + tokenizer = GPT2Tokenizer.from_pretrained('gpt2') + model = GPT2Model.from_pretrained('gpt2') + + special_tokens_dict = {'cls_token': ''} + + num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) + print('We have added', num_added_toks, 'tokens') + model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. + + assert tokenizer.cls_token == '' + """ + if not special_tokens_dict: + return 0 + + added_tokens = 0 + for key, value in special_tokens_dict.items(): + assert key in self.SPECIAL_TOKENS_ATTRIBUTES + + if self.verbose: + logger.info("Assigning %s to the %s key of the tokenizer", value, key) + setattr(self, key, value) + + if key == "additional_special_tokens": + assert isinstance(value, (list, tuple)) and all( + isinstance(t, (str, AddedToken)) for t in value + ), f"Tokens {value} for key {key} should all be str or AddedToken instances" + added_tokens += self.add_tokens(value, special_tokens=True) + else: + assert isinstance( + value, (str, AddedToken) + ), f"Token {value} for key {key} should be a str or an AddedToken instance" + added_tokens += self.add_tokens([value], special_tokens=True) + + return added_tokens + + def add_tokens(self, new_tokens: Union[str, AddedToken, List[str], List[AddedToken]], special_tokens=False) -> int: + """ + Add a list of new tokens to the tokenizer class. If the new tokens are not in the + vocabulary, they are added to it with indices starting from length of the current vocabulary. + + Args: + new_tokens: string or list of string or :class:`~transformers.AddedToken`. Each string is a token to add. + Tokens are only added if they are not already in the vocabulary. AddedToken wrap a string token to + let you personnalize it's behavior (Whether this token should only match against single word, whether + this token should strip all potential whitespaces on the left side, Whether this token should strip + all potential whitespaces on the right side...). + special_token: can be used to specify if the token is a special token. This mostly change the normalization + behavior (special tokens like CLS or [MASK] are usually not lower-cased for instance) + + See details for :class:`~transformers.AddedToken` in HuggingFace tokenizers library. + + Returns: + Number of tokens added to the vocabulary. + + Examples:: + + # Let's see how to increase the vocabulary of Bert model and tokenizer + tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') + model = BertModel.from_pretrained('bert-base-uncased') + + num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) + print('We have added', num_added_toks, 'tokens') + model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. + """ + if not new_tokens: + return 0 + + if not isinstance(new_tokens, (list, tuple)): + new_tokens = [new_tokens] + + return self._add_tokens(new_tokens, special_tokens=special_tokens) + + @property + def bos_token(self): + """ Beginning of sentence token (string). Log an error if used while not having been set. """ + if self._bos_token is None and self.verbose: + logger.error("Using bos_token, but it is not set yet.") + return None + return str(self._bos_token) + + @property + def eos_token(self): + """ End of sentence token (string). Log an error if used while not having been set. """ + if self._eos_token is None and self.verbose: + logger.error("Using eos_token, but it is not set yet.") + return None + return str(self._eos_token) + + @property + def unk_token(self): + """ Unknown token (string). Log an error if used while not having been set. """ + if self._unk_token is None and self.verbose: + logger.error("Using unk_token, but it is not set yet.") + return None + return str(self._unk_token) + + @property + def sep_token(self): + """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ + if self._sep_token is None and self.verbose: + logger.error("Using sep_token, but it is not set yet.") + return None + return str(self._sep_token) + + @property + def pad_token(self): + """ Padding token (string). Log an error if used while not having been set. """ + if self._pad_token is None and self.verbose: + logger.error("Using pad_token, but it is not set yet.") + return None + return str(self._pad_token) + + @property + def cls_token(self): + """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ + if self._cls_token is None and self.verbose: + logger.error("Using cls_token, but it is not set yet.") + return None + return str(self._cls_token) + + @property + def mask_token(self): + """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ + if self._mask_token is None and self.verbose: + logger.error("Using mask_token, but it is not set yet.") + return None + return str(self._mask_token) + + @property + def additional_special_tokens(self): + """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """ + if self._additional_special_tokens is None and self.verbose: + logger.error("Using additional_special_tokens, but it is not set yet.") + return None + return [str(tok) for tok in self._additional_special_tokens] + + @bos_token.setter + def bos_token(self, value): + self._bos_token = value + + @eos_token.setter + def eos_token(self, value): + self._eos_token = value + + @unk_token.setter + def unk_token(self, value): + self._unk_token = value + + @sep_token.setter + def sep_token(self, value): + self._sep_token = value + + @pad_token.setter + def pad_token(self, value): + self._pad_token = value + + @cls_token.setter + def cls_token(self, value): + self._cls_token = value + + @mask_token.setter + def mask_token(self, value): + self._mask_token = value + + @additional_special_tokens.setter + def additional_special_tokens(self, value): + self._additional_special_tokens = value + + @property + def bos_token_id(self): + """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """ + if self._bos_token is None: + return None + return self.convert_tokens_to_ids(self.bos_token) + + @property + def eos_token_id(self): + """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """ + if self._eos_token is None: + return None + return self.convert_tokens_to_ids(self.eos_token) + + @property + def unk_token_id(self): + """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """ + if self._unk_token is None: + return None + return self.convert_tokens_to_ids(self.unk_token) + + @property + def sep_token_id(self): + """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ + if self._sep_token is None: + return None + return self.convert_tokens_to_ids(self.sep_token) + + @property + def pad_token_id(self): + """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """ + if self._pad_token is None: + return None + return self.convert_tokens_to_ids(self.pad_token) + + @property + def pad_token_type_id(self): + """ Id of the padding token type in the vocabulary.""" + return self._pad_token_type_id + + @property + def cls_token_id(self): + """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ + if self._cls_token is None: + return None + return self.convert_tokens_to_ids(self.cls_token) + + @property + def mask_token_id(self): + """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ + if self._mask_token is None: + return None + return self.convert_tokens_to_ids(self.mask_token) + + @property + def additional_special_tokens_ids(self): + """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """ + return self.convert_tokens_to_ids(self.additional_special_tokens) + + @property + def special_tokens_map(self): + """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their + values ('', ''...) + Convert tokens of AddedToken type in string. + All returned tokens are strings + """ + set_attr = {} + for attr in self.SPECIAL_TOKENS_ATTRIBUTES: + attr_value = getattr(self, "_" + attr) + if attr_value: + set_attr[attr] = str(attr_value) + return set_attr + + @property + def special_tokens_map_extended(self): + """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their + values ('', ''...) + Keep the tokens as AddedToken if they are of this type. + + AddedToken can be used to control more finely how special tokens are tokenized. + """ + set_attr = {} + for attr in self.SPECIAL_TOKENS_ATTRIBUTES: + attr_value = getattr(self, "_" + attr) + if attr_value: + set_attr[attr] = attr_value + return set_attr + + @property + def all_special_tokens(self): + """ List all the special tokens ('', ''...) mapped to class attributes + Convert tokens of AddedToken type in string. + All returned tokens are strings + (cls_token, unk_token...). + """ + all_toks = [str(s) for s in self.all_special_tokens_extended] + return all_toks + + @property + def all_special_tokens_extended(self): + """ List all the special tokens ('', ''...) mapped to class attributes + Keep the tokens as AddedToken if they are of this type. + + AddedToken can be used to control more finely how special tokens are tokenized. + """ + all_toks = [] + set_attr = self.special_tokens_map_extended + for attr_value in set_attr.values(): + all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value]) + all_toks = list(set(all_toks)) + return all_toks + + @property + def all_special_ids(self): + """ List the vocabulary indices of the special tokens ('', ''...) mapped to + class attributes (cls_token, unk_token...). + """ + all_toks = self.all_special_tokens + all_ids = self.convert_tokens_to_ids(all_toks) + return all_ids + + +ENCODE_KWARGS_DOCSTRING = r""" + add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): + If set to ``True``, the sequences will be encoded with the special tokens relative + to their model. + `padding` (:obj:`Union[bool, str]`, `optional`, defaults to :obj:`False`): + Activate and control padding. Accepts the following values: + + * `True` or `'longest'`: pad to the longest sequence in the batch (or no padding if only a single sequence if provided), + * `'max_length'`: pad to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`) + * `False` or `'do_not_pad'` (default): No padding (i.e. can output batch with sequences of uneven lengths) + `truncation` (:obj:`Union[bool, str]`, `optional`, defaults to :obj:`False`): + Activate and control truncation. Accepts the following values: + + * `True` or `'longest_first'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`). This will truncate token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is provided, + * `'only_first'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`). This will only truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided, + * `'only_second'`: truncate to a max length specified in `max_length` or to the max acceptable input length for the model if no length is provided (`max_length=None`). This will only truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided, + * `False` or `'do_not_truncate'` (default): No truncation (i.e. can output batch with sequences length greater than the model max admissible input size) + `max_length` (:obj:`Union[int, None]`, `optional`, defaults to :obj:`None`): + Control the length for padding/truncation. Accepts the following values + + * `None` (default): This will use the predefined model max length if required by one of the truncation/padding parameters. If the model has no specific max input length (e.g. XLNet) truncation/padding to max length is deactivated. + * `any integer value` (e.g. `42`): Use this specific maximum length value if required by one of the truncation/padding parameters. + stride (:obj:`int`, `optional`, defaults to ``0``): + If set to a number along with max_length, the overflowing tokens returned when `return_overflowing_tokens=True` + will contain some tokens from the end of the truncated sequence returned to provide some overlap between truncated and overflow ing sequences. + The value of this argument defines the number of overlapping tokens. + is_pretokenized (:obj:`bool`, defaults to :obj:`False`): + Set to True to indicate the input is already tokenized + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + >= 7.5 (Volta). + return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): + Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`, + PyTorch :obj:`torch.Tensor` or Numpy :oj: `np.ndarray` instead of a list of python integers. +""" + +ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" + return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`): + Whether to return token type IDs. If left to the default, will return the token type IDs according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are attention masks? <../glossary.html#attention-mask>`__ + return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return overflowing token sequences (default False). + return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return special tokens mask information (default False). + return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): + Set to True to return (char_start, char_end) for each token (default False). + If using Python's tokenizer, this method will raise NotImplementedError. + This one is only available on fast tokenizers inheriting from PreTrainedTokenizerFast. + **kwargs: passed to the `self.tokenize()` method + + Return: + A Dictionary of shape:: + + { + input_ids: list[int], + token_type_ids: list[int] if return_token_type_ids is True (default) + attention_mask: list[int] if return_attention_mask is True (default) + overflowing_tokens: list[int] if the tokenizer is a slow tokenize, else a List[List[int]] if a ``max_length`` is specified and ``return_overflowing_tokens=True`` + special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` + and return_special_tokens_mask is True + } + + With the fields: + + - ``input_ids``: list of token ids to be fed to a model + - ``token_type_ids``: list of token type ids to be fed to a model + - ``attention_mask``: list of indices specifying which tokens should be attended to by the model + - ``overflowing_tokens``: list of overflowing tokens sequences if a max length is specified and ``return_overflowing_tokens=True``. + - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added + tokens and 1 specifying sequence tokens. +""" + + +class PreTrainedTokenizerBase(SpecialTokensMixin): + """ Base class for slow and fast tokenizers. + + Handle shared (mostly boiler plate) methods for slow and fast tokenizers. + """ + + vocab_files_names: Dict[str, str] = {} + pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {} + pretrained_init_configuration: Dict[str, Dict[str, Any]] = {} + max_model_input_sizes: Dict[str, int] = {} + model_input_names: List[str] = ["token_type_ids", "attention_mask"] + + padding_side: str = "right" + + def __init__(self, **kwargs): + # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``) + self.init_inputs = () + self.init_kwargs = kwargs + + # For backward compatibility we fallback to set model_max_length from max_len if provided + model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None)) + self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER + + # Padding side is right by default and overridden in subclasses. If specified in the kwargs, it is changed. + self.padding_side = kwargs.pop("padding_side", self.padding_side) + assert self.padding_side in [ + "right", + "left", + ], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}" + self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) + + super().__init__(**kwargs) + + @property + def max_len(self) -> int: + """ Kept here for backward compatibility. + Now renamed to `model_max_length` to avoid ambiguity. + """ + return self.model_max_length + + @property + def max_len_single_sentence(self) -> int: + return self.model_max_length - self.num_special_tokens_to_add(pair=False) + + @property + def max_len_sentences_pair(self) -> int: + return self.model_max_length - self.num_special_tokens_to_add(pair=True) + + @max_len_single_sentence.setter + def max_len_single_sentence(self, value) -> int: + """ For backward compatibility, allow to try to setup 'max_len_single_sentence' """ + if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose: + logger.warning( + "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." + ) + else: + raise ValueError( + "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." + ) + + @max_len_sentences_pair.setter + def max_len_sentences_pair(self, value) -> int: + """ For backward compatibility, allow to try to setup 'max_len_sentences_pair' """ + if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose: + logger.warning( + "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." + ) + else: + raise ValueError( + "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." + ) + + @classmethod + def from_pretrained(cls, *inputs, **kwargs): + r""" + Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer. + + Args: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. + - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. + - (not applicable to all derived classes, deprecated) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the vocabulary files and override the cached versions if they exists. + + resume_download: (`optional`) boolean, default False: + Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. + + kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details. + + Examples:: + + # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer + + # Download vocabulary from S3 and cache. + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + + # Download vocabulary from S3 (user-uploaded) and cache. + tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased') + + # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) + tokenizer = BertTokenizer.from_pretrained('./test/saved_model/') + + # If the tokenizer uses a single vocabulary file, you can point directly to this file + tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt') + + # You can link tokens to special vocabulary when instantiating + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='') + # You should be sure '' is in the vocabulary when doing that. + # Otherwise use tokenizer.add_special_tokens({'unk_token': ''}) instead) + assert tokenizer.unk_token == '' + + """ + return cls._from_pretrained(*inputs, **kwargs) + + @classmethod + def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + local_files_only = kwargs.pop("local_files_only", False) + + s3_models = list(cls.max_model_input_sizes.keys()) + vocab_files = {} + init_configuration = {} + if pretrained_model_name_or_path in s3_models: + # Get the vocabulary from AWS S3 bucket + for file_id, map_list in cls.pretrained_vocab_files_map.items(): + vocab_files[file_id] = map_list[pretrained_model_name_or_path] + if ( + cls.pretrained_init_configuration + and pretrained_model_name_or_path in cls.pretrained_init_configuration + ): + init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy() + else: + # Get the vocabulary from local files + logger.info( + "Model name '{}' not found in model shortcut name list ({}). " + "Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format( + pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path + ) + ) + + if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): + if len(cls.vocab_files_names) > 1: + raise ValueError( + "Calling {}.from_pretrained() with the path to a single file or url is not supported." + "Use a model identifier or the path to a directory instead.".format(cls.__name__) + ) + logger.warning( + "Calling {}.from_pretrained() with the path to a single file or url is deprecated".format( + cls.__name__ + ) + ) + file_id = list(cls.vocab_files_names.keys())[0] + vocab_files[file_id] = pretrained_model_name_or_path + else: + # At this point pretrained_model_name_or_path is either a directory or a model identifier name + additional_files_names = { + "added_tokens_file": ADDED_TOKENS_FILE, + "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, + "tokenizer_config_file": TOKENIZER_CONFIG_FILE, + "full_tokenizer_file": FULL_TOKENIZER_FILE, + } + # Look for the tokenizer files + for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items(): + if os.path.isdir(pretrained_model_name_or_path): + full_file_name = os.path.join(pretrained_model_name_or_path, file_name) + if not os.path.exists(full_file_name): + logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) + full_file_name = None + else: + full_file_name = hf_bucket_url( + pretrained_model_name_or_path, filename=file_name, use_cdn=False + ) + + vocab_files[file_id] = full_file_name + + # Get files from url, cache, or disk depending on the case + try: + resolved_vocab_files = {} + for file_id, file_path in vocab_files.items(): + if file_path is None: + resolved_vocab_files[file_id] = None + else: + resolved_vocab_files[file_id] = cached_path( + file_path, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + ) + except EnvironmentError: + if pretrained_model_name_or_path in s3_models: + msg = "Couldn't reach server at '{}' to download vocabulary files." + else: + msg = ( + "Model name '{}' was not found in tokenizers model name list ({}). " + "We assumed '{}' was a path or url to a directory containing vocabulary files " + "named {}, but couldn't find such vocabulary files at this path or url.".format( + pretrained_model_name_or_path, + ", ".join(s3_models), + pretrained_model_name_or_path, + list(cls.vocab_files_names.values()), + ) + ) + + raise EnvironmentError(msg) + + if all(full_file_name is None for full_file_name in resolved_vocab_files.values()): + raise EnvironmentError( + "Model name '{}' was not found in tokenizers model name list ({}). " + "We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files " + "named {} but couldn't find such vocabulary files at this path or url.".format( + pretrained_model_name_or_path, + ", ".join(s3_models), + pretrained_model_name_or_path, + list(cls.vocab_files_names.values()), + ) + ) + + for file_id, file_path in vocab_files.items(): + if file_path == resolved_vocab_files[file_id]: + logger.info("loading file {}".format(file_path)) + else: + logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id])) + + # Prepare tokenizer initialization kwargs + # Did we saved some inputs and kwargs to reload ? + tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None) + if tokenizer_config_file is not None: + with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle: + init_kwargs = json.load(tokenizer_config_handle) + saved_init_inputs = init_kwargs.pop("init_inputs", ()) + if not init_inputs: + init_inputs = saved_init_inputs + else: + init_kwargs = init_configuration + + # Update with newly provided kwargs + init_kwargs.update(kwargs) + + # Set max length if needed + if pretrained_model_name_or_path in cls.max_model_input_sizes: + # if we're using a pretrained model, ensure the tokenizer + # wont index sequences longer than the number of positional embeddings + model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path] + if model_max_length is not None and isinstance(model_max_length, (int, float)): + init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length) + + # Merge resolved_vocab_files arguments in init_kwargs. + added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) + for args_name, file_path in resolved_vocab_files.items(): + if args_name not in init_kwargs: + init_kwargs[args_name] = file_path + + # Instantiate tokenizer. + try: + tokenizer = cls(*init_inputs, **init_kwargs) + except OSError: + raise OSError( + "Unable to load vocabulary from file. " + "Please check that the provided vocabulary is accessible and not corrupted." + ) + + # Save inputs and kwargs for saving and re-loading with ``save_pretrained`` + tokenizer.init_inputs = init_inputs + tokenizer.init_kwargs = init_kwargs + + # If there is a complementary special token map, load it + special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None) + if special_tokens_map_file is not None: + with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: + special_tokens_map = json.load(special_tokens_map_handle) + + for key, value in special_tokens_map.items(): + if isinstance(value, dict): + value = AddedToken(**value) + setattr(tokenizer, key, value) + + # Add supplementary tokens. + special_tokens = tokenizer.all_special_tokens + if added_tokens_file is not None: + with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: + added_tok_encoder = json.load(added_tokens_handle) + + # Sort added tokens by index + added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1])) + + for token, index in added_tok_encoder_sorted: + assert index == len(tokenizer), ( + f"Non-consecutive added token '{token}' found. " + f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary." + ) + tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens)) + + # Check all our special tokens are registrered as "no split" token (we don't cut them) and are in the vocab + added_tokens = tokenizer.sanitize_special_tokens() + if added_tokens: + logger.warning( + "Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained." + ) + + return tokenizer + + def save_pretrained(self, save_directory) -> Tuple[str]: + """ Save the tokenizer vocabulary files together with: + - added tokens, + - special-tokens-to-class-attributes-mapping, + - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert). + + Warning: This won't save modifications you may have applied to the tokenizer after the instantiation + (e.g. modifying tokenizer.do_lower_case after creation). + + This method make sure the full tokenizer can then be re-loaded using the + :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method. + """ + if os.path.isfile(save_directory): + logger.error("Provided path ({}) should be a directory, not a file".format(save_directory)) + return + os.makedirs(save_directory, exist_ok=True) + + special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE) + added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE) + tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE) + + tokenizer_config = copy.deepcopy(self.init_kwargs) + if len(self.init_inputs) > 0: + tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs) + for file_id in self.vocab_files_names.keys(): + tokenizer_config.pop(file_id, None) + + with open(tokenizer_config_file, "w", encoding="utf-8") as f: + f.write(json.dumps(tokenizer_config, ensure_ascii=False)) + + with open(special_tokens_map_file, "w", encoding="utf-8") as f: + write_dict = {} + for key, value in self.special_tokens_map_extended.items(): + if isinstance(value, AddedToken): + write_dict[key] = value.__getstate__() + else: + write_dict[key] = value + f.write(json.dumps(write_dict, ensure_ascii=False)) + + added_vocab = self.get_added_vocab() + if added_vocab: + with open(added_tokens_file, "w", encoding="utf-8") as f: + out_str = json.dumps(added_vocab, ensure_ascii=False) + f.write(out_str) + + vocab_files = self.save_vocabulary(save_directory) + + return vocab_files + (special_tokens_map_file, added_tokens_file) + + @add_end_docstrings( + ENCODE_KWARGS_DOCSTRING, + """ + **kwargs: passed to the `self.tokenize()` method. + """, + ) + def encode( + self, + text: Union[TextInput, PreTokenizedInput, EncodedInput], + text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str] = False, + truncation: Union[bool, str] = False, + max_length: Optional[int] = None, + stride: int = 0, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs + ): + """ + Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. + + Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``. + + Args: + text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`): + The first sequence to be encoded. This can be a string, a list of strings (tokenized string using + the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` + method) + text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second sequence to be encoded. This can be a string, a list of strings (tokenized + string using the `tokenize` method) or a list of integers (tokenized string ids using the + `convert_tokens_to_ids` method) + """ + encoded_inputs = self.encode_plus( + text, + text_pair=text_pair, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + return_tensors=return_tensors, + **kwargs, + ) + + return encoded_inputs["input_ids"] + + def num_special_tokens_to_add(self, pair: bool = False) -> int: + raise NotImplementedError + + def _get_padding_truncation_strategies( + self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs + ): + """ Find the correct padding/truncation strategy with backward compatibility + for old arguments (truncation_strategy and pad_to_max_length) and behaviors. + """ + old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate") + old_pad_to_max_length = kwargs.pop("pad_to_max_length", False) + + # Backward compatibility for previous behavior, maybe we should deprecate it: + # If you only set max_length, it activates truncation for max_length + if max_length is not None and padding is False and truncation is False: + if verbose: + logger.warning( + "Truncation was not explicitely activated but `max_length` is provided a specific value, " + "please use `truncation=True` to explicitely truncate examples to max length. " + "Defaulting to 'longest_first' truncation strategy. " + "If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy " + "more precisely by providing a specific strategy to `truncation`." + ) + truncation = "longest_first" + + # Get padding strategy + if padding is False and old_pad_to_max_length: + if verbose: + warnings.warn( + "The `pad_to_max_length` argument is deprecated and will be removed in a future version, " + "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or " + "use `padding='max_length'` to pad to a max length. In this case, you can give a specific " + "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the " + "maximal input size of the model (e.g. 512 for Bert).", + DeprecationWarning, + ) + if max_length is None: + padding_strategy = PaddingStrategy.LONGEST + else: + padding_strategy = PaddingStrategy.MAX_LENGTH + elif padding is not False: + if padding is True: + padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch + elif not isinstance(padding, PaddingStrategy): + padding_strategy = PaddingStrategy(padding) + else: + padding_strategy = PaddingStrategy.DO_NOT_PAD + + # Get truncation strategy + if truncation is False and old_truncation_strategy != "do_not_truncate": + if verbose: + warnings.warn( + "The `truncation_strategy` argument is deprecated and will be removed in a future version, " + "use `truncation=True` to truncate examples to a max length. You can give a specific " + "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the " + "maximal input size of the model (e.g. 512 for Bert). " + " If you have pairs of inputs, you can give a specific truncation strategy selected among " + "`truncation='only_first'` (will only truncate the first sentence in the pairs) " + "`truncation='only_second'` (will only truncate the second sentence in the pairs) " + "or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).", + DeprecationWarning, + ) + truncation_strategy = TruncationStrategy(old_truncation_strategy) + elif truncation is not False: + if truncation is True: + truncation_strategy = ( + TruncationStrategy.LONGEST_FIRST + ) # Default to truncate the longest sequences in pairs of inputs + elif not isinstance(truncation, TruncationStrategy): + truncation_strategy = TruncationStrategy(truncation) + else: + truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE + + # Set max length if needed + if max_length is None: + if padding_strategy == PaddingStrategy.MAX_LENGTH: + if self.model_max_length > LARGE_INTEGER: + if verbose: + logger.warning( + "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. " + "Default to no padding." + ) + padding_strategy = PaddingStrategy.DO_NOT_PAD + else: + max_length = self.model_max_length + + if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE: + if self.model_max_length > LARGE_INTEGER: + if verbose: + logger.warning( + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. " + "Default to no truncation." + ) + truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE + else: + max_length = self.model_max_length + + # Test if we have a padding token + if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0): + raise ValueError( + "Asking to pad but the tokenizer does not have a padding token. " + "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` " + "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`." + ) + + # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided + if ( + truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE + and padding_strategy != PaddingStrategy.DO_NOT_PAD + and pad_to_multiple_of is not None + and max_length is not None + and (max_length % pad_to_multiple_of != 0) + ): + raise ValueError( + f"Truncation and padding are both activated but " + f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})." + ) + + return padding_strategy, truncation_strategy, max_length, kwargs + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], + text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str] = False, + truncation: Union[bool, str] = False, + max_length: Optional[int] = None, + stride: int = 0, + is_pretokenized: bool = False, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs + ) -> BatchEncoding: + """ + Returns a dictionary containing the encoded sequence or sequence pair and additional information: + the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. + + Args: + text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]``): + The sequence or batch of sequences to be encoded. + Each sequence can be a string or a list of strings (pre-tokenized string). + If the sequences are provided as list of strings (pretokenized), you must set `is_pretokenized=True` + (to lift the ambiguity with a batch of sequences) + text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]``): + The sequence or batch of sequences to be encoded. + Each sequence can be a string or a list of strings (pre-tokenized string). + If the sequences are provided as list of strings (pretokenized), you must set `is_pretokenized=True` + (to lift the ambiguity with a batch of sequences) + """ + # Input type checking for clearer error + assert isinstance(text, str) or ( + isinstance(text, (list, tuple)) + and ( + len(text) == 0 + or ( + isinstance(text[0], str) + or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str))) + ) + ) + ), ( + "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " + "or `List[List[str]]` (batch of pretokenized examples)." + ) + + assert ( + text_pair is None + or isinstance(text_pair, str) + or ( + isinstance(text_pair, (list, tuple)) + and ( + len(text_pair) == 0 + or ( + isinstance(text_pair[0], str) + or ( + isinstance(text_pair[0], (list, tuple)) + and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str)) + ) + ) + ) + ) + ), ( + "text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " + "or `List[List[str]]` (batch of pretokenized examples)." + ) + + is_batched = bool( + (not is_pretokenized and isinstance(text, (list, tuple))) + or (is_pretokenized and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))) + ) + + if is_batched: + batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text + return self.batch_encode_plus( + batch_text_or_text_pairs=batch_text_or_text_pairs, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + is_pretokenized=is_pretokenized, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + else: + return self.encode_plus( + text=text, + text_pair=text_pair, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + is_pretokenized=is_pretokenized, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def encode_plus( + self, + text: Union[TextInput, PreTokenizedInput, EncodedInput], + text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str] = False, + truncation: Union[bool, str] = False, + max_length: Optional[int] = None, + stride: int = 0, + is_pretokenized: bool = False, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs + ) -> BatchEncoding: + """ + Returns a dictionary containing the encoded sequence or sequence pair and additional information: + the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. + + Args: + text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the later only for not-fast tokenizers)): + The first sequence to be encoded. This can be a string, a list of strings (tokenized string using + the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` + method) + text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`, defaults to :obj:`None`): + Optional second sequence to be encoded. This can be a string, a list of strings (tokenized + string using the `tokenize` method) or a list of integers (tokenized string ids using the + `convert_tokens_to_ids` method) + """ + + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + return self._encode_plus( + text=text, + text_pair=text_pair, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + is_pretokenized=is_pretokenized, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + def _encode_plus( + self, + text: Union[TextInput, PreTokenizedInput, EncodedInput], + text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + is_pretokenized: bool = False, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs + ) -> BatchEncoding: + raise NotImplementedError + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def batch_encode_plus( + self, + batch_text_or_text_pairs: Union[ + List[TextInput], + List[TextInputPair], + List[PreTokenizedInput], + List[PreTokenizedInputPair], + List[EncodedInput], + List[EncodedInputPair], + ], + add_special_tokens: bool = True, + padding: Union[bool, str] = False, + truncation: Union[bool, str] = False, + max_length: Optional[int] = None, + stride: int = 0, + is_pretokenized: bool = False, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs + ) -> BatchEncoding: + """ + Returns a dictionary containing the encoded sequence or sequence pair and additional information: + the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. + + Args: + batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, + :obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, + and for not-fast tokenizers, also: + :obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`): + Batch of sequences or pair of sequences to be encoded. + This can be a list of string/string-sequences/int-sequences or a list of pair of + string/string-sequences/int-sequence (see details in encode_plus) + """ + + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + return self._batch_encode_plus( + batch_text_or_text_pairs=batch_text_or_text_pairs, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + is_pretokenized=is_pretokenized, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + def _batch_encode_plus( + self, + batch_text_or_text_pairs: Union[ + List[TextInput], + List[TextInputPair], + List[PreTokenizedInput], + List[PreTokenizedInputPair], + List[EncodedInput], + List[EncodedInputPair], + ], + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + is_pretokenized: bool = False, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs + ) -> BatchEncoding: + raise NotImplementedError + + def pad( + self, + encoded_inputs: Union[ + BatchEncoding, + List[BatchEncoding], + Dict[str, EncodedInput], + Dict[str, List[EncodedInput]], + List[Dict[str, EncodedInput]], + ], + padding: Union[bool, str] = True, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + verbose: bool = True, + ) -> BatchEncoding: + """ Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length in the batch. + + Padding side (left/right) padding token ids are defined at the tokenizer level + (with ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``) + + Args: + encoded_inputs: Dictionary of tokenized inputs (`Dict[str, List[int]]`) or batch of tokenized inputs. + Batch of tokenized inputs can be given as dicts of lists or lists of dicts, both work so you can + use ``tokenizer.pad()`` during pre-processing as well as in a PyTorch Dataloader collate function. + (`Dict[str, List[List[int]]]` or `List[Dict[str, List[int]]]`). + padding: Boolean or specific strategy to use for padding. + Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: + - 'longest' (or `True`) Pad to the longest sequence in the batch + - 'max_length': Pad to the max length (default) + - 'do_not_pad' (or `False`): Do not pad + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + >= 7.5 (Volta). + return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) + return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`): + Can be set to 'tf', 'pt' or 'np' to return respectively TensorFlow :obj:`tf.constant`, + PyTorch :obj:`torch.Tensor` or Numpy :oj: `np.ndarray` instead of a list of python integers. + verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): + Set to ``False`` to avoid printing infos and warnings. + """ + # If we have a list of dicts, let's convert it in a dict of lists + if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)): + encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()} + + assert "input_ids" in encoded_inputs, ( + "You should supply an encoding or a list of encodings to this method. " + "An encoding is the output of one the encoding methods of the tokenizer, i.e. " + "__call__/encode_plus/batch_encode_plus. " + ) + + if not encoded_inputs["input_ids"]: + if return_attention_mask: + encoded_inputs["attention_mask"] = [] + return encoded_inputs + + # Convert padding_strategy in PaddingStrategy + padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies( + padding=padding, max_length=max_length, verbose=verbose + ) + + if encoded_inputs["input_ids"] and not isinstance(encoded_inputs["input_ids"][0], (list, tuple)): + encoded_inputs = self._pad( + encoded_inputs, + max_length=max_length, + padding_strategy=padding_strategy, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + ) + return BatchEncoding(encoded_inputs, tensor_type=return_tensors) + + batch_size = len(encoded_inputs["input_ids"]) + assert all( + len(v) == batch_size for v in encoded_inputs.values() + ), "Some items in the output dictionnary have a different batch size than others." + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = max(len(inputs) for inputs in encoded_inputs["input_ids"]) + padding_strategy = PaddingStrategy.MAX_LENGTH + + batch_outputs = {} + for i in range(batch_size): + inputs = dict((k, v[i]) for k, v in encoded_inputs.items()) + outputs = self._pad( + inputs, + max_length=max_length, + padding_strategy=padding_strategy, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + ) + + for key, value in outputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + return BatchEncoding(batch_outputs, tensor_type=return_tensors) + + def create_token_type_ids_from_sequences(self, token_ids_0: List, token_ids_1: Optional[List] = None) -> List[int]: + if token_ids_1 is None: + return len(token_ids_0) * [0] + return [0] * len(token_ids_0) + [1] * len(token_ids_1) + + def build_inputs_with_special_tokens(self, token_ids_0: List, token_ids_1: Optional[List] = None) -> List: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. This implementation does not add special tokens. + """ + if token_ids_1 is None: + return token_ids_0 + return token_ids_0 + token_ids_1 + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def prepare_for_model( + self, + ids: List[int], + pair_ids: Optional[List[int]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str] = False, + truncation: Union[bool, str] = False, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + prepend_batch_axis: bool = False, + **kwargs + ) -> BatchEncoding: + """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. + It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and + manages a moving window (with user defined stride) for overflowing tokens + + Args: + ids: list of tokenized input ids. Can be obtained from a string by chaining the + `tokenize` and `convert_tokens_to_ids` methods. + pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the + `tokenize` and `convert_tokens_to_ids` methods. + """ + + if "return_lengths" in kwargs: + if verbose: + warnings.warn( + "The PreTrainedTokenizerBase.prepare_for_model `return_lengths` parameter is deprecated. " + "Please use `return_length` instead.", + FutureWarning, + ) + return_length = kwargs["return_lengths"] + + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + pair = bool(pair_ids is not None) + len_ids = len(ids) + len_pair_ids = len(pair_ids) if pair else 0 + + # Load from model defaults + if return_token_type_ids is None: + return_token_type_ids = "token_type_ids" in self.model_input_names + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + encoded_inputs = {} + + # Compute the total size of the returned encodings + total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0) + + # Truncation: Handle max sequence length + if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length: + ids, pair_ids, overflowing_tokens = self.truncate_sequences( + ids, + pair_ids=pair_ids, + num_tokens_to_remove=total_len - max_length, + truncation_strategy=truncation_strategy, + stride=stride, + ) + if return_overflowing_tokens: + encoded_inputs["overflowing_tokens"] = overflowing_tokens + encoded_inputs["num_truncated_tokens"] = total_len - max_length + + # Add special tokens + if add_special_tokens: + sequence = self.build_inputs_with_special_tokens(ids, pair_ids) + token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) + else: + sequence = ids + pair_ids if pair else ids + token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) + + # Build output dictionnary + encoded_inputs["input_ids"] = sequence + if return_token_type_ids: + encoded_inputs["token_type_ids"] = token_type_ids + if return_special_tokens_mask: + if add_special_tokens: + encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) + else: + encoded_inputs["special_tokens_mask"] = [0] * len(sequence) + + # Check lengths + if max_length is None and len(encoded_inputs["input_ids"]) > self.model_max_length and verbose: + logger.warning( + "Token indices sequence length is longer than the specified maximum sequence length " + "for this model ({} > {}). Running this sequence through the model will result in " + "indexing errors".format(len(ids), self.model_max_length) + ) + + # Padding + if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask: + encoded_inputs = self.pad( + encoded_inputs, + max_length=max_length, + padding=padding_strategy.value, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + ) + + if return_length: + encoded_inputs["length"] = len(encoded_inputs["input_ids"]) + + batch_outputs = BatchEncoding( + encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis + ) + + return batch_outputs + + def truncate_sequences( + self, + ids: List[int], + pair_ids: Optional[List[int]] = None, + num_tokens_to_remove: int = 0, + truncation_strategy: Union[str, TruncationStrategy] = "longest_first", + stride: int = 0, + ) -> Tuple[List[int], List[int], List[int]]: + """ Truncates a sequence pair in place to the maximum length. + + Args: + ids: list of tokenized input ids. Can be obtained from a string by chaining the + `tokenize` and `convert_tokens_to_ids` methods. + pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the + `tokenize` and `convert_tokens_to_ids` methods. + num_tokens_to_remove (:obj:`int`, `optional`, defaults to ``0``): + number of tokens to remove using the truncation strategy + truncation_strategy (:obj:`string`, `optional`, defaults to "longest_first"): + String selected in the following options: + + - 'longest_first' (default): Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences). + Overflowing tokens only contains overflow from the first sequence. + - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. + - 'only_second': Only truncate the second sequence + - 'do_not_truncate' + stride (:obj:`int`, `optional`, defaults to ``0``): + If set to a number along with max_length, the overflowing tokens returned will contain some tokens + from the main sequence returned. The value of this argument defines the number of additional tokens. + """ + if num_tokens_to_remove <= 0: + return ids, pair_ids, [] + + if not isinstance(truncation_strategy, TruncationStrategy): + truncation_strategy = TruncationStrategy(truncation_strategy) + + overflowing_tokens = [] + if truncation_strategy == TruncationStrategy.LONGEST_FIRST: + for _ in range(num_tokens_to_remove): + if pair_ids is None or len(ids) > len(pair_ids): + if not overflowing_tokens: + window_len = min(len(ids), stride + 1) + else: + window_len = 1 + overflowing_tokens.extend(ids[-window_len:]) + ids = ids[:-1] + else: + if not overflowing_tokens: + window_len = min(len(pair_ids), stride + 1) + else: + window_len = 1 + overflowing_tokens.extend(pair_ids[-window_len:]) + pair_ids = pair_ids[:-1] + elif truncation_strategy == TruncationStrategy.ONLY_FIRST: + if len(ids) > num_tokens_to_remove: + window_len = min(len(ids), stride + num_tokens_to_remove) + overflowing_tokens = ids[-window_len:] + ids = ids[:-num_tokens_to_remove] + else: + logger.error( + f"We need to remove {num_tokens_to_remove} to truncate the input" + f"but the first sequence has a length {len(ids)}. " + f"Please select another truncation strategy than {truncation_strategy}, " + f"for instance 'longest_first' or 'only_second'." + ) + elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None: + if len(pair_ids) > num_tokens_to_remove: + window_len = min(len(pair_ids), stride + num_tokens_to_remove) + overflowing_tokens = pair_ids[-window_len:] + pair_ids = pair_ids[:-num_tokens_to_remove] + else: + logger.error( + f"We need to remove {num_tokens_to_remove} to truncate the input" + f"but the second sequence has a length {len(pair_ids)}. " + f"Please select another truncation strategy than {truncation_strategy}, " + f"for instance 'longest_first' or 'only_first'." + ) + + return (ids, pair_ids, overflowing_tokens) + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ Pad encoded inputs (on left/right and up to predefined legnth or max length in the batch) + + Args: + encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + >= 7.5 (Volta). + return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(encoded_inputs["input_ids"]) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = ( + padding_strategy != PaddingStrategy.DO_NOT_PAD and len(encoded_inputs["input_ids"]) != max_length + ) + + if needs_to_be_padded: + difference = max_length - len(encoded_inputs["input_ids"]) + if self.padding_side == "right": + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference + if "token_type_ids" in encoded_inputs: + encoded_inputs["token_type_ids"] = ( + encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference + ) + if "special_tokens_mask" in encoded_inputs: + encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference + encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference + elif self.padding_side == "left": + if return_attention_mask: + encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"]) + if "token_type_ids" in encoded_inputs: + encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[ + "token_type_ids" + ] + if "special_tokens_mask" in encoded_inputs: + encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] + encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"] + else: + raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + else: + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + + return encoded_inputs + + def batch_decode(self, sequences: List[List[int]], **kwargs) -> List[str]: + return [self.decode(seq, **kwargs) for seq in sequences] + + def decode( + self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True + ) -> str: + """ + Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary + with options to remove special tokens and clean up tokenization spaces. + Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``. + + Args: + token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods. + skip_special_tokens: if set to True, will replace special tokens. + clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces. + """ + raise NotImplementedError + + def get_special_tokens_mask( + self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + already_has_special_tokens: (default False) Set to True if the token list is already formated with + special tokens for the model + + Returns: + A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + assert already_has_special_tokens and token_ids_1 is None, ( + "You cannot use ``already_has_special_tokens=False`` with this tokenizer. " + "Please use a slow (full python) tokenizer to activate this argument." + "Or set `return_special_token_mask=True` when calling the encoding method " + "to get the special tokens mask in any tokenizer. " + ) + + all_special_ids = self.all_special_ids # cache the property + + special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0] + + return special_tokens_mask + + @staticmethod + def clean_up_tokenization(out_string: str) -> str: + """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms. + """ + out_string = ( + out_string.replace(" .", ".") + .replace(" ?", "?") + .replace(" !", "!") + .replace(" ,", ",") + .replace(" ' ", "'") + .replace(" n't", "n't") + .replace(" 'm", "'m") + .replace(" 's", "'s") + .replace(" 've", "'ve") + .replace(" 're", "'re") + ) + return out_string