|
from onmt.transforms import register_transform |
|
from .transform import Transform |
|
import unicodedata |
|
import random |
|
|
|
|
|
@register_transform(name="uppercase") |
|
class UpperCaseTransform(Transform): |
|
""" |
|
Convert source and target examples to uppercase. |
|
|
|
This transform uses `unicodedata` to normalize the converted |
|
uppercase strings as this is needed for some languages (e.g. Greek). |
|
One issue is that the normalization removes all diacritics and |
|
accents from the uppercased strings, even though in few occasions some |
|
diacritics should be kept even in the uppercased form. |
|
""" |
|
|
|
def __init__(self, opts): |
|
super().__init__(opts) |
|
|
|
@classmethod |
|
def add_options(cls, parser): |
|
"""Add an option for the corpus ratio to apply this transform.""" |
|
|
|
group = parser.add_argument_group("Transform/Uppercase") |
|
group.add( |
|
"--upper_corpus_ratio", |
|
"-upper_corpus_ratio", |
|
type=float, |
|
default=0.01, |
|
help="Corpus ratio to apply uppercasing.", |
|
) |
|
|
|
def _parse_opts(self): |
|
self.upper_corpus_ratio = self.opts.upper_corpus_ratio |
|
|
|
def apply(self, example, is_train=False, stats=None, **kwargs): |
|
"""Convert source and target examples to uppercase.""" |
|
|
|
if random.random() > self.upper_corpus_ratio: |
|
return example |
|
|
|
src_str = " ".join(example["src"]) |
|
src_str = "".join( |
|
c |
|
for c in unicodedata.normalize("NFD", src_str.upper()) |
|
if unicodedata.category(c) != "Mn" |
|
) |
|
example["src"] = src_str.split() |
|
|
|
if example["tgt"] is not None: |
|
tgt_str = " ".join(example["tgt"]) |
|
tgt_str = "".join( |
|
c |
|
for c in unicodedata.normalize("NFD", tgt_str.upper()) |
|
if unicodedata.category(c) != "Mn" |
|
) |
|
example["tgt"] = tgt_str.split() |
|
|
|
return example |
|
|