ReactSeq / onmt /transforms /uppercase.py
Oopstom's picture
Upload 313 files
c668e80 verified
raw
history blame
1.95 kB
from onmt.transforms import register_transform
from .transform import Transform
import unicodedata
import random
@register_transform(name="uppercase")
class UpperCaseTransform(Transform):
"""
Convert source and target examples to uppercase.
This transform uses `unicodedata` to normalize the converted
uppercase strings as this is needed for some languages (e.g. Greek).
One issue is that the normalization removes all diacritics and
accents from the uppercased strings, even though in few occasions some
diacritics should be kept even in the uppercased form.
"""
def __init__(self, opts):
super().__init__(opts)
@classmethod
def add_options(cls, parser):
"""Add an option for the corpus ratio to apply this transform."""
group = parser.add_argument_group("Transform/Uppercase")
group.add(
"--upper_corpus_ratio",
"-upper_corpus_ratio",
type=float,
default=0.01,
help="Corpus ratio to apply uppercasing.",
)
def _parse_opts(self):
self.upper_corpus_ratio = self.opts.upper_corpus_ratio
def apply(self, example, is_train=False, stats=None, **kwargs):
"""Convert source and target examples to uppercase."""
if random.random() > self.upper_corpus_ratio:
return example
src_str = " ".join(example["src"])
src_str = "".join(
c
for c in unicodedata.normalize("NFD", src_str.upper())
if unicodedata.category(c) != "Mn"
)
example["src"] = src_str.split()
if example["tgt"] is not None:
tgt_str = " ".join(example["tgt"])
tgt_str = "".join(
c
for c in unicodedata.normalize("NFD", tgt_str.upper())
if unicodedata.category(c) != "Mn"
)
example["tgt"] = tgt_str.split()
return example