from onmt.transforms import register_transform
from .transform import Transform
import unicodedata
import random


@register_transform(name="uppercase")
class UpperCaseTransform(Transform):
    """
    Convert source and target examples to uppercase.

    This transform uses `unicodedata` to normalize the converted
    uppercase strings as this is needed for some languages (e.g. Greek).
    One issue is that the normalization removes all diacritics and
    accents from the uppercased strings, even though in few occasions some
    diacritics should be kept even in the uppercased form.
    """

    def __init__(self, opts):
        super().__init__(opts)

    @classmethod
    def add_options(cls, parser):
        """Add an option for the corpus ratio to apply this transform."""

        group = parser.add_argument_group("Transform/Uppercase")
        group.add(
            "--upper_corpus_ratio",
            "-upper_corpus_ratio",
            type=float,
            default=0.01,
            help="Corpus ratio to apply uppercasing.",
        )

    def _parse_opts(self):
        self.upper_corpus_ratio = self.opts.upper_corpus_ratio

    def apply(self, example, is_train=False, stats=None, **kwargs):
        """Convert source and target examples to uppercase."""

        if random.random() > self.upper_corpus_ratio:
            return example

        src_str = " ".join(example["src"])
        src_str = "".join(
            c
            for c in unicodedata.normalize("NFD", src_str.upper())
            if unicodedata.category(c) != "Mn"
        )
        example["src"] = src_str.split()

        if example["tgt"] is not None:
            tgt_str = " ".join(example["tgt"])
            tgt_str = "".join(
                c
                for c in unicodedata.normalize("NFD", tgt_str.upper())
                if unicodedata.category(c) != "Mn"
            )
            example["tgt"] = tgt_str.split()

        return example