ReactSeq / onmt /transforms /sampling.py
Oopstom's picture
Upload 313 files
c668e80 verified
raw
history blame
8.2 kB
"""Transforms relate to hamming distance sampling."""
import random
import numpy as np
from onmt.constants import DefaultTokens
from onmt.transforms import register_transform
from .transform import Transform, ObservableStats
class HammingDistanceSampling(object):
"""Functions related to (negative) Hamming Distance Sampling."""
def _softmax(self, x):
softmax = np.exp(x) / sum(np.exp(x))
return softmax
def _sample_replace(self, vocab, reject):
"""Sample a token from `vocab` other than `reject`."""
token = reject
while token == reject:
token = random.choice(vocab)
return token
def _sample_distance(self, tokens, temperature):
"""Sample number of tokens to corrupt from `tokens`."""
n_tokens = len(tokens)
indices = np.arange(n_tokens)
logits = indices * -1 * temperature
probs = self._softmax(logits)
distance = np.random.choice(indices, p=probs)
return distance
def _sample_position(self, tokens, distance):
n_tokens = len(tokens)
chosen_indices = random.sample(range(n_tokens), k=distance)
return chosen_indices
class HammingDistanceSamplingTransform(Transform, HammingDistanceSampling):
"""Abstract Transform class based on HammingDistanceSampling."""
def _set_seed(self, seed):
"""set seed to ensure reproducibility."""
np.random.seed(seed)
random.seed(seed)
class SwitchOutStats(ObservableStats):
"""Runing statistics for counting tokens being switched out."""
__slots__ = ["changed", "total"]
def __init__(self, changed: int, total: int):
self.changed = changed
self.total = total
def update(self, other: "SwitchOutStats"):
self.changed += other.changed
self.total += other.total
@register_transform(name="switchout")
class SwitchOutTransform(HammingDistanceSamplingTransform):
"""
SwitchOut.
:cite:`DBLP:journals/corr/abs-1808-07512`
"""
def __init__(self, opts):
super().__init__(opts)
@classmethod
def require_vocab(cls):
"""Override this method to inform it need vocab to start."""
return True
@classmethod
def add_options(cls, parser):
"""Avalilable options relate to this Transform."""
group = parser.add_argument_group("Transform/SwitchOut")
group.add(
"-switchout_temperature",
"--switchout_temperature",
type=float,
default=1.0,
help="Sampling temperature for SwitchOut. :math:`\\tau^{-1}`"
" in :cite:`DBLP:journals/corr/abs-1808-07512`. "
"Smaller value makes data more diverse.",
)
def _parse_opts(self):
self.temperature = self.opts.switchout_temperature
def _switchout(self, tokens, vocab, stats=None):
# 1. sample number of tokens to corrupt
n_chosen = self._sample_distance(tokens, self.temperature)
# 2. sample positions to corrput
chosen_indices = self._sample_position(tokens, distance=n_chosen)
# 3. sample corrupted values
for i in chosen_indices:
tokens[i] = self._sample_replace(vocab, reject=tokens[i])
if stats is not None:
stats.update(SwitchOutStats(n_chosen, len(tokens)))
return tokens
def apply(self, example, is_train=False, stats=None, **kwargs):
"""Apply switchout to both src and tgt side tokens."""
if is_train:
example["src"] = self._switchout(
example["src"], self.vocabs["src"].ids_to_tokens, stats
)
example["tgt"] = self._switchout(
example["tgt"], self.vocabs["tgt"].ids_to_tokens, stats
)
return example
def _repr_args(self):
"""Return str represent key arguments for class."""
return "{}={}".format("switchout_temperature", self.temperature)
class TokenDropStats(ObservableStats):
"""Runing statistics for counting tokens being switched out."""
__slots__ = ["dropped", "total"]
def __init__(self, dropped: int, total: int):
self.dropped = dropped
self.total = total
def update(self, other: "TokenDropStats"):
self.dropped += other.dropped
self.total += other.total
@register_transform(name="tokendrop")
class TokenDropTransform(HammingDistanceSamplingTransform):
"""Random drop tokens from sentence."""
def __init__(self, opts):
super().__init__(opts)
@classmethod
def add_options(cls, parser):
"""Avalilable options relate to this Transform."""
group = parser.add_argument_group("Transform/Token_Drop")
group.add(
"-tokendrop_temperature",
"--tokendrop_temperature",
type=float,
default=1.0,
help="Sampling temperature for token deletion.",
)
def _parse_opts(self):
self.temperature = self.opts.tokendrop_temperature
def _token_drop(self, tokens, stats=None):
n_items = len(tokens)
# 1. sample number of tokens to corrupt
n_chosen = self._sample_distance(tokens, self.temperature)
# 2. sample positions to corrput
chosen_indices = self._sample_position(tokens, distance=n_chosen)
# 3. Drop token on chosen position
out = [tok for (i, tok) in enumerate(tokens) if i not in chosen_indices]
if stats is not None:
stats.update(TokenDropStats(n_chosen, n_items))
return out
def apply(self, example, is_train=False, stats=None, **kwargs):
"""Apply token drop to both src and tgt side tokens."""
if is_train:
example["src"] = self._token_drop(example["src"], stats)
example["tgt"] = self._token_drop(example["tgt"], stats)
return example
def _repr_args(self):
"""Return str represent key arguments for class."""
return "{}={}".format("tokendrop_temperature", self.temperature)
class TokenMaskStats(ObservableStats):
"""Runing statistics for counting tokens being switched out."""
__slots__ = ["masked", "total"]
def __init__(self, masked: int, total: int):
self.masked = masked
self.total = total
def update(self, other: "TokenMaskStats"):
self.masked += other.masked
self.total += other.total
@register_transform(name="tokenmask")
class TokenMaskTransform(HammingDistanceSamplingTransform):
"""Random mask tokens from src sentence."""
MASK_TOK = DefaultTokens.MASK
def __init__(self, opts):
super().__init__(opts)
@classmethod
def add_options(cls, parser):
"""Avalilable options relate to this Transform."""
group = parser.add_argument_group("Transform/Token_Mask")
group.add(
"-tokenmask_temperature",
"--tokenmask_temperature",
type=float,
default=1.0,
help="Sampling temperature for token masking.",
)
def _parse_opts(self):
self.temperature = self.opts.tokenmask_temperature
@classmethod
def get_specials(cls, opts):
"""Get special vocabs added by prefix transform."""
return ({cls.MASK_TOK}, set())
def _token_mask(self, tokens, stats=None):
# 1. sample number of tokens to corrupt
n_chosen = self._sample_distance(tokens, self.temperature)
# 2. sample positions to corrput
chosen_indices = self._sample_position(tokens, distance=n_chosen)
# 3. mask word on chosen position
for i in chosen_indices:
tokens[i] = self.MASK_TOK
if stats is not None:
stats.update(TokenDropStats(n_chosen, len(tokens)))
return tokens
def apply(self, example, is_train=False, stats=None, **kwargs):
"""Apply word drop to both src and tgt side tokens."""
if is_train:
example["src"] = self._token_mask(example["src"], stats)
return example
def _repr_args(self):
"""Return str represent key arguments for class."""
return "{}={}".format("tokenmask_temperature", self.temperature)