Spaces:

Oopstom
/

ReactSeq

Sleeping

App Files Files Community

ReactSeq / onmt /transforms /sampling.py

Oopstom

Upload 313 files

c668e80 verified 6 months ago

raw

history blame

8.2 kB

	"""Transforms relate to hamming distance sampling."""
	import random
	import numpy as np
	from onmt.constants import DefaultTokens
	from onmt.transforms import register_transform
	from .transform import Transform, ObservableStats


	class HammingDistanceSampling(object):
	"""Functions related to (negative) Hamming Distance Sampling."""

	def _softmax(self, x):
	softmax = np.exp(x) / sum(np.exp(x))
	return softmax

	def _sample_replace(self, vocab, reject):
	"""Sample a token from `vocab` other than `reject`."""
	token = reject
	while token == reject:
	token = random.choice(vocab)
	return token

	def _sample_distance(self, tokens, temperature):
	"""Sample number of tokens to corrupt from `tokens`."""
	n_tokens = len(tokens)
	indices = np.arange(n_tokens)
	logits = indices * -1 * temperature
	probs = self._softmax(logits)
	distance = np.random.choice(indices, p=probs)
	return distance

	def _sample_position(self, tokens, distance):
	n_tokens = len(tokens)
	chosen_indices = random.sample(range(n_tokens), k=distance)
	return chosen_indices


	class HammingDistanceSamplingTransform(Transform, HammingDistanceSampling):
	"""Abstract Transform class based on HammingDistanceSampling."""

	def _set_seed(self, seed):
	"""set seed to ensure reproducibility."""
	np.random.seed(seed)
	random.seed(seed)


	class SwitchOutStats(ObservableStats):
	"""Runing statistics for counting tokens being switched out."""

	__slots__ = ["changed", "total"]

	def __init__(self, changed: int, total: int):
	self.changed = changed
	self.total = total

	def update(self, other: "SwitchOutStats"):
	self.changed += other.changed
	self.total += other.total


	@register_transform(name="switchout")
	class SwitchOutTransform(HammingDistanceSamplingTransform):
	"""
	SwitchOut.
	:cite:`DBLP:journals/corr/abs-1808-07512`
	"""

	def __init__(self, opts):
	super().__init__(opts)

	@classmethod
	def require_vocab(cls):
	"""Override this method to inform it need vocab to start."""
	return True

	@classmethod
	def add_options(cls, parser):
	"""Avalilable options relate to this Transform."""
	group = parser.add_argument_group("Transform/SwitchOut")
	group.add(
	"-switchout_temperature",
	"--switchout_temperature",
	type=float,
	default=1.0,
	help="Sampling temperature for SwitchOut. :math:`\\tau^{-1}`"
	" in :cite:`DBLP:journals/corr/abs-1808-07512`. "
	"Smaller value makes data more diverse.",
	)

	def _parse_opts(self):
	self.temperature = self.opts.switchout_temperature

	def _switchout(self, tokens, vocab, stats=None):
	# 1. sample number of tokens to corrupt
	n_chosen = self._sample_distance(tokens, self.temperature)
	# 2. sample positions to corrput
	chosen_indices = self._sample_position(tokens, distance=n_chosen)
	# 3. sample corrupted values
	for i in chosen_indices:
	tokens[i] = self._sample_replace(vocab, reject=tokens[i])
	if stats is not None:
	stats.update(SwitchOutStats(n_chosen, len(tokens)))
	return tokens

	def apply(self, example, is_train=False, stats=None, **kwargs):
	"""Apply switchout to both src and tgt side tokens."""
	if is_train:
	example["src"] = self._switchout(
	example["src"], self.vocabs["src"].ids_to_tokens, stats
	)
	example["tgt"] = self._switchout(
	example["tgt"], self.vocabs["tgt"].ids_to_tokens, stats
	)
	return example

	def _repr_args(self):
	"""Return str represent key arguments for class."""
	return "{}={}".format("switchout_temperature", self.temperature)


	class TokenDropStats(ObservableStats):
	"""Runing statistics for counting tokens being switched out."""

	__slots__ = ["dropped", "total"]

	def __init__(self, dropped: int, total: int):
	self.dropped = dropped
	self.total = total

	def update(self, other: "TokenDropStats"):
	self.dropped += other.dropped
	self.total += other.total


	@register_transform(name="tokendrop")
	class TokenDropTransform(HammingDistanceSamplingTransform):
	"""Random drop tokens from sentence."""

	def __init__(self, opts):
	super().__init__(opts)

	@classmethod
	def add_options(cls, parser):
	"""Avalilable options relate to this Transform."""
	group = parser.add_argument_group("Transform/Token_Drop")
	group.add(
	"-tokendrop_temperature",
	"--tokendrop_temperature",
	type=float,
	default=1.0,
	help="Sampling temperature for token deletion.",
	)

	def _parse_opts(self):
	self.temperature = self.opts.tokendrop_temperature

	def _token_drop(self, tokens, stats=None):
	n_items = len(tokens)
	# 1. sample number of tokens to corrupt
	n_chosen = self._sample_distance(tokens, self.temperature)
	# 2. sample positions to corrput
	chosen_indices = self._sample_position(tokens, distance=n_chosen)
	# 3. Drop token on chosen position
	out = [tok for (i, tok) in enumerate(tokens) if i not in chosen_indices]
	if stats is not None:
	stats.update(TokenDropStats(n_chosen, n_items))
	return out

	def apply(self, example, is_train=False, stats=None, **kwargs):
	"""Apply token drop to both src and tgt side tokens."""
	if is_train:
	example["src"] = self._token_drop(example["src"], stats)
	example["tgt"] = self._token_drop(example["tgt"], stats)
	return example

	def _repr_args(self):
	"""Return str represent key arguments for class."""
	return "{}={}".format("tokendrop_temperature", self.temperature)


	class TokenMaskStats(ObservableStats):
	"""Runing statistics for counting tokens being switched out."""

	__slots__ = ["masked", "total"]

	def __init__(self, masked: int, total: int):
	self.masked = masked
	self.total = total

	def update(self, other: "TokenMaskStats"):
	self.masked += other.masked
	self.total += other.total


	@register_transform(name="tokenmask")
	class TokenMaskTransform(HammingDistanceSamplingTransform):
	"""Random mask tokens from src sentence."""

	MASK_TOK = DefaultTokens.MASK

	def __init__(self, opts):
	super().__init__(opts)

	@classmethod
	def add_options(cls, parser):
	"""Avalilable options relate to this Transform."""
	group = parser.add_argument_group("Transform/Token_Mask")
	group.add(
	"-tokenmask_temperature",
	"--tokenmask_temperature",
	type=float,
	default=1.0,
	help="Sampling temperature for token masking.",
	)

	def _parse_opts(self):
	self.temperature = self.opts.tokenmask_temperature

	@classmethod
	def get_specials(cls, opts):
	"""Get special vocabs added by prefix transform."""
	return ({cls.MASK_TOK}, set())

	def _token_mask(self, tokens, stats=None):
	# 1. sample number of tokens to corrupt
	n_chosen = self._sample_distance(tokens, self.temperature)
	# 2. sample positions to corrput
	chosen_indices = self._sample_position(tokens, distance=n_chosen)
	# 3. mask word on chosen position
	for i in chosen_indices:
	tokens[i] = self.MASK_TOK
	if stats is not None:
	stats.update(TokenDropStats(n_chosen, len(tokens)))
	return tokens

	def apply(self, example, is_train=False, stats=None, **kwargs):
	"""Apply word drop to both src and tgt side tokens."""
	if is_train:
	example["src"] = self._token_mask(example["src"], stats)
	return example

	def _repr_args(self):
	"""Return str represent key arguments for class."""
	return "{}={}".format("tokenmask_temperature", self.temperature)