Spaces:

Oopstom
/

ReactSeq

Sleeping

App Files Files Community

ReactSeq / onmt /modules /position_ffn.py

Oopstom

Upload 313 files

c668e80 verified 6 months ago

raw

history blame

3.82 kB

	"""Position feed-forward network from "Attention is All You Need"."""


	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.checkpoint import checkpoint
	from onmt.modules.rmsnorm import RMSNorm
	from torch.nn.utils import skip_init
	import torch.distributed as dist


	class ActivationFunction(object):
	relu = "relu"
	gelu = "gelu"
	silu = "silu"
	gated_gelu = "gated-gelu"


	# for silu, see: https://arxiv.org/pdf/2002.05202.pdf
	ACTIVATION_FUNCTIONS = {
	ActivationFunction.relu: F.relu,
	ActivationFunction.gelu: F.gelu,
	ActivationFunction.silu: F.silu,
	ActivationFunction.gated_gelu: F.gelu,
	}


	class PositionwiseFeedForward(nn.Module):
	"""A two-layer Feed-Forward-Network with residual layer norm.

	Args:
	d_model (int): the size of input for the first-layer of the FFN.
	d_ff (int): the hidden layer size of the second-layer
	of the FNN.
	dropout (float): dropout probability in :math:`[0, 1)`.
	activation_fn (ActivationFunction): activation function used.
	layer_norm (string): 'standard' or 'rms'
	"""

	def __init__(
	self,
	d_model,
	d_ff,
	dropout=0.1,
	activation_fn=ActivationFunction.relu,
	add_ffnbias=True,
	parallel_residual=False,
	layer_norm="standard",
	norm_eps=1e-6,
	use_ckpting=[],
	parallel_gpu=1,
	):
	super(PositionwiseFeedForward, self).__init__()
	assert (
	d_ff % parallel_gpu == 0
	), "Model intermediate ffn size must be divisible by the number of partitions"
	self.w_1 = skip_init(
	nn.Linear,
	in_features=d_model,
	out_features=d_ff // parallel_gpu,
	bias=add_ffnbias,
	)
	self.w_2 = skip_init(
	nn.Linear,
	in_features=d_ff // parallel_gpu,
	out_features=d_model,
	bias=add_ffnbias,
	)
	if layer_norm == "standard" and not parallel_residual:
	self.layer_norm = nn.LayerNorm(d_model, eps=norm_eps)
	elif layer_norm == "rms" and not parallel_residual:
	self.layer_norm = RMSNorm(d_model, eps=norm_eps)
	elif not parallel_residual:
	raise ValueError(f"{layer_norm} layer norm type is not supported")
	self.parallel_residual = parallel_residual
	self.dropout_1 = nn.Dropout(dropout)
	self.activation = ACTIVATION_FUNCTIONS[activation_fn]
	self.dropout_2 = nn.Dropout(dropout)
	if activation_fn == "silu" or activation_fn == "gated-gelu":
	self.w_3 = skip_init(
	nn.Linear,
	in_features=d_model,
	out_features=d_ff // parallel_gpu,
	bias=add_ffnbias,
	)
	else:
	self.w_3 = None
	self.maybe_ckpt = checkpoint if "ffn" in use_ckpting else lambda f, x: f(x)
	self.parallel_gpu = parallel_gpu

	def forward(self, x):
	"""Layer definition.

	Args:
	x: ``(batch_size, input_len, model_dim)``

	Returns:
	(FloatTensor): Output ``(batch_size, input_len, model_dim)``.
	"""
	if not self.parallel_residual:
	norm_x = self.layer_norm(x)
	else:
	norm_x = x.clone()
	inter = self.maybe_ckpt(self.w_1, norm_x)
	inter = self.activation(inter)
	if self.w_3 is not None:
	inter.mul_(self.maybe_ckpt(self.w_3, norm_x))
	inter = self.dropout_1(inter)
	inter = self.maybe_ckpt(self.w_2, inter)
	inter = self.dropout_2(inter)

	if self.parallel_gpu > 1:
	dist.all_reduce(inter)

	return inter + x

	def update_dropout(self, dropout):
	self.dropout_1.p = dropout
	self.dropout_2.p = dropout