mitre_466m / modeling_mitre.py

zhiqu22

update attention in generate

74025f2 16 days ago

40.9 kB

	# coding=utf-8

	import math
	from typing import List, Optional, Tuple, Union, Dict, Any

	import torch
	from torch import nn
	from .configuration_mitre import MitreConfig
	from transformers.utils import logging

	from transformers.generation import GenerationMixin
	from transformers.modeling_utils import PreTrainedModel
	from transformers.activations import ACT2FN
	from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
	from transformers.integrations.fsdp import is_fsdp_managed_module
	from transformers.modeling_outputs import (
	BaseModelOutputWithPastAndCrossAttentions,
	Seq2SeqLMOutput,
	Seq2SeqModelOutput,
	)
	from transformers.generation.configuration_utils import GenerationConfig
	from transformers.generation.beam_search import BeamSearchScorer
	from transformers.generation.logits_process import LogitsProcessorList
	from transformers.generation.stopping_criteria import StoppingCriteriaList

	logger = logging.get_logger(__name__)

	def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
	"""
	Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
	are ignored. This is modified from fairseq's `utils.make_positions`.
	"""
	mask = input_ids.ne(padding_idx).int()
	incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
	return incremental_indices.long() + padding_idx


	# Modified from transformers.models.m2m_100.modeling_m2m_100.M2M100Attention
	# and transformers.models.m2m_100.modeling_m2m_100.M2M100SdpaAttention
	class MitreSdpaAttention(nn.Module):

	def __init__(
	self,
	embed_dim: int,
	num_heads: int,
	dropout: float = 0.0,
	bias: bool = True,
	config: Optional[MitreConfig] = None,
	):
	super().__init__()
	self.embed_dim = embed_dim
	self.num_heads = num_heads
	self.dropout = dropout
	self.head_dim = embed_dim // num_heads
	self.config = config

	if (self.head_dim * num_heads) != self.embed_dim:
	raise ValueError(
	f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
	f" and `num_heads`: {num_heads})."
	)
	self.scaling = self.head_dim**-0.5

	self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
	self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
	self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
	self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

	def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
	return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

	def forward(
	self,
	hidden_states: torch.Tensor,
	past_key_value: Optional[Tuple[torch.Tensor]] = None,
	attention_mask: Optional[torch.Tensor] = None,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
	"""
	Input shape: Batch x Time x Channel
	Output objects: attn_output, attn_weights (always be None), past_key_value
	"""
	"""
	1. MitreModel is using MitreSdpaAttention, which is modifed from M2M100SdpaAttention.
	Notabley, both of them do not support `output_attentions=True` or `layer_head_mask` not None,
	leading to 'attn_weights' always being None in output.
	The plan of improving this point has a low priority.
	2. We plan to improve this code with Flash Attention v2.
	"""
	bsz, tgt_len, _ = hidden_states.size()

	# get query proj
	query_states = self.q_proj(hidden_states)
	if past_key_value is not None:
	# reuse k, v, self_attention
	key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
	value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
	key_states = torch.cat([past_key_value[0], key_states], dim=2)
	value_states = torch.cat([past_key_value[1], value_states], dim=2)
	else:
	# self_attention
	key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
	value_states = self._shape(self.v_proj(hidden_states), -1, bsz)

	past_key_value = (key_states, value_states)

	query_states = self._shape(query_states, tgt_len, bsz)

	attn_output = torch.nn.functional.scaled_dot_product_attention(
	query_states,
	key_states,
	value_states,
	attn_mask=attention_mask,
	dropout_p=self.dropout if self.training else 0.0,
	is_causal=False,
	)

	if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
	raise ValueError(
	f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
	f" {attn_output.size()}"
	)

	attn_output = attn_output.transpose(1, 2)

	# Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
	# partitioned across GPUs when using tensor-parallelism.
	attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)

	attn_output = self.out_proj(attn_output)

	return attn_output, None, past_key_value


	# Modified from transformers.models.m2m_100.modeling_m2m100.M2M100DecoderLayer
	class MitreDecoderLayer(nn.Module):
	def __init__(self, config: MitreConfig):
	super().__init__()
	self.embed_dim = config.d_model

	self.self_attn = MitreSdpaAttention(
	embed_dim=self.embed_dim,
	num_heads=config.decoder_attention_heads,
	dropout=config.attention_dropout,
	config=config,
	)
	self.dropout = config.dropout
	self.activation_fn = ACT2FN[config.activation_function]
	self.activation_dropout = config.activation_dropout

	self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
	self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
	self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
	self.final_layer_norm = nn.LayerNorm(self.embed_dim)

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	past_key_value: Optional[Tuple[torch.Tensor]] = None,
	use_cache: Optional[bool] = True,
	) -> torch.Tensor:
	"""
	Args:
	hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
	attention_mask (`torch.FloatTensor`): attention mask of size
	`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
	past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
	"""
	residual = hidden_states
	hidden_states = self.self_attn_layer_norm(hidden_states)

	# Self Attention
	# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
	self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
	# add present self-attn cache to positions 1,2 of present_key_value tuple
	hidden_states, _, present_key_value = self.self_attn(
	hidden_states=hidden_states,
	past_key_value=self_attn_past_key_value,
	attention_mask=attention_mask,
	)
	hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
	hidden_states = residual + hidden_states

	# Fully Connected
	residual = hidden_states
	hidden_states = self.final_layer_norm(hidden_states)
	hidden_states = self.activation_fn(self.fc1(hidden_states))
	hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
	hidden_states = self.fc2(hidden_states)
	hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
	hidden_states = residual + hidden_states

	outputs = (hidden_states,)

	if use_cache:
	outputs += (present_key_value,)

	return outputs


	class MitrePreTrainedModel(PreTrainedModel):
	config_class = MitreConfig
	base_model_prefix = "model"
	supports_gradient_checkpointing = True
	_no_split_modules = ["MitreDecoderLayer"]
	# we plan to implement codes for falsh attention v2
	_supports_flash_attn_2 = False
	_supports_sdpa = True

	def _init_weights(self, module):
	std = self.config.init_std
	if isinstance(module, nn.Linear):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=std)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()


	class MitreDecoder(MitrePreTrainedModel):
	"""
	Transformer decoder consisting of config.decoder_layers layers. Each layer is a [`MitreDecoderLayer`]

	Args:
	config: MitreConfig
	embed_tokens (nn.Embedding): output embedding
	"""

	def __init__(self, config: MitreConfig):
	super().__init__(config)
	self.dropout = config.dropout
	self.padding_idx = config.pad_token_id
	self.max_target_positions = config.max_position_embeddings
	embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

	self.embed_tokens = MitreScaledWordEmbedding(
	config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
	)

	self.src_embed_positions = MitreSinusoidalPositionalEmbedding(
	config.max_position_embeddings,
	config.d_model,
	self.padding_idx,
	)
	self.register_embed_positions = MitreSinusoidalPositionalEmbedding(
	config.max_position_embeddings,
	config.d_model,
	self.padding_idx,
	)
	self.tgt_embed_positions = MitreSinusoidalPositionalEmbedding(
	config.max_position_embeddings,
	config.d_model,
	self.padding_idx,
	)
	self.layers = nn.ModuleList([MitreDecoderLayer(config) for _ in range(config.decoder_layers)])
	if config._attn_implementation != "sdpa":
	raise NotImplementedError("Other attention mechanism are not implemented yet.")

	# TODO implement flash atten v2 for MITRE
	# self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
	self._use_sdpa = config._attn_implementation == "sdpa"
	self.layer_norm = nn.LayerNorm(config.d_model)

	self.gradient_checkpointing = False
	self._future_mask = torch.empty(0)
	# Initialize weights and apply final processing
	self.post_init()

	def create_registers(self, input_ids):
	'''
	create registers by duplicating the language tag respective to each sentence.
	length(registers) = length(real_tokens) = length(tokens) - length(pads)
	'''
	register_nums = (~input_ids.eq(self.padding_idx)).sum(dim=1)
	max_register_nums = register_nums.max().item()
	total_token_nums = input_ids.size(1) + max_register_nums
	batch_size = input_ids.size(0)
	registers = input_ids[range(batch_size), torch.argmax(input_ids, dim=-1)].unsqueeze(1).repeat(1, max_register_nums)
	return registers, register_nums, total_token_nums

	def get_token_indices(self, input_ids, total_token_nums, register_nums):
	'''
	return a token_indices for selecting source tokens from expanded_src_tokens
	'''
	token_indices = torch.arange(total_token_nums).expand(input_ids.size(0), -1).to(input_ids.device)
	token_indices = token_indices + register_nums.unsqueeze(1)
	return token_indices

	def get_batch_indices(self, input_ids, token_indices):
	'''
	return a batch_indices for selecting source tokens from expanded_src_tokens
	'''
	batch_indices = torch.arange(input_ids.shape[0]).unsqueeze(1).expand(-1, token_indices.size(1)).contiguous()
	return batch_indices

	def combine_src_and_registers(self, input_ids, registers):
	'''
	return a expanded_src_tokens for positional embedding.
	'''
	pads = torch.full_like(registers, self.padding_idx)
	expanded_src_tokens = torch.cat((pads, input_ids, registers), dim=1)
	return expanded_src_tokens

	def source_tokens_embedding_with_positions(self, expanded_src_tokens, total_token_nums, batch_indices, indices):
	'''
	return the embeds of source tokens
	'''
	inputs_embeds = self.embed_tokens(expanded_src_tokens)
	inputs_embeds_1 = inputs_embeds[:,:total_token_nums,:] + self.src_embed_positions(expanded_src_tokens[:,:total_token_nums])
	inputs_embeds_2 = inputs_embeds[:,total_token_nums:,:] + self.register_embed_positions(expanded_src_tokens[:,total_token_nums:])
	inputs_embeds = torch.cat((inputs_embeds_1, inputs_embeds_2), dim=1)
	inputs_embeds = inputs_embeds[batch_indices, indices]

	return inputs_embeds

	def fill_with_neg_inf(self, t):
	return t.float().fill_(float("-inf")).type_as(t)

	def check_contiguous(self, t: torch.Tensor):
	return t if t.is_contiguous() else t.contiguous()

	def build_future_mask(self, embeds, src_length, register_nums, past_key_values_length=0):
	b = register_nums.size(0)
	ns = src_length - register_nums
	if past_key_values_length == 0:
	# in training
	# 1. create mask by cache
	dim = embeds.size(1)
	if (
	self._future_mask.size(0) == 0
	or self._future_mask.size(0) < dim
	):
	self._future_mask = torch.triu(self.fill_with_neg_inf(torch.zeros([dim, dim])), 1)
	if self._future_mask.device == embeds.device:
	mask = self._future_mask[:dim, :dim].clone()
	else:
	mask = self._future_mask[:dim, :dim].to(embeds, copy=True)

	# 2. bi-directional attention in source tokens and registers
	mask[ :src_length, :src_length] = 0.

	# 3. create batch mask
	batch_mask = mask.unsqueeze(0).expand(b, -1, -1).clone().contiguous()

	# 4. mask source tokens -> registers
	# 5. mask target -> source tokens
	batch_indices = torch.arange(b).to(batch_mask.device).view(-1, 1, 1).expand(b, dim, dim).contiguous()
	row_indices = torch.arange(dim).to(batch_mask.device).view(1, -1, 1).expand(b, dim, dim).contiguous()
	col_indices = torch.arange(dim).to(batch_mask.device).view(1, 1, -1).expand(b, dim, dim).contiguous()
	source_indices = (row_indices < ns.view(-1, 1, 1)) & (col_indices >= ns.view(-1, 1, 1)) & (col_indices < (ns + register_nums).view(-1, 1, 1)).contiguous()
	target_indices = (row_indices >= (ns + register_nums).view(-1, 1, 1)) & (col_indices < ns.view(-1, 1, 1)).contiguous()
	# 4
	batch_mask[batch_indices[source_indices], row_indices[source_indices], col_indices[source_indices]] = float('-inf')
	# 5
	batch_mask[batch_indices[target_indices], row_indices[target_indices], col_indices[target_indices]] = float('-inf')
	# shape: batch_size, head_num (1 for broadcasting), seq_len, seq_len
	batch_mask = batch_mask.unsqueeze(1)

	elif past_key_values_length > 0:
	# in generation
	mask = torch.zeros(past_key_values_length + 1)
	mask = mask.to(embeds, copy=True)
	batch_mask = mask.unsqueeze(0).expand(b, -1).clone().contiguous()

	batch_indices = torch.arange(b).view(-1, 1).expand(b, past_key_values_length + 1).to(batch_mask.device)
	token_indices = torch.arange(past_key_values_length + 1).view(1, -1).expand(b, past_key_values_length + 1).to(batch_mask.device)
	target_to_source_mask = token_indices < ns.view(-1, 1)

	batch_mask[batch_indices[target_to_source_mask], token_indices[target_to_source_mask]] = float('-inf')
	batch_mask = batch_mask.unsqueeze(1)

	batch_mask = batch_mask.view(b, 1, batch_mask.shape[-2], batch_mask.shape[-1])
	return batch_mask


	def forward(
	self,
	input_ids: Optional[torch.Tensor] = None,
	decoder_input_ids: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	past_key_values: Optional[List[torch.FloatTensor]] = None,
	use_cache: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	registering_cache: dict = None,
	):
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	use_cache = use_cache if use_cache is not None else self.config.use_cache

	# past_key_values_length
	past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0

	if past_key_values_length > 0:
	register_nums = registering_cache["register_nums"]
	src_length = registering_cache["src_length"]

	if input_ids is not None and past_key_values_length == 0:
	# ensure contiguous
	input_ids = self.check_contiguous(input_ids)
	decoder_input_ids = self.check_contiguous(decoder_input_ids)

	if attention_mask is None:
	# create registers from input_ids
	registers, register_nums, total_token_nums = self.create_registers(input_ids)
	# 'expanded_src_tokens' is combined by input_ids, registers, and pads.
	expanded_src_tokens = self.combine_src_and_registers(input_ids, registers)
	token_indices = self.get_token_indices(input_ids, total_token_nums, register_nums)
	batch_indices = self.get_batch_indices(input_ids, token_indices)
	# source tokens (input_ids + registers)
	source_tokens = expanded_src_tokens[batch_indices, token_indices]

	else:
	# although we do not give the attention mask in training and the 1st step of generation,
	# we still leave this block here.
	if registering_cache is None or \
	not all(key in registering_cache for key in \
	("register_nums", "total_token_nums", "expanded_src_tokens",\
	"batch_indices", "token_indices", "source_tokens")):
	raise ValueError(
	"If you generate registers by external codes, \
	you must provide 'register_nums', 'total_token_nums', \
	'expanded_src_tokens', 'batch_indices', 'token_indices' \
	and 'source_tokens' in 'registering_cache' in the training."
	)
	register_nums, total_token_nums = registering_cache["register_nums"], registering_cache["total_token_nums"]
	expanded_src_tokens = registering_cache["expanded_src_tokens"]
	batch_indices, token_indices = registering_cache["batch_indices"], registering_cache["token_indices"]
	source_tokens = registering_cache["source_tokens"]

	# ensure contiguous
	expanded_src_tokens = self.check_contiguous(expanded_src_tokens)
	source_tokens = self.check_contiguous(source_tokens)
	src_length = source_tokens.shape[1]

	# get embeds with positions for source tokens (input_ids + registers)
	inputs_embeds = self.source_tokens_embedding_with_positions(expanded_src_tokens, total_token_nums, batch_indices, token_indices)

	# replace the inference trigger with langtok
	# namely, enc-tgt-dec-tgt strategy
	if decoder_input_ids[0][0].item() != source_tokens[0][-1].item():
	decoder_input_ids[:, 0] = source_tokens[:, -1]

	tokens = torch.cat([source_tokens, decoder_input_ids], dim=1)

	decoder_inputs_embeds = self.embed_tokens(decoder_input_ids)
	decoder_inputs_embeds = decoder_inputs_embeds + self.tgt_embed_positions(decoder_input_ids, past_key_values_length, src_length=src_length)
	# if past_key_values_length > 0:
	# raise ValueError()
	if past_key_values_length == 0:
	hidden_states = torch.cat([inputs_embeds, decoder_inputs_embeds], dim=1)
	else:
	hidden_states = decoder_inputs_embeds

	# ensure contiguous
	hidden_states = self.check_contiguous(hidden_states)

	# if attention_mask is NOT given, we build the attention mask from current hyperparams
	# if attention_mask is given, check the shape of attention mask
	if attention_mask is None:
	attention_mask = self.build_future_mask(hidden_states, src_length, register_nums, past_key_values_length)
	else:
	bsz, src_len = hidden_states.shape[0], hidden_states.shape[1]
	tgt_len = hidden_states.shape[1] if past_key_values_length == 0 else past_key_values_length + 1
	if attention_mask.size() != (bsz, 1, src_len, tgt_len):
	raise ValueError(
	f"Attention mask should be of size {(bsz, 1, src_len, tgt_len)}, but is {attention_mask.size()}"
	)

	# ensure contiguous
	attention_mask = self.check_contiguous(attention_mask)

	# this is a param to turncate kv cache
	# in training, it's None, namely, unactivated.
	max_register_num = None
	# masking pads for attention_mask in the training or the 1st step of generation
	if past_key_values_length == 0:
	# if in generation, activate
	max_register_num = register_nums.max().item() if use_cache else None

	padding_mask = tokens.eq(self.padding_idx)
	if padding_mask.any():
	padding_mask = padding_mask.unsqueeze(1).unsqueeze(2)
	attention_mask = attention_mask.masked_fill(padding_mask == 1, float('-inf'))

	hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

	if self.gradient_checkpointing and self.training:
	if use_cache:
	logger.warning_once(
	"`use_cache=True` is incompatible with gradient checkpointing. Setting" " `use_cache=False`..."
	)
	use_cache = False

	# decoder layers
	all_hidden_states = () if output_hidden_states else None
	next_decoder_cache = () if use_cache else None

	for idx, decoder_layer in enumerate(self.layers):
	if output_hidden_states:
	all_hidden_states += (hidden_states,)

	past_key_value = past_key_values[idx] if past_key_values is not None else None

	if self.gradient_checkpointing and self.training:
	layer_outputs = self._gradient_checkpointing_func(
	decoder_layer.__call__,
	hidden_states,
	attention_mask,
	past_key_value=None,
	use_cache=use_cache,
	)
	else:
	layer_outputs = decoder_layer(
	hidden_states,
	attention_mask=attention_mask,
	past_key_value=past_key_value,
	use_cache=use_cache,
	)

	hidden_states = layer_outputs[0]

	if use_cache:
	if past_key_values_length > 0:
	next_decoder_cache += (layer_outputs[1],)
	else:
	cache_key, cache_value = layer_outputs[1]
	clipped_rep = (
	cache_key[:, :, src_length - max_register_num:, :],
	cache_value[:, :, src_length - max_register_num:, :]
	)
	next_decoder_cache += (clipped_rep,)


	if past_key_values_length == 0:
	hidden_states = hidden_states[:,src_length:,:]

	hidden_states = self.layer_norm(hidden_states)

	# add hidden states from the last decoder layer
	if output_hidden_states:
	all_hidden_states += (hidden_states,)

	next_cache = next_decoder_cache if use_cache else None

	model_output = BaseModelOutputWithPastAndCrossAttentions(
	last_hidden_state=hidden_states,
	past_key_values=next_cache,
	hidden_states=all_hidden_states,
	)

	# the registering cache used in generation
	# in the 1st step, we turncate the kv cache to save cost, so we have to change the src_length
	if use_cache:
	model_output.registering_cache = {
	"register_nums": register_nums,
	"src_length": src_length if past_key_values_length > 0 else max_register_num,
	"attention_mask": attention_mask if past_key_values_length > 0 else None
	}
	else:
	model_output.registering_cache = None

	return model_output


	# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100ScaledWordEmbedding
	class MitreScaledWordEmbedding(nn.Embedding):
	"""
	This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
	"""
	def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0):
	super().__init__(num_embeddings, embedding_dim, padding_idx)
	self.embed_scale = embed_scale

	def forward(self, input_ids: torch.Tensor):
	return super().forward(input_ids) * self.embed_scale


	class MitreSinusoidalPositionalEmbedding(nn.Module):
	"""This module produces sinusoidal positional embeddings of any length."""

	def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
	super().__init__()
	self.offset = 2
	self.embedding_dim = embedding_dim
	self.padding_idx = padding_idx
	self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)

	def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
	emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
	if hasattr(self, "weights"):
	# in forward put the weights on the correct dtype and device of the param
	emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)

	self.register_buffer("weights", emb_weights, persistent=False)

	@staticmethod
	def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
	"""
	Build sinusoidal embeddings.

	This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
	"Attention Is All You Need".
	"""
	half_dim = embedding_dim // 2
	emb = math.log(10000) / (half_dim - 1)
	emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
	emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
	emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
	if embedding_dim % 2 == 1:
	# zero pad
	emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
	if padding_idx is not None:
	emb[padding_idx, :] = 0

	return emb.to(torch.get_default_dtype())

	@torch.no_grad()
	def forward(
	self, input_ids: torch.Tensor = None, past_key_values_length: int = 0, src_length: int = 0
	):
	bsz, seq_len = input_ids.size()
	# Create the position ids from the input token ids. Any padded tokens remain padded.
	position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
	input_ids.device
	)

	if past_key_values_length > 0 and src_length > 0:
	position_ids = torch.where(position_ids == 1, position_ids, position_ids - src_length)

	# expand embeddings if needed
	max_pos = self.padding_idx + 1 + seq_len + past_key_values_length

	if max_pos > self.weights.size(0):
	self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)

	return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()

	class MitreModel(MitrePreTrainedModel):
	_tied_weights_keys = ["decoder.embed_tokens.weight"]

	def __init__(self, config: MitreConfig):
	super().__init__(config)

	self.decoder = MitreDecoder(config)

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.decoder.embed_tokens

	def get_decoder(self):
	return self.decoder

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	decoder_input_ids: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	registering_cache: dict = None,
	) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	use_cache = use_cache if use_cache is not None else self.config.use_cache

	decoder_outputs = self.decoder(
	input_ids=input_ids,
	decoder_input_ids=decoder_input_ids,
	attention_mask=attention_mask,
	past_key_values=past_key_values,
	use_cache=use_cache,
	output_hidden_states=output_hidden_states,
	registering_cache=registering_cache
	)

	model_output = Seq2SeqModelOutput(
	last_hidden_state=decoder_outputs.last_hidden_state,
	past_key_values=decoder_outputs.past_key_values,
	decoder_hidden_states=decoder_outputs.hidden_states,
	decoder_attentions=decoder_outputs.attentions,
	)
	model_output.registering_cache = decoder_outputs.registering_cache
	return model_output

	class MitreForConditionalGeneration(MitrePreTrainedModel, GenerationMixin):
	base_model_prefix = "model"
	_tied_weights_keys = ["decoder.embed_tokens.weight", "lm_head.weight"]

	def __init__(self, config: MitreConfig):
	super().__init__(config)
	self.model = MitreModel(config)
	self.lm_head = nn.Linear(config.d_model, self.model.decoder.embed_tokens.num_embeddings, bias=False)

	# Initialize weights and apply final processing
	self.post_init()

	def get_decoder(self):
	return self.model.get_decoder()

	def get_output_embeddings(self):
	return self.lm_head

	def set_output_embeddings(self, new_embeddings):
	self.lm_head = new_embeddings

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	decoder_input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	registering_cache: dict = None,
	) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:

	outputs = self.model(
	input_ids=input_ids,
	decoder_input_ids=decoder_input_ids,
	attention_mask=attention_mask,
	past_key_values=past_key_values,
	use_cache=use_cache,
	output_hidden_states=output_hidden_states,
	registering_cache=registering_cache,
	)

	lm_logits = self.lm_head(outputs[0])

	if labels is not None:
	raise NotImplementedError("Please implement your loss function here.")

	model_output = Seq2SeqLMOutput(
	loss=None,
	logits=lm_logits,
	past_key_values=outputs.past_key_values,
	decoder_hidden_states=outputs.decoder_hidden_states,
	decoder_attentions=outputs.decoder_attentions,
	)
	model_output.registering_cache = outputs.registering_cache
	return model_output

	@staticmethod
	def _reorder_cache(past_key_values, beam_idx):
	reordered_past = ()
	for layer_past in past_key_values:
	reordered_past += (
	tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
	)
	return reordered_past

	@staticmethod
	def _reorder_register_nums(register_nums, beam_idx):
	return register_nums.index_select(0, beam_idx.to(register_nums.device))

	@staticmethod
	def _expand_inputs_for_generation(
	input_ids: Optional[torch.LongTensor] = None,
	beam_size: int = 1,
	) -> torch.LongTensor:
	"""
	Expands input_ids from [batch_size, len(tokens)] to [batch_size * expand_size, , len(tokens)]
	This is simplified from 'transformers.generation.utils.GenerationMixin._expand_inputs_for_generation'
	"""
	if beam_size == 1:
	return input_ids

	return input_ids.repeat_interleave(beam_size, dim=0)

	def generate(self,
	input_ids: Optional[torch.Tensor] = None,
	generation_config: Optional[GenerationConfig] = None,
	**kwargs: Dict
	):
	"""
	Inference with beam search.
	This code is simplified from 'transformers.generation.utils.GenerationMixin.generate'.
	This code follows the style of m2m and nllb.
	Therefore, there are two points need improvement.
	TODO
	1. early_stop in beam search.
	Current early_stop is at the beam search level instead of model level. Specficially,
	although beamscorer generates eos to the sequence, the sequence is filled by 'pad(1)'.
	As a result, the sequence, which has already finished, will be computed by the model
	continuously. We plan to remove the finished token as Fairseq's style.
	"""
	if generation_config != None:
	assert type(generation_config) is GenerationConfig
	self.generation_config = generation_config
	self.generation_config.update(**kwargs)

	generation_config = self.generation_config

	batch_size = input_ids.shape[0]
	beam_size = generation_config.num_beams
	device = input_ids.device
	max_cache_length = generation_config.max_length
	eos_token_id = torch.Tensor([generation_config.eos_token_id])

	# initial the target tokens
	decoder_input_ids = torch.full(
	(batch_size, 1),
	self.generation_config.decoder_start_token_id,
	dtype=input_ids.dtype,
	device=device
	)

	beam_scorer = BeamSearchScorer(
	batch_size=batch_size,
	num_beams=beam_size,
	device=device,
	length_penalty=self.generation_config.length_penalty,
	do_early_stopping=self.generation_config.early_stopping,
	num_beam_hyps_to_keep=self.generation_config.num_return_sequences,
	max_length=max_cache_length,
	)

	input_ids = self._expand_inputs_for_generation(input_ids, beam_size)
	decoder_input_ids = self._expand_inputs_for_generation(decoder_input_ids, beam_size)
	cur_len = decoder_input_ids.shape[1]

	this_peer_finished = False
	past_key_values = None
	registering_cache= None
	attention_mask = None

	logits_processor = LogitsProcessorList()
	stopping_criteria = StoppingCriteriaList()

	beam_scores = torch.zeros((batch_size, beam_size), dtype=torch.float, device=input_ids.device)
	beam_scores[:, 1:] = -1e9
	beam_scores = beam_scores.view((batch_size * beam_size,))
	while not this_peer_finished:

	if past_key_values is not None:
	decoder_input_ids_for_generation = decoder_input_ids[:, -1:]
	attention_mask = registering_cache["attention_mask"]
	if attention_mask is not None:
	attention_mask = torch.cat((attention_mask, attention_mask[..., -1:]), dim=-1)
	else:
	decoder_input_ids_for_generation = decoder_input_ids

	outputs = self(
	input_ids,
	decoder_input_ids_for_generation,
	attention_mask=attention_mask,
	past_key_values=past_key_values,
	use_cache=True,
	registering_cache=registering_cache
	)

	del input_ids
	input_ids = None

	past_key_values = outputs.past_key_values
	registering_cache = outputs.registering_cache

	next_token_logits = outputs.logits[:, -1, :].clone().float()
	next_token_logits = next_token_logits.to(device)

	next_token_scores = nn.functional.log_softmax(
	next_token_logits, dim=-1
	) # (batch_size * num_beams, vocab_size)

	next_token_scores_processed = logits_processor(decoder_input_ids, next_token_scores)
	next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
	next_token_scores_processed
	)

	# reshape for beam search
	vocab_size = next_token_scores.shape[-1]
	next_token_scores = next_token_scores.view(batch_size, beam_size * vocab_size)

	# Beam token selection: pick 1 + eos_token_id.shape[0] next tokens for each beam so we have at least 1
	# non eos token per beam.
	n_eos_tokens = eos_token_id.shape[0] if eos_token_id is not None else 0
	n_tokens_to_keep = max(2, 1 + n_eos_tokens) * beam_size
	next_token_scores, next_tokens = torch.topk(
	next_token_scores, n_tokens_to_keep, dim=1, largest=True, sorted=True
	)

	next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
	next_tokens = next_tokens % vocab_size
	beam_outputs = beam_scorer.process(
	decoder_input_ids,
	next_token_scores,
	next_tokens,
	next_indices,
	pad_token_id=generation_config.pad_token_id,
	eos_token_id=generation_config.eos_token_id,
	decoder_prompt_len=1,
	)
	beam_scores = beam_outputs["next_beam_scores"]
	beam_next_tokens = beam_outputs["next_beam_tokens"]
	beam_idx = beam_outputs["next_beam_indices"]
	decoder_input_ids = torch.cat([decoder_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)

	del outputs

	past_key_values = self._reorder_cache(past_key_values, beam_idx)
	registering_cache["register_nums"] = self._reorder_register_nums(registering_cache["register_nums"], beam_idx)

	cur_len = cur_len + 1

	if beam_scorer.is_done:
	this_peer_finished = True

	sequence_outputs = beam_scorer.finalize(
	decoder_input_ids,
	beam_scores,
	next_tokens,
	next_indices,
	pad_token_id=generation_config.pad_token_id,
	eos_token_id=eos_token_id,
	max_length=stopping_criteria.max_length,
	decoder_prompt_len=1,
	)

	return sequence_outputs["sequences"]


	MitreForConditionalGeneration.register_for_auto_class("AutoModel")