Spaces:

gijs
/

audio-captioning-small

Sleeping

App Files Files

Gijs Wijngaard commited on Sep 20, 2024

Commit

ab3f8fd

1 Parent(s): 47bcf45

Fix

Browse files

Files changed (5) hide show

README.md +4 -0
app.py +35 -56
config.json +22 -0
hf_wrapper.py +1964 -0
pytorch_model.bin +3 -0

README.md CHANGED Viewed

@@ -1,5 +1,9 @@
 ---
 title: Audio Captioning Small
 emoji: 🔊
 colorFrom: blue
 colorTo: pink

 ---
+<<<<<<< HEAD
 title: Audio Captioning Small
+=======
+title: Efficient Audio Captioning
+>>>>>>> 901f564 (Test)
 emoji: 🔊
 colorFrom: blue
 colorTo: pink

app.py CHANGED Viewed

@@ -1,34 +1,44 @@
 from functools import partial
 import gradio as gr
 import spaces
 import torch
 from torchaudio.functional import resample
 from transformers import AutoModel, PreTrainedTokenizerFast
-def load_model(model_name,
-               device):
-    if model_name == "AudioCaps":
-        model = AutoModel.from_pretrained(
-            "wsntxxn/effb2-trm-audiocaps-captioning",
-            trust_remote_code=True
-        ).to(device)
-        tokenizer = PreTrainedTokenizerFast.from_pretrained(
-            "wsntxxn/audiocaps-simple-tokenizer"
-        )
-    elif model_name == "Clotho":
-        model = AutoModel.from_pretrained(
-            "wsntxxn/effb2-trm-clotho-captioning",
-            trust_remote_code=True
-        ).to(device)
-        tokenizer = PreTrainedTokenizerFast.from_pretrained(
-            "wsntxxn/clotho-simple-tokenizer"
-        )
-    return model, tokenizer
 @spaces.GPU
-def infer(file, runner):
-    sr, wav = file
     wav = torch.as_tensor(wav)
     if wav.dtype == torch.short:
         wav = wav / 2 ** 15
@@ -36,38 +46,17 @@ def infer(file, runner):
         wav = wav / 2 ** 31
     if wav.ndim > 1:
         wav = wav.mean(1)
-    wav = resample(wav, sr, runner.target_sr)
     wav_len = len(wav)
     wav = wav.float().unsqueeze(0)
     with torch.no_grad():
-        word_idx = runner.model(
             audio=wav,
             audio_length=[wav_len]
         )[0]
-        cap = runner.tokenizer.decode(word_idx, skip_special_tokens=True)
     return cap
-# def input_toggle(input_type):
-#     if input_type == "file":
-#         return gr.update(visible=True), gr.update(visible=False)
-#     elif input_type == "mic":
-#         return gr.update(visible=False), gr.update(visible=True)
-class InferRunner:
-    def __init__(self, model_name):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model, self.tokenizer = load_model(model_name, self.device)
-        self.target_sr = self.model.config.sample_rate
-    def change_model(self, model_name):
-        self.model, self.tokenizer = load_model(model_name, self.device)
-        self.target_sr = self.model.config.sample_rate
-def change_model(radio):
-    global infer_runner
-    infer_runner.change_model(radio)
 with gr.Blocks() as demo:
     with gr.Row():
@@ -79,23 +68,13 @@ with gr.Blocks() as demo:
         """)
     with gr.Row():
         with gr.Column():
-            radio = gr.Radio(
-                ["AudioCaps", "Clotho"],
-                value="AudioCaps",
-                label="Select model"
-            )
-            infer_runner = InferRunner(radio.value)
             file = gr.Audio(label="Input", visible=True)
-            radio.change(fn=change_model, inputs=[radio,],)
             btn = gr.Button("Run")
         with gr.Column():
             output = gr.Textbox(label="Output")
         btn.click(
-            fn=partial(infer,
-                       runner=infer_runner),
             inputs=[file,],
             outputs=output
         )
     demo.launch()

+"""
+Audio Captioning Model
+This script implements an audio captioning model based on the Effb2-Trm architecture.
+It uses a pre-trained model to generate captions for audio inputs.
+The original implementation is based on:
+https://github.com/wsntxxn/Effb2-Trm-AudioCaps-Captioning/
+"""
 from functools import partial
 import gradio as gr
 import spaces
 import torch
 from torchaudio.functional import resample
 from transformers import AutoModel, PreTrainedTokenizerFast
+from hf_wrapper import Effb2TrmConfig, Effb2TrmCaptioningModel
+# Load the configuration
+config = Effb2TrmConfig.from_pretrained("config.json")
+# Load the model
+model = Effb2TrmCaptioningModel(config)
+# Load the state dict from the local pytorch_model.bin file
+state_dict = torch.load("pytorch_model.bin", map_location="cpu")
+model.load_state_dict(state_dict)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Move the model to the appropriate device
+model = model.to(device)
+tokenizer = PreTrainedTokenizerFast.from_pretrained(
+    "wsntxxn/audiocaps-simple-tokenizer"
+)
+target_sr = model.config.sample_rate
 @spaces.GPU
+def infer(input_audio):
+    sr, wav = input_audio
     wav = torch.as_tensor(wav)
     if wav.dtype == torch.short:
         wav = wav / 2 ** 15
         wav = wav / 2 ** 31
     if wav.ndim > 1:
         wav = wav.mean(1)
+    wav = resample(wav, sr, target_sr)
     wav_len = len(wav)
     wav = wav.float().unsqueeze(0)
     with torch.no_grad():
+        word_idx = model(
             audio=wav,
             audio_length=[wav_len]
         )[0]
+        cap = tokenizer.decode(word_idx, skip_special_tokens=True)
     return cap
 with gr.Blocks() as demo:
     with gr.Row():
         """)
     with gr.Row():
         with gr.Column():
             file = gr.Audio(label="Input", visible=True)
             btn = gr.Button("Run")
         with gr.Column():
             output = gr.Textbox(label="Output")
         btn.click(
+            fn=partial(infer),
             inputs=[file,],
             outputs=output
         )
     demo.launch()

config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "_name_or_path": "gijs/audio-captioning-small",
+  "architectures": [
+    "Effb2TrmCaptioningModel"
+  ],
+  "attn_emb_dim": 1408,
+  "auto_map": {
+    "AutoConfig": "hf_wrapper.Effb2TrmConfig",
+    "AutoModel": "hf_wrapper.Effb2TrmCaptioningModel"
+  },
+  "decoder_dropout": 0.2,
+  "decoder_emb_dim": 256,
+  "decoder_n_layers": 2,
+  "decoder_we_tie_weights": true,
+  "fc_emb_dim": 1408,
+  "sample_rate": 16000,
+  "shared_dim": 1024,
+  "tchr_dim": 768,
+  "torch_dtype": "float32",
+  "transformers_version": "4.30.2",
+  "vocab_size": 4981
+}

hf_wrapper.py ADDED Viewed

	@@ -0,0 +1,1964 @@

+from typing import Dict, Callable, Union, List
+import random
+import math
+import sys
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence, pad_packed_sequence
+from torchaudio import transforms
+from efficientnet_pytorch import EfficientNet
+from efficientnet_pytorch import utils as efficientnet_utils
+from einops import rearrange, reduce
+from transformers import PretrainedConfig, PreTrainedModel
+def sort_pack_padded_sequence(input, lengths):
+    sorted_lengths, indices = torch.sort(lengths, descending=True)
+    tmp = pack_padded_sequence(input[indices], sorted_lengths.cpu(), batch_first=True)
+    inv_ix = indices.clone()
+    inv_ix[indices] = torch.arange(0,len(indices)).type_as(inv_ix)
+    return tmp, inv_ix
+def pad_unsort_packed_sequence(input, inv_ix):
+    tmp, _ = pad_packed_sequence(input, batch_first=True)
+    tmp = tmp[inv_ix]
+    return tmp
+def pack_wrapper(module, attn_feats, attn_feat_lens):
+    packed, inv_ix = sort_pack_padded_sequence(attn_feats, attn_feat_lens)
+    if isinstance(module, torch.nn.RNNBase):
+        return pad_unsort_packed_sequence(module(packed)[0], inv_ix)
+    else:
+        return pad_unsort_packed_sequence(PackedSequence(module(packed[0]), packed[1]), inv_ix)
+def embedding_pooling(x, lens, pooling="mean"):
+    if pooling == "max":
+        fc_embs = max_with_lens(x, lens)
+    elif pooling == "mean":
+        fc_embs = mean_with_lens(x, lens)
+    elif pooling == "mean+max":
+        x_mean = mean_with_lens(x, lens)
+        x_max = max_with_lens(x, lens)
+        fc_embs = x_mean + x_max
+    elif pooling == "last":
+        indices = (lens - 1).reshape(-1, 1, 1).repeat(1, 1, x.size(-1))
+        # indices: [N, 1, hidden]
+        fc_embs = torch.gather(x, 1, indices).squeeze(1)
+    else:
+        raise Exception(f"pooling method {pooling} not support")
+    return fc_embs
+def interpolate(x, ratio):
+    """Interpolate data in time domain. This is used to compensate the
+    resolution reduction in downsampling of a CNN.
+    Args:
+      x: (batch_size, time_steps, classes_num)
+      ratio: int, ratio to interpolate
+    Returns:
+      upsampled: (batch_size, time_steps * ratio, classes_num)
+    """
+    (batch_size, time_steps, classes_num) = x.shape
+    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
+    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
+    return upsampled
+def pad_framewise_output(framewise_output, frames_num):
+    """Pad framewise_output to the same length as input frames. The pad value
+    is the same as the value of the last frame.
+    Args:
+      framewise_output: (batch_size, frames_num, classes_num)
+      frames_num: int, number of frames to pad
+    Outputs:
+      output: (batch_size, frames_num, classes_num)
+    """
+    pad = framewise_output[:, -1 :, :].repeat(1, frames_num - framewise_output.shape[1], 1)
+    """tensor for padding"""
+    output = torch.cat((framewise_output, pad), dim=1)
+    """(batch_size, frames_num, classes_num)"""
+    return output
+def find_contiguous_regions(activity_array):
+    """Find contiguous regions from bool valued numpy.array.
+    Copy of https://dcase-repo.github.io/dcase_util/_modules/dcase_util/data/decisions.html#DecisionEncoder
+    Reason is:
+    1. This does not belong to a class necessarily
+    2. Import DecisionEncoder requires sndfile over some other imports..which causes some problems on clusters
+    """
+    # Find the changes in the activity_array
+    change_indices = np.logical_xor(activity_array[1:],
+                                    activity_array[:-1]).nonzero()[0]
+    # Shift change_index with one, focus on frame after the change.
+    change_indices += 1
+    if activity_array[0]:
+        # If the first element of activity_array is True add 0 at the beginning
+        change_indices = np.r_[0, change_indices]
+    if activity_array[-1]:
+        # If the last element of activity_array is True, add the length of the array
+        change_indices = np.r_[change_indices, activity_array.size]
+    # Reshape the result into two columns
+    return change_indices.reshape((-1, 2))
+def double_threshold(x, high_thres, low_thres, n_connect=1):
+    """double_threshold
+    Helper function to calculate double threshold for n-dim arrays
+    :param x: input array
+    :param high_thres: high threshold value
+    :param low_thres: Low threshold value
+    :param n_connect: Distance of <= n clusters will be merged
+    """
+    assert x.ndim <= 3, "Whoops something went wrong with the input ({}), check if its <= 3 dims".format(
+        x.shape)
+    if x.ndim == 3:
+        apply_dim = 1
+    elif x.ndim < 3:
+        apply_dim = 0
+    # x is assumed to be 3d: (batch, time, dim)
+    # Assumed to be 2d : (time, dim)
+    # Assumed to be 1d : (time)
+    # time axis is therefore at 1 for 3d and 0 for 2d (
+    return np.apply_along_axis(lambda x: _double_threshold(
+        x, high_thres, low_thres, n_connect=n_connect),
+                               axis=apply_dim,
+                               arr=x)
+def _double_threshold(x, high_thres, low_thres, n_connect=1, return_arr=True):
+    """_double_threshold
+    Computes a double threshold over the input array
+    :param x: input array, needs to be 1d
+    :param high_thres: High threshold over the array
+    :param low_thres: Low threshold over the array
+    :param n_connect: Postprocessing, maximal distance between clusters to connect
+    :param return_arr: By default this function returns the filtered indiced, but if return_arr = True it returns an array of tsame size as x filled with ones and zeros.
+    """
+    assert x.ndim == 1, "Input needs to be 1d"
+    high_locations = np.where(x > high_thres)[0]
+    locations = x > low_thres
+    encoded_pairs = find_contiguous_regions(locations)
+    filtered_list = list(
+        filter(
+            lambda pair:
+            ((pair[0] <= high_locations) & (high_locations <= pair[1])).any(),
+            encoded_pairs))
+    filtered_list = connect_(filtered_list, n_connect)
+    if return_arr:
+        zero_one_arr = np.zeros_like(x, dtype=int)
+        for sl in filtered_list:
+            zero_one_arr[sl[0]:sl[1]] = 1
+        return zero_one_arr
+    return filtered_list
+def connect_(pairs, n=1):
+    """connect_
+    Connects two adjacent clusters if their distance is <= n
+    :param pairs: Clusters of iterateables e.g., [(1,5),(7,10)]
+    :param n: distance between two clusters
+    """
+    if len(pairs) == 0:
+        return []
+    start_, end_ = pairs[0]
+    new_pairs = []
+    for i, (next_item, cur_item) in enumerate(zip(pairs[1:], pairs[0:])):
+        end_ = next_item[1]
+        if next_item[0] - cur_item[1] <= n:
+            pass
+        else:
+            new_pairs.append((start_, cur_item[1]))
+            start_ = next_item[0]
+    new_pairs.append((start_, end_))
+    return new_pairs
+def segments_to_temporal_tag(segments, thre=0.5):
+    after_flag, while_flag = 0, 0
+    for j in range(len(segments)):
+        for k in range(len(segments)):
+            if segments[j][0] == segments[k][0]:
+                continue
+            min_duration = min(segments[j][2] - segments[j][1], segments[k][2] - segments[k][1])
+            overlap = segments[j][2] - segments[k][1]
+            if overlap < thre * min_duration:
+                after_flag = 2
+            if segments[j][1] < segments[k][1] and overlap > thre * min_duration:
+                while_flag = 1
+    return after_flag + while_flag
+def decode_with_timestamps(labels, time_resolution):
+    batch_results = []
+    for lab in labels:
+        segments = []
+        for i, label_column in enumerate(lab.T):
+            change_indices = find_contiguous_regions(label_column)
+            # append [onset, offset] in the result list
+            for row in change_indices:
+                segments.append((i, row[0] * time_resolution, row[1] * time_resolution))
+        temporal_tag = segments_to_temporal_tag(segments)
+        batch_results.append(temporal_tag)
+    return batch_results
+class _EffiNet(nn.Module):
+    """A proxy for efficient net models"""
+    def __init__(self,
+                 blocks_args=None,
+                 global_params=None,
+                 ) -> None:
+        super().__init__()
+        self.eff_net = EfficientNet(blocks_args=blocks_args,
+                                    global_params=global_params)
+    def forward(self, x: torch.Tensor):
+        x = rearrange(x, 'b f t -> b 1 f t')
+        x = self.eff_net.extract_features(x)
+        return reduce(x, 'b c f t -> b t c', 'mean')
+def get_effb2_model() -> _EffiNet:
+    blocks_args, global_params = efficientnet_utils.get_model_params(
+        'efficientnet-b2', {'include_top': False})
+    model = _EffiNet(blocks_args=blocks_args,
+                     global_params=global_params)
+    model.eff_net._change_in_channels(1)
+    return model
+def merge_load_state_dict(state_dict,
+                          model: torch.nn.Module,
+                          output_fn: Callable = sys.stdout.write):
+    model_dict = model.state_dict()
+    pretrained_dict = {}
+    mismatch_keys = []
+    for key, value in state_dict.items():
+        if key in model_dict and model_dict[key].shape == value.shape:
+            pretrained_dict[key] = value
+        else:
+            mismatch_keys.append(key)
+    output_fn(f"Loading pre-trained model, with mismatched keys {mismatch_keys}\n")
+    model_dict.update(pretrained_dict)
+    model.load_state_dict(model_dict, strict=True)
+    return pretrained_dict.keys()
+class EfficientNetB2(nn.Module):
+    def __init__(self,
+                 n_mels: int = 64,
+                 win_length: int = 32,
+                 hop_length: int = 10,
+                 f_min: int = 0,
+                 freeze: bool = False,):
+        super().__init__()
+        sample_rate = 16000
+        self.melspec_extractor = transforms.MelSpectrogram(
+            sample_rate=sample_rate,
+            n_fft=win_length * sample_rate // 1000,
+            win_length=win_length * sample_rate // 1000,
+            hop_length=hop_length * sample_rate // 1000,
+            f_min=f_min,
+            n_mels=n_mels,
+        )
+        self.hop_length = 10 * sample_rate // 1000
+        self.db_transform = transforms.AmplitudeToDB(top_db=120)
+        self.backbone = get_effb2_model()
+        self.fc_emb_size = self.backbone.eff_net._conv_head.out_channels
+        self.downsample_ratio = 32
+        if freeze:
+            for param in self.parameters():
+                param.requires_grad = False
+    def forward(self, input_dict):
+        waveform = input_dict["wav"]
+        wave_length = input_dict["wav_len"]
+        specaug = input_dict["specaug"]
+        x = self.melspec_extractor(waveform)
+        x = self.db_transform(x)    # (batch_size, mel_bins, time_steps)
+        x = rearrange(x, 'b f t -> b 1 t f')
+        if self.training and specaug:
+            x = self.spec_augmenter(x)
+        x = rearrange(x, 'b 1 t f -> b f t')
+        x = self.backbone(x)
+        attn_emb = x
+        wave_length = torch.as_tensor(wave_length)
+        feat_length = torch.div(wave_length, self.hop_length,
+            rounding_mode="floor") + 1
+        feat_length = torch.div(feat_length, self.downsample_ratio,
+            rounding_mode="floor")
+        fc_emb = mean_with_lens(attn_emb, feat_length)
+        output_dict = {
+            'fc_emb': fc_emb,
+            'attn_emb': attn_emb,
+            'attn_emb_len': feat_length
+        }
+        return output_dict
+def generate_length_mask(lens, max_length=None):
+    lens = torch.as_tensor(lens)
+    N = lens.size(0)
+    if max_length is None:
+        max_length = max(lens)
+        if isinstance(max_length, torch.Tensor):
+            max_length = max_length.item()
+    idxs = torch.arange(max_length).repeat(N).view(N, max_length)
+    idxs = idxs.to(lens.device)
+    mask = (idxs < lens.view(-1, 1))
+    return mask
+def mean_with_lens(features, lens):
+    """
+    features: [N, T, ...] (assume the second dimension represents length)
+    lens: [N,]
+    """
+    lens = torch.as_tensor(lens)
+    if max(lens) != features.size(1):
+        max_length = features.size(1)
+        mask = generate_length_mask(lens, max_length)
+    else:
+        mask = generate_length_mask(lens)
+    mask = mask.to(features.device) # [N, T]
+    while mask.ndim < features.ndim:
+        mask = mask.unsqueeze(-1)
+    feature_mean = features * mask
+    feature_mean = feature_mean.sum(1)
+    while lens.ndim < feature_mean.ndim:
+        lens = lens.unsqueeze(1)
+    feature_mean = feature_mean / lens.to(features.device)
+    # feature_mean = features * mask.unsqueeze(-1)
+    # feature_mean = feature_mean.sum(1) / lens.unsqueeze(1).to(features.device)
+    return feature_mean
+def max_with_lens(features, lens):
+    """
+    features: [N, T, ...] (assume the second dimension represents length)
+    lens: [N,]
+    """
+    lens = torch.as_tensor(lens)
+    if max(lens) != features.size(1):
+        max_length = features.size(1)
+        mask = generate_length_mask(lens, max_length)
+    else:
+        mask = generate_length_mask(lens)
+    mask = mask.to(features.device) # [N, T]
+    feature_max = features.clone()
+    feature_max[~mask] = float("-inf")
+    feature_max, _ = feature_max.max(1)
+    return feature_max
+def repeat_tensor(x, n):
+    return x.unsqueeze(0).repeat(n, *([1] * len(x.shape)))
+class CaptionMetaMixin:
+    pad_idx = 0
+    start_idx = 1
+    end_idx = 2
+    max_length = 20
+    @classmethod
+    def set_index(cls, start_idx, end_idx, pad_idx):
+        cls.start_idx = start_idx
+        cls.end_idx = end_idx
+        cls.pad_idx = pad_idx
+class CaptionModel(nn.Module, CaptionMetaMixin):
+    """
+    Encoder-decoder captioning model.
+    """
+    def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        self.vocab_size = decoder.vocab_size
+        self.train_forward_keys = ["cap", "cap_len", "ss_ratio"]
+        self.inference_forward_keys = ["sample_method", "max_length", "temp"]
+        freeze_encoder = kwargs.get("freeze_encoder", False)
+        if freeze_encoder:
+            for param in self.encoder.parameters():
+                param.requires_grad = False
+        self.check_decoder_compatibility()
+    def check_decoder_compatibility(self):
+        compatible_decoders = [x.__class__.__name__ for x in self.compatible_decoders]
+        assert isinstance(self.decoder, self.compatible_decoders), \
+            f"{self.decoder.__class__.__name__} is incompatible with " \
+            f"{self.__class__.__name__}, please use decoder in {compatible_decoders} "
+    def forward(self, input_dict: Dict):
+        """
+        input_dict: {
+            (required)
+            mode: train/inference,
+            [spec, spec_len],
+            [fc],
+            [attn, attn_len],
+            [wav, wav_len],
+            [sample_method: greedy],
+            [temp: 1.0] (in case of no teacher forcing)
+            (optional, mode=train)
+            cap,
+            cap_len,
+            ss_ratio,
+            (optional, mode=inference)
+            sample_method: greedy/beam,
+            max_length,
+            temp,
+            beam_size (optional, sample_method=beam),
+            n_best (optional, sample_method=beam),
+        }
+        """
+        encoder_output_dict = self.encoder(input_dict)
+        output = self.forward_decoder(input_dict, encoder_output_dict)
+        return output
+    def forward_decoder(self, input_dict: Dict, encoder_output_dict: Dict):
+        if input_dict["mode"] == "train":
+            forward_dict = {
+                "mode": "train", "sample_method": "greedy", "temp": 1.0
+            }
+            for key in self.train_forward_keys:
+                forward_dict[key] = input_dict[key]
+            forward_dict.update(encoder_output_dict)
+            output = self.train_forward(forward_dict)
+        elif input_dict["mode"] == "inference":
+            forward_dict = {"mode": "inference"}
+            default_args = { "sample_method": "greedy", "max_length": self.max_length, "temp": 1.0 }
+            for key in self.inference_forward_keys:
+                if key in input_dict:
+                    forward_dict[key] = input_dict[key]
+                else:
+                    forward_dict[key] = default_args[key]
+            if forward_dict["sample_method"] == "beam":
+                forward_dict["beam_size"] = input_dict.get("beam_size", 3)
+                forward_dict["n_best"] = input_dict.get("n_best", False)
+                forward_dict["n_best_size"] = input_dict.get("n_best_size", forward_dict["beam_size"])
+            elif forward_dict["sample_method"] == "dbs":
+                forward_dict["beam_size"] = input_dict.get("beam_size", 6)
+                forward_dict["group_size"] = input_dict.get("group_size", 3)
+                forward_dict["diversity_lambda"] = input_dict.get("diversity_lambda", 0.5)
+                forward_dict["group_nbest"] = input_dict.get("group_nbest", True)
+            forward_dict.update(encoder_output_dict)
+            output = self.inference_forward(forward_dict)
+        else:
+            raise Exception("mode should be either 'train' or 'inference'")
+        output.update(encoder_output_dict)
+        return output
+    def prepare_output(self, input_dict):
+        output = {}
+        batch_size = input_dict["fc_emb"].size(0)
+        if input_dict["mode"] == "train":
+            max_length = input_dict["cap"].size(1) - 1
+        elif input_dict["mode"] == "inference":
+            max_length = input_dict["max_length"]
+        else:
+            raise Exception("mode should be either 'train' or 'inference'")
+        device = input_dict["fc_emb"].device
+        output["seq"] = torch.full((batch_size, max_length), self.end_idx,
+                                   dtype=torch.long)
+        output["logit"] = torch.empty(batch_size, max_length,
+                                      self.vocab_size).to(device)
+        output["sampled_logprob"] = torch.zeros(batch_size, max_length)
+        output["embed"] = torch.empty(batch_size, max_length,
+                                      self.decoder.d_model).to(device)
+        return output
+    def train_forward(self, input_dict):
+        if input_dict["ss_ratio"] != 1: # scheduled sampling training
+            input_dict["mode"] = "train"
+            return self.stepwise_forward(input_dict)
+        output = self.seq_forward(input_dict)
+        self.train_process(output, input_dict)
+        return output
+    def seq_forward(self, input_dict):
+        raise NotImplementedError
+    def train_process(self, output, input_dict):
+        pass
+    def inference_forward(self, input_dict):
+        if input_dict["sample_method"] == "beam":
+            return self.beam_search(input_dict)
+        elif input_dict["sample_method"] == "dbs":
+            return self.diverse_beam_search(input_dict)
+        return self.stepwise_forward(input_dict)
+    def stepwise_forward(self, input_dict):
+        """Step-by-step decoding"""
+        output = self.prepare_output(input_dict)
+        max_length = output["seq"].size(1)
+        # start sampling
+        for t in range(max_length):
+            input_dict["t"] = t
+            self.decode_step(input_dict, output)
+            if input_dict["mode"] == "inference": # decide whether to stop when sampling
+                unfinished_t = output["seq"][:, t] != self.end_idx
+                if t == 0:
+                    unfinished = unfinished_t
+                else:
+                    unfinished *= unfinished_t
+                output["seq"][:, t][~unfinished] = self.end_idx
+                if unfinished.sum() == 0:
+                    break
+        self.stepwise_process(output)
+        return output
+    def decode_step(self, input_dict, output):
+        """Decoding operation of timestep t"""
+        decoder_input = self.prepare_decoder_input(input_dict, output)
+        # feed to the decoder to get logit
+        output_t = self.decoder(decoder_input)
+        logit_t = output_t["logit"]
+        # assert logit_t.ndim == 3
+        if logit_t.size(1) == 1:
+            logit_t = logit_t.squeeze(1)
+            embed_t = output_t["embed"].squeeze(1)
+        elif logit_t.size(1) > 1:
+            logit_t = logit_t[:, -1, :]
+            embed_t = output_t["embed"][:, -1, :]
+        else:
+            raise Exception("no logit output")
+        # sample the next input word and get the corresponding logit
+        sampled = self.sample_next_word(logit_t,
+                                        method=input_dict["sample_method"],
+                                        temp=input_dict["temp"])
+        output_t.update(sampled)
+        output_t["t"] = input_dict["t"]
+        output_t["logit"] = logit_t
+        output_t["embed"] = embed_t
+        self.stepwise_process_step(output, output_t)
+    def prepare_decoder_input(self, input_dict, output):
+        """Prepare the inp ut dict for the decoder"""
+        raise NotImplementedError
+    def stepwise_process_step(self, output, output_t):
+        """Postprocessing (save output values) after each timestep t"""
+        t = output_t["t"]
+        output["logit"][:, t, :] = output_t["logit"]
+        output["seq"][:, t] = output_t["word"]
+        output["sampled_logprob"][:, t] = output_t["probs"]
+        output["embed"][:, t, :] = output_t["embed"]
+    def stepwise_process(self, output):
+        """Postprocessing after the whole step-by-step autoregressive decoding"""
+        pass
+    def sample_next_word(self, logit, method, temp):
+        """Sample the next word, given probs output by the decoder"""
+        logprob = torch.log_softmax(logit, dim=1)
+        if method == "greedy":
+            sampled_logprob, word = torch.max(logprob.detach(), 1)
+        elif method == "gumbel":
+            def sample_gumbel(shape, eps=1e-20):
+                U = torch.rand(shape).to(logprob.device)
+                return -torch.log(-torch.log(U + eps) + eps)
+            def gumbel_softmax_sample(logit, temperature):
+                y = logit + sample_gumbel(logit.size())
+                return torch.log_softmax(y / temperature, dim=-1)
+            _logprob = gumbel_softmax_sample(logprob, temp)
+            _, word = torch.max(_logprob.data, 1)
+            sampled_logprob = logprob.gather(1, word.unsqueeze(-1))
+        else:
+            logprob = logprob / temp
+            if method.startswith("top"):
+                top_num = float(method[3:])
+                if 0 < top_num < 1: # top-p sampling
+                    probs = torch.softmax(logit, dim=1)
+                    sorted_probs, sorted_indices = torch.sort(probs, descending=True, dim=1)
+                    _cumsum = sorted_probs.cumsum(1)
+                    mask = _cumsum < top_num
+                    mask = torch.cat([torch.ones_like(mask[:,:1]), mask[:,:-1]], 1)
+                    sorted_probs = sorted_probs * mask.to(sorted_probs)
+                    sorted_probs = sorted_probs / sorted_probs.sum(1, keepdim=True)
+                    logprob.scatter_(1, sorted_indices, sorted_probs.log())
+                else: # top-k sampling
+                    k = int(top_num)
+                    tmp = torch.empty_like(logprob).fill_(float('-inf'))
+                    topk, indices = torch.topk(logprob, k, dim=1)
+                    tmp = tmp.scatter(1, indices, topk)
+                    logprob = tmp
+            word = torch.distributions.Categorical(logits=logprob.detach()).sample()
+            sampled_logprob = logprob.gather(1, word.unsqueeze(-1)).squeeze(1)
+        word = word.detach().long()
+        # sampled_logprob: [N,], word: [N,]
+        return {"word": word, "probs": sampled_logprob}
+    def beam_search(self, input_dict):
+        output = self.prepare_output(input_dict)
+        max_length = input_dict["max_length"]
+        beam_size = input_dict["beam_size"]
+        if input_dict["n_best"]:
+            n_best_size = input_dict["n_best_size"]
+            batch_size, max_length = output["seq"].size()
+            output["seq"] = torch.full((batch_size, n_best_size, max_length),
+                                        self.end_idx, dtype=torch.long)
+        temp = input_dict["temp"]
+        # instance by instance beam seach
+        for i in range(output["seq"].size(0)):
+            output_i = self.prepare_beamsearch_output(input_dict)
+            input_dict["sample_idx"] = i
+            for t in range(max_length):
+                input_dict["t"] = t
+                output_t = self.beamsearch_step(input_dict, output_i)
+                #######################################
+                # merge with previous beam and select the current max prob beam
+                #######################################
+                logit_t = output_t["logit"]
+                if logit_t.size(1) == 1:
+                    logit_t = logit_t.squeeze(1)
+                elif logit_t.size(1) > 1:
+                    logit_t = logit_t[:, -1, :]
+                else:
+                    raise Exception("no logit output")
+                logprob_t = torch.log_softmax(logit_t, dim=1)
+                logprob_t = torch.log_softmax(logprob_t / temp, dim=1)
+                logprob_t = output_i["topk_logprob"].unsqueeze(1) + logprob_t
+                if t == 0: # for the first step, all k seq will have the same probs
+                    topk_logprob, topk_words = logprob_t[0].topk(
+                        beam_size, 0, True, True)
+                else: # unroll and find top logprob, and their unrolled indices
+                    topk_logprob, topk_words = logprob_t.view(-1).topk(
+                        beam_size, 0, True, True)
+                topk_words = topk_words.cpu()
+                output_i["topk_logprob"] = topk_logprob
+                # output_i["prev_words_beam"] = topk_words // self.vocab_size  # [beam_size,]
+                output_i["prev_words_beam"] = torch.div(topk_words, self.vocab_size,
+                                                        rounding_mode='trunc')
+                output_i["next_word"] = topk_words % self.vocab_size  # [beam_size,]
+                if t == 0:
+                    output_i["seq"] = output_i["next_word"].unsqueeze(1)
+                else:
+                    output_i["seq"] = torch.cat([
+                        output_i["seq"][output_i["prev_words_beam"]],
+                        output_i["next_word"].unsqueeze(1)], dim=1)
+                # add finished beams to results
+                is_end = output_i["next_word"] == self.end_idx
+                if t == max_length - 1:
+                    is_end.fill_(1)
+                for beam_idx in range(beam_size):
+                    if is_end[beam_idx]:
+                        final_beam = {
+                            "seq": output_i["seq"][beam_idx].clone(),
+                            "score": output_i["topk_logprob"][beam_idx].item()
+                        }
+                        final_beam["score"] = final_beam["score"] / (t + 1)
+                        output_i["done_beams"].append(final_beam)
+                output_i["topk_logprob"][is_end] -= 1000
+                self.beamsearch_process_step(output_i, output_t)
+                if len(output_i["done_beams"]) == beam_size:
+                    break
+            self.beamsearch_process(output, output_i, input_dict)
+        return output
+    def prepare_beamsearch_output(self, input_dict):
+        beam_size = input_dict["beam_size"]
+        device = input_dict["fc_emb"].device
+        output = {
+            "topk_logprob": torch.zeros(beam_size).to(device),
+            "seq": None,
+            "prev_words_beam": None,
+            "next_word": None,
+            "done_beams": [],
+        }
+        return output
+    def beamsearch_step(self, input_dict, output_i):
+        decoder_input = self.prepare_beamsearch_decoder_input(input_dict, output_i)
+        output_t = self.decoder(decoder_input)
+        output_t["t"] = input_dict["t"]
+        return output_t
+    def prepare_beamsearch_decoder_input(self, input_dict, output_i):
+        raise NotImplementedError
+    def beamsearch_process_step(self, output_i, output_t):
+        pass
+    def beamsearch_process(self, output, output_i, input_dict):
+        i = input_dict["sample_idx"]
+        done_beams = sorted(output_i["done_beams"], key=lambda x: -x["score"])
+        if input_dict["n_best"]:
+            done_beams = done_beams[:input_dict["n_best_size"]]
+            for out_idx, done_beam in enumerate(done_beams):
+                seq = done_beam["seq"]
+                output["seq"][i][out_idx, :len(seq)] = seq
+        else:
+            seq = done_beams[0]["seq"]
+            output["seq"][i][:len(seq)] = seq
+    def diverse_beam_search(self, input_dict):
+        def add_diversity(seq_table, logprob, t, divm, diversity_lambda, bdash):
+            local_time = t - divm
+            unaug_logprob = logprob.clone()
+            if divm > 0:
+                change = torch.zeros(logprob.size(-1))
+                for prev_choice in range(divm):
+                    prev_decisions = seq_table[prev_choice][..., local_time]
+                    for prev_labels in range(bdash):
+                        change.scatter_add_(0, prev_decisions[prev_labels], change.new_ones(1))
+                change = change.to(logprob.device)
+                logprob = logprob - repeat_tensor(change, bdash) * diversity_lambda
+            return logprob, unaug_logprob
+        output = self.prepare_output(input_dict)
+        group_size = input_dict["group_size"]
+        batch_size = output["seq"].size(0)
+        beam_size = input_dict["beam_size"]
+        bdash = beam_size // group_size
+        input_dict["bdash"] = bdash
+        diversity_lambda = input_dict["diversity_lambda"]
+        device = input_dict["fc_emb"].device
+        max_length = input_dict["max_length"]
+        temp = input_dict["temp"]
+        group_nbest = input_dict["group_nbest"]
+        batch_size, max_length = output["seq"].size()
+        if group_nbest:
+            output["seq"] = torch.full((batch_size, beam_size, max_length),
+                                        self.end_idx, dtype=torch.long)
+        else:
+            output["seq"] = torch.full((batch_size, group_size, max_length),
+                                        self.end_idx, dtype=torch.long)
+        for i in range(batch_size):
+            input_dict["sample_idx"] = i
+            seq_table = [torch.LongTensor(bdash, 0) for _ in range(group_size)] # group_size x [bdash, 0]
+            logprob_table = [torch.zeros(bdash).to(device) for _ in range(group_size)]
+            done_beams_table = [[] for _ in range(group_size)]
+            output_i = {
+                "prev_words_beam": [None for _ in range(group_size)],
+                "next_word": [None for _ in range(group_size)],
+                "state": [None for _ in range(group_size)]
+            }
+            for t in range(max_length + group_size - 1):
+                input_dict["t"] = t
+                for divm in range(group_size):
+                    input_dict["divm"] = divm
+                    if t >= divm and t <= max_length + divm - 1:
+                        local_time = t - divm
+                        decoder_input = self.prepare_dbs_decoder_input(input_dict, output_i)
+                        output_t = self.decoder(decoder_input)
+                        output_t["divm"] = divm
+                        logit_t = output_t["logit"]
+                        if logit_t.size(1) == 1:
+                            logit_t = logit_t.squeeze(1)
+                        elif logit_t.size(1) > 1:
+                            logit_t = logit_t[:, -1, :]
+                        else:
+                            raise Exception("no logit output")
+                        logprob_t = torch.log_softmax(logit_t, dim=1)
+                        logprob_t = torch.log_softmax(logprob_t / temp, dim=1)
+                        logprob_t, unaug_logprob_t = add_diversity(seq_table, logprob_t, t, divm, diversity_lambda, bdash)
+                        logprob_t = logprob_table[divm].unsqueeze(-1) + logprob_t
+                        if local_time == 0: # for the first step, all k seq will have the same probs
+                            topk_logprob, topk_words = logprob_t[0].topk(
+                                bdash, 0, True, True)
+                        else: # unroll and find top logprob, and their unrolled indices
+                            topk_logprob, topk_words = logprob_t.view(-1).topk(
+                                bdash, 0, True, True)
+                        topk_words = topk_words.cpu()
+                        logprob_table[divm] = topk_logprob
+                        output_i["prev_words_beam"][divm] = topk_words // self.vocab_size  # [bdash,]
+                        output_i["next_word"][divm] = topk_words % self.vocab_size  # [bdash,]
+                        if local_time > 0:
+                            seq_table[divm] = seq_table[divm][output_i["prev_words_beam"][divm]]
+                        seq_table[divm] = torch.cat([
+                            seq_table[divm],
+                            output_i["next_word"][divm].unsqueeze(-1)], -1)
+                        is_end = seq_table[divm][:, t-divm] == self.end_idx
+                        assert seq_table[divm].shape[-1] == t - divm + 1
+                        if t == max_length + divm - 1:
+                            is_end.fill_(1)
+                        for beam_idx in range(bdash):
+                            if is_end[beam_idx]:
+                                final_beam = {
+                                    "seq": seq_table[divm][beam_idx].clone(),
+                                    "score": logprob_table[divm][beam_idx].item()
+                                }
+                                final_beam["score"] = final_beam["score"] / (t - divm + 1)
+                                done_beams_table[divm].append(final_beam)
+                        logprob_table[divm][is_end] -= 1000
+                        self.dbs_process_step(output_i, output_t)
+            done_beams_table = [sorted(done_beams_table[divm], key=lambda x: -x["score"])[:bdash] for divm in range(group_size)]
+            if group_nbest:
+                done_beams = sum(done_beams_table, [])
+            else:
+                done_beams = [group_beam[0] for group_beam in done_beams_table]
+            for _, done_beam in enumerate(done_beams):
+                output["seq"][i, _, :len(done_beam["seq"])] = done_beam["seq"]
+        return output
+    def prepare_dbs_decoder_input(self, input_dict, output_i):
+        raise NotImplementedError
+    def dbs_process_step(self, output_i, output_t):
+        pass
+class TransformerModel(CaptionModel):
+    def __init__(self, encoder: nn.Module, decoder: nn.Module, **kwargs):
+        if not hasattr(self, "compatible_decoders"):
+            self.compatible_decoders = (
+                TransformerDecoder,
+            )
+        super().__init__(encoder, decoder, **kwargs)
+    def seq_forward(self, input_dict):
+        cap = input_dict["cap"]
+        cap_padding_mask = (cap == self.pad_idx).to(cap.device)
+        cap_padding_mask = cap_padding_mask[:, :-1]
+        output = self.decoder(
+            {
+                "word": cap[:, :-1],
+                "attn_emb": input_dict["attn_emb"],
+                "attn_emb_len": input_dict["attn_emb_len"],
+                "cap_padding_mask": cap_padding_mask
+            }
+        )
+        return output
+    def prepare_decoder_input(self, input_dict, output):
+        decoder_input = {
+            "attn_emb": input_dict["attn_emb"],
+            "attn_emb_len": input_dict["attn_emb_len"]
+        }
+        t = input_dict["t"]
+        ###############
+        # determine input word
+        ################
+        if input_dict["mode"] == "train" and random.random() < input_dict["ss_ratio"]: # training, scheduled sampling
+            word = input_dict["cap"][:, :t+1]
+        else:
+            start_word = torch.tensor([self.start_idx,] * input_dict["attn_emb"].size(0)).unsqueeze(1).long()
+            if t == 0:
+                word = start_word
+            else:
+                word = torch.cat((start_word, output["seq"][:, :t]), dim=-1)
+        # word: [N, T]
+        decoder_input["word"] = word
+        cap_padding_mask = (word == self.pad_idx).to(input_dict["attn_emb"].device)
+        decoder_input["cap_padding_mask"] = cap_padding_mask
+        return decoder_input
+    def prepare_beamsearch_decoder_input(self, input_dict, output_i):
+        decoder_input = {}
+        t = input_dict["t"]
+        i = input_dict["sample_idx"]
+        beam_size = input_dict["beam_size"]
+        ###############
+        # prepare attn embeds
+        ################
+        if t == 0:
+            attn_emb = repeat_tensor(input_dict["attn_emb"][i], beam_size)
+            attn_emb_len = repeat_tensor(input_dict["attn_emb_len"][i], beam_size)
+            output_i["attn_emb"] = attn_emb
+            output_i["attn_emb_len"] = attn_emb_len
+        decoder_input["attn_emb"] = output_i["attn_emb"]
+        decoder_input["attn_emb_len"] = output_i["attn_emb_len"]
+        ###############
+        # determine input word
+        ################
+        start_word = torch.tensor([self.start_idx,] * beam_size).unsqueeze(1).long()
+        if t == 0:
+            word = start_word
+        else:
+            word = torch.cat((start_word, output_i["seq"]), dim=-1)
+        decoder_input["word"] = word
+        cap_padding_mask = (word == self.pad_idx).to(input_dict["attn_emb"].device)
+        decoder_input["cap_padding_mask"] = cap_padding_mask
+        return decoder_input
+class BaseDecoder(nn.Module):
+    """
+    Take word/audio embeddings and output the next word probs
+    """
+    def __init__(self, emb_dim, vocab_size, fc_emb_dim,
+                 attn_emb_dim, dropout=0.2, tie_weights=False):
+        super().__init__()
+        self.emb_dim = emb_dim
+        self.vocab_size = vocab_size
+        self.fc_emb_dim = fc_emb_dim
+        self.attn_emb_dim = attn_emb_dim
+        self.tie_weights = tie_weights
+        self.word_embedding = nn.Embedding(vocab_size, emb_dim)
+        self.in_dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        raise NotImplementedError
+    def load_word_embedding(self, weight, freeze=True):
+        embedding = np.load(weight)
+        assert embedding.shape[0] == self.vocab_size, "vocabulary size mismatch"
+        assert embedding.shape[1] == self.emb_dim, "embed size mismatch"
+        # embeddings = torch.as_tensor(embeddings).float()
+        # self.word_embeddings.weight = nn.Parameter(embeddings)
+        # for para in self.word_embeddings.parameters():
+            # para.requires_grad = tune
+        self.word_embedding = nn.Embedding.from_pretrained(embedding,
+            freeze=freeze)
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.1, max_len=100):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * \
+            (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        # self.register_buffer("pe", pe)
+        self.register_parameter("pe", nn.Parameter(pe, requires_grad=False))
+    def forward(self, x):
+        # x: [T, N, E]
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)
+class TransformerDecoder(BaseDecoder):
+    def __init__(self,
+                 emb_dim,
+                 vocab_size,
+                 fc_emb_dim,
+                 attn_emb_dim,
+                 dropout,
+                 freeze=False,
+                 tie_weights=False,
+                 **kwargs):
+        super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                         dropout=dropout, tie_weights=tie_weights)
+        self.d_model = emb_dim
+        self.nhead = kwargs.get("nhead", self.d_model // 64)
+        self.nlayers = kwargs.get("nlayers", 2)
+        self.dim_feedforward = kwargs.get("dim_feedforward", self.d_model * 4)
+        self.pos_encoder = PositionalEncoding(self.d_model, dropout)
+        layer = nn.TransformerDecoderLayer(d_model=self.d_model,
+                                           nhead=self.nhead,
+                                           dim_feedforward=self.dim_feedforward,
+                                           dropout=dropout)
+        self.model = nn.TransformerDecoder(layer, self.nlayers)
+        self.classifier = nn.Linear(self.d_model, vocab_size, bias=False)
+        if tie_weights:
+            self.classifier.weight = self.word_embedding.weight
+        self.attn_proj = nn.Sequential(
+            nn.Linear(self.attn_emb_dim, self.d_model),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.LayerNorm(self.d_model)
+        )
+        self.init_params()
+        self.freeze = freeze
+        if freeze:
+            for p in self.parameters():
+                p.requires_grad = False
+    def init_params(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def load_pretrained(self, pretrained, output_fn):
+        checkpoint = torch.load(pretrained, map_location="cpu")
+        if "model" in checkpoint:
+            checkpoint = checkpoint["model"]
+            if next(iter(checkpoint)).startswith("decoder."):
+                state_dict = {}
+                for k, v in checkpoint.items():
+                    state_dict[k[8:]] = v
+        loaded_keys = merge_load_state_dict(state_dict, self, output_fn)
+        if self.freeze:
+            for name, param in self.named_parameters():
+                if name in loaded_keys:
+                    param.requires_grad = False
+                else:
+                    param.requires_grad = True
+    def generate_square_subsequent_mask(self, max_length):
+        mask = (torch.triu(torch.ones(max_length, max_length)) == 1).transpose(0, 1)
+        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+        return mask
+    def forward(self, input_dict):
+        word = input_dict["word"]
+        attn_emb = input_dict["attn_emb"]
+        attn_emb_len = input_dict["attn_emb_len"]
+        cap_padding_mask = input_dict["cap_padding_mask"]
+        p_attn_emb = self.attn_proj(attn_emb)
+        p_attn_emb = p_attn_emb.transpose(0, 1) # [T_src, N, emb_dim]
+        word = word.to(attn_emb.device)
+        embed = self.in_dropout(self.word_embedding(word)) * math.sqrt(self.emb_dim) # [N, T, emb_dim]
+        embed = embed.transpose(0, 1) # [T, N, emb_dim]
+        embed = self.pos_encoder(embed)
+        tgt_mask = self.generate_square_subsequent_mask(embed.size(0)).to(attn_emb.device)
+        memory_key_padding_mask = ~generate_length_mask(attn_emb_len, attn_emb.size(1)).to(attn_emb.device)
+        output = self.model(embed, p_attn_emb, tgt_mask=tgt_mask,
+                            tgt_key_padding_mask=cap_padding_mask,
+                            memory_key_padding_mask=memory_key_padding_mask)
+        output = output.transpose(0, 1)
+        output = {
+            "embed": output,
+            "logit": self.classifier(output),
+        }
+        return output
+class ContraEncoderKdWrapper(nn.Module, CaptionMetaMixin):
+    def __init__(self,
+                 model: nn.Module,
+                 shared_dim: int,
+                 tchr_dim: int,
+                 ):
+        super().__init__()
+        self.model = model
+        self.tchr_dim = tchr_dim
+        if hasattr(model, "encoder"):
+            self.stdnt_proj = nn.Linear(model.encoder.fc_emb_size,
+                                        shared_dim)
+        else:
+            self.stdnt_proj = nn.Linear(model.fc_emb_size,
+                                        shared_dim)
+        self.tchr_proj = nn.Linear(tchr_dim, shared_dim)
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+    def forward(self, input_dict: Dict):
+        unsup = input_dict.get("unsup", False)
+        if unsup is False:
+            output_dict = self.model(input_dict)
+        else:
+            output_dict = self.model.encoder(input_dict)
+        if "tchr_output" in input_dict:
+            stdnt_emb = output_dict["fc_emb"]
+            stdnt_emb = self.stdnt_proj(stdnt_emb)
+            tchr_emb = input_dict["tchr_output"]["embedding"]
+            thcr_emb = self.tchr_proj(tchr_emb)
+            stdnt_emb = F.normalize(stdnt_emb, dim=-1)
+            thcr_emb = F.normalize(thcr_emb, dim=-1)
+            unscaled_logit = stdnt_emb @ thcr_emb.transpose(0, 1)
+            logit = self.logit_scale * unscaled_logit
+            label = torch.arange(logit.shape[0]).to(logit.device)
+            loss1 = F.cross_entropy(logit, label)
+            loss2 = F.cross_entropy(logit.transpose(0, 1), label)
+            loss = (loss1 + loss2) / 2
+            output_dict["enc_kd_loss"] = loss
+        return output_dict
+class Effb2TrmConfig(PretrainedConfig):
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        tchr_dim: int = 768,
+        shared_dim: int = 1024,
+        fc_emb_dim: int = 1408,
+        attn_emb_dim: int = 1408,
+        decoder_n_layers: int = 2,
+        decoder_we_tie_weights: bool = True,
+        decoder_emb_dim: int = 256,
+        decoder_dropout: float = 0.2,
+        vocab_size: int = 4981,
+        **kwargs
+    ):
+        self.sample_rate = sample_rate
+        self.tchr_dim = tchr_dim
+        self.shared_dim = shared_dim
+        self.fc_emb_dim = fc_emb_dim
+        self.attn_emb_dim = attn_emb_dim
+        self.decoder_n_layers = decoder_n_layers
+        self.decoder_we_tie_weights = decoder_we_tie_weights
+        self.decoder_emb_dim = decoder_emb_dim
+        self.decoder_dropout = decoder_dropout
+        self.vocab_size = vocab_size
+        super().__init__(**kwargs)
+class Effb2TrmCaptioningModel(PreTrainedModel):
+    config_class = Effb2TrmConfig
+    def __init__(self, config):
+        super().__init__(config)
+        encoder = EfficientNetB2()
+        decoder = TransformerDecoder(
+            emb_dim=config.decoder_emb_dim,
+            vocab_size=config.vocab_size,
+            fc_emb_dim=config.fc_emb_dim,
+            attn_emb_dim=config.attn_emb_dim,
+            dropout=config.decoder_dropout,
+            nlayers=config.decoder_n_layers,
+            tie_weights=config.decoder_we_tie_weights
+        )
+        model = TransformerModel(encoder, decoder)
+        self.model = ContraEncoderKdWrapper(model, config.shared_dim, config.tchr_dim)
+    def forward(self,
+                audio: torch.Tensor,
+                audio_length: Union[List, np.ndarray, torch.Tensor],
+                sample_method: str = "beam",
+                beam_size: int = 3,
+                max_length: int = 20,
+                temp: float = 1.0,):
+        device = self.device
+        input_dict = {
+            "wav": audio.to(device),
+            "wav_len": audio_length,
+            "specaug": False,
+            "mode": "inference",
+            "sample_method": sample_method,
+            "max_length": max_length,
+            "temp": temp,
+        }
+        if sample_method == "beam":
+            input_dict["beam_size"] = beam_size
+        return self.model(input_dict)["seq"].cpu()
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=in_channels,
+                               out_channels=out_channels,
+                               kernel_size=(3, 3), stride=(1, 1),
+                               padding=(1, 1), bias=False)
+        self.conv2 = nn.Conv2d(in_channels=out_channels,
+                               out_channels=out_channels,
+                               kernel_size=(3, 3), stride=(1, 1),
+                               padding=(1, 1), bias=False)
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        x = F.relu_(self.bn2(self.conv2(x)))
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception('Incorrect argument!')
+        return x
+class Cnn14Encoder(nn.Module):
+    def __init__(self, sample_rate=32000):
+        super().__init__()
+        sr_to_fmax = {
+            32000: 14000,
+            16000: 8000
+        }
+        # Logmel spectrogram extractor
+        self.melspec_extractor = transforms.MelSpectrogram(
+            sample_rate=sample_rate,
+            n_fft=32 * sample_rate // 1000,
+            win_length=32 * sample_rate // 1000,
+            hop_length=10 * sample_rate // 1000,
+            f_min=50,
+            f_max=sr_to_fmax[sample_rate],
+            n_mels=64,
+            norm="slaney",
+            mel_scale="slaney"
+        )
+        self.hop_length = 10 * sample_rate // 1000
+        self.db_transform = transforms.AmplitudeToDB()
+        self.bn0 = nn.BatchNorm2d(64)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+        self.downsample_ratio = 32
+        self.fc1 = nn.Linear(2048, 2048, bias=True)
+        self.fc_emb_size = 2048
+    def forward(self, input_dict):
+        lms = input_dict["lms"]
+        wave_length = input_dict["wav_len"]
+        x = lms    # (batch_size, mel_bins, time_steps)
+        x = x.transpose(1, 2)
+        x = x.unsqueeze(1)      # (batch_size, 1, time_steps, mel_bins)
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = torch.mean(x, dim=3)
+        attn_emb = x.transpose(1, 2)
+        wave_length = torch.as_tensor(wave_length)
+        feat_length = torch.div(wave_length, self.hop_length,
+            rounding_mode="floor") + 1
+        feat_length = torch.div(feat_length, self.downsample_ratio,
+            rounding_mode="floor")
+        x_max = max_with_lens(attn_emb, feat_length)
+        x_mean = mean_with_lens(attn_emb, feat_length)
+        x = x_max + x_mean
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu_(self.fc1(x))
+        fc_emb = F.dropout(x, p=0.5, training=self.training)
+        output_dict = {
+            'fc_emb': fc_emb,
+            'attn_emb': attn_emb,
+            'attn_emb_len': feat_length
+        }
+        return output_dict
+class RnnEncoder(nn.Module):
+    def __init__(self,
+                 attn_feat_dim,
+                 pooling="mean",
+                 **kwargs):
+        super().__init__()
+        self.pooling = pooling
+        self.hidden_size = kwargs.get('hidden_size', 512)
+        self.bidirectional = kwargs.get('bidirectional', False)
+        self.num_layers = kwargs.get('num_layers', 1)
+        self.dropout = kwargs.get('dropout', 0.2)
+        self.rnn_type = kwargs.get('rnn_type', "GRU")
+        self.in_bn = kwargs.get('in_bn', False)
+        self.embed_dim = self.hidden_size * (self.bidirectional + 1)
+        self.network = getattr(nn, self.rnn_type)(
+            attn_feat_dim,
+            self.hidden_size,
+            num_layers=self.num_layers,
+            bidirectional=self.bidirectional,
+            dropout=self.dropout,
+            batch_first=True)
+        if self.in_bn:
+            self.bn = nn.BatchNorm1d(self.embed_dim)
+    def forward(self, input_dict):
+        x = input_dict["attn"]
+        lens = input_dict["attn_len"]
+        lens = torch.as_tensor(lens)
+        # x: [N, T, E]
+        if self.in_bn:
+            x = pack_wrapper(self.bn, x, lens)
+        out = pack_wrapper(self.network, x, lens)
+        # out: [N, T, hidden]
+        attn_emb = out
+        fc_emb = embedding_pooling(out, lens, self.pooling)
+        return {
+            "attn_emb": attn_emb,
+            "fc_emb": fc_emb,
+            "attn_emb_len": lens
+        }
+class Cnn14RnnEncoder(nn.Module):
+    def __init__(self,
+                 sample_rate,
+                 rnn_bidirectional,
+                 rnn_hidden_size,
+                 rnn_dropout,
+                 rnn_num_layers):
+        super().__init__()
+        self.cnn = Cnn14Encoder(sample_rate=sample_rate)
+        self.rnn = RnnEncoder(
+            2048,
+            bidirectional=rnn_bidirectional,
+            hidden_size=rnn_hidden_size,
+            dropout=rnn_dropout,
+            num_layers=rnn_num_layers,
+        )
+    def forward(self, input_dict):
+        output_dict = self.cnn(input_dict)
+        output_dict["attn"] = output_dict["attn_emb"]
+        output_dict["attn_len"] = output_dict["attn_emb_len"]
+        del output_dict["attn_emb"], output_dict["attn_emb_len"]
+        output_dict = self.rnn(output_dict)
+        return output_dict
+class Seq2SeqAttention(nn.Module):
+    def __init__(self, hs_enc, hs_dec, attn_size):
+        """
+        Args:
+            hs_enc: encoder hidden size
+            hs_dec: decoder hidden size
+            attn_size: attention vector size
+        """
+        super(Seq2SeqAttention, self).__init__()
+        self.h2attn = nn.Linear(hs_enc + hs_dec, attn_size)
+        self.v = nn.Parameter(torch.randn(attn_size))
+    def forward(self, h_dec, h_enc, src_lens):
+        """
+        Args:
+            h_dec: decoder hidden (query), [N, hs_dec]
+            h_enc: encoder memory (key/value), [N, src_max_len, hs_enc]
+            src_lens: source (encoder memory) lengths, [N, ]
+        """
+        N = h_enc.size(0)
+        src_max_len = h_enc.size(1)
+        h_dec = h_dec.unsqueeze(1).repeat(1, src_max_len, 1) # [N, src_max_len, hs_dec]
+        attn_input = torch.cat((h_dec, h_enc), dim=-1)
+        attn_out = torch.tanh(self.h2attn(attn_input)) # [N, src_max_len, attn_size]
+        v = self.v.repeat(N, 1).unsqueeze(1) # [N, 1, attn_size]
+        score = torch.bmm(v, attn_out.transpose(1, 2)).squeeze(1) # [N, src_max_len]
+        idxs = torch.arange(src_max_len).repeat(N).view(N, src_max_len)
+        mask = (idxs < src_lens.view(-1, 1)).to(h_dec.device)
+        score = score.masked_fill(mask == 0, -1e10)
+        weights = torch.softmax(score, dim=-1) # [N, src_max_len]
+        ctx = torch.bmm(weights.unsqueeze(1), h_enc).squeeze(1) # [N, hs_enc]
+        return ctx, weights
+class RnnDecoder(BaseDecoder):
+    def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                 dropout, d_model, **kwargs):
+        super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                         dropout,)
+        self.d_model = d_model
+        self.num_layers = kwargs.get('num_layers', 1)
+        self.bidirectional = kwargs.get('bidirectional', False)
+        self.rnn_type = kwargs.get('rnn_type', "GRU")
+        self.classifier = nn.Linear(
+            self.d_model * (self.bidirectional + 1), vocab_size)
+    def forward(self, x):
+        raise NotImplementedError
+    def init_hidden(self, bs, device):
+        num_dire = self.bidirectional + 1
+        n_layer = self.num_layers
+        hid_dim = self.d_model
+        if self.rnn_type == "LSTM":
+            return (torch.zeros(num_dire * n_layer, bs, hid_dim).to(device),
+                    torch.zeros(num_dire * n_layer, bs, hid_dim).to(device))
+        else:
+            return torch.zeros(num_dire * n_layer, bs, hid_dim).to(device)
+class BahAttnCatFcDecoder(RnnDecoder):
+    def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                 dropout, d_model, **kwargs):
+        """
+        concatenate fc, attn, word to feed to the rnn
+        """
+        super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                         dropout, d_model, **kwargs)
+        attn_size = kwargs.get("attn_size", self.d_model)
+        self.model = getattr(nn, self.rnn_type)(
+            input_size=self.emb_dim * 3,
+            hidden_size=self.d_model,
+            batch_first=True,
+            num_layers=self.num_layers,
+            bidirectional=self.bidirectional)
+        self.attn = Seq2SeqAttention(self.attn_emb_dim,
+                                     self.d_model * (self.bidirectional + 1) * \
+                                             self.num_layers,
+                                     attn_size)
+        self.fc_proj = nn.Linear(self.fc_emb_dim, self.emb_dim)
+        self.ctx_proj = nn.Linear(self.attn_emb_dim, self.emb_dim)
+    def forward(self, input_dict):
+        word = input_dict["word"]
+        state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model]
+        fc_emb = input_dict["fc_emb"]
+        attn_emb = input_dict["attn_emb"]
+        attn_emb_len = input_dict["attn_emb_len"]
+        word = word.to(fc_emb.device)
+        embed = self.in_dropout(self.word_embedding(word))
+        # embed: [N, 1, embed_size]
+        if state is None:
+            state = self.init_hidden(word.size(0), fc_emb.device)
+        if self.rnn_type == "LSTM":
+            query = state[0].transpose(0, 1).flatten(1)
+        else:
+            query = state.transpose(0, 1).flatten(1)
+        c, attn_weight = self.attn(query, attn_emb, attn_emb_len)
+        p_fc_emb = self.fc_proj(fc_emb)
+        p_ctx = self.ctx_proj(c)
+        rnn_input = torch.cat((embed, p_ctx.unsqueeze(1), p_fc_emb.unsqueeze(1)),
+                              dim=-1)
+        out, state = self.model(rnn_input, state)
+        output = {
+            "state": state,
+            "embed": out,
+            "logit": self.classifier(out),
+            "attn_weight": attn_weight
+        }
+        return output
+class TemporalBahAttnDecoder(BahAttnCatFcDecoder):
+    def __init__(self, emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                 dropout, d_model, **kwargs):
+        """
+        concatenate fc, attn, word to feed to the rnn
+        """
+        super().__init__(emb_dim, vocab_size, fc_emb_dim, attn_emb_dim,
+                         dropout, d_model, **kwargs)
+        self.temporal_embedding = nn.Embedding(4, emb_dim)
+    def forward(self, input_dict):
+        word = input_dict["word"]
+        state = input_dict.get("state", None) # [n_layer * n_dire, bs, d_model]
+        fc_embs = input_dict["fc_emb"]
+        attn_embs = input_dict["attn_emb"]
+        attn_emb_lens = input_dict["attn_emb_len"]
+        temporal_tag = input_dict["temporal_tag"]
+        if input_dict["t"] == 0:
+            embed = self.in_dropout(
+                self.temporal_embedding(temporal_tag)).unsqueeze(1)
+        elif word.size(-1) == self.fc_emb_dim: # fc_embs
+            embed = word.unsqueeze(1)
+        elif word.size(-1) == 1: # word
+            word = word.to(fc_embs.device)
+            embed = self.in_dropout(self.word_embedding(word))
+        else:
+            raise Exception(f"problem with word input size {word.size()}")
+        # embed: [N, 1, embed_size]
+        if state is None:
+            state = self.init_hidden(word.size(0), fc_embs.device)
+        if self.rnn_type == "LSTM":
+            query = state[0].transpose(0, 1).flatten(1)
+        else:
+            query = state.transpose(0, 1).flatten(1)
+        c, attn_weight = self.attn(query, attn_embs, attn_emb_lens)
+        p_ctx = self.ctx_proj(c)
+        p_fc_embs = self.fc_proj(fc_embs)
+        p_ctx = self.ctx_proj(c)
+        rnn_input = torch.cat((embed, p_ctx.unsqueeze(1), p_fc_embs.unsqueeze(1)), dim=-1)
+        out, state = self.model(rnn_input, state)
+        output = {
+            "state": state,
+            "embed": out,
+            "logit": self.classifier(out),
+            "attn_weight": attn_weight
+        }
+        return output
+class Seq2SeqAttnModel(CaptionModel):
+    def __init__(self, encoder, decoder, **kwargs):
+        if not hasattr(self, "compatible_decoders"):
+            self.compatible_decoders = (
+                BahAttnCatFcDecoder,
+            )
+        super().__init__(encoder, decoder, **kwargs)
+    def seq_forward(self, input_dict):
+        # Bahdanau attention only supports step-by-step implementation, so we implement forward in
+        # step-by-step manner whether in training or evaluation
+        return self.stepwise_forward(input_dict)
+    def prepare_output(self, input_dict):
+        output = super().prepare_output(input_dict)
+        attn_weight = torch.empty(output["seq"].size(0),
+            input_dict["attn_emb"].size(1), output["seq"].size(1))
+        output["attn_weight"] = attn_weight
+        return output
+    def prepare_decoder_input(self, input_dict, output):
+        decoder_input = {
+            "fc_emb": input_dict["fc_emb"],
+            "attn_emb": input_dict["attn_emb"],
+            "attn_emb_len": input_dict["attn_emb_len"]
+        }
+        t = input_dict["t"]
+        ###############
+        # determine input word
+        ################
+        if input_dict["mode"] == "train" and random.random() < input_dict["ss_ratio"]: # training, scheduled sampling
+            word = input_dict["cap"][:, t]
+        else:
+            if t == 0:
+                word = torch.tensor([self.start_idx,] * input_dict["fc_emb"].size(0)).long()
+            else:
+                word = output["seq"][:, t-1]
+        # word: [N,]
+        decoder_input["word"] = word.unsqueeze(1)
+        ################
+        # prepare rnn state
+        ################
+        if t > 0:
+            decoder_input["state"] = output["state"]
+        return decoder_input
+    def stepwise_process_step(self, output, output_t):
+        super().stepwise_process_step(output, output_t)
+        output["state"] = output_t["state"]
+        t = output_t["t"]
+        output["attn_weight"][:, :, t] = output_t["attn_weight"]
+    def prepare_beamsearch_output(self, input_dict):
+        output = super().prepare_beamsearch_output(input_dict)
+        beam_size = input_dict["beam_size"]
+        max_length = input_dict["max_length"]
+        output["attn_weight"] = torch.empty(beam_size,
+            max(input_dict["attn_emb_len"]), max_length)
+        return output
+    def prepare_beamsearch_decoder_input(self, input_dict, output_i):
+        decoder_input = {}
+        t = input_dict["t"]
+        i = input_dict["sample_idx"]
+        beam_size = input_dict["beam_size"]
+        ###############
+        # prepare fc embeds
+        ################
+        if t == 0:
+            fc_emb = repeat_tensor(input_dict["fc_emb"][i], beam_size)
+            output_i["fc_emb"] = fc_emb
+        decoder_input["fc_emb"] = output_i["fc_emb"]
+        ###############
+        # prepare attn embeds
+        ################
+        if t == 0:
+            attn_emb = repeat_tensor(input_dict["attn_emb"][i], beam_size)
+            attn_emb_len = repeat_tensor(input_dict["attn_emb_len"][i], beam_size)
+            output_i["attn_emb"] = attn_emb
+            output_i["attn_emb_len"] = attn_emb_len
+        decoder_input["attn_emb"] = output_i["attn_emb"]
+        decoder_input["attn_emb_len"] = output_i["attn_emb_len"]
+        ###############
+        # determine input word
+        ################
+        if t == 0:
+            word = torch.tensor([self.start_idx,] * beam_size).long()
+        else:
+            word = output_i["next_word"]
+        decoder_input["word"] = word.unsqueeze(1)
+        ################
+        # prepare rnn state
+        ################
+        if t > 0:
+            if self.decoder.rnn_type == "LSTM":
+                decoder_input["state"] = (output_i["state"][0][:, output_i["prev_words_beam"], :].contiguous(),
+                                          output_i["state"][1][:, output_i["prev_words_beam"], :].contiguous())
+            else:
+                decoder_input["state"] = output_i["state"][:, output_i["prev_words_beam"], :].contiguous()
+        return decoder_input
+    def beamsearch_process_step(self, output_i, output_t):
+        t = output_t["t"]
+        output_i["state"] = output_t["state"]
+        output_i["attn_weight"][..., t] = output_t["attn_weight"]
+        output_i["attn_weight"] = output_i["attn_weight"][output_i["prev_words_beam"], ...]
+    def beamsearch_process(self, output, output_i, input_dict):
+        super().beamsearch_process(output, output_i, input_dict)
+        i = input_dict["sample_idx"]
+        output["attn_weight"][i] = output_i["attn_weight"][0]
+    def prepare_dbs_decoder_input(self, input_dict, output_i):
+        decoder_input = {}
+        t = input_dict["t"]
+        i = input_dict["sample_idx"]
+        bdash = input_dict["bdash"]
+        divm = input_dict["divm"]
+        local_time = t - divm
+        ###############
+        # prepare fc embeds
+        ################
+        # repeat only at the first timestep to save consumption
+        if t == 0:
+            fc_emb = repeat_tensor(input_dict["fc_emb"][i], bdash).unsqueeze(1)
+            output_i["fc_emb"] = fc_emb
+        decoder_input["fc_emb"] = output_i["fc_emb"]
+        ###############
+        # prepare attn embeds
+        ################
+        if t == 0:
+            attn_emb = repeat_tensor(input_dict["attn_emb"][i], bdash)
+            attn_emb_len = repeat_tensor(input_dict["attn_emb_len"][i], bdash)
+            output_i["attn_emb"] = attn_emb
+            output_i["attn_emb_len"] = attn_emb_len
+        decoder_input["attn_emb"] = output_i["attn_emb"]
+        decoder_input["attn_emb_len"] = output_i["attn_emb_len"]
+        ###############
+        # determine input word
+        ################
+        if local_time == 0:
+            word = torch.tensor([self.start_idx,] * bdash).long()
+        else:
+            word = output_i["next_word"][divm]
+        decoder_input["word"] = word.unsqueeze(1)
+        ################
+        # prepare rnn state
+        ################
+        if local_time > 0:
+            if self.decoder.rnn_type == "LSTM":
+                decoder_input["state"] = (
+                    output_i["state"][0][divm][
+                        :, output_i["prev_words_beam"][divm], :].contiguous(),
+                    output_i["state"][1][divm][
+                        :, output_i["prev_words_beam"][divm], :].contiguous()
+                )
+            else:
+                decoder_input["state"] = output_i["state"][divm][
+                    :, output_i["prev_words_beam"][divm], :].contiguous()
+        return decoder_input
+    def dbs_process_step(self, output_i, output_t):
+        divm = output_t["divm"]
+        output_i["state"][divm] = output_t["state"]
+        # TODO attention weight
+class TemporalSeq2SeqAttnModel(Seq2SeqAttnModel):
+    def __init__(self, encoder, decoder, **kwargs):
+        if not hasattr(self, "compatible_decoders"):
+            self.compatible_decoders = (
+                TemporalBahAttnDecoder,
+            )
+        super().__init__(encoder, decoder, **kwargs)
+        self.train_forward_keys = ["cap", "cap_len", "ss_ratio", "temporal_tag"]
+        self.inference_forward_keys = ["sample_method", "max_length", "temp", "temporal_tag"]
+    def prepare_decoder_input(self, input_dict, output):
+        decoder_input = super().prepare_decoder_input(input_dict, output)
+        decoder_input["temporal_tag"] = input_dict["temporal_tag"]
+        decoder_input["t"] = input_dict["t"]
+        return decoder_input
+    def prepare_beamsearch_decoder_input(self, input_dict, output_i):
+        decoder_input = super().prepare_beamsearch_decoder_input(input_dict, output_i)
+        t = input_dict["t"]
+        i = input_dict["sample_idx"]
+        beam_size = input_dict["beam_size"]
+        ###############
+        # prepare temporal_tag
+        ################
+        if t == 0:
+            temporal_tag = repeat_tensor(input_dict["temporal_tag"][i], beam_size)
+            output_i["temporal_tag"] = temporal_tag
+        decoder_input["temporal_tag"] = output_i["temporal_tag"]
+        decoder_input["t"] = input_dict["t"]
+        return decoder_input
+    def prepare_dbs_decoder_input(self, input_dict, output_i):
+        decoder_input = super.prepare_dbs_decoder_input(input_dict, output_i)
+        t = input_dict["t"]
+        i = input_dict["sample_idx"]
+        bdash = input_dict["bdash"]
+        ###############
+        # prepare temporal tag
+        ################
+        # repeat only at the first timestep to save consumption
+        if t == 0:
+            temporal_tag = repeat_tensor(input_dict["temporal_tag"][i], bdash)
+            output_i["temporal_tag"] = temporal_tag
+        decoder_input["temporal_tag"] = output_i["temporal_tag"]
+        decoder_input["t"] = input_dict["t"]
+        return decoder_input
+class Cnn8rnnSedModel(nn.Module):
+    def __init__(self, classes_num):
+        super().__init__()
+        self.time_resolution = 0.01
+        self.interpolate_ratio = 4     # Downsampled ratio
+        self.bn0 = nn.BatchNorm2d(64)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.fc1 = nn.Linear(512, 512, bias=True)
+        self.rnn = nn.GRU(512, 256, bidirectional=True, batch_first=True)
+        self.fc_audioset = nn.Linear(512, classes_num, bias=True)
+    def forward(self, lms):
+        output = self.forward_prob(lms)
+        framewise_output = output["framewise_output"].cpu().numpy()
+        thresholded_predictions = double_threshold(
+            framewise_output, 0.75, 0.25)
+        decoded_tags = decode_with_timestamps(
+            thresholded_predictions, self.time_resolution
+        )
+        return decoded_tags
+    def forward_prob(self, lms):
+        """
+        lms: (batch_size, mel_bins, time_steps)"""
+        x = lms
+        x = x.transpose(1, 2)
+        x = x.unsqueeze(1)
+        frames_num = x.shape[2]
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg+max')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg+max')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(1, 2), pool_type='avg+max')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(1, 2), pool_type='avg+max')
+        x = F.dropout(x, p=0.2, training=self.training) # (batch_size, 256, time_steps / 4, mel_bins / 16)
+        x = torch.mean(x, dim=3)
+        x = x.transpose(1, 2)
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu_(self.fc1(x))
+        x, _  = self.rnn(x)
+        segmentwise_output = torch.sigmoid(self.fc_audioset(x)).clamp(1e-7, 1.)
+        framewise_output = interpolate(segmentwise_output,
+                                       self.interpolate_ratio)
+        framewise_output = pad_framewise_output(framewise_output, frames_num)
+        output_dict = {
+            "segmentwise_output": segmentwise_output,
+            'framewise_output': framewise_output,
+        }
+        return output_dict
+class Cnn14RnnTempAttnGruConfig(PretrainedConfig):
+    def __init__(
+        self,
+        sample_rate: int = 32000,
+        encoder_rnn_bidirectional: bool = True,
+        encoder_rnn_hidden_size: int = 256,
+        encoder_rnn_dropout: float = 0.5,
+        encoder_rnn_num_layers: int = 3,
+        decoder_emb_dim: int = 512,
+        vocab_size: int = 4981,
+        fc_emb_dim: int = 512,
+        attn_emb_dim: int = 512,
+        decoder_rnn_type: str = "GRU",
+        decoder_num_layers: int = 1,
+        decoder_d_model: int = 512,
+        decoder_dropout: float = 0.5,
+        **kwargs
+    ):
+        self.sample_rate = sample_rate
+        self.encoder_rnn_bidirectional = encoder_rnn_bidirectional
+        self.encoder_rnn_hidden_size = encoder_rnn_hidden_size
+        self.encoder_rnn_dropout = encoder_rnn_dropout
+        self.encoder_rnn_num_layers = encoder_rnn_num_layers
+        self.decoder_emb_dim = decoder_emb_dim
+        self.vocab_size = vocab_size
+        self.fc_emb_dim = fc_emb_dim
+        self.attn_emb_dim = attn_emb_dim
+        self.decoder_rnn_type = decoder_rnn_type
+        self.decoder_num_layers = decoder_num_layers
+        self.decoder_d_model = decoder_d_model
+        self.decoder_dropout = decoder_dropout
+        super().__init__(**kwargs)
+class Cnn14RnnTempAttnGruModel(PreTrainedModel):
+    config_class = Cnn14RnnTempAttnGruConfig
+    def __init__(self, config):
+        super().__init__(config)
+        sample_rate = config.sample_rate
+        sr_to_fmax = {
+            32000: 14000,
+            16000: 8000
+        }
+        self.melspec_extractor = transforms.MelSpectrogram(
+            sample_rate=sample_rate,
+            n_fft=32 * sample_rate // 1000,
+            win_length=32 * sample_rate // 1000,
+            hop_length=10 * sample_rate // 1000,
+            f_min=50,
+            f_max=sr_to_fmax[sample_rate],
+            n_mels=64,
+            norm="slaney",
+            mel_scale="slaney"
+        )
+        self.db_transform = transforms.AmplitudeToDB()
+        encoder = Cnn14RnnEncoder(
+            sample_rate=config.sample_rate,
+            rnn_bidirectional=config.encoder_rnn_bidirectional,
+            rnn_hidden_size=config.encoder_rnn_hidden_size,
+            rnn_dropout=config.encoder_rnn_dropout,
+            rnn_num_layers=config.encoder_rnn_num_layers
+        )
+        decoder = TemporalBahAttnDecoder(
+            emb_dim=config.decoder_emb_dim,
+            vocab_size=config.vocab_size,
+            fc_emb_dim=config.fc_emb_dim,
+            attn_emb_dim=config.attn_emb_dim,
+            rnn_type=config.decoder_rnn_type,
+            num_layers=config.decoder_num_layers,
+            d_model=config.decoder_d_model,
+            dropout=config.decoder_dropout,
+        )
+        cap_model = TemporalSeq2SeqAttnModel(encoder, decoder)
+        sed_model = Cnn8rnnSedModel(classes_num=447)
+        self.cap_model = cap_model
+        self.sed_model = sed_model
+    def forward(self,
+                audio: torch.Tensor,
+                audio_length: Union[List, np.ndarray, torch.Tensor],
+                temporal_tag: Union[List, np.ndarray, torch.Tensor] = None,
+                sample_method: str = "beam",
+                beam_size: int = 3,
+                max_length: int = 20,
+                temp: float = 1.0,):
+        device = self.device
+        mel_spec = self.melspec_extractor(audio.to(device))
+        log_mel_spec = self.db_transform(mel_spec)
+        sed_tag = self.sed_model(log_mel_spec)
+        sed_tag = torch.as_tensor(sed_tag).to(device)
+        if temporal_tag is not None:
+            temporal_tag = torch.as_tensor(temporal_tag).to(device)
+            temporal_tag = torch.stack([temporal_tag, sed_tag], dim=0)
+            temporal_tag = torch.min(temporal_tag, dim=0).values
+        else:
+            temporal_tag = sed_tag
+        input_dict = {
+            "lms": log_mel_spec,
+            "wav_len": audio_length,
+            "temporal_tag": temporal_tag,
+            "mode": "inference",
+            "sample_method": sample_method,
+            "max_length": max_length,
+            "temp": temp,
+        }
+        if sample_method == "beam":
+            input_dict["beam_size"] = beam_size
+        return self.cap_model(input_dict)["seq"].cpu()

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d5d51984ac220288b130d04f53652a2aec21e7f2cd275c0a48ed1648f6ace16
+size 55324025