I am trying to train Vision Encoder Decoder with VIT encoder and Hindi GPT2
(surajp/gpt2-hindi at main) decoder, for Hindi Image captioning, which my team are doing as a part of Huggingface course project.
Currently my code is this
For creating a dataset
import torch
from torch.utils.data import Dataset
from PIL import Image
class Image_Caption_Dataset(Dataset):
def __init__(self,root_dir,df, feature_extractor,tokenizer,max_target_length=128):
self.root_dir = root_dir
self.df = df
self.feature_extractor = feature_extractor
self.tokenizer = tokenizer
self.max_length=max_target_length
def __len__(self,df):
return self.df.shape[0]
def __getitem__(self,idx):
#return image
image_path = self.df['images'][idx]
text = self.df['text'][idx]
#prepare image
image = Image.open(self.root_dir+'/'+image_path).convert("RGB")
pixel_values = self.feature_extractor(image, return_tensors="pt").pixel_values
#add captions by encoding the input
captions = self.tokenizer(text,
padding='max_length',
max_length=self.max_length).input_ids
captions = [caption if caption != self.tokenizer.pad_token_id else -100 for caption in captions]
encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(captions)}
return encoding
from transformers import ViTFeatureExtractor,AutoTokenizer
encoder_checkpoint = 'google/vit-base-patch16-224'
decoder_checkpoint = 'surajp/gpt2-hindi'
feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
root_dir = "../input/flickr8k/Images"
train_dataset = Image_Caption_Dataset(root_dir=root_dir,
df=train_df,
feature_extractor=feature_extractor,
tokenizer=tokenizer)
val_dataset = Image_Caption_Dataset(root_dir=root_dir,
df=test_df,
feature_extractor=feature_extractor,
tokenizer=tokenizer)
from transformers import VisionEncoderDecoderModel
# initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(encoder_checkpoint, decoder_checkpoint)
#model.to(device)
After initializing the model I configured the model arguments
# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size
# set beam search parameters
model.config.eos_token_id = tokenizer.sep_token_id
model.config.max_length = 128
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4
Then started Training
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
predict_with_generate=True,
evaluation_strategy="steps",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
fp16=True,
output_dir="./",
logging_steps=2,
save_steps=1000,
eval_steps=200,
)
from transformers import default_data_collator
# instantiate trainer
trainer = Seq2SeqTrainer(
model=model,
tokenizer=feature_extractor,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_dataset,
eval_dataset=val_dataset,
data_collator=default_data_collator,
)
trainer.train()
But then I got the error, ValueError: Make sure to set the decoder_start_token_id attribute of the model’s configuration
If you noticed the above code, I have already set this, but when I checked that the tokenizer.cls_token_id was None
so I manually set the tokenizer.cls_token_id=’’
but then I got the Index out of range in self, error
Is there any workaround for this, the code is inspired for` here written by @nielsr
I have also tried with custom training loop, I get the same error