from transformers import AutoTokenizer,VisionEncoderDecoderModel,ViTImageProcessor
model_name = "nlpconnect/vit-gpt2-image-captioning"
model=VisionEncoderDecoderModel.from_pretrained(model_name)
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
from torch.utils.data import Dataset
class CustomDataset(Dataset):
def __init__(self, images, captions):
self.images = images
self.captions = captions
def __len__(self):
return len(self.images)
def __getitem__(self, idx):
image = self.images[idx]
caption = self.captions[idx]
preprocessed_image=feature_extractor(image,return_tensors="pt",max_length=model_max_length,padding=True).pixel_values
preprocessed_text=tokenizer(caption,padding=True,truncation=True,max_length=model_max_length,return_tensors="pt").input_ids
inputs = {
'pixel_values': preprocessed_image,
'labels': preprocessed_text,
}
return inputs
dataset=CustomDataset(fused_images,fused_images_phrases)
for i in range(10):
print(dataset[i]['labels'].shape) # to print the output tensor shape of descriptions
'''
for this I am getting the output as :
torch.Size([1, 6])
torch.Size([1, 7])
torch.Size([1, 4])
torch.Size([1, 4])
torch.Size([1, 8])
torch.Size([1, 8])
torch.Size([1, 6])
torch.Size([1, 6])
torch.Size([1, 6])
torch.Size([1, 6])
i dont know why my tensors arent getting shaped into uniform tensors?