ValueError - number of spatial dimensions

Hello everyone !

First topic, first post for me so if i miss some explaination, i will update it as soon as possible.

I I am fine tuning a maskformer model with a custom COCO dataset. I have some issue when i go for the forward_call to get the loss (function get_loss_dict)

Some of my picture do not raise any error and some of them raise this error. Don’t understand why.

Here the code of the dataset:

class CocoInstanceDataset(torch.utils.data.Dataset):
    def __init__(self, ann_file, img_folder, processor=None, transform=None, increment_instance_ids:bool = True, increment_class_ids:bool = True):
      with open(ann_file, 'r') as f:
          self.coco = json.load(f)
      self.coco['images'] = sorted(self.coco['images'], key=lambda x: x['id'])
      self.img_folder = img_folder
      self.processor = processor
      self.transform = transform
      self.increment_instance_ids = increment_instance_ids
      self.increment_class_ids = increment_class_ids
      self.id2label = {cat['id']:cat['name'] for cat in self.coco['categories']}
      self.label2id = {cat['name']:cat['id'] for cat in self.coco['categories']}

    def get_mask_from_segmentation(self, segmentation, instance_id, width, height):
      mask = imantics.Polygons(segmentation).mask(width=width, height=height).array.astype(np.int64)
      mask[mask>0]=instance_id
      if mask.sum()>0:
        return mask
    
    def __getitem__(self, idx):
      # get image & image infos
      img_path = os.path.join(self.img_folder, self.coco['images'][idx]['file_name'])
      image = np.array(Image.open(img_path)) #.convert('RGB')).transpose(2, 0, 1) #see if convert RGB is needed
      
      # get annotations
      annotations = [ann for ann in self.coco['annotations'] if ann['image_id']==self.coco['images'][idx]['id']]
      masks = []
      for i, ann in enumerate(annotations):
        mask = self.get_mask_from_segmentation(segmentation=ann['segmentation'], 
                                               instance_id = i+self.increment_instance_ids, 
                                               width=self.coco['images'][idx]['width'], 
                                               height=self.coco['images'][idx]['height'])
        if mask is not None:
          masks.append(mask)
          
      instance_id_2_category_id = {i+self.increment_instance_ids:ann['category_id']+self.increment_class_ids for i,ann in enumerate(annotations)}
      
      # apply transformation
      if self.transform is not None:
        transformed = self.transform(image=image, masks=masks)
        image, masks = transformed['image'], transformed['masks']
      
      # convert to C, H, W
      # image = image.transpose(2,0,1)
      
      # tweak to process overlaping masks
      mask_labels=[]
      class_labels=[]
      for i, mask in enumerate(masks):          
        encoding = self.processor(images=[image], segmentation_maps=[mask], instance_id_to_semantic_id=instance_id_2_category_id, return_tensors="pt")
        if i==0:
          pixel_values=encoding['pixel_values']
          pixel_mask=encoding['pixel_mask']
          
        mask_labels.append(encoding['mask_labels'][0])
        class_labels.append(encoding['class_labels'][0])  
        
      inputs =  {'pixel_values':pixel_values, 
                 'pixel_mask':pixel_mask, 
                 'mask_labels': torch.stack(mask_labels).squeeze(), 
                 'class_labels': torch.stack(class_labels).squeeze()
                }

      inputs = {k: v.squeeze() if isinstance(v, torch.Tensor) else v[0] for k,v in inputs.items()}

      if len(class_labels) == 1:
        inputs["class_labels"] = torch.stack(class_labels).squeeze(dim=1)

      return inputs

    def __len__(self):
      return len(self.coco['images'])

below the model call to get loss:

outputs = model(
          pixel_values=batch["pixel_values"],
          mask_labels=batch["mask_labels"],
          class_labels=batch["class_labels"],
      )

And below the error trace