Hi, I’ve been getting a bug in my objectdetection for the longest time. My dataset is in the right format. If I change the batch size, I will still get the error saying the same thing. I’m unable to find the solution. How do I fix it?
training_args = TrainingArguments(
output_dir=“./model”,
num_train_epochs=2,
per_device_train_batch_size=5,
per_device_eval_batch_size=5,
# dataloader_num_workers=4,
weight_decay=1e-4,
logging_dir=‘./logs’,
logging_steps=70,
gradient_accumulation_steps=4,
metric_for_best_model=“eval_map”,
greater_is_better=True,
load_best_model_at_end=True,
eval_strategy=“epoch”,
save_strategy=‘epoch’,
save_total_limit=2,
# eval_do_concat_batches=False,
learning_rate=5e-3,
lr_scheduler_type=‘cosine’,
fp16=True,
max_grad_norm=0.01,
report_to=“none”
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=lambda batch: collate_fn(batch, image_processor),
tokenizer=image_processor,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=eval_metrics
)
The error:
IndexError Traceback (most recent call last)
Cell In[67], line 1
----> 1 trainer.train()
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2164, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
2162 hf_hub_utils.enable_progress_bars()
2163 else:
→ 2164 return inner_training_loop(
2165 args=args,
2166 resume_from_checkpoint=resume_from_checkpoint,
2167 trial=trial,
2168 ignore_keys_for_eval=ignore_keys_for_eval,
2169 )
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2524, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2517 context = (
2518 functools.partial(self.accelerator.no_sync, model=model)
2519 if i != len(batch_samples) - 1
2520 and self.accelerator.distributed_type != DistributedType.DEEPSPEED
2521 else contextlib.nullcontext
2522 )
2523 with context():
→ 2524 tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
2526 if (
2527 args.logging_nan_inf_filter
2528 and not is_torch_xla_available()
2529 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
2530 ):
2531 # if loss is nan or inf simply add the average of previous logged losses
2532 tr_loss = tr_loss + tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3654, in Trainer.training_step(self, model, inputs, num_items_in_batch)
3651 return loss_mb.reduce_mean().detach().to(self.args.device)
3653 with self.compute_loss_context_manager():
→ 3654 loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
3656 del inputs
3657 if (
3658 self.args.torch_empty_cache_steps is not None
3659 and self.state.global_step % self.args.torch_empty_cache_steps == 0
3660 ):
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:3708, in Trainer.compute_loss(self, model, inputs, return_outputs, num_items_in_batch)
3706 loss_kwargs[“num_items_in_batch”] = num_items_in_batch
3707 inputs = {**inputs, **loss_kwargs}
→ 3708 outputs = model(**inputs)
3709 # Save past state if it exists
3710 # TODO: this needs to be fixed and made cleaner later.
3711 if self.args.past_index >= 0:
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
→ 1553 return self._call_impl(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don’t have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:186, in DataParallel.forward(self, *inputs, **kwargs)
184 return self.module(*inputs[0], **module_kwargs[0])
185 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
→ 186 outputs = self.parallel_apply(replicas, inputs, module_kwargs)
187 return self.gather(outputs, self.output_device)
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:201, in DataParallel.parallel_apply(self, replicas, inputs, kwargs)
200 def parallel_apply(self, replicas: Sequence[T], inputs: Sequence[Any], kwargs: Any) → List[Any]:
→ 201 return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py:108, in parallel_apply(modules, inputs, kwargs_tup, devices)
106 output = results[i]
107 if isinstance(output, ExceptionWrapper):
→ 108 output.reraise()
109 outputs.append(output)
110 return outputs
File /opt/conda/lib/python3.10/site-packages/torch/_utils.py:706, in ExceptionWrapper.reraise(self)
702 except TypeError:
703 # If the exception takes multiple arguments, don’t try to
704 # instantiate since we don’t know how to
705 raise RuntimeError(msg) from None
→ 706 raise exception
IndexError: Caught IndexError in replica 0 on device 0.
Original Traceback (most recent call last):
File “/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py”, line 83, in _worker
output = module(*input, **kwargs)
File “/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1562, in _call_impl
return forward_call(*args, **kwargs)
File “/opt/conda/lib/python3.10/site-packages/transformers/models/detr/modeling_detr.py”, line 1466, in forward
loss, loss_dict, auxiliary_outputs = self.loss_function(
File “/opt/conda/lib/python3.10/site-packages/transformers/loss/loss_for_object_detection.py”, line 552, in ForObjectDetectionLoss
loss_dict = criterion(outputs_loss, labels)
File “/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1562, in _call_impl
return forward_call(*args, **kwargs)
File “/opt/conda/lib/python3.10/site-packages/transformers/loss/loss_for_object_detection.py”, line 253, in forward
indices = self.matcher(outputs_without_aux, targets)
File “/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1562, in _call_impl
return forward_call(*args, **kwargs)
File “/opt/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py”, line 116, in decorate_context
return func(*args, **kwargs)
File “/opt/conda/lib/python3.10/site-packages/transformers/loss/loss_for_object_detection.py”, line 360, in forward
indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
File “/opt/conda/lib/python3.10/site-packages/transformers/loss/loss_for_object_detection.py”, line 360, in
indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
IndexError: index 5 is out of bounds for dimension 0 with size 5