How to use FSDP or DDP with Seq2SeqTrainer?

I have 2 GTX 1080 Ti GPUs(11G RAM each one) and i want to fine-tune openai/whisper-small model which one of the hugging face transformers models. Also, I want to use Fully Sharded Data Parallel(FSDP) via seq2seqTrainer but i got error. torch version= ‘2.0.0+cu117’

Here is my code related to data:
1.

def prepare_dataset(batch):
    batch["input_features"] = feature_extractor(batch["audio"], sampling_rate=16000).input_features[0]
    batch["labels"] = tokenizer(batch["text"]).input_ids

    batch["input_features"] = torch.tensor(batch["input_features"])
    batch["labels"] = torch.tensor(batch["labels"])
  
    return batch
train_ds = train_ds.map(prepare_dataset, remove_columns=train_ds.column_names)
val_ds = val_ds.map(prepare_dataset, remove_columns=val_ds.column_names)

this is how i build the model and its parameters:
1.

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small",
                                                        activation_dropout=0.1,
                                                        attention_dropout=0.1,
                                                        dropout=0.1)
os.environ['RANK'] = '0'
os.environ['WORLD_SIZE'] = '2'
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
training_args = Seq2SeqTrainingArguments(
    output_dir="/home/whisper_small_16_2_outputs/",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=936,
    fp16=True,
    local_rank=0,
    save_strategy='steps',
    evaluation_strategy="steps",
    gradient_checkpointing=True,
    predict_with_generate=True,
    generation_max_length=210,
    save_steps=600,
    eval_steps=300,
    logging_steps=300,
    num_train_epochs=30,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    save_total_limit=5,
    fsdp='full_shard',
    fsdp_config='/home/fsdp_config.json'
)
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

and this is the error i’ve got:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[21], line 1
----> 1 training_args = Seq2SeqTrainingArguments(
      2     output_dir="/home/whisper_small_16_2_outputs/",
      3     per_device_train_batch_size=8,
      4     gradient_accumulation_steps=2,
      5     learning_rate=1e-5,
      6     warmup_steps=936,
      7     fp16=True,
      8     local_rank=0,
      9     save_strategy='steps',
     10     evaluation_strategy="steps",
     11     gradient_checkpointing=True,
     12     predict_with_generate=True,
     13     generation_max_length=210,
     14     save_steps=600,
     15     eval_steps=300,
     16     logging_steps=300,
     17     num_train_epochs=30,
     18     load_best_model_at_end=True,
     19     metric_for_best_model="wer",
     20     greater_is_better=False,
     21     save_total_limit=5,
     22     fsdp='full_shard',
     23     fsdp_config='/home/fsdp_config.json'
     24 )

File <string>:115, in __init__(self, output_dir, overwrite_output_dir, do_train, do_eval, do_predict, evaluation_strategy, prediction_loss_only, per_device_train_batch_size, per_device_eval_batch_size, per_gpu_train_batch_size, per_gpu_eval_batch_size, gradient_accumulation_steps, eval_accumulation_steps, eval_delay, learning_rate, weight_decay, adam_beta1, adam_beta2, adam_epsilon, max_grad_norm, num_train_epochs, max_steps, lr_scheduler_type, warmup_ratio, warmup_steps, log_level, log_level_replica, log_on_each_node, logging_dir, logging_strategy, logging_first_step, logging_steps, logging_nan_inf_filter, save_strategy, save_steps, save_total_limit, save_safetensors, save_on_each_node, no_cuda, use_mps_device, seed, data_seed, jit_mode_eval, use_ipex, bf16, fp16, fp16_opt_level, half_precision_backend, bf16_full_eval, fp16_full_eval, tf32, local_rank, xpu_backend, tpu_num_cores, tpu_metrics_debug, debug, dataloader_drop_last, eval_steps, dataloader_num_workers, past_index, run_name, disable_tqdm, remove_unused_columns, label_names, load_best_model_at_end, metric_for_best_model, greater_is_better, ignore_data_skip, sharded_ddp, fsdp, fsdp_min_num_params, fsdp_config, fsdp_transformer_layer_cls_to_wrap, deepspeed, label_smoothing_factor, optim, optim_args, adafactor, group_by_length, length_column_name, report_to, ddp_find_unused_parameters, ddp_bucket_cap_mb, dataloader_pin_memory, skip_memory_metrics, use_legacy_prediction_loop, push_to_hub, resume_from_checkpoint, hub_model_id, hub_strategy, hub_token, hub_private_repo, gradient_checkpointing, include_inputs_for_metrics, fp16_backend, push_to_hub_model_id, push_to_hub_organization, push_to_hub_token, mp_parameters, auto_find_batch_size, full_determinism, torchdynamo, ray_scope, ddp_timeout, torch_compile, torch_compile_backend, torch_compile_mode, sortish_sampler, predict_with_generate, generation_max_length, generation_num_beams, generation_config)

File ~/.local/lib/python3.10/site-packages/transformers/training_args.py:1259, in TrainingArguments.__post_init__(self)
   1253     if version.parse(version.parse(torch.__version__).base_version) == version.parse("2.0.0") and self.fp16:
   1254         raise ValueError("--optim adamw_torch_fused with --fp16 requires PyTorch>2.0")
   1256 if (
   1257     self.framework == "pt"
   1258     and is_torch_available()
-> 1259     and (self.device.type != "cuda")
   1260     and (get_xla_device_type(self.device) != "GPU")
   1261     and (self.fp16 or self.fp16_full_eval)
   1262 ):
   1263     raise ValueError(
   1264         "FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation"
   1265         " (`--fp16_full_eval`) can only be used on CUDA devices."
   1266     )
   1268 if (
   1269     self.framework == "pt"
   1270     and is_torch_available()
   (...)
   1275     and (self.bf16 or self.bf16_full_eval)
   1276 ):

File ~/.local/lib/python3.10/site-packages/transformers/training_args.py:1694, in TrainingArguments.device(self)
   1690 """
   1691 The device used by this process.
   1692 """
   1693 requires_backends(self, ["torch"])
-> 1694 return self._setup_devices

File ~/.local/lib/python3.10/site-packages/transformers/utils/generic.py:54, in cached_property.__get__(self, obj, objtype)
     52 cached = getattr(obj, attr, None)
     53 if cached is None:
---> 54     cached = self.fget(obj)
     55     setattr(obj, attr, cached)
     56 return cached

File ~/.local/lib/python3.10/site-packages/transformers/training_args.py:1679, in TrainingArguments._setup_devices(self)
   1677         torch.distributed.init_process_group(backend=self.xpu_backend, timeout=self.ddp_timeout_delta)
   1678     else:
-> 1679         torch.distributed.init_process_group(backend="nccl", timeout=self.ddp_timeout_delta)
   1680 device = torch.device("cuda", self.local_rank)
   1681 self._n_gpu = 1

File ~/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:920, in init_process_group(backend, init_method, timeout, world_size, rank, store, group_name, pg_options)
    916     barrier()
    917 else:
    918     # Use store based barrier here since barrier() used a bunch of
    919     # default devices and messes up NCCL internal state.
--> 920     _store_based_barrier(rank, store, timeout)

File ~/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py:459, in _store_based_barrier(rank, store, timeout)
    456         log_time = time.time()
    458     if timedelta(seconds=(time.time() - start)) > timeout:
--> 459         raise RuntimeError(
    460             "Timed out initializing process group in store based barrier on "
    461             "rank: {}, for key: {} (world_size={}, worker_count={}, timeout={})".format(
    462                 rank, store_key, world_size, worker_count, timeout
    463             )
    464         )
    466 logger.info(
    467     f"Rank {rank}: Completed store-based barrier for key:{store_key} with {world_size} nodes."
    468 )

RuntimeError: Timed out initializing process group in store based barrier on rank: 0, for key: store_based_barrier_key:1 (world_size=2, worker_count=1, timeout=0:30:00)