Hey guys! I wanted to create a vision dataset of my own using HuggingFace Dataset by extending the GeneratorBasedBuilder like so:
class AutoEncoderDataset(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("1.0.0")
def __init__(self, training_pickle: AnyStr, validation_pickle: AnyStr, *args, writer_batch_size=None, **kwargs):
super().__init__(*args, writer_batch_size=writer_batch_size, **kwargs)
self.training_pickle = training_pickle
self.validation_pickle = validation_pickle
def _info(self) -> DatasetInfo:
features = datasets.Features({
"image": datasets.Image()
})
return datasets.DatasetInfo(
features=features
)
def _split_generators(self, dl_manager: DownloadManager):
splits = [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"h5_path": self.training_pickle
}
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"h5_path": self.validation_pickle
}
)
]
return splits
def _generate_examples(self, h5_path: AnyStr):
with h5py.File(h5_path, "r") as infile:
images = infile["images"]
for _id in range(images.shape[0]):
yield _id, {
"image": cv2.imdecode(images[_id][-1], cv2.IMREAD_COLOR)
}
But when I use this as part of my training arguments I get the following error:
***** Running training *****
Num examples = 0
Num Epochs = 10
Instantaneous batch size per device = 32
Total train batch size (w. parallel, distributed & accumulation) = 32
Gradient Accumulation steps = 1
Total optimization steps = 320
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
wandb: Currently logged in as: nishantbhattacharya (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.12.18 is available! To upgrade, please run:
wandb: $ pip install wandb --upgrade
wandb: Tracking run with wandb version 0.12.11
wandb: Run data is saved locally in C:\Users\nisha\Documents\Imagine\wandb\run-20220619_200416-1o365d21
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run outputs\all\AutoEncoder_assets_2
wandb: View project at https://wandb.ai/nishantbhattacharya/huggingface
wandb: View run at https://wandb.ai/nishantbhattacharya/huggingface/runs/1o365d21
0%| | 0/320 [00:00<?, ?it/s]Traceback (most recent call last):
File "C:\Users\nisha\Documents\Imagine\main.py", line 60, in <module>
main()
File "C:\Users\nisha\Documents\Imagine\main.py", line 52, in main
atrain(args.dataset, args.subdatasets)
File "C:\Users\nisha\Documents\Imagine\autoencoder_trainer.py", line 83, in train
train_single_asset(subdataset, dataset_tag)
File "C:\Users\nisha\Documents\Imagine\autoencoder_trainer.py", line 67, in train_single_asset
trainer.train()
File "C:\Users\nisha\.conda\envs\imagine\lib\site-packages\transformers\trainer.py", line 1339, in train
for step, inputs in enumerate(epoch_iterator):
File "C:\Users\nisha\.conda\envs\imagine\lib\site-packages\torch\utils\data\dataloader.py", line 521, in __next__
data = self._next_data()
File "C:\Users\nisha\.conda\envs\imagine\lib\site-packages\torch\utils\data\dataloader.py", line 561, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "C:\Users\nisha\.conda\envs\imagine\lib\site-packages\torch\utils\data\_utils\fetch.py", line 49, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "C:\Users\nisha\.conda\envs\imagine\lib\site-packages\torch\utils\data\_utils\fetch.py", line 49, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "C:\Users\nisha\.conda\envs\imagine\lib\site-packages\datasets\arrow_dataset.py", line 1764, in __getitem__
return self._getitem(
File "C:\Users\nisha\.conda\envs\imagine\lib\site-packages\datasets\arrow_dataset.py", line 1748, in _getitem
pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
File "C:\Users\nisha\.conda\envs\imagine\lib\site-packages\datasets\formatting\formatting.py", line 486, in query_table
_check_valid_index_key(key, size)
File "C:\Users\nisha\.conda\envs\imagine\lib\site-packages\datasets\formatting\formatting.py", line 429, in _check_valid_index_key
raise IndexError(f"Invalid key: {key} is out of bounds for size {size}")
IndexError: Invalid key: 664 is out of bounds for size 0
wandb: Waiting for W&B process to finish... (failed 1). Press Ctrl-C to abort syncing.
wandb: - 0.001 MB of 0.001 MB uploaded (0.000 MB deduped)
wandb:
wandb: Synced outputs\all\AutoEncoder_assets_2: https://wandb.ai/nishantbhattacharya/huggingface/runs/1o365d21
wandb: Synced 6 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
wandb: Find logs at: .\wandb\run-20220619_200416-1o365d21\logs
Process finished with exit code 1
I can’t figure out what is wrong. I looped through the dataset object entirely and there were no errors. Any help would be really appreciated! I also noticed that the number of examples of 0 for some reason.