Trying to upload a dataset of hundreds of thousands of audio samples (the total volume is not very large, 60 gb) to the hub with push_to_hub, it doesn’t work.
From time to time one piece of the data (parquet) gets pushed and then I get RemoteDisconnected even though my internet is stable.
Please help.
Thanks
Hi! Calling push_to_hub
after an error resumes the upload, so calling it as many times as needed to upload all the shards is one solution.
Also, I think reducing the shard size (max_shard_size
) should help.
Hi,
I tried both for a few days, keeps crashing in the middle.
Isn’t there a more stable way to do it?
I think the only option then is to manually convert the dataset to Parquet shards and push them to the Hub using huggingface_hub
(or Git).
from datasets.table import embed_table_storage
# embeds external audio/image files (will not be needed in Datasets 3.0)
format = dataset.format
dataset = dataset.with_format("arrow")
dataset = dataset.map(embed_table_storage, batched=True)
dataset = dataset.with_format(**format)
max_shard_size = 0.5 << 30 # 500 MB
num_shards = max(int(dataset.data.nbytes / max_shard_size) + 1, 1)
for i in range(num_shards):
shard = dataset.shard(num_shards=num_shards, index=i, contiguous=True)
shard.to_parquet("shard_{i}.parquet")
# huggingface_hub.upload_file (or `huggingface_hub.create_commit` to push all the files at once)
# to preserve the features types (will not be needed in the next release of Datasets; `to_parquet` will embed this info the file's schema)
import huggingface_hub
card = huggingface_hub.DatasetCard("")
card.data["features"] = dataset.features._to_yaml_list()
card.push_to_hub("<repo_id>")
Hi mariosasko,
I managed to upload the dataset manually using your example. Is there a way to generate the dataset card automatically from a DatasetDict object? I mean the same way push_to_hub function does it, with the splits and their num_bytes
etc…
I used this snippet below
for split in ds.keys():
dataset = ds[split]
shard_name = "shard_" + split + "_{}.parquet"
max_shard_size = 4 << 30 # 4 GB
num_shards = max(int(dataset.data.nbytes / max_shard_size) + 1, 1)
for i in range(num_shards):
shard = dataset.shard(num_shards=num_shards, index=i, contiguous=True)
shard.to_parquet(save_dir + shard_name.format(i))
from huggingface_hub import HfApi
api = HfApi()
for i in range(num_shards):
api.upload_file(
path_or_fileobj=save_dir + shard_name.format(i),
path_in_repo="data/" + shard_name.format(i),
repo_id=_REPO,
repo_type="dataset",
)
I manged to do it by hacking some parts from dataset_dict.py
like this:
from huggingface_hub import (
CommitOperationAdd,
CommitOperationDelete,
DatasetCard,
DatasetCardData,
HfApi,
)
from pathlib import Path
from datasets.utils import logging
from datasets.utils.metadata import MetadataConfigs
from datasets.info import DatasetInfo, DatasetInfosDict
from datasets.splits import NamedSplit, Split, SplitDict, SplitInfo
logger = logging.get_logger(__name__)
# todo:
split = "train"
api = HfApi()
repo_id = _REPO
config_name = "default"
revision = None
repo_with_dataset_card, repo_with_dataset_infos = False, False
deletions, deleted_size = [], 0
repo_splits = [] # use a list to keep the order of the splits
data_dir = config_name if config_name != "default" else "data"
self = ds
info_to_dump: DatasetInfo = next(iter(self.values())).info.copy()
info_to_dump.config_name = config_name
info_to_dump.splits = SplitDict()
total_uploaded_size = 0
total_dataset_nbytes = 0
for split in self.keys():
dataset_nbytes = self[split]._estimate_nbytes()
info_to_dump.splits[split] = SplitInfo(
str(split), num_bytes=dataset_nbytes, num_examples=len(self[split])
)
total_uploaded_size += 0
total_dataset_nbytes += dataset_nbytes
info_to_dump.download_checksums = None
info_to_dump.download_size = total_uploaded_size
info_to_dump.dataset_size = total_dataset_nbytes
info_to_dump.size_in_bytes = total_uploaded_size + total_dataset_nbytes
metadata_config_to_dump = {
"data_files": [
{"split": split, "path": f"data/{get_shard_prefix(split)}_*"}
for split in self.keys()
],
}
try:
dataset_card_path = api.hf_hub_download(
repo_id, "README.md", repo_type="dataset", revision=revision
)
dataset_card = DatasetCard.load(Path(dataset_card_path))
dataset_card_data = dataset_card.data
metadata_configs = MetadataConfigs.from_dataset_card_data(dataset_card_data)
except:
print("No dataset card found")
dataset_card = None
dataset_card_data = DatasetCardData()
metadata_configs = MetadataConfigs()
DatasetInfosDict({config_name: info_to_dump}).to_dataset_card_data(dataset_card_data)
MetadataConfigs({config_name: metadata_config_to_dump}).to_dataset_card_data(
dataset_card_data
)
dataset_card = (
DatasetCard(f"---\n{dataset_card_data}\n---\n")
if dataset_card is None
else dataset_card
)
additions = []
additions.append(
CommitOperationAdd(
path_in_repo="README.md", path_or_fileobj=str(dataset_card).encode()
)
)
commit_message = "Generate dataset card"
api.create_commit(
repo_id,
operations=additions,
commit_message=commit_message,
repo_type="dataset",
revision=revision,
)