winglian commited on
Commit
5ac3392
·
unverified ·
1 Parent(s): e356b29

support for datasets with multiple names (#480)

Browse files

* support for datasets with multiple names

* update docs

Files changed (2) hide show
  1. README.md +9 -0
  2. src/axolotl/utils/data.py +10 -1
README.md CHANGED
@@ -328,6 +328,15 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
328
  name: enron_emails
329
  type: completion # format from earlier
330
 
 
 
 
 
 
 
 
 
 
331
  # local
332
  datasets:
333
  - path: data.jsonl # or json
 
328
  name: enron_emails
329
  type: completion # format from earlier
330
 
331
+ # huggingface repo with multiple named configurations/subsets
332
+ datasets:
333
+ - path: bigcode/commitpackft
334
+ name:
335
+ - ruby
336
+ - python
337
+ - typescript
338
+ type: ... # unimplemented custom format
339
+
340
  # local
341
  datasets:
342
  - path: data.jsonl # or json
src/axolotl/utils/data.py CHANGED
@@ -134,8 +134,17 @@ def load_tokenized_prepared_datasets(
134
  seed = 42
135
 
136
  datasets = []
 
 
 
 
 
 
 
 
 
137
  # pylint: disable=invalid-name
138
- for d in cfg.datasets:
139
  ds: Union[Dataset, DatasetDict] = None
140
  ds_from_hub = False
141
  try:
 
134
  seed = 42
135
 
136
  datasets = []
137
+
138
+ def for_d_in_datasets(dataset_configs):
139
+ for dataset in dataset_configs:
140
+ if dataset.name and isinstance(dataset.name, list):
141
+ for name in dataset.name:
142
+ yield DictDefault({**dataset, "name": name})
143
+ else:
144
+ yield dataset
145
+
146
  # pylint: disable=invalid-name
147
+ for d in for_d_in_datasets(cfg.datasets):
148
  ds: Union[Dataset, DatasetDict] = None
149
  ds_from_hub = False
150
  try: