support for datasets with multiple names (#480)
Browse files* support for datasets with multiple names
* update docs
- README.md +9 -0
- src/axolotl/utils/data.py +10 -1
README.md
CHANGED
@@ -328,6 +328,15 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
|
|
328 |
name: enron_emails
|
329 |
type: completion # format from earlier
|
330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
331 |
# local
|
332 |
datasets:
|
333 |
- path: data.jsonl # or json
|
|
|
328 |
name: enron_emails
|
329 |
type: completion # format from earlier
|
330 |
|
331 |
+
# huggingface repo with multiple named configurations/subsets
|
332 |
+
datasets:
|
333 |
+
- path: bigcode/commitpackft
|
334 |
+
name:
|
335 |
+
- ruby
|
336 |
+
- python
|
337 |
+
- typescript
|
338 |
+
type: ... # unimplemented custom format
|
339 |
+
|
340 |
# local
|
341 |
datasets:
|
342 |
- path: data.jsonl # or json
|
src/axolotl/utils/data.py
CHANGED
@@ -134,8 +134,17 @@ def load_tokenized_prepared_datasets(
|
|
134 |
seed = 42
|
135 |
|
136 |
datasets = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
# pylint: disable=invalid-name
|
138 |
-
for d in cfg.datasets:
|
139 |
ds: Union[Dataset, DatasetDict] = None
|
140 |
ds_from_hub = False
|
141 |
try:
|
|
|
134 |
seed = 42
|
135 |
|
136 |
datasets = []
|
137 |
+
|
138 |
+
def for_d_in_datasets(dataset_configs):
|
139 |
+
for dataset in dataset_configs:
|
140 |
+
if dataset.name and isinstance(dataset.name, list):
|
141 |
+
for name in dataset.name:
|
142 |
+
yield DictDefault({**dataset, "name": name})
|
143 |
+
else:
|
144 |
+
yield dataset
|
145 |
+
|
146 |
# pylint: disable=invalid-name
|
147 |
+
for d in for_d_in_datasets(cfg.datasets):
|
148 |
ds: Union[Dataset, DatasetDict] = None
|
149 |
ds_from_hub = False
|
150 |
try:
|