ich
commited on
Fix bug when using pretokenized datasets (#652)
Browse files* fix pretokenized datasets readme
* check if dataset type is not set to handle pretokenized datasets
- README.md +1 -1
- src/axolotl/utils/config.py +2 -0
README.md
CHANGED
@@ -317,7 +317,7 @@ Using file:
|
|
317 |
#### How to use your custom pretokenized dataset
|
318 |
|
319 |
- Do not pass a `type:`
|
320 |
-
- Dataset must
|
321 |
|
322 |
|
323 |
### Config
|
|
|
317 |
#### How to use your custom pretokenized dataset
|
318 |
|
319 |
- Do not pass a `type:`
|
320 |
+
- Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels`
|
321 |
|
322 |
|
323 |
### Config
|
src/axolotl/utils/config.py
CHANGED
@@ -293,6 +293,8 @@ def validate_config(cfg):
|
|
293 |
|
294 |
if cfg.datasets:
|
295 |
for idx, ds_cfg in enumerate(cfg.datasets):
|
|
|
|
|
296 |
if ds_cfg.type == "sharegpt:chat":
|
297 |
LOG.warning(
|
298 |
PendingDeprecationWarning(
|
|
|
293 |
|
294 |
if cfg.datasets:
|
295 |
for idx, ds_cfg in enumerate(cfg.datasets):
|
296 |
+
if not ds_cfg.type:
|
297 |
+
continue
|
298 |
if ds_cfg.type == "sharegpt:chat":
|
299 |
LOG.warning(
|
300 |
PendingDeprecationWarning(
|