Frank Ruis winglian commited on
Commit
7477a53
1 Parent(s): 7d1d22f

wrap prepared_ds_path in str() to avoid TypeError in fsspec package (#1548)

Browse files

* wrap prepared_ds_path in str() to avoid TypeError in fsspec package

`fsspec` calls `if "::" in path` on `prepared_ds_path`, which will throw an error if it is a `PosixPath` object.

* update test too

---------

Co-authored-by: Wing Lian <[email protected]>

src/axolotl/utils/data/sft.py CHANGED
@@ -421,7 +421,7 @@ def load_tokenized_prepared_datasets(
421
 
422
  if cfg.local_rank == 0:
423
  LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
424
- dataset.save_to_disk(prepared_ds_path)
425
  if cfg.push_dataset_to_hub:
426
  LOG.info(
427
  f"Saving merged prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
 
421
 
422
  if cfg.local_rank == 0:
423
  LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
424
+ dataset.save_to_disk(str(prepared_ds_path))
425
  if cfg.push_dataset_to_hub:
426
  LOG.info(
427
  f"Saving merged prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
tests/test_datasets.py CHANGED
@@ -110,7 +110,7 @@ class TestDatasetPreparation(unittest.TestCase):
110
  """Usual use case. Verify datasets saved via `save_to_disk` can be loaded."""
111
  with tempfile.TemporaryDirectory() as tmp_dir:
112
  tmp_ds_name = Path(tmp_dir) / "tmp_dataset"
113
- self.dataset.save_to_disk(tmp_ds_name)
114
 
115
  prepared_path = Path(tmp_dir) / "prepared"
116
  cfg = DictDefault(
 
110
  """Usual use case. Verify datasets saved via `save_to_disk` can be loaded."""
111
  with tempfile.TemporaryDirectory() as tmp_dir:
112
  tmp_ds_name = Path(tmp_dir) / "tmp_dataset"
113
+ self.dataset.save_to_disk(str(tmp_ds_name))
114
 
115
  prepared_path = Path(tmp_dir) / "prepared"
116
  cfg = DictDefault(