chargoddard
commited on
Commit
·
88089e8
1
Parent(s):
168a7a0
Add ability to pass 'name' argument to load_dataset
Browse files- src/axolotl/utils/data.py +13 -14
src/axolotl/utils/data.py
CHANGED
@@ -94,6 +94,7 @@ def load_tokenized_prepared_datasets(
|
|
94 |
try:
|
95 |
load_dataset(
|
96 |
d.path,
|
|
|
97 |
streaming=True,
|
98 |
use_auth_token=use_auth_token,
|
99 |
)
|
@@ -107,6 +108,7 @@ def load_tokenized_prepared_datasets(
|
|
107 |
if local_path.is_dir():
|
108 |
ds = load_dataset(
|
109 |
d.path,
|
|
|
110 |
data_files=d.data_files,
|
111 |
streaming=False,
|
112 |
split=None,
|
@@ -114,6 +116,7 @@ def load_tokenized_prepared_datasets(
|
|
114 |
elif local_path.is_file():
|
115 |
ds = load_dataset(
|
116 |
"json",
|
|
|
117 |
data_files=d.path,
|
118 |
streaming=False,
|
119 |
split=None,
|
@@ -123,26 +126,22 @@ def load_tokenized_prepared_datasets(
|
|
123 |
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
124 |
)
|
125 |
elif ds_from_hub:
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
else:
|
134 |
-
ds = load_dataset(
|
135 |
-
d.path,
|
136 |
-
streaming=False,
|
137 |
-
use_auth_token=use_auth_token,
|
138 |
-
)
|
139 |
else:
|
140 |
fp = hf_hub_download(
|
141 |
repo_id=d.path,
|
142 |
repo_type="dataset",
|
143 |
filename=d.data_files,
|
144 |
)
|
145 |
-
ds = load_dataset(
|
|
|
|
|
146 |
if not ds:
|
147 |
raise ValueError("unhandled dataset load")
|
148 |
# support for using a subset of the data
|
|
|
94 |
try:
|
95 |
load_dataset(
|
96 |
d.path,
|
97 |
+
name=d.name,
|
98 |
streaming=True,
|
99 |
use_auth_token=use_auth_token,
|
100 |
)
|
|
|
108 |
if local_path.is_dir():
|
109 |
ds = load_dataset(
|
110 |
d.path,
|
111 |
+
name=d.name,
|
112 |
data_files=d.data_files,
|
113 |
streaming=False,
|
114 |
split=None,
|
|
|
116 |
elif local_path.is_file():
|
117 |
ds = load_dataset(
|
118 |
"json",
|
119 |
+
name=d.name,
|
120 |
data_files=d.path,
|
121 |
streaming=False,
|
122 |
split=None,
|
|
|
126 |
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
127 |
)
|
128 |
elif ds_from_hub:
|
129 |
+
ds = load_dataset(
|
130 |
+
d.path,
|
131 |
+
name=d.name,
|
132 |
+
streaming=False,
|
133 |
+
data_files=d.data_files,
|
134 |
+
use_auth_token=use_auth_token,
|
135 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
else:
|
137 |
fp = hf_hub_download(
|
138 |
repo_id=d.path,
|
139 |
repo_type="dataset",
|
140 |
filename=d.data_files,
|
141 |
)
|
142 |
+
ds = load_dataset(
|
143 |
+
"json", name=d.name, data_files=fp, streaming=False, split=None
|
144 |
+
)
|
145 |
if not ds:
|
146 |
raise ValueError("unhandled dataset load")
|
147 |
# support for using a subset of the data
|