tokenizer training
Browse files
scripts/pretrain_datasets.py
CHANGED
@@ -42,6 +42,10 @@ pretrain_datasets = [
|
|
42 |
# stem
|
43 |
#
|
44 |
# 1.44 GB, 63,357
|
|
|
|
|
|
|
|
|
45 |
*[
|
46 |
{'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['markdown']}
|
47 |
for i in range(0, 100, 5)
|
|
|
42 |
# stem
|
43 |
#
|
44 |
# 1.44 GB, 63,357
|
45 |
+
*[
|
46 |
+
{'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['abstract']}
|
47 |
+
for i in range(0, 100, 5)
|
48 |
+
],
|
49 |
*[
|
50 |
{'path': 'neuralwork/arxiver', 'split': f'train[{i}%:{i + 5}%]', 'format': lambda n: n['markdown']}
|
51 |
for i in range(0, 100, 5)
|