pretrain fixed bigcode/the-stack-smol-xl dataset
Browse files
scripts/prepare_pretrain_dataset.py
CHANGED
@@ -94,7 +94,7 @@ datasets_configs = [
|
|
94 |
],
|
95 |
[
|
96 |
# ~3 GB, 4,976,850
|
97 |
-
{'path': 'saillab/taco-datasets', '
|
98 |
for name in [
|
99 |
# 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
|
100 |
'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
|
|
|
94 |
],
|
95 |
[
|
96 |
# ~3 GB, 4,976,850
|
97 |
+
{'path': 'saillab/taco-datasets', 'data_dir': name, 'split': 'train', 'format': '{instruction} {input} {output}'}
|
98 |
for name in [
|
99 |
# 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
|
100 |
'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
|
scripts/train_tokenizer.py
CHANGED
@@ -23,24 +23,25 @@ def batch_iterator():
|
|
23 |
|
24 |
# code
|
25 |
dataset = (
|
26 |
-
load_dataset('bigcode/the-stack-smol-xs',
|
27 |
-
for
|
28 |
-
|
|
|
29 |
'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
|
30 |
-
'augeas', 'awk', '
|
31 |
-
'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
|
32 |
-
'cuda', 'dart', 'dockerfile', 'elixir',
|
33 |
'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
|
34 |
-
'groovy', 'haskell', 'idris', 'isabelle', 'java',
|
35 |
-
'java-server-pages', 'julia', 'kotlin', 'lean',
|
36 |
'literate-agda', 'literate-coffeescript', 'literate-haskell',
|
37 |
-
'lua', 'makefile', 'maple', 'mathematica', 'matlab',
|
38 |
-
'ocaml', 'pascal', 'perl', 'php', '
|
39 |
-
'protocol-buffer', 'r', 'racket', 'restructuredtext',
|
40 |
'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
|
41 |
'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
|
42 |
'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
|
43 |
-
'thrift', 'verilog', 'vhdl', 'visual-basic', 'xslt',
|
44 |
'yacc', 'zig',
|
45 |
]
|
46 |
)
|
|
|
23 |
|
24 |
# code
|
25 |
dataset = (
|
26 |
+
load_dataset('bigcode/the-stack-smol-xs', data_dir=f'data/{name}', split='train', trust_remote_code=True)
|
27 |
+
for name in [
|
28 |
+
# 'batchfile' - unsafe
|
29 |
+
# 'powershell' - unsafe
|
30 |
'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
|
31 |
+
'augeas', 'awk', 'bison', 'bluespec', 'c',
|
32 |
+
'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
|
33 |
+
'css', 'cuda', 'dart', 'dockerfile', 'elixir',
|
34 |
'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
|
35 |
+
'groovy', 'haskell','html', 'idris', 'isabelle', 'java',
|
36 |
+
'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
|
37 |
'literate-agda', 'literate-coffeescript', 'literate-haskell',
|
38 |
+
'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
|
39 |
+
'ocaml', 'pascal', 'perl', 'php', 'prolog',
|
40 |
+
'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
|
41 |
'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
|
42 |
'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
|
43 |
'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
|
44 |
+
'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt',
|
45 |
'yacc', 'zig',
|
46 |
]
|
47 |
)
|