mtasic85 commited on
Commit
dfc94d9
·
1 Parent(s): df39d2c

pretrain fixed bigcode/the-stack-smol-xl dataset

Browse files
scripts/prepare_pretrain_dataset.py CHANGED
@@ -94,7 +94,7 @@ datasets_configs = [
94
  ],
95
  [
96
  # ~3 GB, 4,976,850
97
- {'path': 'saillab/taco-datasets', 'name': name, 'split': 'train', 'format': '{instruction} {input} {output}'}
98
  for name in [
99
  # 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
100
  'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
 
94
  ],
95
  [
96
  # ~3 GB, 4,976,850
97
+ {'path': 'saillab/taco-datasets', 'data_dir': name, 'split': 'train', 'format': '{instruction} {input} {output}'}
98
  for name in [
99
  # 'multilingual-instruction-tuning-dataset /multilingual-alpaca-52k-gpt-4',
100
  'multilingual-instruction-tuning-dataset /multilinugal-dolly-15k',
scripts/train_tokenizer.py CHANGED
@@ -23,24 +23,25 @@ def batch_iterator():
23
 
24
  # code
25
  dataset = (
26
- load_dataset('bigcode/the-stack-smol-xs', lang, split='train', trust_remote_code=True)
27
- for lang in [
28
- 'python', 'javascript', 'typescript', 'html', 'css', 'c', 'c++', 'markdown',
 
29
  'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
30
- 'augeas', 'awk', 'batchfile', 'bison', 'bluespec',
31
- 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
32
- 'cuda', 'dart', 'dockerfile', 'elixir',
33
  'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
34
- 'groovy', 'haskell', 'idris', 'isabelle', 'java',
35
- 'java-server-pages', 'julia', 'kotlin', 'lean',
36
  'literate-agda', 'literate-coffeescript', 'literate-haskell',
37
- 'lua', 'makefile', 'maple', 'mathematica', 'matlab',
38
- 'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog',
39
- 'protocol-buffer', 'r', 'racket', 'restructuredtext',
40
  'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
41
  'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
42
  'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
43
- 'thrift', 'verilog', 'vhdl', 'visual-basic', 'xslt',
44
  'yacc', 'zig',
45
  ]
46
  )
 
23
 
24
  # code
25
  dataset = (
26
+ load_dataset('bigcode/the-stack-smol-xs', data_dir=f'data/{name}', split='train', trust_remote_code=True)
27
+ for name in [
28
+ # 'batchfile' - unsafe
29
+ # 'powershell' - unsafe
30
  'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly',
31
+ 'augeas', 'awk', 'bison', 'bluespec', 'c',
32
+ 'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp',
33
+ 'css', 'cuda', 'dart', 'dockerfile', 'elixir',
34
  'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go',
35
+ 'groovy', 'haskell','html', 'idris', 'isabelle', 'java',
36
+ 'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean',
37
  'literate-agda', 'literate-coffeescript', 'literate-haskell',
38
+ 'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab',
39
+ 'ocaml', 'pascal', 'perl', 'php', 'prolog',
40
+ 'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext',
41
  'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme',
42
  'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan',
43
  'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex',
44
+ 'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt',
45
  'yacc', 'zig',
46
  ]
47
  )