mtasic85 commited on
Commit
992b12b
·
1 Parent(s): d5265f8

prepare dataset

Browse files
scripts/TRAIN.md ADDED
File without changes
scripts/model.yaml ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
2
+ # ``model_config``. (type: Optional[str], default: null)
3
+ model_name: "tiny-llama-1.1b"
4
+
5
+ # A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
6
+ # ``model_config``. (type: Optional[Config], default: null)
7
+ model_config:
8
+ padded_vocab_size: 32768
9
+ vocab_size: 32768
10
+ block_size: 32768
11
+ n_layer: 10
12
+ n_head: 12
13
+ head_size: null
14
+ n_embd: 312
15
+ n_query_groups: 4
16
+ rotary_percentage: 1.0
17
+ parallel_residual: false
18
+ bias: false
19
+ norm_class_name: "RMSNorm"
20
+ norm_eps: 1e-05
21
+ mlp_class_name: "LLaMAMLP"
22
+ intermediate_size: 1092
23
+ rope_base: 500000
24
+
25
+ # Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
26
+ # /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
27
+ out_dir: "../out/contrain/"
28
+
29
+ # The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
30
+ # precision: bf16-mixed
31
+ precision: bf16-true
32
+
33
+ # Optional path to a checkpoint directory to initialize the model from.
34
+ # Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
35
+ initial_checkpoint_dir: "tangledgroup/tangled-llama-33m-32k-base-v0.1"
36
+
37
+ # Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
38
+ # from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
39
+ # ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
40
+ # (type: Union[bool, Literal["auto"], Path], default: False)
41
+ # resume: false
42
+ resume: "auto"
43
+
44
+ # Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
45
+ data:
46
+ class_path: LitData
47
+
48
+ init_args:
49
+ data_path: "../data/"
50
+ num_workers: 16
51
+ seq_length: 32768
52
+
53
+ # Training-related arguments. See ``litgpt.args.TrainArgs`` for details
54
+ train:
55
+ # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
56
+ save_interval: 1000
57
+
58
+ # Number of iterations between logging calls (type: int, default: 1)
59
+ log_interval: 1
60
+
61
+ # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
62
+ global_batch_size: 512
63
+
64
+ # Number of samples per data-parallel rank (type: int, default: 4)
65
+ micro_batch_size: 16
66
+ # micro_batch_size: 14
67
+
68
+ # Number of iterations with learning rate warmup active (type: int, default: 2000)
69
+ lr_warmup_steps: 2000
70
+
71
+ # Number of epochs to train on (type: Optional[int], default: null)
72
+ epochs:
73
+
74
+ # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
75
+ # max_tokens: 3000000000000
76
+ max_tokens: 9782206713 # 1591379 * 2049 * 3
77
+
78
+ # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
79
+ max_steps:
80
+
81
+ # Limits the length of samples. Off by default (type: Optional[int], default: null)
82
+ max_seq_length: 32768
83
+
84
+ # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
85
+ tie_embeddings:
86
+
87
+ # (type: Optional[float], default: 1.0)
88
+ max_norm: 1.0
89
+
90
+ # (type: float, default: 4e-05)
91
+ min_lr: 4.0e-05
92
+
93
+ # Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
94
+ eval:
95
+ # Number of optimizer steps between evaluation calls (type: int, default: 1000)
96
+ interval: 100
97
+
98
+ # Number of tokens to generate (type: Optional[int], default: null)
99
+ max_new_tokens:
100
+
101
+ # Number of iterations (type: int, default: 100)
102
+ max_iters: 100
103
+
104
+ # Whether to evaluate on the validation set at the beginning of the training
105
+ initial_validation: false
106
+
107
+ # Whether to evaluate on the validation set at the end the training
108
+ final_validation: true
109
+
110
+ # Optimizer-related arguments
111
+ optimizer:
112
+ # class_path: torch.optim.AdamW
113
+ class_path: grokadamw.GrokAdamW
114
+ # class_path: bitsandbytes.optim.AdamW8bit
115
+ # class_path: bitsandbytes.optim.PagedAdamW8bit
116
+
117
+ init_args:
118
+ # (type: float, default: 0.001)
119
+ lr: 1.0e-3
120
+
121
+ # (type: float, default: 0.01)
122
+ weight_decay: 0.1
123
+
124
+ # (type: tuple, default: (0.9,0.999))
125
+ betas:
126
+ - 0.9
127
+ - 0.95
128
+
129
+ # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
130
+ devices: auto
131
+
132
+ # How many nodes to use. (type: int, default: 1)
133
+ num_nodes: 1
134
+
135
+ # Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
136
+ # module require this. (type: Optional[Path], default: null)
137
+ tokenizer_dir: "../"
138
+
139
+ # The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
140
+ logger_name: "wandb"
141
+
142
+ # The random seed to use for reproducibility. (type: int, default: 42)
143
+ seed: 42
scripts/prepare_contrain_dataset.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+
3
+ from datasets import load_dataset
4
+ from litdata import optimize, TokensLoader
5
+ from litgpt.tokenizer import Tokenizer
6
+ from functools import partial
7
+
8
+
9
+ def batch_iterator(name=None):
10
+ if name in (None, 'Replete-AI/Everything_Instruct_Multilingual'):
11
+ dataset = load_dataset('Replete-AI/Everything_Instruct_Multilingual', split='train')
12
+
13
+ for row in dataset:
14
+ text = []
15
+
16
+ if row['instruction']:
17
+ text.append(
18
+ '<|im_start|>system\n'
19
+ f"{row['instruction']}<|im_end|>"
20
+ )
21
+
22
+ if row['input']:
23
+ text.append(
24
+ '<|im_start|>user\n'
25
+ f"{row['input']}<|im_end|>"
26
+ )
27
+
28
+ if row['output']:
29
+ text.append(
30
+ '<|im_start|>assistant\n'
31
+ f"{row['output']}<|im_end|>"
32
+ )
33
+
34
+ text = '\n'.join(text) + '\n'
35
+ yield text
36
+ break
37
+
38
+ del dataset
39
+ gc.collect()
40
+
41
+ if name in (None, 'HuggingFaceH4/ultrachat_200k'):
42
+ dataset = load_dataset('HuggingFaceH4/ultrachat_200k', split='train_sft')
43
+
44
+ for row in dataset:
45
+ text = [
46
+ f"<|im_start|>{n['role']}\n{n['content']}<|im_end|>"
47
+ for n in row['messages']
48
+ ]
49
+
50
+ text = '\n'.join(text) + '\n'
51
+ yield text
52
+ break
53
+
54
+ del dataset
55
+ gc.collect()
56
+
57
+ if name in (None, 'HuggingFaceH4/no_robots'):
58
+ dataset = load_dataset('HuggingFaceH4/no_robots', split='train')
59
+
60
+ for row in dataset:
61
+ text = [
62
+ f"<|im_start|>{n['role']}\n{n['content']}<|im_end|>"
63
+ for n in row['messages']
64
+ ]
65
+
66
+ text = '\n'.join(text) + '\n'
67
+ yield text
68
+ break
69
+
70
+ del dataset
71
+ gc.collect()
72
+
73
+ if name in (None, 'datatab/ultrafeedback_binarized_serbian'):
74
+ dataset = load_dataset('datatab/ultrafeedback_binarized_serbian', split='train_sft')
75
+
76
+ for row in dataset:
77
+ text = [
78
+ f"<|im_start|>{n['role']}\n{n['content']}<|im_end|>"
79
+ for n in row['chosen']
80
+ ]
81
+
82
+ text = '\n'.join(text) + '\n'
83
+ yield text
84
+ break
85
+
86
+ del dataset
87
+ gc.collect()
88
+
89
+ if name in (None, 'datatab/alpaca-cleaned-serbian-full'):
90
+ dataset = load_dataset('datatab/alpaca-cleaned-serbian-full', split='train')
91
+
92
+ for row in dataset:
93
+ text = []
94
+
95
+ if row['instruction']:
96
+ text.append(
97
+ '<|im_start|>system\n'
98
+ f"{row['instruction']}<|im_end|>"
99
+ )
100
+
101
+ if row['input']:
102
+ text.append(
103
+ '<|im_start|>user\n'
104
+ f"{row['input']}<|im_end|>"
105
+ )
106
+
107
+ if row['output']:
108
+ text.append(
109
+ '<|im_start|>assistant\n'
110
+ f"{row['output']}<|im_end|>"
111
+ )
112
+
113
+ text = '\n'.join(text) + '\n'
114
+ yield text
115
+ break
116
+
117
+ del dataset
118
+ gc.collect()
119
+
120
+
121
+ def tokenize_fn(dataset_name, tokenizer=None):
122
+ for text in batch_iterator(dataset_name):
123
+ text_ids = tokenizer.encode(text, bos=False, eos=True)
124
+ yield text_ids
125
+
126
+
127
+ datasets_names = [
128
+ 'Replete-AI/Everything_Instruct_Multilingual',
129
+ 'HuggingFaceH4/ultrachat_200k',
130
+ 'HuggingFaceH4/no_robots',
131
+ 'datatab/ultrafeedback_binarized_serbian',
132
+ 'datatab/alpaca-cleaned-serbian-full',
133
+ ]
134
+
135
+ outputs = optimize(
136
+ fn=partial(tokenize_fn, tokenizer=Tokenizer('..')),
137
+ inputs=datasets_names,
138
+ output_dir='../data/',
139
+ # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
140
+ chunk_size=((32768 + 1) * 500),
141
+ num_workers=16,
142
+ )
scripts/requirements.in ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
2
+ tqdm
3
+ datasets
4
+ jinja2
5
+ transformers
6
+ bitsandbytes
7
+ wandb
8
+ # litgpt[all]
9
+ litgpt[all] @ git+https://github.com/Lightning-AI/litgpt.git
10
+ litdata
11
+ grokadamw