model
Browse files- scripts/model.yaml +3 -3
scripts/model.yaml
CHANGED
@@ -73,7 +73,7 @@ train:
|
|
73 |
|
74 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
75 |
# max_tokens: 3000000000000
|
76 |
-
max_tokens:
|
77 |
|
78 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
79 |
max_steps:
|
@@ -110,9 +110,9 @@ eval:
|
|
110 |
# Optimizer-related arguments
|
111 |
optimizer:
|
112 |
# class_path: torch.optim.AdamW
|
113 |
-
|
114 |
# class_path: bitsandbytes.optim.AdamW8bit
|
115 |
-
class_path: bitsandbytes.optim.PagedAdamW8bit
|
116 |
|
117 |
init_args:
|
118 |
# (type: float, default: 0.001)
|
|
|
73 |
|
74 |
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
|
75 |
# max_tokens: 3000000000000
|
76 |
+
max_tokens: 4252334823 # 129767 * 32769 * 1
|
77 |
|
78 |
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
|
79 |
max_steps:
|
|
|
110 |
# Optimizer-related arguments
|
111 |
optimizer:
|
112 |
# class_path: torch.optim.AdamW
|
113 |
+
class_path: grokadamw.GrokAdamW
|
114 |
# class_path: bitsandbytes.optim.AdamW8bit
|
115 |
+
# class_path: bitsandbytes.optim.PagedAdamW8bit
|
116 |
|
117 |
init_args:
|
118 |
# (type: float, default: 0.001)
|