Crystalcareai
/

Quiet-Star-Custom

Text Generation

Model card Files Files and versions Community

Crystalcareai commited on Mar 30, 2024

Commit

fb98720

·

verified ·

1 Parent(s): 0b90483

Update modeling_quiet.py

Files changed (1) hide show

modeling_quiet.py +5 -5

modeling_quiet.py CHANGED Viewed

@@ -60,7 +60,7 @@ def model_init(params):
         trust_remote_code=True,
         device_map="auto",
         # load_in_4bit=True,
-        # attn_implementation="flash_attention_2",
     )
     print("Loaded model")
@@ -115,7 +115,7 @@ training_args = TrainingArguments(
     # beta2=0.95,
     # auto_find_batch_size=True
     learning_rate=2e-07,
-    max_grad_norm=1.0,  # Gradient clipping with a maximum gradient norm of 0.3
     warmup_steps=10,
     lr_scheduler_type="cosine",
     push_to_hub=False,
@@ -125,12 +125,12 @@ training_args = TrainingArguments(
 # Training is currently bugged with lora/qlora
 # peft_config = LoraConfig(
-#           r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
 #     target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
 #                       "gate_proj", "up_proj", "down_proj",],
 #     lora_alpha = 16,
-#     lora_dropout = 0, # Supports any, but = 0 is optimized
-#     bias = "none", # Enable Dora method
 #     use_dora=False,
 # )

         trust_remote_code=True,
         device_map="auto",
         # load_in_4bit=True,
+        # attn_implementation="flash_attention_2", #flash-attn currently unsupported.
     )
     print("Loaded model")
     # beta2=0.95,
     # auto_find_batch_size=True
     learning_rate=2e-07,
+    max_grad_norm=1.0,
     warmup_steps=10,
     lr_scheduler_type="cosine",
     push_to_hub=False,
 # Training is currently bugged with lora/qlora
 # peft_config = LoraConfig(
+#           r = 16,
 #     target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
 #                       "gate_proj", "up_proj", "down_proj",],
 #     lora_alpha = 16,
+#     lora_dropout = 0,
+#     bias = "none",
 #     use_dora=False,
 # )