ds_cfg: train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size} gradient_accumulation_steps: ${gradient_accumulation_steps} scheduler: type: WarmupDecayLR params: total_num_steps: 1688 warmup_max_lr: ${learning_rate} warmup_num_steps: 200 warmup_type: linear optimizer: type: AdamW params: lr: ${learning_rate} betas: - 0.9 - 0.95 eps: 1.0e-06 weight_decay: ${weight_decay} bf16: enabled: true zero_optimization: stage: 1 stage3_param_persistence_threshold: 100000.0 stage3_max_live_parameters: 100000000.0 stage3_prefetch_bucket_size: 100000000.0 memory_efficient_linear: false steps_per_print: 25 gradient_clipping: 1.0 prescale_gradients: false sft_model_dir: experiments/llama2.7b.chat.logiqav2.llama-2-70b-chat.dpo-sft.A6K.w4.v1.0/checkpoint-1600/ fix_hack_data_dir: ${sft_model_dir}/fix_hack_data_dir/ attempt_response_file: ${sft_model_dir}/react-inter-states/process-rm/logiqav2-train.qa.react.v1.0.0shot.sample10.inter_ver2.0.rs0.2.r0.3.sample3.json inter_states_file_train: ${sft_model_dir}/react-inter-states/logiqav2-train.full.qa.react.v1.0.0shot.sample10.clean_inter_ver2.0.rs0.2.r0.3.[1-2]-of-20.json inter_states_file_dev: ${sft_model_dir}/react-inter-states/logiqav2-train.full.qa.react.v1.0.0shot.sample10.clean_inter_ver2.0.rs0.2.r0.3.0-of-20.json train_file: ${attempt_response_file} dev_file: ${attempt_response_file} test_file: null torch_dtype: _target_: general_util.training_utils.return_torch_dtype dtype: bfloat16 tokenizer_init: _target_: general_util.tokenization_utils.init_tokenizer tokenizer_path: experiments/llama2.7b.chat.logiqav2.llama-2-70b-chat.dpo-sft.A6K.w4.v1.0/checkpoint-1600/ padding_side: left device_map: _target_: models.llama.return_single_device_map model: _target_: models.llama.LlamaModelForSequenceClassification.from_pretrained num_labels: 4 gradient_checkpointing: true pad_token_id: 0 attn_implementation: flash_attention_2 torch_dtype: ${torch_dtype} device_map: ${device_map} read_tensor_train: _target_: data.general.Attempt2ValueRewardModelingDataset max_value: 3 original_data_file: ../research.data/LogiQA2.0/logiqa/DATA/LOGIQA/train.txt original_reader: _target_: data.logiqav2.LogicQAReader flat_options: true instruction: _target_: data.prompts.logiqav2.react.prompts.get_prompt prompt_name: react_v2 reader: _target_: data.general.PartialTrajAttemptsReader partial_traj_file: ${inter_states_file_train} template: 'Context: {} Question: {} Options: {} Thought 1: ' compose_keys: - context - question - option_list read_tensor: _target_: data.general.Attempt2ValueRewardModelingDataset max_value: 3 original_data_file: ../research.data/LogiQA2.0/logiqa/DATA/LOGIQA/train.txt original_reader: _target_: data.logiqav2.LogicQAReader flat_options: true instruction: _target_: data.prompts.logiqav2.react.prompts.get_prompt prompt_name: react_v2 reader: _target_: data.general.PartialTrajAttemptsReader partial_traj_file: ${inter_states_file_dev} template: 'Context: {} Question: {} Options: {} Thought 1: ' compose_keys: - context - question - option_list dist_load_data_barrier: false extended_vocab: null collator: _target_: data.general.Attempt2ValueCollator tokenizer: ${tokenizer_init} max_seq_length: 4096 num_workers: 8 prefetch_factor: 2 model_name_or_path: ${sft_model_dir} pretrain: null resume: null exp_name: llama2.7b.chat.logiqav2.70b-distil.prm.fix_hack.A100.w4.v1.2.s${seed} exp_notes: null output_dir: experiments/${exp_name} do_train: true evaluate_during_training: true do_eval: false eval_sub_path: checkpoint-* per_gpu_train_batch_size: 4 per_gpu_eval_batch_size: 8 learning_rate: 1.0e-06 gradient_accumulation_steps: 4 weight_decay: 0.01 adam_epsilon: 1.0e-06 adam_betas: (0.9, 0.98) total_dataset_len: 54071 max_grad_norm: 1.0 num_train_epochs: 2 max_steps: 0 warmup_proportion: 0 warmup_steps: 200 optimizer: null use_nvlamb: null bit_training: null logging_steps: 5 save_ds_state: false save_steps: 200 save_best: false eval_steps: 200 ddp_eval: true no_cuda: false seed: 42 local_rank: 0 fp16: true fp16_opt_level: O1 fp16_bfloat16: true prediction_cfg: metric: acc measure: 1 best_checkpoint: experiments/llama2.7b.chat.logiqav2.70b-distil.prm.fix_hack.A100.w4.v1.2.s42/checkpoint-200 best_result: 0.5669135624910111 eval_forward_fn: _target_: general_util.evaluator.DefaultForwardFn post_process: _target_: post_processors.dpo.ResponseClsPostProcessor summary_helper: _target_: general_util.tensorboard_helper.WandbWriter batch_index_or_keys: null outputs_index_or_keys: null n_gpu: 1 device: cuda:0 train_batch_size: 4 eval_batch_size: 8 world_size: 4