WHATX
/

30k-Llama3-8B

Model card Files Files and versions Community

30k-Llama3-8B / config.yaml

QJerry's picture

Initial commit.

ed266a5 verified 4 months ago

history blame contribute delete

1.63 kB

	data:
	prompt: llama3_formal
	train: ../data/susgen/FINAL/PER_3500/FINAL_PER3500_30k.json
	val: null
	val_split_ratio: 0.005
	device: cuda
	instruct_mask: true
	local_rank: 0
	model:
	acceleration: null
	int4_config:
	bnb_4bit_compute_dtype: bfloat16
	bnb_4bit_quant_type: nf4
	bnb_4bit_use_double_quant: true
	load_in_4bit: true
	load_in_8bit: false
	int8_config:
	load_in_4bit: false
	load_in_8bit: true
	lora:
	bias: none
	inference_mode: false
	lora_alpha: 32
	lora_dropout: 0.1
	r: 16
	target_modules:
	- q_proj
	- k_proj
	- v_proj
	- o_proj
	- gate_proj
	- up_proj
	- down_proj
	- lm_head
	task_type: CAUSAL_LM
	lora_path: false
	model_path: ../ckpts/Meta-Llama-3-8B-Instruct
	quantization: int4
	seed: 2024
	show_config: false
	use_lora: true
	window: null
	name: 30k-Llama3-8B
	output_dir: ../results/
	tokenizer:
	add_bos_token: true
	add_eos_token: false
	add_prefix_space: false
	encode:
	max_length: 2048
	return_tensors: pt
	truncation: true
	model_max_length: 2048
	padding_side: left
	pretrained_model_name_or_path: ../ckpts/Meta-Llama-3-8B
	truncation_side: right
	use_fast: true
	trainer: NewTrainer
	training:
	bf16: true
	deepspeed: ./configs/ds_configs/ds_config_stage_2.json
	gradient_accumulation_steps: 16
	learning_rate: 1.0e-05
	logging_steps: 1
	lr_scheduler_type: cosine
	max_steps: 301
	optim: paged_adamw_32bit
	per_device_train_batch_size: 16
	remove_unused_columns: false
	report_to: wandb
	resume_from_checkpoint: null
	save_steps: 20
	save_strategy: steps
	warmup_steps: 100
	weight_decay: 0.01