lyy14011305's picture
Upload 13 files
d97772d verified
2024-05-21 13:11:53.195 | INFO | __main__:setup_everything:52 - train_args:TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=2,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0002,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=output/firefly-qwen-7b-sft-qlora/runs/May21_13-11-52_ts-ccabdee9f774458487b5dd0f562f0b70-launcher,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=100,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=constant_with_warmup,
max_grad_norm=0.3,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1,
optim=paged_adamw_32bit,
optim_args=None,
optim_target_modules=None,
output_dir=output/firefly-qwen-7b-sft-qlora,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=output/firefly-qwen-7b-sft-qlora,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=100,
save_strategy=steps,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=100,
weight_decay=0,
)
2024-05-21 13:11:53.197 | INFO | __main__:init_components:333 - Initializing components...
2024-05-21 13:12:44.163 | INFO | __main__:setup_everything:52 - train_args:TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=2,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0002,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=output/firefly-qwen-7b-sft-qlora/runs/May21_13-12-43_ts-ccabdee9f774458487b5dd0f562f0b70-launcher,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=100,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=constant_with_warmup,
max_grad_norm=0.3,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1,
optim=paged_adamw_32bit,
optim_args=None,
optim_target_modules=None,
output_dir=output/firefly-qwen-7b-sft-qlora,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=output/firefly-qwen-7b-sft-qlora,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=100,
save_strategy=steps,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=100,
weight_decay=0,
)
2024-05-21 13:12:44.166 | INFO | __main__:init_components:333 - Initializing components...
2024-05-21 13:12:44.696 | INFO | __main__:load_tokenizer:211 - vocab_size of tokenizer: 151851
2024-05-21 13:12:44.697 | INFO | __main__:load_model:220 - Loading model from base model: Qwen/Qwen-7B-Chat
2024-05-21 13:12:44.697 | INFO | __main__:load_model:221 - Train model with qlora
2024-05-21 13:13:28.400 | INFO | __main__:setup_everything:52 - train_args:TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=2,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0002,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=output/firefly-qwen-7b-sft-qlora/runs/May21_13-13-28_ts-ccabdee9f774458487b5dd0f562f0b70-launcher,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=100,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=constant_with_warmup,
max_grad_norm=0.3,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1,
optim=paged_adamw_32bit,
optim_args=None,
optim_target_modules=None,
output_dir=output/firefly-qwen-7b-sft-qlora,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=output/firefly-qwen-7b-sft-qlora,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=100,
save_strategy=steps,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=100,
weight_decay=0,
)
2024-05-21 13:13:28.403 | INFO | __main__:init_components:333 - Initializing components...
2024-05-21 13:13:28.848 | INFO | __main__:load_tokenizer:211 - vocab_size of tokenizer: 151851
2024-05-21 13:13:28.849 | INFO | __main__:load_model:220 - Loading model from base model: Qwen/Qwen-7B-Chat
2024-05-21 13:13:28.849 | INFO | __main__:load_model:221 - Train model with qlora
2024-05-21 13:13:57.006 | INFO | __main__:find_all_linear_names:85 - LoRA target module names: ['c_proj', 'w1', 'c_attn', 'w2']
2024-05-21 13:14:01.246 | INFO | __main__:load_model:283 - memory footprint of model: 8.189956784248352 GB
2024-05-21 13:14:01.253 | INFO | __main__:load_model:295 - Total model params: 4626.45M
2024-05-21 13:14:01.254 | INFO | __main__:init_components:349 - Train model with sft task
2024-05-21 13:14:01.254 | INFO | __main__:load_sft_dataset:315 - Loading data with UnifiedSFTDataset
2024-05-21 13:14:01.254 | INFO | component.dataset:__init__:19 - Loading data: ./data/dummy_data.jsonl
2024-05-21 13:14:01.260 | INFO | component.dataset:__init__:22 - Use template "qwen" for training
2024-05-21 13:14:01.260 | INFO | component.dataset:__init__:23 - There are 33 data in dataset
2024-05-21 13:14:01.303 | INFO | __main__:main:387 - *** starting training ***
2024-05-21 13:14:18.008 | INFO | __main__:setup_everything:52 - train_args:TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=2,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0002,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=output/firefly-qwen-7b-sft-qlora/runs/May21_13-14-17_ts-ccabdee9f774458487b5dd0f562f0b70-launcher,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=100,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=constant_with_warmup,
max_grad_norm=0.3,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1,
optim=paged_adamw_32bit,
optim_args=None,
optim_target_modules=None,
output_dir=output/firefly-qwen-7b-sft-qlora,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=output/firefly-qwen-7b-sft-qlora,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=100,
save_strategy=steps,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=100,
weight_decay=0,
)
2024-05-21 13:14:18.011 | INFO | __main__:init_components:333 - Initializing components...
2024-05-21 13:14:18.461 | INFO | __main__:load_tokenizer:211 - vocab_size of tokenizer: 151851
2024-05-21 13:14:18.461 | INFO | __main__:load_model:220 - Loading model from base model: Qwen/Qwen-7B-Chat
2024-05-21 13:14:18.462 | INFO | __main__:load_model:221 - Train model with qlora
2024-05-21 13:14:30.427 | INFO | __main__:find_all_linear_names:85 - LoRA target module names: ['c_proj', 'w2', 'w1', 'c_attn']
2024-05-21 13:14:34.822 | INFO | __main__:load_model:283 - memory footprint of model: 8.189956784248352 GB
2024-05-21 13:14:34.830 | INFO | __main__:load_model:295 - Total model params: 4626.45M
2024-05-21 13:14:34.830 | INFO | __main__:init_components:349 - Train model with sft task
2024-05-21 13:14:34.830 | INFO | __main__:load_sft_dataset:315 - Loading data with UnifiedSFTDataset
2024-05-21 13:14:34.831 | INFO | component.dataset:__init__:19 - Loading data: ./data/dummy_data.jsonl
2024-05-21 13:14:34.832 | INFO | component.dataset:__init__:22 - Use template "qwen" for training
2024-05-21 13:14:34.832 | INFO | component.dataset:__init__:23 - There are 33 data in dataset
2024-05-21 13:14:34.873 | INFO | __main__:main:387 - *** starting training ***
2024-05-21 13:15:24.386 | INFO | __main__:setup_everything:52 - train_args:TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=2,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0002,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=output/firefly-qwen-7b-sft-qlora/runs/May21_13-15-24_ts-ccabdee9f774458487b5dd0f562f0b70-launcher,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=100,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=constant_with_warmup,
max_grad_norm=0.3,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1,
optim=paged_adamw_32bit,
optim_args=None,
optim_target_modules=None,
output_dir=output/firefly-qwen-7b-sft-qlora,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=output/firefly-qwen-7b-sft-qlora,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=100,
save_strategy=steps,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=100,
weight_decay=0,
)
2024-05-21 13:15:24.389 | INFO | __main__:init_components:333 - Initializing components...
2024-05-21 13:15:24.833 | INFO | __main__:load_tokenizer:211 - vocab_size of tokenizer: 151851
2024-05-21 13:15:24.833 | INFO | __main__:load_model:220 - Loading model from base model: Qwen/Qwen-7B-Chat
2024-05-21 13:15:24.834 | INFO | __main__:load_model:221 - Train model with qlora
2024-05-21 13:15:36.611 | INFO | __main__:find_all_linear_names:85 - LoRA target module names: ['w1', 'w2', 'c_proj', 'c_attn']
2024-05-21 13:15:40.843 | INFO | __main__:load_model:283 - memory footprint of model: 8.189956784248352 GB
2024-05-21 13:15:40.851 | INFO | __main__:load_model:295 - Total model params: 4626.45M
2024-05-21 13:15:40.851 | INFO | __main__:init_components:349 - Train model with sft task
2024-05-21 13:15:40.851 | INFO | __main__:load_sft_dataset:315 - Loading data with UnifiedSFTDataset
2024-05-21 13:15:40.852 | INFO | component.dataset:__init__:19 - Loading data: ./data/dummy_data.jsonl
2024-05-21 13:15:40.853 | INFO | component.dataset:__init__:22 - Use template "qwen" for training
2024-05-21 13:15:40.854 | INFO | component.dataset:__init__:23 - There are 33 data in dataset
2024-05-21 13:15:40.895 | INFO | __main__:main:387 - *** starting training ***
2024-05-21 15:20:14.959 | INFO | __main__:setup_everything:52 - train_args:TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=2,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0002,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=output/firefly-qwen-7b-sft-qlora/runs/May21_15-20-14_ts-ccabdee9f774458487b5dd0f562f0b70-launcher,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=100,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=constant_with_warmup,
max_grad_norm=0.3,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1,
optim=paged_adamw_32bit,
optim_args=None,
optim_target_modules=None,
output_dir=output/firefly-qwen-7b-sft-qlora,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=output/firefly-qwen-7b-sft-qlora,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=100,
save_strategy=steps,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=100,
weight_decay=0,
)
2024-05-21 15:20:14.962 | INFO | __main__:init_components:333 - Initializing components...
2024-05-21 15:20:15.449 | INFO | __main__:load_tokenizer:211 - vocab_size of tokenizer: 151851
2024-05-21 15:20:15.450 | INFO | __main__:load_model:220 - Loading model from base model: Qwen/Qwen-7B-Chat
2024-05-21 15:20:15.450 | INFO | __main__:load_model:221 - Train model with qlora
2024-05-21 15:20:42.755 | INFO | __main__:find_all_linear_names:85 - LoRA target module names: ['c_proj', 'w1', 'c_attn', 'w2']
2024-05-21 15:20:47.021 | INFO | __main__:load_model:283 - memory footprint of model: 8.189956784248352 GB
2024-05-21 15:20:47.029 | INFO | __main__:load_model:295 - Total model params: 4626.45M
2024-05-21 15:20:47.029 | INFO | __main__:init_components:349 - Train model with sft task
2024-05-21 15:20:47.029 | INFO | __main__:load_sft_dataset:315 - Loading data with UnifiedSFTDataset
2024-05-21 15:20:47.029 | INFO | component.dataset:__init__:19 - Loading data: ./data/dummy_data.jsonl
2024-05-21 15:20:47.034 | INFO | component.dataset:__init__:22 - Use template "qwen" for training
2024-05-21 15:20:47.034 | INFO | component.dataset:__init__:23 - There are 33 data in dataset
2024-05-21 15:20:47.077 | INFO | __main__:main:387 - *** starting training ***
2024-05-21 15:37:02.692 | INFO | __main__:setup_everything:52 - train_args:TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=2,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0002,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=output/firefly-qwen-7b-sft-qlora/runs/May21_15-37-02_ts-ccabdee9f774458487b5dd0f562f0b70-launcher,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=100,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=constant_with_warmup,
max_grad_norm=0.3,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1,
optim=paged_adamw_32bit,
optim_args=None,
optim_target_modules=None,
output_dir=output/firefly-qwen-7b-sft-qlora,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=output/firefly-qwen-7b-sft-qlora,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=100,
save_strategy=steps,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=100,
weight_decay=0,
)
2024-05-21 15:37:02.695 | INFO | __main__:init_components:333 - Initializing components...
2024-05-21 15:37:03.274 | INFO | __main__:load_tokenizer:211 - vocab_size of tokenizer: 151851
2024-05-21 15:37:03.274 | INFO | __main__:load_model:220 - Loading model from base model: Qwen/Qwen-7B-Chat
2024-05-21 15:37:03.275 | INFO | __main__:load_model:221 - Train model with qlora
2024-05-21 15:37:31.115 | INFO | __main__:find_all_linear_names:85 - LoRA target module names: ['w2', 'w1', 'c_proj', 'c_attn']
2024-05-21 15:38:12.051 | INFO | __main__:load_model:283 - memory footprint of model: 8.189956784248352 GB
2024-05-21 15:38:12.058 | INFO | __main__:load_model:295 - Total model params: 4626.45M
2024-05-21 15:38:12.059 | INFO | __main__:init_components:349 - Train model with sft task
2024-05-21 15:38:12.059 | INFO | __main__:load_sft_dataset:315 - Loading data with UnifiedSFTDataset
2024-05-21 15:38:12.059 | INFO | component.dataset:__init__:19 - Loading data: ./data/dummy_data.jsonl
2024-05-21 15:38:12.063 | INFO | component.dataset:__init__:22 - Use template "qwen" for training
2024-05-21 15:38:12.063 | INFO | component.dataset:__init__:23 - There are 33 data in dataset
2024-05-21 15:38:12.105 | INFO | __main__:main:387 - *** starting training ***
2024-05-21 15:48:05.000 | INFO | __main__:setup_everything:52 - train_args:TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=2,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0002,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=output/firefly-qwen-7b-sft-qlora/runs/May21_15-48-04_ts-ccabdee9f774458487b5dd0f562f0b70-launcher,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=100,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=constant_with_warmup,
max_grad_norm=0.3,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1,
optim=paged_adamw_32bit,
optim_args=None,
optim_target_modules=None,
output_dir=output/firefly-qwen-7b-sft-qlora,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=output/firefly-qwen-7b-sft-qlora,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=100,
save_strategy=steps,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=100,
weight_decay=0,
)
2024-05-21 15:48:05.004 | INFO | __main__:init_components:333 - Initializing components...
2024-05-21 15:48:05.571 | INFO | __main__:load_tokenizer:211 - vocab_size of tokenizer: 151851
2024-05-21 15:48:05.572 | INFO | __main__:load_model:220 - Loading model from base model: Qwen/Qwen-7B-Chat
2024-05-21 15:48:05.572 | INFO | __main__:load_model:221 - Train model with qlora
2024-05-21 15:48:32.241 | INFO | __main__:find_all_linear_names:85 - LoRA target module names: ['c_attn', 'w2', 'c_proj', 'w1']
2024-05-21 15:49:13.600 | INFO | __main__:load_model:283 - memory footprint of model: 8.189956784248352 GB
2024-05-21 15:49:13.607 | INFO | __main__:load_model:295 - Total model params: 4626.45M
2024-05-21 15:49:13.607 | INFO | __main__:init_components:349 - Train model with sft task
2024-05-21 15:49:13.607 | INFO | __main__:load_sft_dataset:315 - Loading data with UnifiedSFTDataset
2024-05-21 15:49:13.608 | INFO | component.dataset:__init__:19 - Loading data: ./data/dummy_data.jsonl
2024-05-21 15:49:13.611 | INFO | component.dataset:__init__:22 - Use template "qwen" for training
2024-05-21 15:49:13.612 | INFO | component.dataset:__init__:23 - There are 33 data in dataset
2024-05-21 15:49:13.654 | INFO | __main__:main:387 - *** starting training ***
2024-05-21 15:57:36.477 | INFO | __main__:setup_everything:52 - train_args:TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'gradient_accumulation_kwargs': None},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=2,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0002,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=output/firefly-qwen-7b-sft-qlora/runs/May21_15-57-36_ts-ccabdee9f774458487b5dd0f562f0b70-launcher,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=100,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=constant_with_warmup,
max_grad_norm=0.3,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1,
optim=paged_adamw_32bit,
optim_args=None,
optim_target_modules=None,
output_dir=output/firefly-qwen-7b-sft-qlora,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=output/firefly-qwen-7b-sft-qlora,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=100,
save_strategy=steps,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=100,
weight_decay=0,
)
2024-05-21 15:57:36.480 | INFO | __main__:init_components:333 - Initializing components...
2024-05-21 15:57:37.043 | INFO | __main__:load_tokenizer:211 - vocab_size of tokenizer: 151851
2024-05-21 15:57:37.043 | INFO | __main__:load_model:220 - Loading model from base model: Qwen/Qwen-7B-Chat
2024-05-21 15:57:37.044 | INFO | __main__:load_model:221 - Train model with qlora
2024-05-21 15:58:02.666 | INFO | __main__:find_all_linear_names:85 - LoRA target module names: ['w2', 'w1', 'c_proj', 'c_attn']
2024-05-21 15:58:45.333 | INFO | __main__:load_model:283 - memory footprint of model: 8.189956784248352 GB
2024-05-21 15:58:45.341 | INFO | __main__:load_model:295 - Total model params: 4626.45M
2024-05-21 15:58:45.341 | INFO | __main__:init_components:349 - Train model with sft task
2024-05-21 15:58:45.341 | INFO | __main__:load_sft_dataset:315 - Loading data with UnifiedSFTDataset
2024-05-21 15:58:45.341 | INFO | component.dataset:__init__:19 - Loading data: ./data/dummy_data.jsonl
2024-05-21 15:58:45.346 | INFO | component.dataset:__init__:22 - Use template "qwen" for training
2024-05-21 15:58:45.346 | INFO | component.dataset:__init__:23 - There are 33 data in dataset
2024-05-21 15:58:45.387 | INFO | __main__:main:387 - *** starting training ***
2024-05-23 10:33:52.253 | INFO | __main__:setup_everything:52 - train_args:TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0002,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=output/firefly-qwen-7b-sft-qlora/runs/May23_10-33-52_ts-2f5a9417da5f4de29903e0323581dcaa-launcher,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=100,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=constant_with_warmup,
max_grad_norm=0.3,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1,
optim=paged_adamw_32bit,
optim_args=None,
output_dir=output/firefly-qwen-7b-sft-qlora,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=output/firefly-qwen-7b-sft-qlora,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=100,
save_strategy=steps,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=100,
weight_decay=0,
)
2024-05-23 10:33:52.256 | INFO | __main__:init_components:333 - Initializing components...
2024-05-23 10:33:52.746 | INFO | __main__:load_tokenizer:211 - vocab_size of tokenizer: 151851
2024-05-23 10:33:52.747 | INFO | __main__:load_model:220 - Loading model from base model: Qwen/Qwen-7B-Chat
2024-05-23 10:33:52.747 | INFO | __main__:load_model:221 - Train model with qlora
2024-05-23 10:34:24.498 | INFO | __main__:find_all_linear_names:85 - LoRA target module names: ['w1', 'w2', 'c_proj', 'c_attn']
2024-05-23 10:34:28.849 | INFO | __main__:load_model:283 - memory footprint of model: 8.189956784248352 GB
2024-05-23 10:34:28.856 | INFO | __main__:load_model:295 - Total model params: 4626.45M
2024-05-23 10:34:28.857 | INFO | __main__:init_components:349 - Train model with sft task
2024-05-23 10:34:28.857 | INFO | __main__:load_sft_dataset:315 - Loading data with UnifiedSFTDataset
2024-05-23 10:34:28.857 | INFO | component.dataset:__init__:19 - Loading data: ./data/dummy_data.jsonl
2024-05-23 10:34:28.862 | INFO | component.dataset:__init__:22 - Use template "qwen" for training
2024-05-23 10:34:28.862 | INFO | component.dataset:__init__:23 - There are 33 data in dataset
2024-05-23 10:34:28.908 | INFO | __main__:main:387 - *** starting training ***
2024-05-23 10:46:55.771 | INFO | __main__:setup_everything:52 - train_args:TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0002,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=output/firefly-qwen-7b-sft-qlora/runs/May23_10-46-55_ts-2f5a9417da5f4de29903e0323581dcaa-launcher,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=100,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=constant_with_warmup,
max_grad_norm=0.3,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1,
optim=paged_adamw_32bit,
optim_args=None,
output_dir=output/firefly-qwen-7b-sft-qlora,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=output/firefly-qwen-7b-sft-qlora,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=100,
save_strategy=steps,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=100,
weight_decay=0,
)
2024-05-23 10:46:55.774 | INFO | __main__:init_components:333 - Initializing components...
2024-05-23 10:46:56.246 | INFO | __main__:load_tokenizer:211 - vocab_size of tokenizer: 151851
2024-05-23 10:46:56.246 | INFO | __main__:load_model:220 - Loading model from base model: Qwen/Qwen-7B-Chat
2024-05-23 10:46:56.247 | INFO | __main__:load_model:221 - Train model with qlora
2024-05-23 10:47:29.916 | INFO | __main__:find_all_linear_names:85 - LoRA target module names: ['w2', 'w1', 'c_attn', 'c_proj']
2024-05-23 10:47:34.339 | INFO | __main__:load_model:283 - memory footprint of model: 8.189956784248352 GB
2024-05-23 10:47:34.404 | INFO | __main__:load_model:295 - Total model params: 4626.45M
2024-05-23 10:47:34.405 | INFO | __main__:init_components:349 - Train model with sft task
2024-05-23 10:47:34.405 | INFO | __main__:load_sft_dataset:315 - Loading data with UnifiedSFTDataset
2024-05-23 10:47:34.405 | INFO | component.dataset:__init__:19 - Loading data: ./data/dummy_data.jsonl
2024-05-23 10:47:34.411 | INFO | component.dataset:__init__:22 - Use template "qwen" for training
2024-05-23 10:47:34.411 | INFO | component.dataset:__init__:23 - There are 33 data in dataset
2024-05-23 10:47:34.456 | INFO | __main__:main:387 - *** starting training ***
2024-05-23 11:03:38.908 | INFO | __main__:setup_everything:52 - train_args:TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0002,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=output/firefly-qwen-7b-sft-qlora/runs/May23_11-03-38_ts-2f5a9417da5f4de29903e0323581dcaa-launcher,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=100,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=constant_with_warmup,
max_grad_norm=0.3,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1,
optim=paged_adamw_32bit,
optim_args=None,
output_dir=output/firefly-qwen-7b-sft-qlora,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=output/firefly-qwen-7b-sft-qlora,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=100,
save_strategy=steps,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=100,
weight_decay=0,
)
2024-05-23 11:03:38.911 | INFO | __main__:init_components:333 - Initializing components...
2024-05-23 11:03:39.404 | INFO | __main__:load_tokenizer:211 - vocab_size of tokenizer: 151851
2024-05-23 11:03:39.405 | INFO | __main__:load_model:220 - Loading model from base model: Qwen/Qwen-7B-Chat
2024-05-23 11:03:39.405 | INFO | __main__:load_model:221 - Train model with qlora
2024-05-23 11:04:06.829 | INFO | __main__:find_all_linear_names:85 - LoRA target module names: ['c_proj', 'c_attn', 'w1', 'w2']
2024-05-23 11:04:11.229 | INFO | __main__:load_model:283 - memory footprint of model: 8.189956784248352 GB
2024-05-23 11:04:11.237 | INFO | __main__:load_model:295 - Total model params: 4626.45M
2024-05-23 11:04:11.237 | INFO | __main__:init_components:349 - Train model with sft task
2024-05-23 11:04:11.238 | INFO | __main__:load_sft_dataset:315 - Loading data with UnifiedSFTDataset
2024-05-23 11:04:11.238 | INFO | component.dataset:__init__:19 - Loading data: ./data/dummy_data.jsonl
2024-05-23 11:04:11.242 | INFO | component.dataset:__init__:22 - Use template "qwen" for training
2024-05-23 11:04:11.242 | INFO | component.dataset:__init__:23 - There are 33 data in dataset
2024-05-23 11:04:11.282 | INFO | __main__:main:387 - *** starting training ***
2024-05-23 11:09:34.482 | INFO | __main__:setup_everything:52 - train_args:TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0002,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=output/firefly-qwen-7b-sft-qlora/runs/May23_11-09-34_ts-2f5a9417da5f4de29903e0323581dcaa-launcher,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=100,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=constant_with_warmup,
max_grad_norm=0.3,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1,
optim=paged_adamw_32bit,
optim_args=None,
output_dir=output/firefly-qwen-7b-sft-qlora,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=output/firefly-qwen-7b-sft-qlora,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=100,
save_strategy=steps,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=100,
weight_decay=0,
)
2024-05-23 11:09:34.485 | INFO | __main__:init_components:333 - Initializing components...
2024-05-23 11:09:34.949 | INFO | __main__:load_tokenizer:211 - vocab_size of tokenizer: 151851
2024-05-23 11:09:34.949 | INFO | __main__:load_model:220 - Loading model from base model: Qwen/Qwen-7B-Chat
2024-05-23 11:09:34.950 | INFO | __main__:load_model:221 - Train model with qlora
2024-05-23 11:10:02.282 | INFO | __main__:find_all_linear_names:85 - LoRA target module names: ['c_proj', 'c_attn', 'w2', 'w1']
2024-05-23 11:10:07.418 | INFO | __main__:load_model:283 - memory footprint of model: 8.189956784248352 GB
2024-05-23 11:10:07.426 | INFO | __main__:load_model:295 - Total model params: 4626.45M
2024-05-23 11:10:07.427 | INFO | __main__:init_components:349 - Train model with sft task
2024-05-23 11:10:07.427 | INFO | __main__:load_sft_dataset:315 - Loading data with UnifiedSFTDataset
2024-05-23 11:10:07.427 | INFO | component.dataset:__init__:19 - Loading data: ./data/dummy_data.jsonl
2024-05-23 11:10:07.431 | INFO | component.dataset:__init__:22 - Use template "qwen" for training
2024-05-23 11:10:07.431 | INFO | component.dataset:__init__:23 - There are 33 data in dataset
2024-05-23 11:10:07.479 | INFO | __main__:main:387 - *** starting training ***
2024-05-23 11:18:23.072 | INFO | __main__:setup_everything:52 - train_args:TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0002,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=output/firefly-qwen-7b-sft-qlora/runs/May23_11-18-23_ts-2f5a9417da5f4de29903e0323581dcaa-launcher,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=100,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=constant_with_warmup,
max_grad_norm=0.3,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1,
optim=paged_adamw_32bit,
optim_args=None,
output_dir=output/firefly-qwen-7b-sft-qlora,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=output/firefly-qwen-7b-sft-qlora,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=100,
save_strategy=steps,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=100,
weight_decay=0,
)
2024-05-23 11:18:23.076 | INFO | __main__:init_components:333 - Initializing components...
2024-05-23 11:18:23.596 | INFO | __main__:load_tokenizer:211 - vocab_size of tokenizer: 151851
2024-05-23 11:18:23.597 | INFO | __main__:load_model:220 - Loading model from base model: Qwen/Qwen-7B-Chat
2024-05-23 11:18:23.597 | INFO | __main__:load_model:221 - Train model with qlora
2024-05-23 11:18:50.129 | INFO | __main__:find_all_linear_names:85 - LoRA target module names: ['c_proj', 'w2', 'c_attn', 'w1']
2024-05-23 11:18:54.414 | INFO | __main__:load_model:283 - memory footprint of model: 8.189956784248352 GB
2024-05-23 11:18:54.422 | INFO | __main__:load_model:295 - Total model params: 4626.45M
2024-05-23 11:18:54.422 | INFO | __main__:init_components:349 - Train model with sft task
2024-05-23 11:18:54.422 | INFO | __main__:load_sft_dataset:315 - Loading data with UnifiedSFTDataset
2024-05-23 11:18:54.423 | INFO | component.dataset:__init__:19 - Loading data: ./data/dummy_data.jsonl
2024-05-23 11:18:54.428 | INFO | component.dataset:__init__:22 - Use template "qwen" for training
2024-05-23 11:18:54.428 | INFO | component.dataset:__init__:23 - There are 33 data in dataset
2024-05-23 11:18:54.469 | INFO | __main__:main:387 - *** starting training ***
2024-05-23 11:23:01.591 | INFO | __main__:setup_everything:52 - train_args:TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0002,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=output/firefly-qwen-7b-sft-qlora/runs/May23_11-23-01_ts-2f5a9417da5f4de29903e0323581dcaa-launcher,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=100,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=constant_with_warmup,
max_grad_norm=0.3,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1,
optim=paged_adamw_32bit,
optim_args=None,
output_dir=output/firefly-qwen-7b-sft-qlora,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=output/firefly-qwen-7b-sft-qlora,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=100,
save_strategy=steps,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=100,
weight_decay=0,
)
2024-05-23 11:23:01.594 | INFO | __main__:init_components:333 - Initializing components...
2024-05-23 11:23:02.063 | INFO | __main__:load_tokenizer:211 - vocab_size of tokenizer: 151851
2024-05-23 11:23:02.064 | INFO | __main__:load_model:220 - Loading model from base model: Qwen/Qwen-7B-Chat
2024-05-23 11:23:02.064 | INFO | __main__:load_model:221 - Train model with qlora
2024-05-23 11:23:29.599 | INFO | __main__:find_all_linear_names:85 - LoRA target module names: ['c_attn', 'c_proj', 'w2', 'w1']
2024-05-23 11:23:33.944 | INFO | __main__:load_model:283 - memory footprint of model: 8.189956784248352 GB
2024-05-23 11:23:33.951 | INFO | __main__:load_model:295 - Total model params: 4626.45M
2024-05-23 11:23:33.952 | INFO | __main__:init_components:349 - Train model with sft task
2024-05-23 11:23:33.952 | INFO | __main__:load_sft_dataset:315 - Loading data with UnifiedSFTDataset
2024-05-23 11:23:33.952 | INFO | component.dataset:__init__:19 - Loading data: ./data/dummy_data.jsonl
2024-05-23 11:23:33.957 | INFO | component.dataset:__init__:22 - Use template "qwen" for training
2024-05-23 11:23:33.957 | INFO | component.dataset:__init__:23 - There are 33 data in dataset
2024-05-23 11:23:34.049 | INFO | __main__:main:387 - *** starting training ***
2024-05-23 11:24:24.214 | INFO | __main__:setup_everything:52 - train_args:TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0002,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=output/firefly-qwen-7b-sft-qlora/runs/May23_11-24-24_ts-2f5a9417da5f4de29903e0323581dcaa-launcher,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=100,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=constant_with_warmup,
max_grad_norm=0.3,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1,
optim=paged_adamw_32bit,
optim_args=None,
output_dir=output/firefly-qwen-7b-sft-qlora,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=output/firefly-qwen-7b-sft-qlora,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=100,
save_strategy=steps,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=100,
weight_decay=0,
)
2024-05-23 11:24:24.217 | INFO | __main__:init_components:333 - Initializing components...
2024-05-23 11:24:24.676 | INFO | __main__:load_tokenizer:211 - vocab_size of tokenizer: 151851
2024-05-23 11:24:24.677 | INFO | __main__:load_model:220 - Loading model from base model: Qwen/Qwen-7B-Chat
2024-05-23 11:24:24.677 | INFO | __main__:load_model:221 - Train model with qlora
2024-05-23 11:24:36.170 | INFO | __main__:find_all_linear_names:85 - LoRA target module names: ['c_proj', 'w1', 'c_attn', 'w2']
2024-05-23 11:24:40.516 | INFO | __main__:load_model:283 - memory footprint of model: 8.189956784248352 GB
2024-05-23 11:24:40.524 | INFO | __main__:load_model:295 - Total model params: 4626.45M
2024-05-23 11:24:40.524 | INFO | __main__:init_components:349 - Train model with sft task
2024-05-23 11:24:40.524 | INFO | __main__:load_sft_dataset:315 - Loading data with UnifiedSFTDataset
2024-05-23 11:24:40.524 | INFO | component.dataset:__init__:19 - Loading data: ./data/dummy_data.jsonl
2024-05-23 11:24:40.526 | INFO | component.dataset:__init__:22 - Use template "qwen" for training
2024-05-23 11:24:40.526 | INFO | component.dataset:__init__:23 - There are 33 data in dataset
2024-05-23 11:24:40.570 | INFO | __main__:main:387 - *** starting training ***
2024-05-23 11:30:05.839 | INFO | __main__:setup_everything:52 - train_args:TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0002,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=output/firefly-qwen-7b-sft-qlora/runs/May23_11-30-05_ts-2f5a9417da5f4de29903e0323581dcaa-launcher,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=100,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=constant_with_warmup,
max_grad_norm=0.3,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1,
optim=paged_adamw_32bit,
optim_args=None,
output_dir=output/firefly-qwen-7b-sft-qlora,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=output/firefly-qwen-7b-sft-qlora,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=100,
save_strategy=steps,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=100,
weight_decay=0,
)
2024-05-23 11:30:05.843 | INFO | __main__:init_components:333 - Initializing components...
2024-05-23 11:30:06.357 | INFO | __main__:load_tokenizer:211 - vocab_size of tokenizer: 151851
2024-05-23 11:30:06.358 | INFO | __main__:load_model:220 - Loading model from base model: Qwen/Qwen-7B-Chat
2024-05-23 11:30:06.358 | INFO | __main__:load_model:221 - Train model with qlora
2024-05-23 11:30:34.001 | INFO | __main__:find_all_linear_names:85 - LoRA target module names: ['w1', 'w2', 'c_proj', 'c_attn']
2024-05-23 11:30:38.329 | INFO | __main__:load_model:283 - memory footprint of model: 8.189956784248352 GB
2024-05-23 11:30:38.337 | INFO | __main__:load_model:295 - Total model params: 4626.45M
2024-05-23 11:30:38.337 | INFO | __main__:init_components:349 - Train model with sft task
2024-05-23 11:30:38.337 | INFO | __main__:load_sft_dataset:315 - Loading data with UnifiedSFTDataset
2024-05-23 11:30:38.338 | INFO | component.dataset:__init__:19 - Loading data: ./data/dummy_data.jsonl
2024-05-23 11:30:38.342 | INFO | component.dataset:__init__:22 - Use template "qwen" for training
2024-05-23 11:30:38.342 | INFO | component.dataset:__init__:23 - There are 33 data in dataset
2024-05-23 11:30:38.383 | INFO | __main__:main:387 - *** starting training ***
2024-05-23 11:58:07.400 | INFO | __main__:setup_everything:52 - train_args:TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0002,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=output/firefly-qwen-7b-sft-qlora/runs/May23_11-58-07_ts-2f5a9417da5f4de29903e0323581dcaa-launcher,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=100,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=constant_with_warmup,
max_grad_norm=0.3,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=1,
optim=paged_adamw_32bit,
optim_args=None,
output_dir=output/firefly-qwen-7b-sft-qlora,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
resume_from_checkpoint=None,
run_name=output/firefly-qwen-7b-sft-qlora,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=100,
save_strategy=steps,
save_total_limit=1,
seed=42,
skip_memory_metrics=True,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=100,
weight_decay=0,
)
2024-05-23 11:58:07.403 | INFO | __main__:init_components:333 - Initializing components...
2024-05-23 11:58:07.884 | INFO | __main__:load_tokenizer:211 - vocab_size of tokenizer: 151851
2024-05-23 11:58:07.884 | INFO | __main__:load_model:220 - Loading model from base model: Qwen/Qwen-7B-Chat
2024-05-23 11:58:07.885 | INFO | __main__:load_model:221 - Train model with qlora
2024-05-23 11:58:35.827 | INFO | __main__:find_all_linear_names:85 - LoRA target module names: ['c_attn', 'c_proj', 'w1', 'w2']
2024-05-23 11:58:40.220 | INFO | __main__:load_model:283 - memory footprint of model: 8.189956784248352 GB
2024-05-23 11:58:40.228 | INFO | __main__:load_model:295 - Total model params: 4626.45M
2024-05-23 11:58:40.228 | INFO | __main__:init_components:349 - Train model with sft task
2024-05-23 11:58:40.228 | INFO | __main__:load_sft_dataset:315 - Loading data with UnifiedSFTDataset
2024-05-23 11:58:40.228 | INFO | component.dataset:__init__:19 - Loading data: ./data/dummy_data.jsonl
2024-05-23 11:58:40.232 | INFO | component.dataset:__init__:22 - Use template "qwen" for training
2024-05-23 11:58:40.232 | INFO | component.dataset:__init__:23 - There are 33 data in dataset
2024-05-23 11:58:40.276 | INFO | __main__:main:387 - *** starting training ***