zetavg commited on
Commit
9279c83
·
unverified ·
1 Parent(s): 8788753
llama_lora/globals.py CHANGED
@@ -3,6 +3,9 @@ import subprocess
3
 
4
  from typing import Any, Dict, List, Optional, Tuple, Union
5
 
 
 
 
6
  from .lib.finetune import train
7
 
8
 
@@ -25,6 +28,12 @@ class Global:
25
  # Model related
26
  model_has_been_used = False
27
 
 
 
 
 
 
 
28
  # UI related
29
  ui_title: str = "LLaMA-LoRA"
30
  ui_emoji: str = "🦙🎛️"
@@ -60,3 +69,57 @@ commit_hash = get_git_commit_hash()
60
 
61
  if commit_hash:
62
  Global.version = commit_hash[:8]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  from typing import Any, Dict, List, Optional, Tuple, Union
5
 
6
+ from numba import cuda
7
+ import nvidia_smi
8
+
9
  from .lib.finetune import train
10
 
11
 
 
28
  # Model related
29
  model_has_been_used = False
30
 
31
+ # GPU Info
32
+ gpu_cc = None # GPU compute capability
33
+ gpu_sms = None # GPU total number of SMs
34
+ gpu_total_cores = None # GPU total cores
35
+ gpu_total_memory = None
36
+
37
  # UI related
38
  ui_title: str = "LLaMA-LoRA"
39
  ui_emoji: str = "🦙🎛️"
 
69
 
70
  if commit_hash:
71
  Global.version = commit_hash[:8]
72
+
73
+
74
+ def load_gpu_info():
75
+ try:
76
+ cc_cores_per_SM_dict = {
77
+ (2, 0): 32,
78
+ (2, 1): 48,
79
+ (3, 0): 192,
80
+ (3, 5): 192,
81
+ (3, 7): 192,
82
+ (5, 0): 128,
83
+ (5, 2): 128,
84
+ (6, 0): 64,
85
+ (6, 1): 128,
86
+ (7, 0): 64,
87
+ (7, 5): 64,
88
+ (8, 0): 64,
89
+ (8, 6): 128,
90
+ (8, 9): 128,
91
+ (9, 0): 128
92
+ }
93
+ # the above dictionary should result in a value of "None" if a cc match
94
+ # is not found. The dictionary needs to be extended as new devices become
95
+ # available, and currently does not account for all Jetson devices
96
+ device = cuda.get_current_device()
97
+ device_sms = getattr(device, 'MULTIPROCESSOR_COUNT')
98
+ device_cc = device.compute_capability
99
+ cores_per_sm = cc_cores_per_SM_dict.get(device_cc)
100
+ total_cores = cores_per_sm*device_sms
101
+ print("GPU compute capability: ", device_cc)
102
+ print("GPU total number of SMs: ", device_sms)
103
+ print("GPU total cores: ", total_cores)
104
+ Global.gpu_cc = device_cc
105
+ Global.gpu_sms = device_sms
106
+ Global.gpu_total_cores = total_cores
107
+
108
+ nvidia_smi.nvmlInit()
109
+ handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
110
+ info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
111
+ total_memory = info.total
112
+
113
+ total_memory_mb = total_memory / (1024 ** 2)
114
+ total_memory_gb = total_memory / (1024 ** 3)
115
+
116
+ # Print the memory size
117
+ print(
118
+ f"GPU total memory: {total_memory} bytes ({total_memory_mb:.2f} MB) ({total_memory_gb:.2f} GB)")
119
+ Global.gpu_total_memory = total_memory
120
+
121
+ except Exception as e:
122
+ print(f"Notice: cannot get GPU info: {e}")
123
+
124
+
125
+ load_gpu_info()
llama_lora/models.py CHANGED
@@ -102,6 +102,14 @@ def load_base_model():
102
  )
103
 
104
 
 
 
 
 
 
 
 
 
105
  def unload_models():
106
  del Global.loaded_base_model
107
  Global.loaded_base_model = None
@@ -109,11 +117,7 @@ def unload_models():
109
  del Global.loaded_tokenizer
110
  Global.loaded_tokenizer = None
111
 
112
- gc.collect()
113
-
114
- # if not shared.args.cpu: # will not be running on CPUs anyway
115
- with torch.no_grad():
116
- torch.cuda.empty_cache()
117
 
118
  Global.model_has_been_used = False
119
 
 
102
  )
103
 
104
 
105
+ def clear_cache():
106
+ gc.collect()
107
+
108
+ # if not shared.args.cpu: # will not be running on CPUs anyway
109
+ with torch.no_grad():
110
+ torch.cuda.empty_cache()
111
+
112
+
113
  def unload_models():
114
  del Global.loaded_base_model
115
  Global.loaded_base_model = None
 
117
  del Global.loaded_tokenizer
118
  Global.loaded_tokenizer = None
119
 
120
+ clear_cache()
 
 
 
 
121
 
122
  Global.model_has_been_used = False
123
 
llama_lora/ui/finetune_ui.py CHANGED
@@ -9,7 +9,9 @@ from random_word import RandomWords
9
  from transformers import TrainerCallback
10
 
11
  from ..globals import Global
12
- from ..models import get_base_model, get_tokenizer, unload_models_if_already_used
 
 
13
  from ..utils.data import (
14
  get_available_template_names,
15
  get_available_dataset_names,
@@ -238,6 +240,12 @@ def parse_plain_text_input(
238
  return result
239
 
240
 
 
 
 
 
 
 
241
  def do_train(
242
  # Dataset
243
  template,
@@ -258,9 +266,10 @@ def do_train(
258
  lora_alpha,
259
  lora_dropout,
260
  model_name,
261
- progress=gr.Progress(track_tqdm=True),
262
  ):
263
  try:
 
264
  # If model has been used in inference, we need to unload it first.
265
  # Otherwise, we'll get a 'Function MmBackward0 returned an invalid
266
  # gradient at index 1 - expected device meta but got cuda:0' error.
@@ -337,7 +346,8 @@ def do_train(
337
 
338
  progress(
339
  (i, 300),
340
- desc="(Simulate) " + get_progress_text(epoch, epochs, last_loss)
 
341
  )
342
 
343
  time.sleep(0.1)
@@ -401,12 +411,13 @@ Train data (first 10):
401
 
402
  # Do this again right before training to make sure the model is not used in inference.
403
  unload_models_if_already_used()
 
404
 
405
  base_model = get_base_model()
406
  tokenizer = get_tokenizer()
407
 
408
  # Do not let other tqdm iterations interfere the progress reporting after training starts.
409
- progress.track_tqdm = False
410
 
411
  results = Global.train_fn(
412
  base_model, # base_model
@@ -431,7 +442,8 @@ Train data (first 10):
431
  training_callbacks # callbacks
432
  )
433
 
434
- logs_str = "\n".join([json.dumps(log) for log in log_history]) or "None"
 
435
 
436
  result_message = f"Training ended:\n{str(results)}\n\nLogs:\n{logs_str}"
437
  print(result_message)
@@ -590,9 +602,18 @@ def finetune_ui():
590
  )
591
 
592
  with gr.Row():
 
 
 
 
 
 
 
 
 
593
  with gr.Column():
594
  micro_batch_size = gr.Slider(
595
- minimum=1, maximum=100, step=1, value=8,
596
  label="Micro Batch Size",
597
  info="The number of examples in each mini-batch for gradient computation. A smaller micro_batch_size reduces memory usage but may increase training time."
598
  )
 
9
  from transformers import TrainerCallback
10
 
11
  from ..globals import Global
12
+ from ..models import (
13
+ get_base_model, get_tokenizer,
14
+ clear_cache, unload_models_if_already_used)
15
  from ..utils.data import (
16
  get_available_template_names,
17
  get_available_dataset_names,
 
240
  return result
241
 
242
 
243
+ should_training_progress_track_tqdm = True
244
+
245
+ if Global.gpu_total_cores is not None and Global.gpu_total_cores > 2560:
246
+ should_training_progress_track_tqdm = False
247
+
248
+
249
  def do_train(
250
  # Dataset
251
  template,
 
266
  lora_alpha,
267
  lora_dropout,
268
  model_name,
269
+ progress=gr.Progress(track_tqdm=should_training_progress_track_tqdm),
270
  ):
271
  try:
272
+ clear_cache()
273
  # If model has been used in inference, we need to unload it first.
274
  # Otherwise, we'll get a 'Function MmBackward0 returned an invalid
275
  # gradient at index 1 - expected device meta but got cuda:0' error.
 
346
 
347
  progress(
348
  (i, 300),
349
+ desc="(Simulate) " +
350
+ get_progress_text(epoch, epochs, last_loss)
351
  )
352
 
353
  time.sleep(0.1)
 
411
 
412
  # Do this again right before training to make sure the model is not used in inference.
413
  unload_models_if_already_used()
414
+ clear_cache()
415
 
416
  base_model = get_base_model()
417
  tokenizer = get_tokenizer()
418
 
419
  # Do not let other tqdm iterations interfere the progress reporting after training starts.
420
+ # progress.track_tqdm = False # setting this dynamically is not working, determining if track_tqdm should be enabled based on GPU cores at start instead.
421
 
422
  results = Global.train_fn(
423
  base_model, # base_model
 
442
  training_callbacks # callbacks
443
  )
444
 
445
+ logs_str = "\n".join([json.dumps(log)
446
+ for log in log_history]) or "None"
447
 
448
  result_message = f"Training ended:\n{str(results)}\n\nLogs:\n{logs_str}"
449
  print(result_message)
 
602
  )
603
 
604
  with gr.Row():
605
+ micro_batch_size_default_value = 1
606
+
607
+ if Global.gpu_total_cores is not None and Global.gpu_total_memory is not None:
608
+ memory_per_core = Global.gpu_total_memory / Global.gpu_total_cores
609
+ if memory_per_core >= 6291456:
610
+ micro_batch_size_default_value = 8
611
+ elif memory_per_core >= 4000000: # ?
612
+ micro_batch_size_default_value = 4
613
+
614
  with gr.Column():
615
  micro_batch_size = gr.Slider(
616
+ minimum=1, maximum=100, step=1, value=micro_batch_size_default_value,
617
  label="Micro Batch Size",
618
  info="The number of examples in each mini-batch for gradient computation. A smaller micro_batch_size reduces memory usage but may increase training time."
619
  )
llama_lora/ui/inference_ui.py CHANGED
@@ -245,7 +245,7 @@ def inference_ui():
245
  preview_prompt = gr.Textbox(
246
  show_label=False, interactive=False, elem_id="inference_preview_prompt")
247
  update_prompt_preview_btn = gr.Button(
248
- "↻", elem_id="inference_update_prompt_preview_btn", full_width=False)
249
  update_prompt_preview_btn.style(size="sm")
250
 
251
  # with gr.Column():
 
245
  preview_prompt = gr.Textbox(
246
  show_label=False, interactive=False, elem_id="inference_preview_prompt")
247
  update_prompt_preview_btn = gr.Button(
248
+ "↻", elem_id="inference_update_prompt_preview_btn")
249
  update_prompt_preview_btn.style(size="sm")
250
 
251
  # with gr.Column():
requirements.txt CHANGED
@@ -7,6 +7,8 @@ datasets
7
  fire
8
  git+https://github.com/huggingface/peft.git
9
  git+https://github.com/huggingface/transformers.git
 
 
10
  gradio
11
  loralib
12
  sentencepiece
 
7
  fire
8
  git+https://github.com/huggingface/peft.git
9
  git+https://github.com/huggingface/transformers.git
10
+ numba
11
+ nvidia-ml-py3
12
  gradio
13
  loralib
14
  sentencepiece