Spaces:
Runtime error
Runtime error
zetavg
commited on
update
Browse files- llama_lora/globals.py +63 -0
- llama_lora/models.py +9 -5
- llama_lora/ui/finetune_ui.py +27 -6
- llama_lora/ui/inference_ui.py +1 -1
- requirements.txt +2 -0
llama_lora/globals.py
CHANGED
@@ -3,6 +3,9 @@ import subprocess
|
|
3 |
|
4 |
from typing import Any, Dict, List, Optional, Tuple, Union
|
5 |
|
|
|
|
|
|
|
6 |
from .lib.finetune import train
|
7 |
|
8 |
|
@@ -25,6 +28,12 @@ class Global:
|
|
25 |
# Model related
|
26 |
model_has_been_used = False
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
# UI related
|
29 |
ui_title: str = "LLaMA-LoRA"
|
30 |
ui_emoji: str = "🦙🎛️"
|
@@ -60,3 +69,57 @@ commit_hash = get_git_commit_hash()
|
|
60 |
|
61 |
if commit_hash:
|
62 |
Global.version = commit_hash[:8]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
from typing import Any, Dict, List, Optional, Tuple, Union
|
5 |
|
6 |
+
from numba import cuda
|
7 |
+
import nvidia_smi
|
8 |
+
|
9 |
from .lib.finetune import train
|
10 |
|
11 |
|
|
|
28 |
# Model related
|
29 |
model_has_been_used = False
|
30 |
|
31 |
+
# GPU Info
|
32 |
+
gpu_cc = None # GPU compute capability
|
33 |
+
gpu_sms = None # GPU total number of SMs
|
34 |
+
gpu_total_cores = None # GPU total cores
|
35 |
+
gpu_total_memory = None
|
36 |
+
|
37 |
# UI related
|
38 |
ui_title: str = "LLaMA-LoRA"
|
39 |
ui_emoji: str = "🦙🎛️"
|
|
|
69 |
|
70 |
if commit_hash:
|
71 |
Global.version = commit_hash[:8]
|
72 |
+
|
73 |
+
|
74 |
+
def load_gpu_info():
|
75 |
+
try:
|
76 |
+
cc_cores_per_SM_dict = {
|
77 |
+
(2, 0): 32,
|
78 |
+
(2, 1): 48,
|
79 |
+
(3, 0): 192,
|
80 |
+
(3, 5): 192,
|
81 |
+
(3, 7): 192,
|
82 |
+
(5, 0): 128,
|
83 |
+
(5, 2): 128,
|
84 |
+
(6, 0): 64,
|
85 |
+
(6, 1): 128,
|
86 |
+
(7, 0): 64,
|
87 |
+
(7, 5): 64,
|
88 |
+
(8, 0): 64,
|
89 |
+
(8, 6): 128,
|
90 |
+
(8, 9): 128,
|
91 |
+
(9, 0): 128
|
92 |
+
}
|
93 |
+
# the above dictionary should result in a value of "None" if a cc match
|
94 |
+
# is not found. The dictionary needs to be extended as new devices become
|
95 |
+
# available, and currently does not account for all Jetson devices
|
96 |
+
device = cuda.get_current_device()
|
97 |
+
device_sms = getattr(device, 'MULTIPROCESSOR_COUNT')
|
98 |
+
device_cc = device.compute_capability
|
99 |
+
cores_per_sm = cc_cores_per_SM_dict.get(device_cc)
|
100 |
+
total_cores = cores_per_sm*device_sms
|
101 |
+
print("GPU compute capability: ", device_cc)
|
102 |
+
print("GPU total number of SMs: ", device_sms)
|
103 |
+
print("GPU total cores: ", total_cores)
|
104 |
+
Global.gpu_cc = device_cc
|
105 |
+
Global.gpu_sms = device_sms
|
106 |
+
Global.gpu_total_cores = total_cores
|
107 |
+
|
108 |
+
nvidia_smi.nvmlInit()
|
109 |
+
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
|
110 |
+
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
|
111 |
+
total_memory = info.total
|
112 |
+
|
113 |
+
total_memory_mb = total_memory / (1024 ** 2)
|
114 |
+
total_memory_gb = total_memory / (1024 ** 3)
|
115 |
+
|
116 |
+
# Print the memory size
|
117 |
+
print(
|
118 |
+
f"GPU total memory: {total_memory} bytes ({total_memory_mb:.2f} MB) ({total_memory_gb:.2f} GB)")
|
119 |
+
Global.gpu_total_memory = total_memory
|
120 |
+
|
121 |
+
except Exception as e:
|
122 |
+
print(f"Notice: cannot get GPU info: {e}")
|
123 |
+
|
124 |
+
|
125 |
+
load_gpu_info()
|
llama_lora/models.py
CHANGED
@@ -102,6 +102,14 @@ def load_base_model():
|
|
102 |
)
|
103 |
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
def unload_models():
|
106 |
del Global.loaded_base_model
|
107 |
Global.loaded_base_model = None
|
@@ -109,11 +117,7 @@ def unload_models():
|
|
109 |
del Global.loaded_tokenizer
|
110 |
Global.loaded_tokenizer = None
|
111 |
|
112 |
-
|
113 |
-
|
114 |
-
# if not shared.args.cpu: # will not be running on CPUs anyway
|
115 |
-
with torch.no_grad():
|
116 |
-
torch.cuda.empty_cache()
|
117 |
|
118 |
Global.model_has_been_used = False
|
119 |
|
|
|
102 |
)
|
103 |
|
104 |
|
105 |
+
def clear_cache():
|
106 |
+
gc.collect()
|
107 |
+
|
108 |
+
# if not shared.args.cpu: # will not be running on CPUs anyway
|
109 |
+
with torch.no_grad():
|
110 |
+
torch.cuda.empty_cache()
|
111 |
+
|
112 |
+
|
113 |
def unload_models():
|
114 |
del Global.loaded_base_model
|
115 |
Global.loaded_base_model = None
|
|
|
117 |
del Global.loaded_tokenizer
|
118 |
Global.loaded_tokenizer = None
|
119 |
|
120 |
+
clear_cache()
|
|
|
|
|
|
|
|
|
121 |
|
122 |
Global.model_has_been_used = False
|
123 |
|
llama_lora/ui/finetune_ui.py
CHANGED
@@ -9,7 +9,9 @@ from random_word import RandomWords
|
|
9 |
from transformers import TrainerCallback
|
10 |
|
11 |
from ..globals import Global
|
12 |
-
from ..models import
|
|
|
|
|
13 |
from ..utils.data import (
|
14 |
get_available_template_names,
|
15 |
get_available_dataset_names,
|
@@ -238,6 +240,12 @@ def parse_plain_text_input(
|
|
238 |
return result
|
239 |
|
240 |
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
def do_train(
|
242 |
# Dataset
|
243 |
template,
|
@@ -258,9 +266,10 @@ def do_train(
|
|
258 |
lora_alpha,
|
259 |
lora_dropout,
|
260 |
model_name,
|
261 |
-
progress=gr.Progress(track_tqdm=
|
262 |
):
|
263 |
try:
|
|
|
264 |
# If model has been used in inference, we need to unload it first.
|
265 |
# Otherwise, we'll get a 'Function MmBackward0 returned an invalid
|
266 |
# gradient at index 1 - expected device meta but got cuda:0' error.
|
@@ -337,7 +346,8 @@ def do_train(
|
|
337 |
|
338 |
progress(
|
339 |
(i, 300),
|
340 |
-
desc="(Simulate) " +
|
|
|
341 |
)
|
342 |
|
343 |
time.sleep(0.1)
|
@@ -401,12 +411,13 @@ Train data (first 10):
|
|
401 |
|
402 |
# Do this again right before training to make sure the model is not used in inference.
|
403 |
unload_models_if_already_used()
|
|
|
404 |
|
405 |
base_model = get_base_model()
|
406 |
tokenizer = get_tokenizer()
|
407 |
|
408 |
# Do not let other tqdm iterations interfere the progress reporting after training starts.
|
409 |
-
progress.track_tqdm = False
|
410 |
|
411 |
results = Global.train_fn(
|
412 |
base_model, # base_model
|
@@ -431,7 +442,8 @@ Train data (first 10):
|
|
431 |
training_callbacks # callbacks
|
432 |
)
|
433 |
|
434 |
-
logs_str = "\n".join([json.dumps(log)
|
|
|
435 |
|
436 |
result_message = f"Training ended:\n{str(results)}\n\nLogs:\n{logs_str}"
|
437 |
print(result_message)
|
@@ -590,9 +602,18 @@ def finetune_ui():
|
|
590 |
)
|
591 |
|
592 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
593 |
with gr.Column():
|
594 |
micro_batch_size = gr.Slider(
|
595 |
-
minimum=1, maximum=100, step=1, value=
|
596 |
label="Micro Batch Size",
|
597 |
info="The number of examples in each mini-batch for gradient computation. A smaller micro_batch_size reduces memory usage but may increase training time."
|
598 |
)
|
|
|
9 |
from transformers import TrainerCallback
|
10 |
|
11 |
from ..globals import Global
|
12 |
+
from ..models import (
|
13 |
+
get_base_model, get_tokenizer,
|
14 |
+
clear_cache, unload_models_if_already_used)
|
15 |
from ..utils.data import (
|
16 |
get_available_template_names,
|
17 |
get_available_dataset_names,
|
|
|
240 |
return result
|
241 |
|
242 |
|
243 |
+
should_training_progress_track_tqdm = True
|
244 |
+
|
245 |
+
if Global.gpu_total_cores is not None and Global.gpu_total_cores > 2560:
|
246 |
+
should_training_progress_track_tqdm = False
|
247 |
+
|
248 |
+
|
249 |
def do_train(
|
250 |
# Dataset
|
251 |
template,
|
|
|
266 |
lora_alpha,
|
267 |
lora_dropout,
|
268 |
model_name,
|
269 |
+
progress=gr.Progress(track_tqdm=should_training_progress_track_tqdm),
|
270 |
):
|
271 |
try:
|
272 |
+
clear_cache()
|
273 |
# If model has been used in inference, we need to unload it first.
|
274 |
# Otherwise, we'll get a 'Function MmBackward0 returned an invalid
|
275 |
# gradient at index 1 - expected device meta but got cuda:0' error.
|
|
|
346 |
|
347 |
progress(
|
348 |
(i, 300),
|
349 |
+
desc="(Simulate) " +
|
350 |
+
get_progress_text(epoch, epochs, last_loss)
|
351 |
)
|
352 |
|
353 |
time.sleep(0.1)
|
|
|
411 |
|
412 |
# Do this again right before training to make sure the model is not used in inference.
|
413 |
unload_models_if_already_used()
|
414 |
+
clear_cache()
|
415 |
|
416 |
base_model = get_base_model()
|
417 |
tokenizer = get_tokenizer()
|
418 |
|
419 |
# Do not let other tqdm iterations interfere the progress reporting after training starts.
|
420 |
+
# progress.track_tqdm = False # setting this dynamically is not working, determining if track_tqdm should be enabled based on GPU cores at start instead.
|
421 |
|
422 |
results = Global.train_fn(
|
423 |
base_model, # base_model
|
|
|
442 |
training_callbacks # callbacks
|
443 |
)
|
444 |
|
445 |
+
logs_str = "\n".join([json.dumps(log)
|
446 |
+
for log in log_history]) or "None"
|
447 |
|
448 |
result_message = f"Training ended:\n{str(results)}\n\nLogs:\n{logs_str}"
|
449 |
print(result_message)
|
|
|
602 |
)
|
603 |
|
604 |
with gr.Row():
|
605 |
+
micro_batch_size_default_value = 1
|
606 |
+
|
607 |
+
if Global.gpu_total_cores is not None and Global.gpu_total_memory is not None:
|
608 |
+
memory_per_core = Global.gpu_total_memory / Global.gpu_total_cores
|
609 |
+
if memory_per_core >= 6291456:
|
610 |
+
micro_batch_size_default_value = 8
|
611 |
+
elif memory_per_core >= 4000000: # ?
|
612 |
+
micro_batch_size_default_value = 4
|
613 |
+
|
614 |
with gr.Column():
|
615 |
micro_batch_size = gr.Slider(
|
616 |
+
minimum=1, maximum=100, step=1, value=micro_batch_size_default_value,
|
617 |
label="Micro Batch Size",
|
618 |
info="The number of examples in each mini-batch for gradient computation. A smaller micro_batch_size reduces memory usage but may increase training time."
|
619 |
)
|
llama_lora/ui/inference_ui.py
CHANGED
@@ -245,7 +245,7 @@ def inference_ui():
|
|
245 |
preview_prompt = gr.Textbox(
|
246 |
show_label=False, interactive=False, elem_id="inference_preview_prompt")
|
247 |
update_prompt_preview_btn = gr.Button(
|
248 |
-
"↻", elem_id="inference_update_prompt_preview_btn"
|
249 |
update_prompt_preview_btn.style(size="sm")
|
250 |
|
251 |
# with gr.Column():
|
|
|
245 |
preview_prompt = gr.Textbox(
|
246 |
show_label=False, interactive=False, elem_id="inference_preview_prompt")
|
247 |
update_prompt_preview_btn = gr.Button(
|
248 |
+
"↻", elem_id="inference_update_prompt_preview_btn")
|
249 |
update_prompt_preview_btn.style(size="sm")
|
250 |
|
251 |
# with gr.Column():
|
requirements.txt
CHANGED
@@ -7,6 +7,8 @@ datasets
|
|
7 |
fire
|
8 |
git+https://github.com/huggingface/peft.git
|
9 |
git+https://github.com/huggingface/transformers.git
|
|
|
|
|
10 |
gradio
|
11 |
loralib
|
12 |
sentencepiece
|
|
|
7 |
fire
|
8 |
git+https://github.com/huggingface/peft.git
|
9 |
git+https://github.com/huggingface/transformers.git
|
10 |
+
numba
|
11 |
+
nvidia-ml-py3
|
12 |
gradio
|
13 |
loralib
|
14 |
sentencepiece
|