How to merge adapter into original llama?
HI,
it seems https://huggingface.co/timdettmers/guanaco-65b is a lora weight, how did you apply this lora weight on a 75B model? Is there any repo/script that I can refer to?
Thanks!
To apply a lora to a model, load the model and then load the lora on top using perft model.
# Some llama or alpaca model 65b
base_model = "decapoda-research/llama-65b-hf"
model = LlamaForCausalLM.from_pretrained(
base_model,
load_in_8bit=load_8bit,
torch_dtype=torch.float16
)
# Load the LORA on top
lora_weights = "timdettmers/guanaco-65b"
model = PeftModel.from_pretrained(
model,
lora_weights,
torch_dtype=torch.float16
)
Then to save the lora applied model:
out_folder = args.output or Path(f"models/somename")
model.save_pretrained(out_folder, max_shard_size="2GB", safe_serialization=True)
# If you have to save the tokenizer too ...
tokenizer.save_pretrained(out_folder)
Here's the script I used specifically. Requires peft version 0.3 installed
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
import os
import argparse
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_model_name_or_path", type=str)
parser.add_argument("--peft_model_path", type=str)
parser.add_argument("--output_dir", type=str)
parser.add_argument("--device", type=str, default="auto")
parser.add_argument("--push_to_hub", action="store_true")
return parser.parse_args()
def main():
args = get_args()
if args.device == 'auto':
device_arg = { 'device_map': 'auto' }
else:
device_arg = { 'device_map': { "": args.device} }
print(f"Loading base model: {args.base_model_name_or_path}")
base_model = AutoModelForCausalLM.from_pretrained(
args.base_model_name_or_path,
return_dict=True,
torch_dtype=torch.float16,
**device_arg
)
print(f"Loading PEFT: {args.peft_model_path}")
model = PeftModel.from_pretrained(base_model, args.peft_model_path, **device_arg)
print(f"Running merge_and_unload")
model = model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(args.base_model_name_or_path)
if args.push_to_hub:
print(f"Saving to hub ...")
model.push_to_hub(f"{args.output_dir}", use_temp_dir=False)
tokenizer.push_to_hub(f"{args.output_dir}", use_temp_dir=False)
else:
model.save_pretrained(f"{args.output_dir}")
tokenizer.save_pretrained(f"{args.output_dir}")
print(f"Model saved to {args.output_dir}")
if __name__ == "__main__" :
main()
YTMND, thank you so much for explaining the process / script!
@TheBloke
One thing I ran into trying to merge my LoRAs using your code is that merge_and_unload()
doesn't work if the base model (mostly safetensors) has no config file associated with it. You can change in the above code
model = model.merge_and_unload()
tomodel = model.base_model.model
and it should work (if it doesn't you can manually set the Config on the AutoModelForCausalLM object model.config = AutoConfig.from_pretrained('some/model')
).
If using text-generation-ui
If you want to merge a GPTQ lora created in text-generation-ui
with --monkey-patch
you can use the below code.
merge-lora.py
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from peft import PeftModel
import torch
import modules.shared as shared
import modules.monkey_patch_gptq_lora_export as monkeypatch
import os
import argparse
# your hf access token
access_token= 'hf_deadbeef010101'
# your uploaded hf repo
hf_repo = 'myuser/myrepo-65b-GPTQ'
def main():
device_arg = { 'device_map': 'auto' }
print(f"Loading base model: {shared.args.model}")
config = AutoConfig.from_pretrained(f'{shared.args.model_dir}/{shared.args.model}/config.json')
model, tokenizer = monkeypatch.load_model_llama(shared.args.model)
model.config = config
print(f"Loading PEFT: {shared.args.lora}")
model = PeftModel.from_pretrained(model, shared.args.lora[0], **device_arg)
model = model.base_model.model
print(f"Saving to hub ...")
# to push to HF
model.push_to_hub(f"{hf_repo}", use_temp_dir=True, use_auth_token=access_token)
tokenizer.push_to_hub(f"{hf_repo}", use_temp_dir=True, use_auth_token=access_token)
# Or save locally
# model.save_pretrained(f"my/output/dir")
# tokenizer.save_pretrained(f"my/output/dir")
if __name__ == "__main__" :
main()
Place merge-lora.py
in the root of text-generation-ui and then run the below commands (adjusting your cuda visible devices).
CURRENTDATEONLY=`date +"%b %d %Y"`
export CUDA_VISIBLE_DEVICES=0
python merge-lora.py \
--model 'TheBloke_guanaco-65B-GPTQ' \
--lora '/media/nmitchko/SSD-PUT/text-generation-webui/loras/medguanaco/' \
--wbits 4 \
--monkey-patch \
--listen \
--listen-port 7890 \
--chat \
--extensions api google_translate | tee "export-${CURRENTDATEONLY}-start.log"
@TheBloke One thing I ran into trying to merge my LoRAs using your code is that
merge_and_unload()
doesn't work if the base model (mostly safetensors) has no config file associated with it. You can change in the above code
model = model.merge_and_unload()
tomodel = model.base_model.model
and it should work (if it doesn't you can manually set the Config on the AutoModelForCausalLM object
model.config = AutoConfig.from_pretrained('some/model')
).If using text-generation-ui
If you want to merge a GPTQ lora created in
text-generation-ui
with--monkey-patch
you can use the below code.merge-lora.py
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig from peft import PeftModel import torch import modules.shared as shared import modules.monkey_patch_gptq_lora_export as monkeypatch import os import argparse # your hf access token access_token= 'hf_deadbeef010101' # your uploaded hf repo hf_repo = 'myuser/myrepo-65b-GPTQ' def main(): device_arg = { 'device_map': 'auto' } print(f"Loading base model: {shared.args.model}") config = AutoConfig.from_pretrained(f'{shared.args.model_dir}/{shared.args.model}/config.json') model, tokenizer = monkeypatch.load_model_llama(shared.args.model) model.config = config print(f"Loading PEFT: {shared.args.lora}") model = PeftModel.from_pretrained(model, shared.args.lora[0], **device_arg) model = model.base_model.model print(f"Saving to hub ...") # to push to HF model.push_to_hub(f"{hf_repo}", use_temp_dir=True, use_auth_token=access_token) tokenizer.push_to_hub(f"{hf_repo}", use_temp_dir=True, use_auth_token=access_token) # Or save locally # model.save_pretrained(f"my/output/dir") # tokenizer.save_pretrained(f"my/output/dir") if __name__ == "__main__" : main()
Place
merge-lora.py
in the root of text-generation-ui and then run the below commands (adjusting your cuda visible devices).
CURRENTDATEONLY=`date +"%b %d %Y"` export CUDA_VISIBLE_DEVICES=0 python merge-lora.py \ --model 'TheBloke_guanaco-65B-GPTQ' \ --lora '/media/nmitchko/SSD-PUT/text-generation-webui/loras/medguanaco/' \ --wbits 4 \ --monkey-patch \ --listen \ --listen-port 7890 \ --chat \ --extensions api google_translate | tee "export-${CURRENTDATEONLY}-start.log"
how did you manage to train a QLoRa on Text web Ui? i kept getting errors everytime i tried to train..
i even used --monkey-patch
i even used --monkey-patch
I no longer use this workflow with TG-webui because of various bugs. Now I just use Qlora and point it to the model directory where I want to start and end.
if someone has a working requirements.txt or pip freeze output for your environment, can you please share? :)
I keep getting hit with the below
Traceback (most recent call last):
File "/app/merge.py", line 70, in <module>
main()
File "/app/merge.py", line 38, in main
base_model = AutoModelForCausalLM.from_pretrained(
File "/app/env/lib/python3.9/site-packages/transformers/models/auto/auto_factory.py", line 434, in from_pretrained
config, kwargs = AutoConfig.from_pretrained(
File "/app/env/lib/python3.9/site-packages/transformers/models/auto/configuration_auto.py", line 873, in from_pretrained
config_class = CONFIG_MAPPING[config_dict["model_type"]]
File "/app/env/lib/python3.9/site-packages/transformers/models/auto/configuration_auto.py", line 579, in __getitem__
raise KeyError(key)
KeyError: 'llama'