lvkaokao
support aqlm and gptq 2/3 bits.
b10d6d4
raw
history blame
9.33 kB
import json
import os
from datetime import datetime, timezone
import time
from huggingface_hub import ModelCard, snapshot_download
from src.display.formatting import styled_error, styled_message, styled_warning
from src.envs import API, EVAL_REQUESTS_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_REPO, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA, REPO, GIT_REQUESTS_PATH, GIT_STATUS_PATH, GLOBAL_COND
from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
from src.submission.check_validity import (
already_submitted_models,
check_model_card,
get_model_size,
get_quantized_model_parameters_memory,
is_model_on_hub,
is_gguf_on_hub,
user_submission_permission,
get_model_tags
)
REQUESTED_MODELS = None
USERS_TO_SUBMISSION_DATES = None
def add_new_eval(
model: str,
revision: str,
private: bool,
compute_dtype: str="float16",
precision: str="4bit",
weight_dtype: str="int4",
gguf_ftype: str="*Q4_0.gguf",
):
global REQUESTED_MODELS
global USERS_TO_SUBMISSION_DATES
if not REQUESTED_MODELS:
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(GIT_STATUS_PATH)
quant_type = None
user_name = ""
model_path = model
if "/" in model:
user_name = model.split("/")[0]
model_path = model.split("/")[1]
precision = precision.split(" ")[0]
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
# Is the user rate limited?
if user_name != "":
user_can_submit, error_msg = user_submission_permission(
user_name, USERS_TO_SUBMISSION_DATES, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
)
if not user_can_submit:
return styled_error(error_msg)
# Did the model authors forbid its submission to the leaderboard?
if model in DO_NOT_SUBMIT_MODELS:
return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
# Does the model actually exist?
if revision == "":
revision = "main"
architecture = "?"
downloads = 0
created_at = ""
gguf_on_hub, error, gguf_files, new_gguf_ftype = is_gguf_on_hub(repo_id=model, filename=gguf_ftype)
if new_gguf_ftype is not None:
gguf_ftype = new_gguf_ftype
model_on_hub, error, model_config = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
# Is the model on the hub?
if (not model_on_hub or model_config is None) and (not gguf_on_hub or gguf_files is None):
return styled_error(f'Model "{model}" {error}')
if model_config is not None:
architectures = getattr(model_config, "architectures", None)
if architectures:
architecture = ";".join(architectures)
downloads = getattr(model_config, 'downloads', 0)
created_at = getattr(model_config, 'created_at', '')
quantization_config = getattr(model_config, 'quantization_config', None)
if gguf_files is not None:
architectures = ""
downloads = 0
created_at = ""
quantization_config = None
quant_type = "llama.cpp"
# Is the model info correctly filled?
try:
model_info = API.model_info(repo_id=model, revision=revision)
except Exception:
return styled_error("Could not get your model information. Please fill it up properly.")
# Were the model card and license filled?
try:
if model_info.cardData is None:
license = "unknown"
else:
license = model_info.cardData.get("license", "unknown")
except Exception:
return styled_error("Please select a license for your model")
modelcard_OK, error_msg, model_card = check_model_card(model)
# maybe don't have model card
"""
if not modelcard_OK:
return styled_error(error_msg)
"""
tags = get_model_tags(model_card, model)
# Seems good, creating the eval
print("Adding new eval")
script = "ITREX"
hardware = "cpu"
precision = "4bit"
if quantization_config is not None:
quant_method = quantization_config.get("quant_method", None)
if "bnb_4bit_quant_type" in quantization_config:
quant_method = "bitsandbytes"
quant_type = "bitsandbytes"
hardware = "gpu"
if quantization_config.get("load_in_4bit", True):
precision = "4bit"
if quantization_config.get("load_in_8bit", True):
precision = "8bit"
if quant_method == "gptq":
hardware = "cpu"
quant_type = "GPTQ"
precision = f"{quantization_config.get('bits', '4bit')}bit"
if quant_method == "awq":
hardware = "gpu"
quant_type = "AWQ"
precision = f"{quantization_config.get('bits', '4bit')}bit"
if quant_method == "aqlm":
hardware = "gpu"
quant_type = "AQLM"
nbits_per_codebook = quantization_config.get('nbits_per_codebook')
num_codebooks = quantization_config.get('num_codebooks')
in_group_size = quantization_config.get('in_group_size')
bits = int(nbits_per_codebook * num_codebooks / in_group_size)
precision = f"{bits}bit"
if precision == "4bit":
weight_dtype = "int4"
elif precision == "3bit":
weight_dtype = "int3"
elif precision == "2bit":
weight_dtype = "int2"
if quant_type is None or quant_type == "":
return styled_error("Please select a quantization model like GPTQ, AWQ etc.")
model_params, model_size = get_quantized_model_parameters_memory(model_info,
quant_method=quant_type.lower(),
bits=precision)
if quant_type == "llama.cpp":
hardware = "cpu"
script = "llama_cpp"
tags = "llama.cpp"
else:
hardware = "gpu"
eval_entry = {
"model": model,
"revision": revision,
"private": private,
"params": model_size,
"architectures": architecture,
"quant_type": quant_type,
"precision": precision,
"model_params": model_params,
"model_size": model_size,
"precision": precision,
"weight_dtype": weight_dtype,
"compute_dtype": compute_dtype,
"gguf_ftype": gguf_ftype,
"hardware": hardware,
"status": "Pending",
"submitted_time": current_time,
"model_type": "quantization",
"job_id": -1,
"job_start_time": None,
"scripts": script
}
supplementary_info = {
"likes": model_info.likes,
"license": license,
"still_on_hub": True,
"tags": tags,
"downloads": downloads,
"created_at": created_at
}
print(eval_entry)
# ToDo: need open
# Check for duplicate submission
if f"{model}_{revision}_{quant_type}_{precision}_{weight_dtype}_{compute_dtype}" in REQUESTED_MODELS:
return styled_warning("This model has been already submitted.")
print("Creating huggingface/dataset eval file")
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
os.makedirs(OUT_DIR, exist_ok=True)
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{quant_type}_{precision}_{weight_dtype}_{compute_dtype}.json"
with open(out_path, "w") as f:
f.write(json.dumps(eval_entry))
print("Uploading eval file")
API.upload_file(
path_or_fileobj=out_path,
path_in_repo=out_path.split("eval-queue/")[1],
repo_id=QUEUE_REPO,
repo_type="dataset",
commit_message=f"Add {model} to eval queue",
)
print("Creating git eval file")
OUT_DIR = f"{GIT_REQUESTS_PATH}/{user_name}"
os.makedirs(OUT_DIR, exist_ok=True)
req_out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{quant_type}_{precision}_{weight_dtype}_{compute_dtype}.json"
req_git_path = "/".join(req_out_path.split('/')[1:])
print("Creating status file")
OUT_DIR = f"{GIT_STATUS_PATH}/{user_name}"
os.makedirs(OUT_DIR, exist_ok=True)
sta_out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{quant_type}_{precision}_{weight_dtype}_{compute_dtype}.json"
sta_git_path = "/".join(sta_out_path.split('/')[1:])
print("Uploading eval file")
try:
print("git-push get lock..............")
GLOBAL_COND.acquire()
branch = REPO.active_branch.name
REPO.remotes.origin.pull(branch)
REPO.index.remove("requests", False, r=True)
with open(req_out_path, "w") as f:
f.write(json.dumps(eval_entry, indent=4))
with open(sta_out_path, "w") as f:
f.write(json.dumps(eval_entry, indent=4))
REPO.index.add([req_git_path, sta_git_path])
commit = REPO.index.commit(f"Add {model} to eval requests/status.")
REPO.remotes.origin.push(branch)
time.sleep(10)
print("git-push release lock..............")
GLOBAL_COND.release()
except Exception as e:
print(str(e))
print("git-push error........")
GLOBAL_COND.release()
return styled_message(
"Your request has been submitted to the evaluation queue!\nPlease wait for up to 3 hours for the model to show in the PENDING list."
)