lvkaokao
commited on
Commit
·
b10d6d4
1
Parent(s):
228e920
support aqlm and gptq 2/3 bits.
Browse files- src/display/utils.py +15 -2
- src/leaderboard/read_evals.py +2 -3
- src/submission/check_validity.py +11 -4
- src/submission/submit.py +15 -0
src/display/utils.py
CHANGED
@@ -204,6 +204,7 @@ class WeightType(Enum):
|
|
204 |
|
205 |
class QuantType(Enum):
|
206 |
gptq = ModelDetails(name="GPTQ", symbol="🟢")
|
|
|
207 |
awq = ModelDetails(name="AWQ", symbol="🟩")
|
208 |
llama_cpp = ModelDetails(name="llama.cpp", symbol="🔶")
|
209 |
bnb = ModelDetails(name="bitsandbytes", symbol="💬")
|
@@ -216,6 +217,8 @@ class QuantType(Enum):
|
|
216 |
def from_str(quant_dtype):
|
217 |
if quant_dtype in ["GPTQ"]:
|
218 |
return QuantType.gptq
|
|
|
|
|
219 |
if quant_dtype in ["AWQ"]:
|
220 |
return QuantType.awq
|
221 |
if quant_dtype in ["llama.cpp"]:
|
@@ -228,6 +231,8 @@ class QuantType(Enum):
|
|
228 |
|
229 |
|
230 |
class WeightDtype(Enum):
|
|
|
|
|
231 |
int4 = ModelDetails("int4")
|
232 |
nf4 = ModelDetails("nf4")
|
233 |
fp4 = ModelDetails("fp4")
|
@@ -235,6 +240,10 @@ class WeightDtype(Enum):
|
|
235 |
Unknown = ModelDetails("?")
|
236 |
|
237 |
def from_str(weight_dtype):
|
|
|
|
|
|
|
|
|
238 |
if weight_dtype in ["int4"]:
|
239 |
return WeightDtype.int4
|
240 |
if weight_dtype in ["nf4"]:
|
@@ -290,6 +299,8 @@ class GroupDtype(Enum):
|
|
290 |
class Precision(Enum):
|
291 |
# float16 = ModelDetails("float16")
|
292 |
# bfloat16 = ModelDetails("bfloat16")
|
|
|
|
|
293 |
qt_4bit = ModelDetails("4bit")
|
294 |
# qt_8bit = ModelDetails("8bit")
|
295 |
# qt_GPTQ = ModelDetails("GPTQ")
|
@@ -300,8 +311,10 @@ class Precision(Enum):
|
|
300 |
# return Precision.float16
|
301 |
# if precision in ["torch.bfloat16", "bfloat16"]:
|
302 |
# return Precision.bfloat16
|
303 |
-
if precision in ["
|
304 |
-
return Precision.
|
|
|
|
|
305 |
if precision in ["4bit"]:
|
306 |
return Precision.qt_4bit
|
307 |
# if precision in ["GPTQ", "None"]:
|
|
|
204 |
|
205 |
class QuantType(Enum):
|
206 |
gptq = ModelDetails(name="GPTQ", symbol="🟢")
|
207 |
+
aqlm = ModelDetails(name="AQLM", symbol="⭐")
|
208 |
awq = ModelDetails(name="AWQ", symbol="🟩")
|
209 |
llama_cpp = ModelDetails(name="llama.cpp", symbol="🔶")
|
210 |
bnb = ModelDetails(name="bitsandbytes", symbol="💬")
|
|
|
217 |
def from_str(quant_dtype):
|
218 |
if quant_dtype in ["GPTQ"]:
|
219 |
return QuantType.gptq
|
220 |
+
if quant_dtype in ["AQLM"]:
|
221 |
+
return QuantType.aqlm
|
222 |
if quant_dtype in ["AWQ"]:
|
223 |
return QuantType.awq
|
224 |
if quant_dtype in ["llama.cpp"]:
|
|
|
231 |
|
232 |
|
233 |
class WeightDtype(Enum):
|
234 |
+
int2 = ModelDetails("int2")
|
235 |
+
int3 = ModelDetails("int3")
|
236 |
int4 = ModelDetails("int4")
|
237 |
nf4 = ModelDetails("nf4")
|
238 |
fp4 = ModelDetails("fp4")
|
|
|
240 |
Unknown = ModelDetails("?")
|
241 |
|
242 |
def from_str(weight_dtype):
|
243 |
+
if weight_dtype in ["int2"]:
|
244 |
+
return WeightDtype.int2
|
245 |
+
if weight_dtype in ["int3"]:
|
246 |
+
return WeightDtype.int3
|
247 |
if weight_dtype in ["int4"]:
|
248 |
return WeightDtype.int4
|
249 |
if weight_dtype in ["nf4"]:
|
|
|
299 |
class Precision(Enum):
|
300 |
# float16 = ModelDetails("float16")
|
301 |
# bfloat16 = ModelDetails("bfloat16")
|
302 |
+
qt_2bit = ModelDetails("2bit")
|
303 |
+
qt_3bit = ModelDetails("3bit")
|
304 |
qt_4bit = ModelDetails("4bit")
|
305 |
# qt_8bit = ModelDetails("8bit")
|
306 |
# qt_GPTQ = ModelDetails("GPTQ")
|
|
|
311 |
# return Precision.float16
|
312 |
# if precision in ["torch.bfloat16", "bfloat16"]:
|
313 |
# return Precision.bfloat16
|
314 |
+
if precision in ["2bit"]:
|
315 |
+
return Precision.qt_2bit
|
316 |
+
if precision in ["3bit"]:
|
317 |
+
return Precision.qt_3bit
|
318 |
if precision in ["4bit"]:
|
319 |
return Precision.qt_4bit
|
320 |
# if precision in ["GPTQ", "None"]:
|
src/leaderboard/read_evals.py
CHANGED
@@ -54,8 +54,7 @@ class EvalResult:
|
|
54 |
# Precision
|
55 |
precision = Precision.from_str(config.get("precision", "4bit"))
|
56 |
quant_type = QuantType.from_str(config.get("quant_type", "GPTQ"))
|
57 |
-
|
58 |
-
weight_dtype = WeightDtype.from_str(config.get("weight_dtype", "int4"))
|
59 |
compute_dtype = ComputeDtype.from_str(data["task_info"].get("compute_dtype", "bfloat16"))
|
60 |
double_quant = data["quantization_config"].get("bnb_4bit_use_double_quant", False)
|
61 |
model_params = config["model_params"]
|
@@ -243,7 +242,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
|
|
243 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
244 |
eval_result.update_with_request_file(requests_path)
|
245 |
if eval_result.full_model in dynamic_data:
|
246 |
-
eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
|
247 |
# Hardcoding because of gating problem
|
248 |
if "meta-llama" in eval_result.full_model:
|
249 |
eval_result.still_on_hub = True
|
|
|
54 |
# Precision
|
55 |
precision = Precision.from_str(config.get("precision", "4bit"))
|
56 |
quant_type = QuantType.from_str(config.get("quant_type", "GPTQ"))
|
57 |
+
weight_dtype = WeightDtype.from_str(data["task_info"].get("weight_dtype", "int4"))
|
|
|
58 |
compute_dtype = ComputeDtype.from_str(data["task_info"].get("compute_dtype", "bfloat16"))
|
59 |
double_quant = data["quantization_config"].get("bnb_4bit_use_double_quant", False)
|
60 |
model_params = config["model_params"]
|
|
|
242 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
243 |
eval_result.update_with_request_file(requests_path)
|
244 |
if eval_result.full_model in dynamic_data:
|
245 |
+
# eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
|
246 |
# Hardcoding because of gating problem
|
247 |
if "meta-llama" in eval_result.full_model:
|
248 |
eval_result.still_on_hub = True
|
src/submission/check_validity.py
CHANGED
@@ -92,18 +92,22 @@ def get_model_size(model_info: ModelInfo, precision: str):
|
|
92 |
return model_size
|
93 |
|
94 |
KNOWN_SIZE_FACTOR = {
|
95 |
-
"gptq": {"4bit": 8, "8bit": 4},
|
96 |
"awq": {"4bit": 8},
|
97 |
-
"bitsandbytes": {"4bit": 2}
|
|
|
98 |
}
|
99 |
|
100 |
BYTES = {
|
101 |
"I32": 4,
|
|
|
|
|
102 |
"F16": 2,
|
103 |
"BF16": 2,
|
104 |
"F32": 4,
|
105 |
"U8": 1}
|
106 |
|
|
|
107 |
def get_quantized_model_parameters_memory(model_info: ModelInfo, quant_method="", bits="4bit"):
|
108 |
try:
|
109 |
safetensors = get_safetensors_metadata(model_info.id)
|
@@ -111,9 +115,12 @@ def get_quantized_model_parameters_memory(model_info: ModelInfo, quant_method=""
|
|
111 |
mem = 0
|
112 |
for key in safetensors.parameter_count:
|
113 |
mem += safetensors.parameter_count[key] * BYTES[key]
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
-
if key in ["I32", "U8"]:
|
116 |
-
num_parameters += safetensors.parameter_count[key] * KNOWN_SIZE_FACTOR[quant_method][bits]
|
117 |
params_b = round(num_parameters / 1e9, 2)
|
118 |
size_gb = round(mem / 1e9,2)
|
119 |
return params_b, size_gb
|
|
|
92 |
return model_size
|
93 |
|
94 |
KNOWN_SIZE_FACTOR = {
|
95 |
+
"gptq": {"4bit": 8, "8bit": 4, "2bit": 8, "3bit": 12},
|
96 |
"awq": {"4bit": 8},
|
97 |
+
"bitsandbytes": {"4bit": 2},
|
98 |
+
"aqlm": {"4bit": 8, "8bit": 4, "2bit": 8, "3bit": 6},
|
99 |
}
|
100 |
|
101 |
BYTES = {
|
102 |
"I32": 4,
|
103 |
+
"I16": 2,
|
104 |
+
"I8": 1,
|
105 |
"F16": 2,
|
106 |
"BF16": 2,
|
107 |
"F32": 4,
|
108 |
"U8": 1}
|
109 |
|
110 |
+
|
111 |
def get_quantized_model_parameters_memory(model_info: ModelInfo, quant_method="", bits="4bit"):
|
112 |
try:
|
113 |
safetensors = get_safetensors_metadata(model_info.id)
|
|
|
115 |
mem = 0
|
116 |
for key in safetensors.parameter_count:
|
117 |
mem += safetensors.parameter_count[key] * BYTES[key]
|
118 |
+
if key in ["I32", "U8", "I16", "I8"]:
|
119 |
+
param = safetensors.parameter_count[key] * KNOWN_SIZE_FACTOR[quant_method][bits]
|
120 |
+
if key == "I8":
|
121 |
+
param = param / 2
|
122 |
+
num_parameters += param
|
123 |
|
|
|
|
|
124 |
params_b = round(num_parameters / 1e9, 2)
|
125 |
size_gb = round(mem / 1e9,2)
|
126 |
return params_b, size_gb
|
src/submission/submit.py
CHANGED
@@ -140,6 +140,21 @@ def add_new_eval(
|
|
140 |
hardware = "gpu"
|
141 |
quant_type = "AWQ"
|
142 |
precision = f"{quantization_config.get('bits', '4bit')}bit"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
if quant_type is None or quant_type == "":
|
145 |
return styled_error("Please select a quantization model like GPTQ, AWQ etc.")
|
|
|
140 |
hardware = "gpu"
|
141 |
quant_type = "AWQ"
|
142 |
precision = f"{quantization_config.get('bits', '4bit')}bit"
|
143 |
+
if quant_method == "aqlm":
|
144 |
+
hardware = "gpu"
|
145 |
+
quant_type = "AQLM"
|
146 |
+
nbits_per_codebook = quantization_config.get('nbits_per_codebook')
|
147 |
+
num_codebooks = quantization_config.get('num_codebooks')
|
148 |
+
in_group_size = quantization_config.get('in_group_size')
|
149 |
+
bits = int(nbits_per_codebook * num_codebooks / in_group_size)
|
150 |
+
precision = f"{bits}bit"
|
151 |
+
|
152 |
+
if precision == "4bit":
|
153 |
+
weight_dtype = "int4"
|
154 |
+
elif precision == "3bit":
|
155 |
+
weight_dtype = "int3"
|
156 |
+
elif precision == "2bit":
|
157 |
+
weight_dtype = "int2"
|
158 |
|
159 |
if quant_type is None or quant_type == "":
|
160 |
return styled_error("Please select a quantization model like GPTQ, AWQ etc.")
|