lvkaokao commited on
Commit
b10d6d4
·
1 Parent(s): 228e920

support aqlm and gptq 2/3 bits.

Browse files
src/display/utils.py CHANGED
@@ -204,6 +204,7 @@ class WeightType(Enum):
204
 
205
  class QuantType(Enum):
206
  gptq = ModelDetails(name="GPTQ", symbol="🟢")
 
207
  awq = ModelDetails(name="AWQ", symbol="🟩")
208
  llama_cpp = ModelDetails(name="llama.cpp", symbol="🔶")
209
  bnb = ModelDetails(name="bitsandbytes", symbol="💬")
@@ -216,6 +217,8 @@ class QuantType(Enum):
216
  def from_str(quant_dtype):
217
  if quant_dtype in ["GPTQ"]:
218
  return QuantType.gptq
 
 
219
  if quant_dtype in ["AWQ"]:
220
  return QuantType.awq
221
  if quant_dtype in ["llama.cpp"]:
@@ -228,6 +231,8 @@ class QuantType(Enum):
228
 
229
 
230
  class WeightDtype(Enum):
 
 
231
  int4 = ModelDetails("int4")
232
  nf4 = ModelDetails("nf4")
233
  fp4 = ModelDetails("fp4")
@@ -235,6 +240,10 @@ class WeightDtype(Enum):
235
  Unknown = ModelDetails("?")
236
 
237
  def from_str(weight_dtype):
 
 
 
 
238
  if weight_dtype in ["int4"]:
239
  return WeightDtype.int4
240
  if weight_dtype in ["nf4"]:
@@ -290,6 +299,8 @@ class GroupDtype(Enum):
290
  class Precision(Enum):
291
  # float16 = ModelDetails("float16")
292
  # bfloat16 = ModelDetails("bfloat16")
 
 
293
  qt_4bit = ModelDetails("4bit")
294
  # qt_8bit = ModelDetails("8bit")
295
  # qt_GPTQ = ModelDetails("GPTQ")
@@ -300,8 +311,10 @@ class Precision(Enum):
300
  # return Precision.float16
301
  # if precision in ["torch.bfloat16", "bfloat16"]:
302
  # return Precision.bfloat16
303
- if precision in ["8bit"]:
304
- return Precision.qt_8bit
 
 
305
  if precision in ["4bit"]:
306
  return Precision.qt_4bit
307
  # if precision in ["GPTQ", "None"]:
 
204
 
205
  class QuantType(Enum):
206
  gptq = ModelDetails(name="GPTQ", symbol="🟢")
207
+ aqlm = ModelDetails(name="AQLM", symbol="⭐")
208
  awq = ModelDetails(name="AWQ", symbol="🟩")
209
  llama_cpp = ModelDetails(name="llama.cpp", symbol="🔶")
210
  bnb = ModelDetails(name="bitsandbytes", symbol="💬")
 
217
  def from_str(quant_dtype):
218
  if quant_dtype in ["GPTQ"]:
219
  return QuantType.gptq
220
+ if quant_dtype in ["AQLM"]:
221
+ return QuantType.aqlm
222
  if quant_dtype in ["AWQ"]:
223
  return QuantType.awq
224
  if quant_dtype in ["llama.cpp"]:
 
231
 
232
 
233
  class WeightDtype(Enum):
234
+ int2 = ModelDetails("int2")
235
+ int3 = ModelDetails("int3")
236
  int4 = ModelDetails("int4")
237
  nf4 = ModelDetails("nf4")
238
  fp4 = ModelDetails("fp4")
 
240
  Unknown = ModelDetails("?")
241
 
242
  def from_str(weight_dtype):
243
+ if weight_dtype in ["int2"]:
244
+ return WeightDtype.int2
245
+ if weight_dtype in ["int3"]:
246
+ return WeightDtype.int3
247
  if weight_dtype in ["int4"]:
248
  return WeightDtype.int4
249
  if weight_dtype in ["nf4"]:
 
299
  class Precision(Enum):
300
  # float16 = ModelDetails("float16")
301
  # bfloat16 = ModelDetails("bfloat16")
302
+ qt_2bit = ModelDetails("2bit")
303
+ qt_3bit = ModelDetails("3bit")
304
  qt_4bit = ModelDetails("4bit")
305
  # qt_8bit = ModelDetails("8bit")
306
  # qt_GPTQ = ModelDetails("GPTQ")
 
311
  # return Precision.float16
312
  # if precision in ["torch.bfloat16", "bfloat16"]:
313
  # return Precision.bfloat16
314
+ if precision in ["2bit"]:
315
+ return Precision.qt_2bit
316
+ if precision in ["3bit"]:
317
+ return Precision.qt_3bit
318
  if precision in ["4bit"]:
319
  return Precision.qt_4bit
320
  # if precision in ["GPTQ", "None"]:
src/leaderboard/read_evals.py CHANGED
@@ -54,8 +54,7 @@ class EvalResult:
54
  # Precision
55
  precision = Precision.from_str(config.get("precision", "4bit"))
56
  quant_type = QuantType.from_str(config.get("quant_type", "GPTQ"))
57
- # not use
58
- weight_dtype = WeightDtype.from_str(config.get("weight_dtype", "int4"))
59
  compute_dtype = ComputeDtype.from_str(data["task_info"].get("compute_dtype", "bfloat16"))
60
  double_quant = data["quantization_config"].get("bnb_4bit_use_double_quant", False)
61
  model_params = config["model_params"]
@@ -243,7 +242,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
243
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
244
  eval_result.update_with_request_file(requests_path)
245
  if eval_result.full_model in dynamic_data:
246
- eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
247
  # Hardcoding because of gating problem
248
  if "meta-llama" in eval_result.full_model:
249
  eval_result.still_on_hub = True
 
54
  # Precision
55
  precision = Precision.from_str(config.get("precision", "4bit"))
56
  quant_type = QuantType.from_str(config.get("quant_type", "GPTQ"))
57
+ weight_dtype = WeightDtype.from_str(data["task_info"].get("weight_dtype", "int4"))
 
58
  compute_dtype = ComputeDtype.from_str(data["task_info"].get("compute_dtype", "bfloat16"))
59
  double_quant = data["quantization_config"].get("bnb_4bit_use_double_quant", False)
60
  model_params = config["model_params"]
 
242
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
243
  eval_result.update_with_request_file(requests_path)
244
  if eval_result.full_model in dynamic_data:
245
+ # eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
246
  # Hardcoding because of gating problem
247
  if "meta-llama" in eval_result.full_model:
248
  eval_result.still_on_hub = True
src/submission/check_validity.py CHANGED
@@ -92,18 +92,22 @@ def get_model_size(model_info: ModelInfo, precision: str):
92
  return model_size
93
 
94
  KNOWN_SIZE_FACTOR = {
95
- "gptq": {"4bit": 8, "8bit": 4},
96
  "awq": {"4bit": 8},
97
- "bitsandbytes": {"4bit": 2}
 
98
  }
99
 
100
  BYTES = {
101
  "I32": 4,
 
 
102
  "F16": 2,
103
  "BF16": 2,
104
  "F32": 4,
105
  "U8": 1}
106
 
 
107
  def get_quantized_model_parameters_memory(model_info: ModelInfo, quant_method="", bits="4bit"):
108
  try:
109
  safetensors = get_safetensors_metadata(model_info.id)
@@ -111,9 +115,12 @@ def get_quantized_model_parameters_memory(model_info: ModelInfo, quant_method=""
111
  mem = 0
112
  for key in safetensors.parameter_count:
113
  mem += safetensors.parameter_count[key] * BYTES[key]
 
 
 
 
 
114
 
115
- if key in ["I32", "U8"]:
116
- num_parameters += safetensors.parameter_count[key] * KNOWN_SIZE_FACTOR[quant_method][bits]
117
  params_b = round(num_parameters / 1e9, 2)
118
  size_gb = round(mem / 1e9,2)
119
  return params_b, size_gb
 
92
  return model_size
93
 
94
  KNOWN_SIZE_FACTOR = {
95
+ "gptq": {"4bit": 8, "8bit": 4, "2bit": 8, "3bit": 12},
96
  "awq": {"4bit": 8},
97
+ "bitsandbytes": {"4bit": 2},
98
+ "aqlm": {"4bit": 8, "8bit": 4, "2bit": 8, "3bit": 6},
99
  }
100
 
101
  BYTES = {
102
  "I32": 4,
103
+ "I16": 2,
104
+ "I8": 1,
105
  "F16": 2,
106
  "BF16": 2,
107
  "F32": 4,
108
  "U8": 1}
109
 
110
+
111
  def get_quantized_model_parameters_memory(model_info: ModelInfo, quant_method="", bits="4bit"):
112
  try:
113
  safetensors = get_safetensors_metadata(model_info.id)
 
115
  mem = 0
116
  for key in safetensors.parameter_count:
117
  mem += safetensors.parameter_count[key] * BYTES[key]
118
+ if key in ["I32", "U8", "I16", "I8"]:
119
+ param = safetensors.parameter_count[key] * KNOWN_SIZE_FACTOR[quant_method][bits]
120
+ if key == "I8":
121
+ param = param / 2
122
+ num_parameters += param
123
 
 
 
124
  params_b = round(num_parameters / 1e9, 2)
125
  size_gb = round(mem / 1e9,2)
126
  return params_b, size_gb
src/submission/submit.py CHANGED
@@ -140,6 +140,21 @@ def add_new_eval(
140
  hardware = "gpu"
141
  quant_type = "AWQ"
142
  precision = f"{quantization_config.get('bits', '4bit')}bit"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  if quant_type is None or quant_type == "":
145
  return styled_error("Please select a quantization model like GPTQ, AWQ etc.")
 
140
  hardware = "gpu"
141
  quant_type = "AWQ"
142
  precision = f"{quantization_config.get('bits', '4bit')}bit"
143
+ if quant_method == "aqlm":
144
+ hardware = "gpu"
145
+ quant_type = "AQLM"
146
+ nbits_per_codebook = quantization_config.get('nbits_per_codebook')
147
+ num_codebooks = quantization_config.get('num_codebooks')
148
+ in_group_size = quantization_config.get('in_group_size')
149
+ bits = int(nbits_per_codebook * num_codebooks / in_group_size)
150
+ precision = f"{bits}bit"
151
+
152
+ if precision == "4bit":
153
+ weight_dtype = "int4"
154
+ elif precision == "3bit":
155
+ weight_dtype = "int3"
156
+ elif precision == "2bit":
157
+ weight_dtype = "int2"
158
 
159
  if quant_type is None or quant_type == "":
160
  return styled_error("Please select a quantization model like GPTQ, AWQ etc.")