Clémentine
commited on
Commit
·
7689092
1
Parent(s):
816b1dc
added doc
Browse files- custom_tasks.py +1 -1
- src/backend/manage_requests.py +10 -6
- src/backend/run_eval_suite_harness.py +17 -1
- src/backend/run_eval_suite_lighteval.py +16 -0
- src/backend/sort_queue.py +1 -1
custom_tasks.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
# ruff: noqa: F405, F403, F401
|
2 |
"""
|
3 |
-
Custom evaluation tasks for lighteval.
|
4 |
|
5 |
This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
|
6 |
|
|
|
1 |
# ruff: noqa: F405, F403, F401
|
2 |
"""
|
3 |
+
Custom evaluation tasks for lighteval. Complete this task with your own configuration if you want to use a custom lighteval task.
|
4 |
|
5 |
This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
|
6 |
|
src/backend/manage_requests.py
CHANGED
@@ -11,27 +11,32 @@ logger = setup_logger(__name__)
|
|
11 |
|
12 |
@dataclass
|
13 |
class EvalRequest:
|
|
|
|
|
14 |
model: str
|
15 |
-
private: bool
|
16 |
status: str
|
17 |
json_filepath: str
|
18 |
weight_type: str = "Original"
|
19 |
model_type: str = "" # pretrained, finetuned, with RL
|
20 |
precision: str = "" # float16, bfloat16
|
21 |
-
|
22 |
-
revision: str = "main" # commit
|
23 |
submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
|
24 |
-
model_type: Optional[str] = None
|
25 |
likes: Optional[int] = 0
|
26 |
params: Optional[int] = None
|
27 |
license: Optional[str] = ""
|
28 |
|
29 |
def get_model_args(self):
|
|
|
|
|
|
|
30 |
model_args = f"pretrained={self.model},revision={self.revision}"
|
31 |
|
32 |
if self.precision in ["float16", "bfloat16", "float32"]:
|
33 |
model_args += f",dtype={self.precision}"
|
|
|
34 |
# Quantized models need some added config, the install of bits and bytes, etc
|
|
|
35 |
#elif self.precision == "8bit":
|
36 |
# model_args += ",load_in_8bit=True"
|
37 |
#elif self.precision == "4bit":
|
@@ -39,7 +44,6 @@ class EvalRequest:
|
|
39 |
#elif self.precision == "GPTQ":
|
40 |
# A GPTQ model does not need dtype to be specified,
|
41 |
# it will be inferred from the config
|
42 |
-
pass
|
43 |
else:
|
44 |
raise Exception(f"Unknown precision {self.precision}.")
|
45 |
|
@@ -67,7 +71,7 @@ def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str,
|
|
67 |
|
68 |
|
69 |
def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
|
70 |
-
"""
|
71 |
models appearing first, followed by public models sorted by the number of
|
72 |
likes.
|
73 |
|
|
|
11 |
|
12 |
@dataclass
|
13 |
class EvalRequest:
|
14 |
+
"""This class represents one evaluation request file.
|
15 |
+
"""
|
16 |
model: str
|
|
|
17 |
status: str
|
18 |
json_filepath: str
|
19 |
weight_type: str = "Original"
|
20 |
model_type: str = "" # pretrained, finetuned, with RL
|
21 |
precision: str = "" # float16, bfloat16
|
22 |
+
revision: str = "main" # commit hash
|
|
|
23 |
submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
|
24 |
+
model_type: Optional[str] = None # pretrained, fine-tuned, etc - define your own categories in
|
25 |
likes: Optional[int] = 0
|
26 |
params: Optional[int] = None
|
27 |
license: Optional[str] = ""
|
28 |
|
29 |
def get_model_args(self):
|
30 |
+
"""Edit this function if you want to manage more complex quantization issues. You'll need to map it to
|
31 |
+
the evaluation suite you chose.
|
32 |
+
"""
|
33 |
model_args = f"pretrained={self.model},revision={self.revision}"
|
34 |
|
35 |
if self.precision in ["float16", "bfloat16", "float32"]:
|
36 |
model_args += f",dtype={self.precision}"
|
37 |
+
|
38 |
# Quantized models need some added config, the install of bits and bytes, etc
|
39 |
+
|
40 |
#elif self.precision == "8bit":
|
41 |
# model_args += ",load_in_8bit=True"
|
42 |
#elif self.precision == "4bit":
|
|
|
44 |
#elif self.precision == "GPTQ":
|
45 |
# A GPTQ model does not need dtype to be specified,
|
46 |
# it will be inferred from the config
|
|
|
47 |
else:
|
48 |
raise Exception(f"Unknown precision {self.precision}.")
|
49 |
|
|
|
71 |
|
72 |
|
73 |
def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
|
74 |
+
"""Gets all pending evaluation requests and return a list in which private
|
75 |
models appearing first, followed by public models sorted by the number of
|
76 |
likes.
|
77 |
|
src/backend/run_eval_suite_harness.py
CHANGED
@@ -12,7 +12,23 @@ from src.logging import setup_logger
|
|
12 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
13 |
logger = setup_logger(__name__)
|
14 |
|
15 |
-
def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
if limit:
|
17 |
logger.info(
|
18 |
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
|
|
12 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
13 |
logger = setup_logger(__name__)
|
14 |
|
15 |
+
def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int, batch_size: int, device: str, local_dir: str, results_repo: str, no_cache: bool =True, limit: int =None):
|
16 |
+
"""Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
eval_request (EvalRequest): Input evaluation request file representation
|
20 |
+
task_names (list): Tasks to launch
|
21 |
+
num_fewshot (int): Number of few shots to use
|
22 |
+
batch_size (int): Selected batch size
|
23 |
+
device (str): "cpu" or "gpu:0", depending on what you assigned to the space
|
24 |
+
local_dir (str): Where to save the results locally
|
25 |
+
results_repo (str): To which repository to upload the results
|
26 |
+
no_cache (bool, optional): Whether to use a cache or not.
|
27 |
+
limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
|
28 |
+
|
29 |
+
Returns:
|
30 |
+
_type_: _description_
|
31 |
+
"""
|
32 |
if limit:
|
33 |
logger.info(
|
34 |
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
src/backend/run_eval_suite_lighteval.py
CHANGED
@@ -13,6 +13,22 @@ logging.getLogger("openai").setLevel(logging.WARNING)
|
|
13 |
logger = setup_logger(__name__)
|
14 |
|
15 |
def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int, local_dir: str, accelerator: str, region: str, vendor: str, instance_size: str, instance_type: str, limit=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
if limit:
|
17 |
logger.info("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
18 |
|
|
|
13 |
logger = setup_logger(__name__)
|
14 |
|
15 |
def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int, local_dir: str, accelerator: str, region: str, vendor: str, instance_size: str, instance_type: str, limit=None):
|
16 |
+
"""Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
eval_request (EvalRequest): Input evaluation request file representation
|
20 |
+
task_names (list): Tasks to launch
|
21 |
+
batch_size (int): Selected batch size
|
22 |
+
accelerator (str): Inference endpoint parameter for running the evaluation
|
23 |
+
region (str): Inference endpoint parameter for running the evaluation
|
24 |
+
vendor (str): Inference endpoint parameter for running the evaluation
|
25 |
+
instance_size (str): Inference endpoint parameter for running the evaluation
|
26 |
+
instance_type (str): Inference endpoint parameter for running the evaluation
|
27 |
+
local_dir (str): Where to save the results locally
|
28 |
+
no_cache (bool, optional): Whether to use a cache or not.
|
29 |
+
limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
|
30 |
+
"""
|
31 |
+
|
32 |
if limit:
|
33 |
logger.info("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
34 |
|
src/backend/sort_queue.py
CHANGED
@@ -11,7 +11,7 @@ class ModelMetadata:
|
|
11 |
likes: int = 0
|
12 |
size: int = 15
|
13 |
|
14 |
-
|
15 |
def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
|
16 |
private_models = [model for model in models if model.private]
|
17 |
public_models = [model for model in models if not model.private]
|
|
|
11 |
likes: int = 0
|
12 |
size: int = 15
|
13 |
|
14 |
+
# All the functions below sort the models in the queue based on different parameters
|
15 |
def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
|
16 |
private_models = [model for model in models if model.private]
|
17 |
public_models = [model for model in models if not model.private]
|