Clémentine
commited on
Commit
·
7689092
1
Parent(s):
816b1dc
added doc
Browse files- custom_tasks.py +1 -1
- src/backend/manage_requests.py +10 -6
- src/backend/run_eval_suite_harness.py +17 -1
- src/backend/run_eval_suite_lighteval.py +16 -0
- src/backend/sort_queue.py +1 -1
custom_tasks.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# ruff: noqa: F405, F403, F401
|
| 2 |
"""
|
| 3 |
-
Custom evaluation tasks for lighteval.
|
| 4 |
|
| 5 |
This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
|
| 6 |
|
|
|
|
| 1 |
# ruff: noqa: F405, F403, F401
|
| 2 |
"""
|
| 3 |
+
Custom evaluation tasks for lighteval. Complete this task with your own configuration if you want to use a custom lighteval task.
|
| 4 |
|
| 5 |
This file generally create just a TASKS_TABLE and TASKS_GROUPS which are then imported by LightEval.
|
| 6 |
|
src/backend/manage_requests.py
CHANGED
|
@@ -11,27 +11,32 @@ logger = setup_logger(__name__)
|
|
| 11 |
|
| 12 |
@dataclass
|
| 13 |
class EvalRequest:
|
|
|
|
|
|
|
| 14 |
model: str
|
| 15 |
-
private: bool
|
| 16 |
status: str
|
| 17 |
json_filepath: str
|
| 18 |
weight_type: str = "Original"
|
| 19 |
model_type: str = "" # pretrained, finetuned, with RL
|
| 20 |
precision: str = "" # float16, bfloat16
|
| 21 |
-
|
| 22 |
-
revision: str = "main" # commit
|
| 23 |
submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
|
| 24 |
-
model_type: Optional[str] = None
|
| 25 |
likes: Optional[int] = 0
|
| 26 |
params: Optional[int] = None
|
| 27 |
license: Optional[str] = ""
|
| 28 |
|
| 29 |
def get_model_args(self):
|
|
|
|
|
|
|
|
|
|
| 30 |
model_args = f"pretrained={self.model},revision={self.revision}"
|
| 31 |
|
| 32 |
if self.precision in ["float16", "bfloat16", "float32"]:
|
| 33 |
model_args += f",dtype={self.precision}"
|
|
|
|
| 34 |
# Quantized models need some added config, the install of bits and bytes, etc
|
|
|
|
| 35 |
#elif self.precision == "8bit":
|
| 36 |
# model_args += ",load_in_8bit=True"
|
| 37 |
#elif self.precision == "4bit":
|
|
@@ -39,7 +44,6 @@ class EvalRequest:
|
|
| 39 |
#elif self.precision == "GPTQ":
|
| 40 |
# A GPTQ model does not need dtype to be specified,
|
| 41 |
# it will be inferred from the config
|
| 42 |
-
pass
|
| 43 |
else:
|
| 44 |
raise Exception(f"Unknown precision {self.precision}.")
|
| 45 |
|
|
@@ -67,7 +71,7 @@ def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str,
|
|
| 67 |
|
| 68 |
|
| 69 |
def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
|
| 70 |
-
"""
|
| 71 |
models appearing first, followed by public models sorted by the number of
|
| 72 |
likes.
|
| 73 |
|
|
|
|
| 11 |
|
| 12 |
@dataclass
|
| 13 |
class EvalRequest:
|
| 14 |
+
"""This class represents one evaluation request file.
|
| 15 |
+
"""
|
| 16 |
model: str
|
|
|
|
| 17 |
status: str
|
| 18 |
json_filepath: str
|
| 19 |
weight_type: str = "Original"
|
| 20 |
model_type: str = "" # pretrained, finetuned, with RL
|
| 21 |
precision: str = "" # float16, bfloat16
|
| 22 |
+
revision: str = "main" # commit hash
|
|
|
|
| 23 |
submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
|
| 24 |
+
model_type: Optional[str] = None # pretrained, fine-tuned, etc - define your own categories in
|
| 25 |
likes: Optional[int] = 0
|
| 26 |
params: Optional[int] = None
|
| 27 |
license: Optional[str] = ""
|
| 28 |
|
| 29 |
def get_model_args(self):
|
| 30 |
+
"""Edit this function if you want to manage more complex quantization issues. You'll need to map it to
|
| 31 |
+
the evaluation suite you chose.
|
| 32 |
+
"""
|
| 33 |
model_args = f"pretrained={self.model},revision={self.revision}"
|
| 34 |
|
| 35 |
if self.precision in ["float16", "bfloat16", "float32"]:
|
| 36 |
model_args += f",dtype={self.precision}"
|
| 37 |
+
|
| 38 |
# Quantized models need some added config, the install of bits and bytes, etc
|
| 39 |
+
|
| 40 |
#elif self.precision == "8bit":
|
| 41 |
# model_args += ",load_in_8bit=True"
|
| 42 |
#elif self.precision == "4bit":
|
|
|
|
| 44 |
#elif self.precision == "GPTQ":
|
| 45 |
# A GPTQ model does not need dtype to be specified,
|
| 46 |
# it will be inferred from the config
|
|
|
|
| 47 |
else:
|
| 48 |
raise Exception(f"Unknown precision {self.precision}.")
|
| 49 |
|
|
|
|
| 71 |
|
| 72 |
|
| 73 |
def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
|
| 74 |
+
"""Gets all pending evaluation requests and return a list in which private
|
| 75 |
models appearing first, followed by public models sorted by the number of
|
| 76 |
likes.
|
| 77 |
|
src/backend/run_eval_suite_harness.py
CHANGED
|
@@ -12,7 +12,23 @@ from src.logging import setup_logger
|
|
| 12 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
| 13 |
logger = setup_logger(__name__)
|
| 14 |
|
| 15 |
-
def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
if limit:
|
| 17 |
logger.info(
|
| 18 |
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
|
|
|
| 12 |
logging.getLogger("openai").setLevel(logging.WARNING)
|
| 13 |
logger = setup_logger(__name__)
|
| 14 |
|
| 15 |
+
def run_evaluation(eval_request: EvalRequest, task_names: list, num_fewshot: int, batch_size: int, device: str, local_dir: str, results_repo: str, no_cache: bool =True, limit: int =None):
|
| 16 |
+
"""Runs one evaluation for the current evaluation request file, then pushes the results to the hub.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
eval_request (EvalRequest): Input evaluation request file representation
|
| 20 |
+
task_names (list): Tasks to launch
|
| 21 |
+
num_fewshot (int): Number of few shots to use
|
| 22 |
+
batch_size (int): Selected batch size
|
| 23 |
+
device (str): "cpu" or "gpu:0", depending on what you assigned to the space
|
| 24 |
+
local_dir (str): Where to save the results locally
|
| 25 |
+
results_repo (str): To which repository to upload the results
|
| 26 |
+
no_cache (bool, optional): Whether to use a cache or not.
|
| 27 |
+
limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
_type_: _description_
|
| 31 |
+
"""
|
| 32 |
if limit:
|
| 33 |
logger.info(
|
| 34 |
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
src/backend/run_eval_suite_lighteval.py
CHANGED
|
@@ -13,6 +13,22 @@ logging.getLogger("openai").setLevel(logging.WARNING)
|
|
| 13 |
logger = setup_logger(__name__)
|
| 14 |
|
| 15 |
def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int, local_dir: str, accelerator: str, region: str, vendor: str, instance_size: str, instance_type: str, limit=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
if limit:
|
| 17 |
logger.info("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
| 18 |
|
|
|
|
| 13 |
logger = setup_logger(__name__)
|
| 14 |
|
| 15 |
def run_evaluation(eval_request: EvalRequest, task_names: str, batch_size: int, local_dir: str, accelerator: str, region: str, vendor: str, instance_size: str, instance_type: str, limit=None):
|
| 16 |
+
"""Runs one evaluation for the current evaluation request file using lighteval, then pushes the results to the hub.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
eval_request (EvalRequest): Input evaluation request file representation
|
| 20 |
+
task_names (list): Tasks to launch
|
| 21 |
+
batch_size (int): Selected batch size
|
| 22 |
+
accelerator (str): Inference endpoint parameter for running the evaluation
|
| 23 |
+
region (str): Inference endpoint parameter for running the evaluation
|
| 24 |
+
vendor (str): Inference endpoint parameter for running the evaluation
|
| 25 |
+
instance_size (str): Inference endpoint parameter for running the evaluation
|
| 26 |
+
instance_type (str): Inference endpoint parameter for running the evaluation
|
| 27 |
+
local_dir (str): Where to save the results locally
|
| 28 |
+
no_cache (bool, optional): Whether to use a cache or not.
|
| 29 |
+
limit (int, optional): Whether to use a number of samples only for the evaluation - only for debugging
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
if limit:
|
| 33 |
logger.info("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
|
| 34 |
|
src/backend/sort_queue.py
CHANGED
|
@@ -11,7 +11,7 @@ class ModelMetadata:
|
|
| 11 |
likes: int = 0
|
| 12 |
size: int = 15
|
| 13 |
|
| 14 |
-
|
| 15 |
def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
|
| 16 |
private_models = [model for model in models if model.private]
|
| 17 |
public_models = [model for model in models if not model.private]
|
|
|
|
| 11 |
likes: int = 0
|
| 12 |
size: int = 15
|
| 13 |
|
| 14 |
+
# All the functions below sort the models in the queue based on different parameters
|
| 15 |
def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
|
| 16 |
private_models = [model for model in models if model.private]
|
| 17 |
public_models = [model for model in models if not model.private]
|