ljvmiranda921 commited on
Commit
814a536
·
1 Parent(s): e0bc74c

Apply formatter

Browse files
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
 
5
  from huggingface_hub import snapshot_download
6
 
7
  from src.about import (
@@ -20,11 +20,19 @@ from src.display.utils import (
20
  EVAL_TYPES,
21
  AutoEvalColumn,
22
  ModelType,
23
- fields,
24
  WeightType,
25
- Precision
 
 
 
 
 
 
 
 
 
26
  )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
 
@@ -32,24 +40,37 @@ from src.submission.submit import add_new_eval
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
 
35
  ### Space initialisation
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
40
  )
41
  except Exception:
42
  restart_space()
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
  snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
47
  )
48
  except Exception:
49
  restart_space()
50
 
51
 
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
 
53
 
54
  (
55
  finished_eval_queue_df,
@@ -57,6 +78,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -64,15 +86,23 @@ def init_leaderboard(dataframe):
64
  value=dataframe,
65
  datatype=[c.type for c in fields(AutoEvalColumn)],
66
  select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
 
 
68
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
 
 
 
 
 
 
76
  ColumnFilter(
77
  AutoEvalColumn.params.name,
78
  type="slider",
@@ -81,7 +111,10 @@ def init_leaderboard(dataframe):
81
  label="Select the number of parameters (B)",
82
  ),
83
  ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
 
 
 
85
  ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
@@ -142,14 +175,20 @@ with demo:
142
  row_count=5,
143
  )
144
  with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
 
 
146
 
147
  with gr.Row():
148
  with gr.Column():
149
  model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
 
 
151
  model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
 
 
153
  label="Model type",
154
  multiselect=False,
155
  value=None,
@@ -158,7 +197,9 @@ with demo:
158
 
159
  with gr.Column():
160
  precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
 
 
162
  label="Precision",
163
  multiselect=False,
164
  value="float16",
@@ -171,7 +212,9 @@ with demo:
171
  value="Original",
172
  interactive=True,
173
  )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
 
 
175
 
176
  submit_button = gr.Button("Submit Eval")
177
  submission_result = gr.Markdown()
@@ -201,4 +244,4 @@ with demo:
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import gradio as gr
 
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
+ from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
5
  from huggingface_hub import snapshot_download
6
 
7
  from src.about import (
 
20
  EVAL_TYPES,
21
  AutoEvalColumn,
22
  ModelType,
23
+ Precision,
24
  WeightType,
25
+ fields,
26
+ )
27
+ from src.envs import (
28
+ API,
29
+ EVAL_REQUESTS_PATH,
30
+ EVAL_RESULTS_PATH,
31
+ QUEUE_REPO,
32
+ REPO_ID,
33
+ RESULTS_REPO,
34
+ TOKEN,
35
  )
 
36
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
37
  from src.submission.submit import add_new_eval
38
 
 
40
  def restart_space():
41
  API.restart_space(repo_id=REPO_ID)
42
 
43
+
44
  ### Space initialisation
45
  try:
46
  print(EVAL_REQUESTS_PATH)
47
  snapshot_download(
48
+ repo_id=QUEUE_REPO,
49
+ local_dir=EVAL_REQUESTS_PATH,
50
+ repo_type="dataset",
51
+ tqdm_class=None,
52
+ etag_timeout=30,
53
+ token=TOKEN,
54
  )
55
  except Exception:
56
  restart_space()
57
  try:
58
  print(EVAL_RESULTS_PATH)
59
  snapshot_download(
60
+ repo_id=RESULTS_REPO,
61
+ local_dir=EVAL_RESULTS_PATH,
62
+ repo_type="dataset",
63
+ tqdm_class=None,
64
+ etag_timeout=30,
65
+ token=TOKEN,
66
  )
67
  except Exception:
68
  restart_space()
69
 
70
 
71
+ LEADERBOARD_DF = get_leaderboard_df(
72
+ EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS
73
+ )
74
 
75
  (
76
  finished_eval_queue_df,
 
78
  pending_eval_queue_df,
79
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
80
 
81
+
82
  def init_leaderboard(dataframe):
83
  if dataframe is None or dataframe.empty:
84
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
86
  value=dataframe,
87
  datatype=[c.type for c in fields(AutoEvalColumn)],
88
  select_columns=SelectColumns(
89
+ default_selection=[
90
+ c.name for c in fields(AutoEvalColumn) if c.displayed_by_default
91
+ ],
92
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
93
  label="Select Columns to Display:",
94
  ),
95
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
96
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
97
  filter_columns=[
98
+ ColumnFilter(
99
+ AutoEvalColumn.model_type.name,
100
+ type="checkboxgroup",
101
+ label="Model types",
102
+ ),
103
+ ColumnFilter(
104
+ AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"
105
+ ),
106
  ColumnFilter(
107
  AutoEvalColumn.params.name,
108
  type="slider",
 
111
  label="Select the number of parameters (B)",
112
  ),
113
  ColumnFilter(
114
+ AutoEvalColumn.still_on_hub.name,
115
+ type="boolean",
116
+ label="Deleted/incomplete",
117
+ default=True,
118
  ),
119
  ],
120
  bool_checkboxgroup_label="Hide models",
 
175
  row_count=5,
176
  )
177
  with gr.Row():
178
+ gr.Markdown(
179
+ "# ✉️✨ Submit your model here!", elem_classes="markdown-text"
180
+ )
181
 
182
  with gr.Row():
183
  with gr.Column():
184
  model_name_textbox = gr.Textbox(label="Model name")
185
+ revision_name_textbox = gr.Textbox(
186
+ label="Revision commit", placeholder="main"
187
+ )
188
  model_type = gr.Dropdown(
189
+ choices=[
190
+ t.to_str(" : ") for t in ModelType if t != ModelType.Unknown
191
+ ],
192
  label="Model type",
193
  multiselect=False,
194
  value=None,
 
197
 
198
  with gr.Column():
199
  precision = gr.Dropdown(
200
+ choices=[
201
+ i.value.name for i in Precision if i != Precision.Unknown
202
+ ],
203
  label="Precision",
204
  multiselect=False,
205
  value="float16",
 
212
  value="Original",
213
  interactive=True,
214
  )
215
+ base_model_name_textbox = gr.Textbox(
216
+ label="Base model (for delta or adapter weights)"
217
+ )
218
 
219
  submit_button = gr.Button("Submit Eval")
220
  submission_result = gr.Markdown()
 
244
  scheduler = BackgroundScheduler()
245
  scheduler.add_job(restart_space, "interval", seconds=1800)
246
  scheduler.start()
247
+ demo.queue(default_concurrency_limit=40).launch()
pyproject.toml CHANGED
@@ -2,12 +2,13 @@
2
  # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
  select = ["E", "F"]
4
  ignore = ["E501"] # line too long (black is taking care of this)
5
- line-length = 119
6
  fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
 
8
  [tool.isort]
9
  profile = "black"
10
- line_length = 119
 
11
 
12
  [tool.black]
13
- line-length = 119
 
2
  # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
  select = ["E", "F"]
4
  ignore = ["E501"] # line too long (black is taking care of this)
5
+ line-length = 88
6
  fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
 
8
  [tool.isort]
9
  profile = "black"
10
+ line_length = 88
11
+ multi_line_output = 9
12
 
13
  [tool.black]
14
+ line-length = 88
src/about.py CHANGED
@@ -1,6 +1,7 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
4
  @dataclass
5
  class Task:
6
  benchmark: str
@@ -11,13 +12,13 @@ class Task:
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  task0 = Task("anli_r1", "acc", "ANLI")
16
  task1 = Task("logiqa", "acc_norm", "LogiQA")
17
 
18
- NUM_FEWSHOT = 0 # Change with your few shot
19
- # ---------------------------------------------------
20
 
 
 
21
 
22
 
23
  # Your leaderboard name
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+
5
  @dataclass
6
  class Task:
7
  benchmark: str
 
12
  # Select your tasks here
13
  # ---------------------------------------------------
14
  class Tasks(Enum):
15
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
  task0 = Task("anli_r1", "acc", "ANLI")
17
  task1 = Task("logiqa", "acc_norm", "LogiQA")
18
 
 
 
19
 
20
+ NUM_FEWSHOT = 0 # Change with your few shot
21
+ # ---------------------------------------------------
22
 
23
 
24
  # Your leaderboard name
src/display/formatting.py CHANGED
@@ -16,7 +16,9 @@ def styled_warning(warn):
16
 
17
 
18
  def styled_message(message):
19
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
 
 
20
 
21
 
22
  def has_no_nan_values(df, columns):
 
16
 
17
 
18
  def styled_message(message):
19
+ return (
20
+ f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
21
+ )
22
 
23
 
24
  def has_no_nan_values(df, columns):
src/display/utils.py CHANGED
@@ -5,8 +5,11 @@ import pandas as pd
5
 
6
  from src.about import Tasks
7
 
 
8
  def fields(raw_class):
9
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
 
10
 
11
 
12
  # These classes are for user facing column names,
@@ -20,29 +23,69 @@ class ColumnContent:
20
  hidden: bool = False
21
  never_hidden: bool = False
22
 
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
25
  # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
 
32
  # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
 
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
@@ -53,12 +96,13 @@ class EvalQueueColumn: # Queue column
53
  weight_type = ColumnContent("weight_type", "str", "Original")
54
  status = ColumnContent("status", "str", True)
55
 
 
56
  ## All the model information that we might need
57
  @dataclass
58
  class ModelDetails:
59
  name: str
60
  display_name: str = ""
61
- symbol: str = "" # emoji
62
 
63
 
64
  class ModelType(Enum):
@@ -83,11 +127,13 @@ class ModelType(Enum):
83
  return ModelType.IFT
84
  return ModelType.Unknown
85
 
 
86
  class WeightType(Enum):
87
  Adapter = ModelDetails("Adapter")
88
  Original = ModelDetails("Original")
89
  Delta = ModelDetails("Delta")
90
 
 
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
@@ -100,6 +146,7 @@ class Precision(Enum):
100
  return Precision.bfloat16
101
  return Precision.Unknown
102
 
 
103
  # Column selection
104
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
 
@@ -107,4 +154,3 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
5
 
6
  from src.about import Tasks
7
 
8
+
9
  def fields(raw_class):
10
+ return [
11
+ v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"
12
+ ]
13
 
14
 
15
  # These classes are for user facing column names,
 
23
  hidden: bool = False
24
  never_hidden: bool = False
25
 
26
+
27
  ## Leaderboard columns
28
  auto_eval_column_dict = []
29
  # Init
30
+ auto_eval_column_dict.append(
31
+ [
32
+ "model_type_symbol",
33
+ ColumnContent,
34
+ ColumnContent("T", "str", True, never_hidden=True),
35
+ ]
36
+ )
37
+ auto_eval_column_dict.append(
38
+ [
39
+ "model",
40
+ ColumnContent,
41
+ ColumnContent("Model", "markdown", True, never_hidden=True),
42
+ ]
43
+ )
44
+ # Scores
45
+ auto_eval_column_dict.append(
46
+ ["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)]
47
+ )
48
  for task in Tasks:
49
+ auto_eval_column_dict.append(
50
+ [task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)]
51
+ )
52
  # Model information
53
+ auto_eval_column_dict.append(
54
+ ["model_type", ColumnContent, ColumnContent("Type", "str", False)]
55
+ )
56
+ auto_eval_column_dict.append(
57
+ ["architecture", ColumnContent, ColumnContent("Architecture", "str", False)]
58
+ )
59
+ auto_eval_column_dict.append(
60
+ ["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)]
61
+ )
62
+ auto_eval_column_dict.append(
63
+ ["precision", ColumnContent, ColumnContent("Precision", "str", False)]
64
+ )
65
+ auto_eval_column_dict.append(
66
+ ["license", ColumnContent, ColumnContent("Hub License", "str", False)]
67
+ )
68
+ auto_eval_column_dict.append(
69
+ ["params", ColumnContent, ColumnContent("#Params (B)", "number", False)]
70
+ )
71
+ auto_eval_column_dict.append(
72
+ ["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)]
73
+ )
74
+ auto_eval_column_dict.append(
75
+ [
76
+ "still_on_hub",
77
+ ColumnContent,
78
+ ColumnContent("Available on the hub", "bool", False),
79
+ ]
80
+ )
81
+ auto_eval_column_dict.append(
82
+ ["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)]
83
+ )
84
 
85
  # We use make dataclass to dynamically fill the scores from Tasks
86
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
87
 
88
+
89
  ## For the queue columns in the submission tab
90
  @dataclass(frozen=True)
91
  class EvalQueueColumn: # Queue column
 
96
  weight_type = ColumnContent("weight_type", "str", "Original")
97
  status = ColumnContent("status", "str", True)
98
 
99
+
100
  ## All the model information that we might need
101
  @dataclass
102
  class ModelDetails:
103
  name: str
104
  display_name: str = ""
105
+ symbol: str = "" # emoji
106
 
107
 
108
  class ModelType(Enum):
 
127
  return ModelType.IFT
128
  return ModelType.Unknown
129
 
130
+
131
  class WeightType(Enum):
132
  Adapter = ModelDetails("Adapter")
133
  Original = ModelDetails("Original")
134
  Delta = ModelDetails("Delta")
135
 
136
+
137
  class Precision(Enum):
138
  float16 = ModelDetails("float16")
139
  bfloat16 = ModelDetails("bfloat16")
 
146
  return Precision.bfloat16
147
  return Precision.Unknown
148
 
149
+
150
  # Column selection
151
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
152
 
 
154
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
155
 
156
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
src/envs.py CHANGED
@@ -4,9 +4,9 @@ from huggingface_hub import HfApi
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
- TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
@@ -14,7 +14,7 @@ QUEUE_REPO = f"{OWNER}/requests"
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
 
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
+ CACHE_PATH = os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
src/leaderboard/read_evals.py CHANGED
@@ -8,28 +8,28 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
15
  @dataclass
16
  class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
  model: str
23
- revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
32
- date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
 
35
  @classmethod
@@ -58,7 +58,10 @@ class EvalResult:
58
  full_model = "/".join(org_and_model)
59
 
60
  still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
 
 
 
62
  )
63
  architecture = "?"
64
  if model_config is not None:
@@ -72,7 +75,13 @@ class EvalResult:
72
  task = task.value
73
 
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
 
 
 
 
 
 
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
 
@@ -85,15 +94,17 @@ class EvalResult:
85
  org=org,
86
  model=model,
87
  results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
- architecture=architecture
92
  )
93
 
94
  def update_with_request_file(self, requests_path):
95
  """Finds the relevant request file for the current model and updates info with it"""
96
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
 
 
97
 
98
  try:
99
  with open(request_file, "r") as f:
@@ -105,7 +116,9 @@ class EvalResult:
105
  self.num_params = request.get("params", 0)
106
  self.date = request.get("submitted_time", "")
107
  except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
 
 
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -165,7 +178,9 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
165
 
166
  # Sort the files by date
167
  try:
168
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
 
 
169
  except dateutil.parser._parser.ParserError:
170
  files = [files[-1]]
171
 
@@ -181,14 +196,16 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
181
  # Store results of same eval together
182
  eval_name = eval_result.eval_name
183
  if eval_name in eval_results.keys():
184
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
 
 
185
  else:
186
  eval_results[eval_name] = eval_result
187
 
188
  results = []
189
  for v in eval_results.values():
190
  try:
191
- v.to_dict() # we test if the dict version is complete
192
  results.append(v)
193
  except KeyError: # not all eval values present
194
  continue
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
15
  @dataclass
16
  class EvalResult:
17
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
18
+
19
+ eval_name: str # org_model_precision (uid)
20
+ full_model: str # org/model (path on hub)
21
+ org: str
22
  model: str
23
+ revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
+ weight_type: WeightType = WeightType.Original # Original or Adapter
28
+ architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
32
+ date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
 
35
  @classmethod
 
58
  full_model = "/".join(org_and_model)
59
 
60
  still_on_hub, _, model_config = is_model_on_hub(
61
+ full_model,
62
+ config.get("model_sha", "main"),
63
+ trust_remote_code=True,
64
+ test_tokenizer=False,
65
  )
66
  architecture = "?"
67
  if model_config is not None:
 
75
  task = task.value
76
 
77
  # We average all scores of a given metric (not all metrics are present in all files)
78
+ accs = np.array(
79
+ [
80
+ v.get(task.metric, None)
81
+ for k, v in data["results"].items()
82
+ if task.benchmark == k
83
+ ]
84
+ )
85
  if accs.size == 0 or any([acc is None for acc in accs]):
86
  continue
87
 
 
94
  org=org,
95
  model=model,
96
  results=results,
97
+ precision=precision,
98
+ revision=config.get("model_sha", ""),
99
  still_on_hub=still_on_hub,
100
+ architecture=architecture,
101
  )
102
 
103
  def update_with_request_file(self, requests_path):
104
  """Finds the relevant request file for the current model and updates info with it"""
105
+ request_file = get_request_file_for_model(
106
+ requests_path, self.full_model, self.precision.value.name
107
+ )
108
 
109
  try:
110
  with open(request_file, "r") as f:
 
116
  self.num_params = request.get("params", 0)
117
  self.date = request.get("submitted_time", "")
118
  except Exception:
119
+ print(
120
+ f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
121
+ )
122
 
123
  def to_dict(self):
124
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
178
 
179
  # Sort the files by date
180
  try:
181
+ files.sort(
182
+ key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7]
183
+ )
184
  except dateutil.parser._parser.ParserError:
185
  files = [files[-1]]
186
 
 
196
  # Store results of same eval together
197
  eval_name = eval_result.eval_name
198
  if eval_name in eval_results.keys():
199
+ eval_results[eval_name].results.update(
200
+ {k: v for k, v in eval_result.results.items() if v is not None}
201
+ )
202
  else:
203
  eval_results[eval_name] = eval_result
204
 
205
  results = []
206
  for v in eval_results.values():
207
  try:
208
+ v.to_dict() # we test if the dict version is complete
209
  results.append(v)
210
  except KeyError: # not all eval values present
211
  continue
src/populate.py CHANGED
@@ -8,7 +8,9 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
 
 
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
@@ -39,7 +41,11 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
39
  all_evals.append(data)
40
  elif ".md" not in entry:
41
  # this is a folder
42
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
 
 
 
 
43
  for sub_entry in sub_entries:
44
  file_path = os.path.join(save_path, entry, sub_entry)
45
  with open(file_path) as fp:
@@ -51,7 +57,11 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
51
 
52
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
 
 
 
 
55
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
 
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
+ def get_leaderboard_df(
12
+ results_path: str, requests_path: str, cols: list, benchmark_cols: list
13
+ ) -> pd.DataFrame:
14
  """Creates a dataframe from all the individual experiment results"""
15
  raw_data = get_raw_eval_results(results_path, requests_path)
16
  all_data_json = [v.to_dict() for v in raw_data]
 
41
  all_evals.append(data)
42
  elif ".md" not in entry:
43
  # this is a folder
44
+ sub_entries = [
45
+ e
46
+ for e in os.listdir(f"{save_path}/{entry}")
47
+ if os.path.isfile(e) and not e.startswith(".")
48
+ ]
49
  for sub_entry in sub_entries:
50
  file_path = os.path.join(save_path, entry, sub_entry)
51
  with open(file_path) as fp:
 
57
 
58
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
59
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
60
+ finished_list = [
61
+ e
62
+ for e in all_evals
63
+ if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"
64
+ ]
65
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
66
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
67
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
src/submission/check_validity.py CHANGED
@@ -10,12 +10,16 @@ from huggingface_hub.hf_api import ModelInfo
10
  from transformers import AutoConfig
11
  from transformers.models.auto.tokenization_auto import AutoTokenizer
12
 
 
13
  def check_model_card(repo_id: str) -> tuple[bool, str]:
14
  """Checks if the model card and license exist and have been filled"""
15
  try:
16
  card = ModelCard.load(repo_id)
17
  except huggingface_hub.utils.EntryNotFoundError:
18
- return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
 
 
 
19
 
20
  # Enforce license metadata
21
  if card.data.license is None:
@@ -31,28 +35,49 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
31
 
32
  return True, ""
33
 
34
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
 
 
 
 
 
 
 
35
  """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
  try:
37
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
 
 
 
 
 
38
  if test_tokenizer:
39
  try:
40
- tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
 
 
 
 
 
41
  except ValueError as e:
42
  return (
43
  False,
44
  f"uses a tokenizer which is not in a transformers release: {e}",
45
- None
46
  )
47
  except Exception as e:
48
- return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
 
 
 
 
49
  return True, None, config
50
 
51
  except ValueError:
52
  return (
53
  False,
54
  "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
55
- None
56
  )
57
 
58
  except Exception as e:
@@ -64,16 +89,22 @@ def get_model_size(model_info: ModelInfo, precision: str):
64
  try:
65
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
  except (AttributeError, TypeError):
67
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
 
 
68
 
69
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
 
 
70
  model_size = size_factor * model_size
71
  return model_size
72
 
 
73
  def get_model_arch(model_info: ModelInfo):
74
  """Gets the model architecture from the configuration"""
75
  return model_info.config.get("architectures", "Unknown")
76
 
 
77
  def already_submitted_models(requested_models_dir: str) -> set[str]:
78
  """Gather a list of already submitted models to avoid duplicates"""
79
  depth = 1
@@ -88,12 +119,16 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
88
  continue
89
  with open(os.path.join(root, file), "r") as f:
90
  info = json.load(f)
91
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
 
 
92
 
93
  # Select organisation
94
  if info["model"].count("/") == 0 or "submitted_time" not in info:
95
  continue
96
  organisation, _ = info["model"].split("/")
97
- users_to_submission_dates[organisation].append(info["submitted_time"])
 
 
98
 
99
  return set(file_names), users_to_submission_dates
 
10
  from transformers import AutoConfig
11
  from transformers.models.auto.tokenization_auto import AutoTokenizer
12
 
13
+
14
  def check_model_card(repo_id: str) -> tuple[bool, str]:
15
  """Checks if the model card and license exist and have been filled"""
16
  try:
17
  card = ModelCard.load(repo_id)
18
  except huggingface_hub.utils.EntryNotFoundError:
19
+ return (
20
+ False,
21
+ "Please add a model card to your model to explain how you trained/fine-tuned it.",
22
+ )
23
 
24
  # Enforce license metadata
25
  if card.data.license is None:
 
35
 
36
  return True, ""
37
 
38
+
39
+ def is_model_on_hub(
40
+ model_name: str,
41
+ revision: str,
42
+ token: str = None,
43
+ trust_remote_code=False,
44
+ test_tokenizer=False,
45
+ ) -> tuple[bool, str]:
46
  """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
47
  try:
48
+ config = AutoConfig.from_pretrained(
49
+ model_name,
50
+ revision=revision,
51
+ trust_remote_code=trust_remote_code,
52
+ token=token,
53
+ )
54
  if test_tokenizer:
55
  try:
56
+ tk = AutoTokenizer.from_pretrained(
57
+ model_name,
58
+ revision=revision,
59
+ trust_remote_code=trust_remote_code,
60
+ token=token,
61
+ )
62
  except ValueError as e:
63
  return (
64
  False,
65
  f"uses a tokenizer which is not in a transformers release: {e}",
66
+ None,
67
  )
68
  except Exception as e:
69
+ return (
70
+ False,
71
+ "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
72
+ None,
73
+ )
74
  return True, None, config
75
 
76
  except ValueError:
77
  return (
78
  False,
79
  "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
80
+ None,
81
  )
82
 
83
  except Exception as e:
 
89
  try:
90
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
91
  except (AttributeError, TypeError):
92
+ return (
93
+ 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
94
+ )
95
 
96
+ size_factor = (
97
+ 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
98
+ )
99
  model_size = size_factor * model_size
100
  return model_size
101
 
102
+
103
  def get_model_arch(model_info: ModelInfo):
104
  """Gets the model architecture from the configuration"""
105
  return model_info.config.get("architectures", "Unknown")
106
 
107
+
108
  def already_submitted_models(requested_models_dir: str) -> set[str]:
109
  """Gather a list of already submitted models to avoid duplicates"""
110
  depth = 1
 
119
  continue
120
  with open(os.path.join(root, file), "r") as f:
121
  info = json.load(f)
122
+ file_names.append(
123
+ f"{info['model']}_{info['revision']}_{info['precision']}"
124
+ )
125
 
126
  # Select organisation
127
  if info["model"].count("/") == 0 or "submitted_time" not in info:
128
  continue
129
  organisation, _ = info["model"].split("/")
130
+ users_to_submission_dates[organisation].append(
131
+ info["submitted_time"]
132
+ )
133
 
134
  return set(file_names), users_to_submission_dates
src/submission/submit.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
  from src.submission.check_validity import (
8
  already_submitted_models,
9
  check_model_card,
@@ -14,6 +14,7 @@ from src.submission.check_validity import (
14
  REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
16
 
 
17
  def add_new_eval(
18
  model: str,
19
  base_model: str,
@@ -25,7 +26,9 @@ def add_new_eval(
25
  global REQUESTED_MODELS
26
  global USERS_TO_SUBMISSION_DATES
27
  if not REQUESTED_MODELS:
28
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
 
 
29
 
30
  user_name = ""
31
  model_path = model
@@ -45,12 +48,16 @@ def add_new_eval(
45
 
46
  # Is the model on the hub?
47
  if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
 
 
49
  if not base_model_on_hub:
50
  return styled_error(f'Base model "{base_model}" {error}')
51
 
52
  if not weight_type == "Adapter":
53
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
 
 
54
  if not model_on_hub:
55
  return styled_error(f'Model "{model}" {error}')
56
 
@@ -58,7 +65,9 @@ def add_new_eval(
58
  try:
59
  model_info = API.model_info(repo_id=model, revision=revision)
60
  except Exception:
61
- return styled_error("Could not get your model information. Please fill it up properly.")
 
 
62
 
63
  model_size = get_model_size(model_info=model_info, precision=precision)
64
 
@@ -97,7 +106,9 @@ def add_new_eval(
97
  print("Creating eval file")
98
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
  os.makedirs(OUT_DIR, exist_ok=True)
100
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
 
 
101
 
102
  with open(out_path, "w") as f:
103
  f.write(json.dumps(eval_entry))
 
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
+ from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
7
  from src.submission.check_validity import (
8
  already_submitted_models,
9
  check_model_card,
 
14
  REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
16
 
17
+
18
  def add_new_eval(
19
  model: str,
20
  base_model: str,
 
26
  global REQUESTED_MODELS
27
  global USERS_TO_SUBMISSION_DATES
28
  if not REQUESTED_MODELS:
29
+ REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(
30
+ EVAL_REQUESTS_PATH
31
+ )
32
 
33
  user_name = ""
34
  model_path = model
 
48
 
49
  # Is the model on the hub?
50
  if weight_type in ["Delta", "Adapter"]:
51
+ base_model_on_hub, error, _ = is_model_on_hub(
52
+ model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True
53
+ )
54
  if not base_model_on_hub:
55
  return styled_error(f'Base model "{base_model}" {error}')
56
 
57
  if not weight_type == "Adapter":
58
+ model_on_hub, error, _ = is_model_on_hub(
59
+ model_name=model, revision=revision, token=TOKEN, test_tokenizer=True
60
+ )
61
  if not model_on_hub:
62
  return styled_error(f'Model "{model}" {error}')
63
 
 
65
  try:
66
  model_info = API.model_info(repo_id=model, revision=revision)
67
  except Exception:
68
+ return styled_error(
69
+ "Could not get your model information. Please fill it up properly."
70
+ )
71
 
72
  model_size = get_model_size(model_info=model_info, precision=precision)
73
 
 
106
  print("Creating eval file")
107
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
108
  os.makedirs(OUT_DIR, exist_ok=True)
109
+ out_path = (
110
+ f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
111
+ )
112
 
113
  with open(out_path, "w") as f:
114
  f.write(json.dumps(eval_entry))