Alex commited on
Commit
15c92e9
·
1 Parent(s): b4d9db9
Files changed (3) hide show
  1. app.py +55 -1
  2. src/populate.py +35 -0
  3. src/submission/submit.py +130 -1
app.py CHANGED
@@ -28,7 +28,7 @@ from src.display.utils import (
28
  )
29
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
30
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
31
- from src.submission.submit import add_new_eval
32
 
33
 
34
  def restart_space():
@@ -190,6 +190,60 @@ with demo:
190
  submission_result,
191
  )
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  with gr.Row():
194
  with gr.Accordion("📙 Citation", open=False):
195
  citation_button = gr.Textbox(
 
28
  )
29
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
30
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
31
+ from src.submission.submit import add_new_eval, add_manual_results
32
 
33
 
34
  def restart_space():
 
190
  submission_result,
191
  )
192
 
193
+ # ----------------------------------------------------
194
+ # Manual metrics submission form
195
+ # ----------------------------------------------------
196
+ with gr.Markdown("## 📝 Submit metrics manually (advanced)"):
197
+ pass
198
+
199
+ with gr.Row():
200
+ with gr.Column():
201
+ model_name_metrics = gr.Textbox(label="Model name", placeholder="org/model")
202
+ revision_metrics = gr.Textbox(label="Revision commit", placeholder="main", value="main")
203
+ bleu_input = gr.Number(label="BLEU", value=0.5)
204
+ pass1_input = gr.Number(label="Pass@1", value=0.5, minimum=0.0, maximum=1.0)
205
+ pass5_input = gr.Number(label="Pass@5", value=0.5, minimum=0.0, maximum=1.0)
206
+ pass10_input = gr.Number(label="Pass@10", value=0.5, minimum=0.0, maximum=1.0)
207
+
208
+ with gr.Column():
209
+ # Subjective metrics sliders (0-5)
210
+ readability_slider = gr.Slider(0, 5, step=1, value=3, label="Readability")
211
+ relevance_slider = gr.Slider(0, 5, step=1, value=3, label="Relevance")
212
+ explanation_slider = gr.Slider(0, 5, step=1, value=3, label="Explanation clarity")
213
+ problem_slider = gr.Slider(0, 5, step=1, value=3, label="Problem identification")
214
+ actionability_slider = gr.Slider(0, 5, step=1, value=3, label="Actionability")
215
+ completeness_slider = gr.Slider(0, 5, step=1, value=3, label="Completeness")
216
+ specificity_slider = gr.Slider(0, 5, step=1, value=3, label="Specificity")
217
+ contextual_slider = gr.Slider(0, 5, step=1, value=3, label="Contextual adequacy")
218
+ consistency_slider = gr.Slider(0, 5, step=1, value=3, label="Consistency")
219
+ brevity_slider = gr.Slider(0, 5, step=1, value=3, label="Brevity")
220
+
221
+ submit_metrics_button = gr.Button("Submit Metrics")
222
+ metrics_submission_result = gr.Markdown()
223
+
224
+ submit_metrics_button.click(
225
+ add_manual_results,
226
+ [
227
+ model_name_metrics,
228
+ revision_metrics,
229
+ bleu_input,
230
+ readability_slider,
231
+ relevance_slider,
232
+ explanation_slider,
233
+ problem_slider,
234
+ actionability_slider,
235
+ completeness_slider,
236
+ specificity_slider,
237
+ contextual_slider,
238
+ consistency_slider,
239
+ brevity_slider,
240
+ pass1_input,
241
+ pass5_input,
242
+ pass10_input,
243
+ ],
244
+ metrics_submission_result,
245
+ )
246
+
247
  with gr.Row():
248
  with gr.Accordion("📙 Citation", open=False):
249
  citation_button = gr.Textbox(
src/populate.py CHANGED
@@ -14,6 +14,35 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  # Sort primarily by LLM exact-match Pass@1 metric; if not present, fall back to average
18
  preferred_cols = []
19
  if hasattr(AutoEvalColumn, "pass_at_1"):
@@ -24,6 +53,12 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
24
  if col in df.columns:
25
  df = df.sort_values(by=[col], ascending=False)
26
  break
 
 
 
 
 
 
27
  df = df[cols].round(decimals=2)
28
 
29
  # filter out if any of the benchmarks have not been produced
 
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
+
18
+ # ------------------------------------------------------------------
19
+ # Fallback: if no evaluation results are found we populate the
20
+ # leaderboard with a single example model. This guarantees that a
21
+ # freshly deployed Space shows a non-empty leaderboard and it serves
22
+ # as a template for the expected columns/values.
23
+ # ------------------------------------------------------------------
24
+ if df.empty:
25
+ example_row = {}
26
+
27
+ # Populate benchmark metrics with the default value 0.5
28
+ for metric in benchmark_cols:
29
+ example_row[metric] = 0.5
30
+
31
+ # Minimal metadata so that the row displays nicely
32
+ example_row[AutoEvalColumn.model.name] = make_clickable_model("example/model")
33
+ example_row[AutoEvalColumn.average.name] = 0.5
34
+ example_row[AutoEvalColumn.model_type_symbol.name] = "🟢"
35
+ example_row[AutoEvalColumn.model_type.name] = "pretrained"
36
+ example_row[AutoEvalColumn.precision.name] = "float16"
37
+ example_row[AutoEvalColumn.weight_type.name] = "Original"
38
+ example_row[AutoEvalColumn.still_on_hub.name] = True
39
+ example_row[AutoEvalColumn.architecture.name] = "Transformer"
40
+ example_row[AutoEvalColumn.revision.name] = "main"
41
+ example_row[AutoEvalColumn.license.name] = "apache-2.0"
42
+
43
+ # Any missing columns will be created later in the function
44
+ df = pd.DataFrame([example_row])
45
+
46
  # Sort primarily by LLM exact-match Pass@1 metric; if not present, fall back to average
47
  preferred_cols = []
48
  if hasattr(AutoEvalColumn, "pass_at_1"):
 
53
  if col in df.columns:
54
  df = df.sort_values(by=[col], ascending=False)
55
  break
56
+
57
+ # Ensure all expected columns exist, add missing ones with NaN so selection does not fail
58
+ for expected in cols:
59
+ if expected not in df.columns:
60
+ df[expected] = pd.NA
61
+
62
  df = df[cols].round(decimals=2)
63
 
64
  # filter out if any of the benchmarks have not been produced
src/submission/submit.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
  from src.submission.check_validity import (
8
  already_submitted_models,
9
  check_model_card,
@@ -117,3 +117,132 @@ def add_new_eval(
117
  return styled_message(
118
  "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
+ from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO
7
  from src.submission.check_validity import (
8
  already_submitted_models,
9
  check_model_card,
 
117
  return styled_message(
118
  "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
  )
120
+
121
+ # --------------------------------------------------------
122
+ # Manual metrics submission (bypass evaluation queue)
123
+ # --------------------------------------------------------
124
+
125
+ ALL_SUBJECTIVE_FIELDS = [
126
+ "readability",
127
+ "relevance",
128
+ "explanation_clarity",
129
+ "problem_identification",
130
+ "actionability",
131
+ "completeness",
132
+ "specificity",
133
+ "contextual_adequacy",
134
+ "consistency",
135
+ "brevity",
136
+ ]
137
+
138
+ def _compute_multimetric(payload: dict) -> float:
139
+ """Average of the 10 subjective metrics."""
140
+ total = sum(float(payload[f]) for f in ALL_SUBJECTIVE_FIELDS)
141
+ return total / len(ALL_SUBJECTIVE_FIELDS)
142
+
143
+ def add_manual_results(
144
+ model: str,
145
+ revision: str,
146
+ bleu: float,
147
+ readability: int,
148
+ relevance: int,
149
+ explanation_clarity: int,
150
+ problem_identification: int,
151
+ actionability: int,
152
+ completeness: int,
153
+ specificity: int,
154
+ contextual_adequacy: int,
155
+ consistency: int,
156
+ brevity: int,
157
+ pass_at_1: float,
158
+ pass_at_5: float,
159
+ pass_at_10: float,
160
+ ):
161
+ """Directly submit evaluation metrics for a model and push them to the results dataset."""
162
+
163
+ # Basic validation
164
+ if model == "":
165
+ return styled_error("Please specify a model name.")
166
+
167
+ if revision == "":
168
+ revision = "main"
169
+
170
+ if pass_at_5 < pass_at_1:
171
+ return styled_error("pass@5 must be greater or equal to pass@1")
172
+ if pass_at_10 < pass_at_5:
173
+ return styled_error("pass@10 must be greater or equal to pass@5")
174
+
175
+ # Prepare dictionary in the same format used by read_evals.py
176
+ payload_dict = {
177
+ "model": model,
178
+ "revision": revision,
179
+ "bleu": bleu,
180
+ "readability": readability,
181
+ "relevance": relevance,
182
+ "explanation_clarity": explanation_clarity,
183
+ "problem_identification": problem_identification,
184
+ "actionability": actionability,
185
+ "completeness": completeness,
186
+ "specificity": specificity,
187
+ "contextual_adequacy": contextual_adequacy,
188
+ "consistency": consistency,
189
+ "brevity": brevity,
190
+ "pass_at_1": pass_at_1,
191
+ "pass_at_5": pass_at_5,
192
+ "pass_at_10": pass_at_10,
193
+ }
194
+
195
+ multimetric = _compute_multimetric(payload_dict)
196
+
197
+ # Compose final results file (same structure as api_submit_results)
198
+ result_json = {
199
+ "config": {
200
+ "model_dtype": "unknown",
201
+ "model_name": model,
202
+ "model_sha": revision,
203
+ },
204
+ "results": {
205
+ "bleu": {"score": bleu},
206
+ "multimetric": {"score": multimetric},
207
+ "pass_at_1": {"score": pass_at_1},
208
+ "pass_at_5": {"score": pass_at_5},
209
+ "pass_at_10": {"score": pass_at_10},
210
+ },
211
+ }
212
+
213
+ # Add subjective metrics
214
+ for field in ALL_SUBJECTIVE_FIELDS:
215
+ result_json["results"][field] = {"score": payload_dict[field]}
216
+
217
+ # Write file locally then upload
218
+ try:
219
+ os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
220
+ except Exception:
221
+ pass
222
+
223
+ from datetime import datetime, timezone
224
+ import uuid
225
+
226
+ ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
227
+ unique_id = uuid.uuid4().hex[:8]
228
+ filename = f"results_{model.replace('/', '_')}_{ts}_{unique_id}.json"
229
+ local_path = os.path.join(EVAL_RESULTS_PATH, filename)
230
+
231
+ try:
232
+ with open(local_path, "w") as fp:
233
+ json.dump(result_json, fp)
234
+
235
+ API.upload_file(
236
+ path_or_fileobj=local_path,
237
+ path_in_repo=filename,
238
+ repo_id=RESULTS_REPO,
239
+ repo_type="dataset",
240
+ commit_message=f"Add manual results for {model}",
241
+ )
242
+ except Exception as e:
243
+ return styled_error(f"Failed to upload results: {e}")
244
+ finally:
245
+ if os.path.exists(local_path):
246
+ os.remove(local_path)
247
+
248
+ return styled_message("Metrics successfully submitted! The leaderboard will refresh shortly.")