Spaces:
Sleeping
Sleeping
Alex
commited on
Commit
·
15c92e9
1
Parent(s):
b4d9db9
zalupa
Browse files- app.py +55 -1
- src/populate.py +35 -0
- src/submission/submit.py +130 -1
app.py
CHANGED
@@ -28,7 +28,7 @@ from src.display.utils import (
|
|
28 |
)
|
29 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
30 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
31 |
-
from src.submission.submit import add_new_eval
|
32 |
|
33 |
|
34 |
def restart_space():
|
@@ -190,6 +190,60 @@ with demo:
|
|
190 |
submission_result,
|
191 |
)
|
192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
with gr.Row():
|
194 |
with gr.Accordion("📙 Citation", open=False):
|
195 |
citation_button = gr.Textbox(
|
|
|
28 |
)
|
29 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
30 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
31 |
+
from src.submission.submit import add_new_eval, add_manual_results
|
32 |
|
33 |
|
34 |
def restart_space():
|
|
|
190 |
submission_result,
|
191 |
)
|
192 |
|
193 |
+
# ----------------------------------------------------
|
194 |
+
# Manual metrics submission form
|
195 |
+
# ----------------------------------------------------
|
196 |
+
with gr.Markdown("## 📝 Submit metrics manually (advanced)"):
|
197 |
+
pass
|
198 |
+
|
199 |
+
with gr.Row():
|
200 |
+
with gr.Column():
|
201 |
+
model_name_metrics = gr.Textbox(label="Model name", placeholder="org/model")
|
202 |
+
revision_metrics = gr.Textbox(label="Revision commit", placeholder="main", value="main")
|
203 |
+
bleu_input = gr.Number(label="BLEU", value=0.5)
|
204 |
+
pass1_input = gr.Number(label="Pass@1", value=0.5, minimum=0.0, maximum=1.0)
|
205 |
+
pass5_input = gr.Number(label="Pass@5", value=0.5, minimum=0.0, maximum=1.0)
|
206 |
+
pass10_input = gr.Number(label="Pass@10", value=0.5, minimum=0.0, maximum=1.0)
|
207 |
+
|
208 |
+
with gr.Column():
|
209 |
+
# Subjective metrics sliders (0-5)
|
210 |
+
readability_slider = gr.Slider(0, 5, step=1, value=3, label="Readability")
|
211 |
+
relevance_slider = gr.Slider(0, 5, step=1, value=3, label="Relevance")
|
212 |
+
explanation_slider = gr.Slider(0, 5, step=1, value=3, label="Explanation clarity")
|
213 |
+
problem_slider = gr.Slider(0, 5, step=1, value=3, label="Problem identification")
|
214 |
+
actionability_slider = gr.Slider(0, 5, step=1, value=3, label="Actionability")
|
215 |
+
completeness_slider = gr.Slider(0, 5, step=1, value=3, label="Completeness")
|
216 |
+
specificity_slider = gr.Slider(0, 5, step=1, value=3, label="Specificity")
|
217 |
+
contextual_slider = gr.Slider(0, 5, step=1, value=3, label="Contextual adequacy")
|
218 |
+
consistency_slider = gr.Slider(0, 5, step=1, value=3, label="Consistency")
|
219 |
+
brevity_slider = gr.Slider(0, 5, step=1, value=3, label="Brevity")
|
220 |
+
|
221 |
+
submit_metrics_button = gr.Button("Submit Metrics")
|
222 |
+
metrics_submission_result = gr.Markdown()
|
223 |
+
|
224 |
+
submit_metrics_button.click(
|
225 |
+
add_manual_results,
|
226 |
+
[
|
227 |
+
model_name_metrics,
|
228 |
+
revision_metrics,
|
229 |
+
bleu_input,
|
230 |
+
readability_slider,
|
231 |
+
relevance_slider,
|
232 |
+
explanation_slider,
|
233 |
+
problem_slider,
|
234 |
+
actionability_slider,
|
235 |
+
completeness_slider,
|
236 |
+
specificity_slider,
|
237 |
+
contextual_slider,
|
238 |
+
consistency_slider,
|
239 |
+
brevity_slider,
|
240 |
+
pass1_input,
|
241 |
+
pass5_input,
|
242 |
+
pass10_input,
|
243 |
+
],
|
244 |
+
metrics_submission_result,
|
245 |
+
)
|
246 |
+
|
247 |
with gr.Row():
|
248 |
with gr.Accordion("📙 Citation", open=False):
|
249 |
citation_button = gr.Textbox(
|
src/populate.py
CHANGED
@@ -14,6 +14,35 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
# Sort primarily by LLM exact-match Pass@1 metric; if not present, fall back to average
|
18 |
preferred_cols = []
|
19 |
if hasattr(AutoEvalColumn, "pass_at_1"):
|
@@ -24,6 +53,12 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
24 |
if col in df.columns:
|
25 |
df = df.sort_values(by=[col], ascending=False)
|
26 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
df = df[cols].round(decimals=2)
|
28 |
|
29 |
# filter out if any of the benchmarks have not been produced
|
|
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
+
|
18 |
+
# ------------------------------------------------------------------
|
19 |
+
# Fallback: if no evaluation results are found we populate the
|
20 |
+
# leaderboard with a single example model. This guarantees that a
|
21 |
+
# freshly deployed Space shows a non-empty leaderboard and it serves
|
22 |
+
# as a template for the expected columns/values.
|
23 |
+
# ------------------------------------------------------------------
|
24 |
+
if df.empty:
|
25 |
+
example_row = {}
|
26 |
+
|
27 |
+
# Populate benchmark metrics with the default value 0.5
|
28 |
+
for metric in benchmark_cols:
|
29 |
+
example_row[metric] = 0.5
|
30 |
+
|
31 |
+
# Minimal metadata so that the row displays nicely
|
32 |
+
example_row[AutoEvalColumn.model.name] = make_clickable_model("example/model")
|
33 |
+
example_row[AutoEvalColumn.average.name] = 0.5
|
34 |
+
example_row[AutoEvalColumn.model_type_symbol.name] = "🟢"
|
35 |
+
example_row[AutoEvalColumn.model_type.name] = "pretrained"
|
36 |
+
example_row[AutoEvalColumn.precision.name] = "float16"
|
37 |
+
example_row[AutoEvalColumn.weight_type.name] = "Original"
|
38 |
+
example_row[AutoEvalColumn.still_on_hub.name] = True
|
39 |
+
example_row[AutoEvalColumn.architecture.name] = "Transformer"
|
40 |
+
example_row[AutoEvalColumn.revision.name] = "main"
|
41 |
+
example_row[AutoEvalColumn.license.name] = "apache-2.0"
|
42 |
+
|
43 |
+
# Any missing columns will be created later in the function
|
44 |
+
df = pd.DataFrame([example_row])
|
45 |
+
|
46 |
# Sort primarily by LLM exact-match Pass@1 metric; if not present, fall back to average
|
47 |
preferred_cols = []
|
48 |
if hasattr(AutoEvalColumn, "pass_at_1"):
|
|
|
53 |
if col in df.columns:
|
54 |
df = df.sort_values(by=[col], ascending=False)
|
55 |
break
|
56 |
+
|
57 |
+
# Ensure all expected columns exist, add missing ones with NaN so selection does not fail
|
58 |
+
for expected in cols:
|
59 |
+
if expected not in df.columns:
|
60 |
+
df[expected] = pd.NA
|
61 |
+
|
62 |
df = df[cols].round(decimals=2)
|
63 |
|
64 |
# filter out if any of the benchmarks have not been produced
|
src/submission/submit.py
CHANGED
@@ -3,7 +3,7 @@ import os
|
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
-
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
|
7 |
from src.submission.check_validity import (
|
8 |
already_submitted_models,
|
9 |
check_model_card,
|
@@ -117,3 +117,132 @@ def add_new_eval(
|
|
117 |
return styled_message(
|
118 |
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
119 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
+
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO
|
7 |
from src.submission.check_validity import (
|
8 |
already_submitted_models,
|
9 |
check_model_card,
|
|
|
117 |
return styled_message(
|
118 |
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
119 |
)
|
120 |
+
|
121 |
+
# --------------------------------------------------------
|
122 |
+
# Manual metrics submission (bypass evaluation queue)
|
123 |
+
# --------------------------------------------------------
|
124 |
+
|
125 |
+
ALL_SUBJECTIVE_FIELDS = [
|
126 |
+
"readability",
|
127 |
+
"relevance",
|
128 |
+
"explanation_clarity",
|
129 |
+
"problem_identification",
|
130 |
+
"actionability",
|
131 |
+
"completeness",
|
132 |
+
"specificity",
|
133 |
+
"contextual_adequacy",
|
134 |
+
"consistency",
|
135 |
+
"brevity",
|
136 |
+
]
|
137 |
+
|
138 |
+
def _compute_multimetric(payload: dict) -> float:
|
139 |
+
"""Average of the 10 subjective metrics."""
|
140 |
+
total = sum(float(payload[f]) for f in ALL_SUBJECTIVE_FIELDS)
|
141 |
+
return total / len(ALL_SUBJECTIVE_FIELDS)
|
142 |
+
|
143 |
+
def add_manual_results(
|
144 |
+
model: str,
|
145 |
+
revision: str,
|
146 |
+
bleu: float,
|
147 |
+
readability: int,
|
148 |
+
relevance: int,
|
149 |
+
explanation_clarity: int,
|
150 |
+
problem_identification: int,
|
151 |
+
actionability: int,
|
152 |
+
completeness: int,
|
153 |
+
specificity: int,
|
154 |
+
contextual_adequacy: int,
|
155 |
+
consistency: int,
|
156 |
+
brevity: int,
|
157 |
+
pass_at_1: float,
|
158 |
+
pass_at_5: float,
|
159 |
+
pass_at_10: float,
|
160 |
+
):
|
161 |
+
"""Directly submit evaluation metrics for a model and push them to the results dataset."""
|
162 |
+
|
163 |
+
# Basic validation
|
164 |
+
if model == "":
|
165 |
+
return styled_error("Please specify a model name.")
|
166 |
+
|
167 |
+
if revision == "":
|
168 |
+
revision = "main"
|
169 |
+
|
170 |
+
if pass_at_5 < pass_at_1:
|
171 |
+
return styled_error("pass@5 must be greater or equal to pass@1")
|
172 |
+
if pass_at_10 < pass_at_5:
|
173 |
+
return styled_error("pass@10 must be greater or equal to pass@5")
|
174 |
+
|
175 |
+
# Prepare dictionary in the same format used by read_evals.py
|
176 |
+
payload_dict = {
|
177 |
+
"model": model,
|
178 |
+
"revision": revision,
|
179 |
+
"bleu": bleu,
|
180 |
+
"readability": readability,
|
181 |
+
"relevance": relevance,
|
182 |
+
"explanation_clarity": explanation_clarity,
|
183 |
+
"problem_identification": problem_identification,
|
184 |
+
"actionability": actionability,
|
185 |
+
"completeness": completeness,
|
186 |
+
"specificity": specificity,
|
187 |
+
"contextual_adequacy": contextual_adequacy,
|
188 |
+
"consistency": consistency,
|
189 |
+
"brevity": brevity,
|
190 |
+
"pass_at_1": pass_at_1,
|
191 |
+
"pass_at_5": pass_at_5,
|
192 |
+
"pass_at_10": pass_at_10,
|
193 |
+
}
|
194 |
+
|
195 |
+
multimetric = _compute_multimetric(payload_dict)
|
196 |
+
|
197 |
+
# Compose final results file (same structure as api_submit_results)
|
198 |
+
result_json = {
|
199 |
+
"config": {
|
200 |
+
"model_dtype": "unknown",
|
201 |
+
"model_name": model,
|
202 |
+
"model_sha": revision,
|
203 |
+
},
|
204 |
+
"results": {
|
205 |
+
"bleu": {"score": bleu},
|
206 |
+
"multimetric": {"score": multimetric},
|
207 |
+
"pass_at_1": {"score": pass_at_1},
|
208 |
+
"pass_at_5": {"score": pass_at_5},
|
209 |
+
"pass_at_10": {"score": pass_at_10},
|
210 |
+
},
|
211 |
+
}
|
212 |
+
|
213 |
+
# Add subjective metrics
|
214 |
+
for field in ALL_SUBJECTIVE_FIELDS:
|
215 |
+
result_json["results"][field] = {"score": payload_dict[field]}
|
216 |
+
|
217 |
+
# Write file locally then upload
|
218 |
+
try:
|
219 |
+
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
220 |
+
except Exception:
|
221 |
+
pass
|
222 |
+
|
223 |
+
from datetime import datetime, timezone
|
224 |
+
import uuid
|
225 |
+
|
226 |
+
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
227 |
+
unique_id = uuid.uuid4().hex[:8]
|
228 |
+
filename = f"results_{model.replace('/', '_')}_{ts}_{unique_id}.json"
|
229 |
+
local_path = os.path.join(EVAL_RESULTS_PATH, filename)
|
230 |
+
|
231 |
+
try:
|
232 |
+
with open(local_path, "w") as fp:
|
233 |
+
json.dump(result_json, fp)
|
234 |
+
|
235 |
+
API.upload_file(
|
236 |
+
path_or_fileobj=local_path,
|
237 |
+
path_in_repo=filename,
|
238 |
+
repo_id=RESULTS_REPO,
|
239 |
+
repo_type="dataset",
|
240 |
+
commit_message=f"Add manual results for {model}",
|
241 |
+
)
|
242 |
+
except Exception as e:
|
243 |
+
return styled_error(f"Failed to upload results: {e}")
|
244 |
+
finally:
|
245 |
+
if os.path.exists(local_path):
|
246 |
+
os.remove(local_path)
|
247 |
+
|
248 |
+
return styled_message("Metrics successfully submitted! The leaderboard will refresh shortly.")
|