Alex commited on
Commit
e7ea9f6
·
1 Parent(s): 1125184
app.py CHANGED
@@ -3,6 +3,8 @@ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
 
 
6
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
@@ -198,7 +200,24 @@ with demo:
198
  show_copy_button=True,
199
  )
200
 
 
 
 
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
6
+ from fastapi import FastAPI
7
+ from src.api_submit_results import router as submission_router
8
 
9
  from src.about import (
10
  CITATION_BUTTON_LABEL,
 
200
  show_copy_button=True,
201
  )
202
 
203
+ # ------------------------------
204
+ # Start background scheduler
205
+ # ------------------------------
206
  scheduler = BackgroundScheduler()
207
  scheduler.add_job(restart_space, "interval", seconds=1800)
208
  scheduler.start()
209
+
210
+ # ------------------------------
211
+ # Mount Gradio UI into FastAPI application
212
+ # ------------------------------
213
+ # Removed direct .launch(); Gradio UI will be served via the mounted FastAPI `app`.
214
+
215
+ # ------------------ FastAPI mounting ------------------
216
+ backend = FastAPI()
217
+ backend.include_router(submission_router)
218
+
219
+ # Enable queuing (same limit as before)
220
+ demo = demo.queue(default_concurrency_limit=40)
221
+
222
+ # Expose `app` for the HF Spaces runtime
223
+ app = gr.mount_gradio_app(backend, demo, path="/")
main.py DELETED
@@ -1,166 +0,0 @@
1
- from typing import List
2
-
3
- import os
4
-
5
- import pandas as pd
6
- from fastapi import FastAPI, HTTPException
7
- from pydantic import BaseModel, Field, validator
8
- import gradio as gr
9
-
10
- # -----------------------------------------------------------------------------
11
- # Constants
12
- # -----------------------------------------------------------------------------
13
- CSV_PATH = os.getenv("LEADERBOARD_CSV", "leaderboard.csv")
14
- LEADERBOARD_COLUMNS = [
15
- "model_name",
16
- "bleu",
17
- "multimetric",
18
- # individual multimetric dimensions
19
- "readability",
20
- "relevance",
21
- "explanation_clarity",
22
- "problem_identification",
23
- "actionability",
24
- "completeness",
25
- "specificity",
26
- "contextual_adequacy",
27
- "consistency",
28
- "brevity",
29
- # exact-match metrics
30
- "pass_at_1",
31
- "pass_at_5",
32
- "pass_at_10",
33
- ]
34
-
35
-
36
- # -----------------------------------------------------------------------------
37
- # Pydantic schema for incoming submissions
38
- # -----------------------------------------------------------------------------
39
- class Submission(BaseModel):
40
- model_name: str = Field(..., description="Arbitrary display name for the submission")
41
-
42
- # automatic metric
43
- bleu: float = Field(..., ge=0, description="BLEU score (0-100)")
44
-
45
- # ten subjective dimensions
46
- readability: int = Field(..., ge=0, le=5)
47
- relevance: int = Field(..., ge=0, le=5)
48
- explanation_clarity: int = Field(..., ge=0, le=5)
49
- problem_identification: int = Field(..., ge=0, le=5)
50
- actionability: int = Field(..., ge=0, le=5)
51
- completeness: int = Field(..., ge=0, le=5)
52
- specificity: int = Field(..., ge=0, le=5)
53
- contextual_adequacy: int = Field(..., ge=0, le=5)
54
- consistency: int = Field(..., ge=0, le=5)
55
- brevity: int = Field(..., ge=0, le=5)
56
-
57
- # exact-match pass@k
58
- pass_at_1: float = Field(..., ge=0, le=1)
59
- pass_at_5: float = Field(..., ge=0, le=1)
60
- pass_at_10: float = Field(..., ge=0, le=1)
61
-
62
- @validator("pass_at_5")
63
- def pass5_ge_pass1(cls, v, values):
64
- if "pass_at_1" in values and v < values["pass_at_1"]:
65
- raise ValueError("pass@5 must be >= pass@1")
66
- return v
67
-
68
- @validator("pass_at_10")
69
- def pass10_ge_pass5(cls, v, values):
70
- if "pass_at_5" in values and v < values["pass_at_5"]:
71
- raise ValueError("pass@10 must be >= pass@5")
72
- return v
73
-
74
- # computed property (not part of submission payload)
75
- def compute_multimetric(self) -> float:
76
- fields = [
77
- self.readability,
78
- self.relevance,
79
- self.explanation_clarity,
80
- self.problem_identification,
81
- self.actionability,
82
- self.completeness,
83
- self.specificity,
84
- self.contextual_adequacy,
85
- self.consistency,
86
- self.brevity,
87
- ]
88
- return float(sum(fields)) / len(fields)
89
-
90
-
91
- # -----------------------------------------------------------------------------
92
- # Helpers
93
- # -----------------------------------------------------------------------------
94
-
95
- def _init_storage(csv_path: str):
96
- """Ensure the CSV exists with the correct header"""
97
- if not os.path.exists(csv_path):
98
- df = pd.DataFrame(columns=LEADERBOARD_COLUMNS)
99
- df.to_csv(csv_path, index=False)
100
-
101
-
102
- def _load_leaderboard() -> pd.DataFrame:
103
- _init_storage(CSV_PATH)
104
- df = pd.read_csv(CSV_PATH)
105
- # sort descending by Pass@1
106
- if not df.empty and "pass_at_1" in df.columns:
107
- df = df.sort_values("pass_at_1", ascending=False)
108
- return df
109
-
110
-
111
- def _save_submission(sub: Submission):
112
- _init_storage(CSV_PATH)
113
- df = pd.read_csv(CSV_PATH)
114
-
115
- # Remove previous entry for the same model (if any)
116
- df = df[df["model_name"] != sub.model_name]
117
-
118
- # Compose new row
119
- record = sub.dict()
120
- record["multimetric"] = sub.compute_multimetric()
121
- df = pd.concat([df, pd.DataFrame([record])], ignore_index=True)
122
-
123
- # keep ordering of columns
124
- df = df[LEADERBOARD_COLUMNS]
125
- df.to_csv(CSV_PATH, index=False)
126
-
127
-
128
- # -----------------------------------------------------------------------------
129
- # FastAPI backend
130
- # -----------------------------------------------------------------------------
131
- api = FastAPI(title="Leaderboard API", version="0.1.0")
132
-
133
-
134
- @api.post("/submit", tags=["submission"])
135
- async def submit_results(payload: Submission):
136
- """Receive a new result entry and persist it."""
137
- try:
138
- _save_submission(payload)
139
- return {"status": "ok", "detail": "Submission stored."}
140
- except Exception as e:
141
- raise HTTPException(status_code=400, detail=str(e))
142
-
143
-
144
- @api.get("/leaderboard", tags=["leaderboard"])
145
- async def get_leaderboard():
146
- """Return the current leaderboard as JSON (sorted by Pass@1)."""
147
- return _load_leaderboard().to_dict(orient="records")
148
-
149
-
150
- # -----------------------------------------------------------------------------
151
- # Gradio frontend
152
- # -----------------------------------------------------------------------------
153
-
154
- def _load_leaderboard_df():
155
- return _load_leaderboard()
156
-
157
-
158
- with gr.Blocks(title="📊 Leaderboard") as demo:
159
- gr.Markdown("# 📊 Leaderboard — sorted by **LLM-based exact-match Pass@1**")
160
- df_component = gr.Dataframe(value=_load_leaderboard_df(), interactive=False, wrap=True)
161
- refresh_btn = gr.Button("🔄 Refresh")
162
- refresh_btn.click(lambda: _load_leaderboard_df(), outputs=df_component)
163
-
164
-
165
- # Mount gradio under "/"
166
- app = gr.mount_gradio_app(api, demo, path="/")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -14,4 +14,5 @@ tqdm
14
  transformers
15
  tokenizers>=0.15.0
16
  sentencepiece
17
- fastapi
 
 
14
  transformers
15
  tokenizers>=0.15.0
16
  sentencepiece
17
+ fastapi
18
+ uvicorn
src/about.py CHANGED
@@ -8,14 +8,33 @@ class Task:
8
  col_name: str
9
 
10
 
11
- # Select your tasks here
12
  # ---------------------------------------------------
13
- class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
17
 
18
- NUM_FEWSHOT = 0 # Change with your few shot
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # ---------------------------------------------------
20
 
21
 
 
8
  col_name: str
9
 
10
 
11
+ # Select your metrics here
12
  # ---------------------------------------------------
13
+ # Each entry: first argument is the key inside "results" dict in the result JSON,
14
+ # second is the metric key inside that sub-dict (we use "score" everywhere for uniformity),
15
+ # third is the column name displayed in the leaderboard.
 
16
 
17
+ class Tasks(Enum):
18
+ bleu = Task("bleu", "score", "BLEU ⬆️")
19
+ multimetric = Task("multimetric", "score", "Multimetric ⬆️")
20
+
21
+ readability = Task("readability", "score", "Readability")
22
+ relevance = Task("relevance", "score", "Relevance")
23
+ explanation_clarity = Task("explanation_clarity", "score", "Explanation clarity")
24
+ problem_identification = Task("problem_identification", "score", "Problem identification")
25
+ actionability = Task("actionability", "score", "Actionability")
26
+ completeness = Task("completeness", "score", "Completeness")
27
+ specificity = Task("specificity", "score", "Specificity")
28
+ contextual_adequacy = Task("contextual_adequacy", "score", "Contextual adequacy")
29
+ consistency = Task("consistency", "score", "Consistency")
30
+ brevity = Task("brevity", "score", "Brevity")
31
+
32
+ pass_at_1 = Task("pass_at_1", "score", "Pass@1 ⬆️")
33
+ pass_at_5 = Task("pass_at_5", "score", "Pass@5")
34
+ pass_at_10 = Task("pass_at_10", "score", "Pass@10")
35
+
36
+
37
+ NUM_FEWSHOT = 0 # Not applicable here but kept for compatibility
38
  # ---------------------------------------------------
39
 
40
 
src/api_submit_results.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime, timezone
2
+ import json
3
+ import os
4
+ import uuid
5
+
6
+ from fastapi import APIRouter, HTTPException
7
+ from pydantic import BaseModel, Field, validator
8
+
9
+ from src.envs import API, RESULTS_REPO, EVAL_RESULTS_PATH, TOKEN
10
+
11
+ router = APIRouter(prefix="/api", tags=["submission"])
12
+
13
+ ALL_SUBJECTIVE_FIELDS = [
14
+ "readability",
15
+ "relevance",
16
+ "explanation_clarity",
17
+ "problem_identification",
18
+ "actionability",
19
+ "completeness",
20
+ "specificity",
21
+ "contextual_adequacy",
22
+ "consistency",
23
+ "brevity",
24
+ ]
25
+
26
+
27
+ class ResultPayload(BaseModel):
28
+ model: str = Field(..., description="Model id on the Hub (e.g. org/model)")
29
+ revision: str = Field("main", description="Commit sha or branch (default: main)")
30
+ bleu: float = Field(..., ge=0, description="BLEU score (0-100)")
31
+
32
+ # 10 subjective metrics 0-5
33
+ readability: int = Field(..., ge=0, le=5)
34
+ relevance: int = Field(..., ge=0, le=5)
35
+ explanation_clarity: int = Field(..., ge=0, le=5)
36
+ problem_identification: int = Field(..., ge=0, le=5)
37
+ actionability: int = Field(..., ge=0, le=5)
38
+ completeness: int = Field(..., ge=0, le=5)
39
+ specificity: int = Field(..., ge=0, le=5)
40
+ contextual_adequacy: int = Field(..., ge=0, le=5)
41
+ consistency: int = Field(..., ge=0, le=5)
42
+ brevity: int = Field(..., ge=0, le=5)
43
+
44
+ pass_at_1: float = Field(..., ge=0, le=1)
45
+ pass_at_5: float = Field(..., ge=0, le=1)
46
+ pass_at_10: float = Field(..., ge=0, le=1)
47
+
48
+ @validator("pass_at_5")
49
+ def _p5_ge_p1(cls, v, values):
50
+ if "pass_at_1" in values and v < values["pass_at_1"]:
51
+ raise ValueError("pass@5 must be >= pass@1")
52
+ return v
53
+
54
+ @validator("pass_at_10")
55
+ def _p10_ge_p5(cls, v, values):
56
+ if "pass_at_5" in values and v < values["pass_at_5"]:
57
+ raise ValueError("pass@10 must be >= pass@5")
58
+ return v
59
+
60
+ def multimetric(self) -> float:
61
+ total = sum(getattr(self, f) for f in ALL_SUBJECTIVE_FIELDS)
62
+ return float(total) / len(ALL_SUBJECTIVE_FIELDS)
63
+
64
+
65
+ @router.post("/submit", status_code=200)
66
+ async def submit_results(payload: ResultPayload):
67
+ """Accept new evaluation results and push them to the results dataset."""
68
+
69
+ # Prepare JSON in expected format (compatible with read_evals.py)
70
+ results_dict = {
71
+ "config": {
72
+ "model_dtype": "unknown",
73
+ "model_name": payload.model,
74
+ "model_sha": payload.revision,
75
+ },
76
+ "results": {},
77
+ }
78
+
79
+ # Primary metrics
80
+ results_dict["results"]["bleu"] = {"score": payload.bleu}
81
+ results_dict["results"]["multimetric"] = {"score": payload.multimetric()}
82
+
83
+ # Subjective metrics
84
+ for field in ALL_SUBJECTIVE_FIELDS:
85
+ results_dict["results"][field] = {"score": getattr(payload, field)}
86
+
87
+ # Pass@k metrics
88
+ results_dict["results"]["pass_at_1"] = {"score": payload.pass_at_1}
89
+ results_dict["results"]["pass_at_5"] = {"score": payload.pass_at_5}
90
+ results_dict["results"]["pass_at_10"] = {"score": payload.pass_at_10}
91
+
92
+ # File handling
93
+ os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
94
+ ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
95
+ unique_id = uuid.uuid4().hex[:8]
96
+ filename = f"results_{payload.model.replace('/', '_')}_{ts}_{unique_id}.json"
97
+ local_path = os.path.join(EVAL_RESULTS_PATH, filename)
98
+
99
+ with open(local_path, "w") as fp:
100
+ json.dump(results_dict, fp)
101
+
102
+ try:
103
+ API.upload_file(
104
+ path_or_fileobj=local_path,
105
+ path_in_repo=filename,
106
+ repo_id=RESULTS_REPO,
107
+ repo_type="dataset",
108
+ commit_message=f"Add results for {payload.model}",
109
+ )
110
+ except Exception as e:
111
+ raise HTTPException(status_code=500, detail=f"Failed to upload results: {e}")
112
+ finally:
113
+ if os.path.exists(local_path):
114
+ os.remove(local_path)
115
+
116
+ return {"status": "ok", "detail": "Results submitted."}
src/display/utils.py CHANGED
@@ -25,10 +25,16 @@ auto_eval_column_dict = []
25
  # Init
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 
 
 
 
30
  for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
 
32
  # Model information
33
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
 
25
  # Init
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
+ # Average kept but not displayed by default
29
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", False)])
30
+
31
+ # Determine which metrics are visible by default
32
+ _DEFAULT_VISIBLE = {"bleu", "multimetric", "pass_at_1", "pass_at_5", "pass_at_10"}
33
+
34
  for task in Tasks:
35
+ show = task.name in _DEFAULT_VISIBLE
36
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", show)])
37
+
38
  # Model information
39
  auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
40
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
src/leaderboard/read_evals.py CHANGED
@@ -76,7 +76,10 @@ class EvalResult:
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
 
79
- mean_acc = np.mean(accs) * 100.0
 
 
 
80
  results[task.benchmark] = mean_acc
81
 
82
  return self(
 
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
 
79
+ if task.metric == "score":
80
+ mean_acc = float(np.mean(accs))
81
+ else:
82
+ mean_acc = float(np.mean(accs) * 100.0)
83
  results[task.benchmark] = mean_acc
84
 
85
  return self(
src/populate.py CHANGED
@@ -14,7 +14,9 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
 
18
  df = df[cols].round(decimals=2)
19
 
20
  # filter out if any of the benchmarks have not been produced
 
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
+ # Sort primarily by LLM exact-match Pass@1 metric
18
+ sort_col = AutoEvalColumn.pass_at_1.name if hasattr(AutoEvalColumn, "pass_at_1") else AutoEvalColumn.average.name
19
+ df = df.sort_values(by=[sort_col], ascending=False)
20
  df = df[cols].round(decimals=2)
21
 
22
  # filter out if any of the benchmarks have not been produced