Spaces:

kenkaneki
/

CodeReviewBench

Sleeping

App Files Files Community

Alex commited on Jul 2

Commit

e7ea9f6

1 Parent(s): 1125184

zalupa

Browse files

Files changed (8) hide show

app.py +20 -1
main.py +0 -166
requirements.txt +2 -1
src/about.py +25 -6
src/api_submit_results.py +116 -0
src/display/utils.py +9 -3
src/leaderboard/read_evals.py +4 -1
src/populate.py +3 -1

app.py CHANGED Viewed

@@ -3,6 +3,8 @@ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.about import (
     CITATION_BUTTON_LABEL,
@@ -198,7 +200,24 @@ with demo:
                 show_copy_button=True,
             )
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
+from fastapi import FastAPI
+from src.api_submit_results import router as submission_router
 from src.about import (
     CITATION_BUTTON_LABEL,
                 show_copy_button=True,
             )
+# ------------------------------
+# Start background scheduler
+# ------------------------------
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
+# ------------------------------
+# Mount Gradio UI into FastAPI application
+# ------------------------------
+# Removed direct .launch(); Gradio UI will be served via the mounted FastAPI `app`.
+# ------------------ FastAPI mounting ------------------
+backend = FastAPI()
+backend.include_router(submission_router)
+# Enable queuing (same limit as before)
+demo = demo.queue(default_concurrency_limit=40)
+# Expose `app` for the HF Spaces runtime
+app = gr.mount_gradio_app(backend, demo, path="/")

main.py DELETED Viewed

@@ -1,166 +0,0 @@
-from typing import List
-import os
-import pandas as pd
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel, Field, validator
-import gradio as gr
-# -----------------------------------------------------------------------------
-# Constants
-# -----------------------------------------------------------------------------
-CSV_PATH = os.getenv("LEADERBOARD_CSV", "leaderboard.csv")
-LEADERBOARD_COLUMNS = [
-    "model_name",
-    "bleu",
-    "multimetric",
-    # individual multimetric dimensions
-    "readability",
-    "relevance",
-    "explanation_clarity",
-    "problem_identification",
-    "actionability",
-    "completeness",
-    "specificity",
-    "contextual_adequacy",
-    "consistency",
-    "brevity",
-    # exact-match metrics
-    "pass_at_1",
-    "pass_at_5",
-    "pass_at_10",
-]
-# -----------------------------------------------------------------------------
-# Pydantic schema for incoming submissions
-# -----------------------------------------------------------------------------
-class Submission(BaseModel):
-    model_name: str = Field(..., description="Arbitrary display name for the submission")
-    # automatic metric
-    bleu: float = Field(..., ge=0, description="BLEU score (0-100)")
-    # ten subjective dimensions
-    readability: int = Field(..., ge=0, le=5)
-    relevance: int = Field(..., ge=0, le=5)
-    explanation_clarity: int = Field(..., ge=0, le=5)
-    problem_identification: int = Field(..., ge=0, le=5)
-    actionability: int = Field(..., ge=0, le=5)
-    completeness: int = Field(..., ge=0, le=5)
-    specificity: int = Field(..., ge=0, le=5)
-    contextual_adequacy: int = Field(..., ge=0, le=5)
-    consistency: int = Field(..., ge=0, le=5)
-    brevity: int = Field(..., ge=0, le=5)
-    # exact-match pass@k
-    pass_at_1: float = Field(..., ge=0, le=1)
-    pass_at_5: float = Field(..., ge=0, le=1)
-    pass_at_10: float = Field(..., ge=0, le=1)
-    @validator("pass_at_5")
-    def pass5_ge_pass1(cls, v, values):
-        if "pass_at_1" in values and v < values["pass_at_1"]:
-            raise ValueError("pass@5 must be >= pass@1")
-        return v
-    @validator("pass_at_10")
-    def pass10_ge_pass5(cls, v, values):
-        if "pass_at_5" in values and v < values["pass_at_5"]:
-            raise ValueError("pass@10 must be >= pass@5")
-        return v
-    # computed property (not part of submission payload)
-    def compute_multimetric(self) -> float:
-        fields = [
-            self.readability,
-            self.relevance,
-            self.explanation_clarity,
-            self.problem_identification,
-            self.actionability,
-            self.completeness,
-            self.specificity,
-            self.contextual_adequacy,
-            self.consistency,
-            self.brevity,
-        ]
-        return float(sum(fields)) / len(fields)
-# -----------------------------------------------------------------------------
-# Helpers
-# -----------------------------------------------------------------------------
-def _init_storage(csv_path: str):
-    """Ensure the CSV exists with the correct header"""
-    if not os.path.exists(csv_path):
-        df = pd.DataFrame(columns=LEADERBOARD_COLUMNS)
-        df.to_csv(csv_path, index=False)
-def _load_leaderboard() -> pd.DataFrame:
-    _init_storage(CSV_PATH)
-    df = pd.read_csv(CSV_PATH)
-    # sort descending by Pass@1
-    if not df.empty and "pass_at_1" in df.columns:
-        df = df.sort_values("pass_at_1", ascending=False)
-    return df
-def _save_submission(sub: Submission):
-    _init_storage(CSV_PATH)
-    df = pd.read_csv(CSV_PATH)
-    # Remove previous entry for the same model (if any)
-    df = df[df["model_name"] != sub.model_name]
-    # Compose new row
-    record = sub.dict()
-    record["multimetric"] = sub.compute_multimetric()
-    df = pd.concat([df, pd.DataFrame([record])], ignore_index=True)
-    # keep ordering of columns
-    df = df[LEADERBOARD_COLUMNS]
-    df.to_csv(CSV_PATH, index=False)
-# -----------------------------------------------------------------------------
-# FastAPI backend
-# -----------------------------------------------------------------------------
-api = FastAPI(title="Leaderboard API", version="0.1.0")
-@api.post("/submit", tags=["submission"])
-async def submit_results(payload: Submission):
-    """Receive a new result entry and persist it."""
-    try:
-        _save_submission(payload)
-        return {"status": "ok", "detail": "Submission stored."}
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=str(e))
-@api.get("/leaderboard", tags=["leaderboard"])
-async def get_leaderboard():
-    """Return the current leaderboard as JSON (sorted by Pass@1)."""
-    return _load_leaderboard().to_dict(orient="records")
-# -----------------------------------------------------------------------------
-# Gradio frontend
-# -----------------------------------------------------------------------------
-def _load_leaderboard_df():
-    return _load_leaderboard()
-with gr.Blocks(title="📊 Leaderboard") as demo:
-    gr.Markdown("# 📊 Leaderboard — sorted by **LLM-based exact-match Pass@1**")
-    df_component = gr.Dataframe(value=_load_leaderboard_df(), interactive=False, wrap=True)
-    refresh_btn = gr.Button("🔄 Refresh")
-    refresh_btn.click(lambda: _load_leaderboard_df(), outputs=df_component)
-# Mount gradio under "/"
-app = gr.mount_gradio_app(api, demo, path="/")

requirements.txt CHANGED Viewed

@@ -14,4 +14,5 @@ tqdm
 transformers
 tokenizers>=0.15.0
 sentencepiece
-fastapi

 transformers
 tokenizers>=0.15.0
 sentencepiece
+fastapi
+uvicorn

src/about.py CHANGED Viewed

@@ -8,14 +8,33 @@ class Task:
     col_name: str
-# Select your tasks here
 # ---------------------------------------------------
-class Tasks(Enum):
-    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("anli_r1", "acc", "ANLI")
-    task1 = Task("logiqa", "acc_norm", "LogiQA")
-NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------

     col_name: str
+# Select your metrics here
 # ---------------------------------------------------
+# Each entry: first argument is the key inside "results" dict in the result JSON,
+# second is the metric key inside that sub-dict (we use "score" everywhere for uniformity),
+# third is the column name displayed in the leaderboard.
+class Tasks(Enum):
+    bleu = Task("bleu", "score", "BLEU ⬆️")
+    multimetric = Task("multimetric", "score", "Multimetric ⬆️")
+    readability = Task("readability", "score", "Readability")
+    relevance = Task("relevance", "score", "Relevance")
+    explanation_clarity = Task("explanation_clarity", "score", "Explanation clarity")
+    problem_identification = Task("problem_identification", "score", "Problem identification")
+    actionability = Task("actionability", "score", "Actionability")
+    completeness = Task("completeness", "score", "Completeness")
+    specificity = Task("specificity", "score", "Specificity")
+    contextual_adequacy = Task("contextual_adequacy", "score", "Contextual adequacy")
+    consistency = Task("consistency", "score", "Consistency")
+    brevity = Task("brevity", "score", "Brevity")
+    pass_at_1 = Task("pass_at_1", "score", "Pass@1 ⬆️")
+    pass_at_5 = Task("pass_at_5", "score", "Pass@5")
+    pass_at_10 = Task("pass_at_10", "score", "Pass@10")
+NUM_FEWSHOT = 0  # Not applicable here but kept for compatibility
 # ---------------------------------------------------

src/api_submit_results.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from datetime import datetime, timezone
+import json
+import os
+import uuid
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel, Field, validator
+from src.envs import API, RESULTS_REPO, EVAL_RESULTS_PATH, TOKEN
+router = APIRouter(prefix="/api", tags=["submission"])
+ALL_SUBJECTIVE_FIELDS = [
+    "readability",
+    "relevance",
+    "explanation_clarity",
+    "problem_identification",
+    "actionability",
+    "completeness",
+    "specificity",
+    "contextual_adequacy",
+    "consistency",
+    "brevity",
+]
+class ResultPayload(BaseModel):
+    model: str = Field(..., description="Model id on the Hub (e.g. org/model)")
+    revision: str = Field("main", description="Commit sha or branch (default: main)")
+    bleu: float = Field(..., ge=0, description="BLEU score (0-100)")
+    # 10 subjective metrics 0-5
+    readability: int = Field(..., ge=0, le=5)
+    relevance: int = Field(..., ge=0, le=5)
+    explanation_clarity: int = Field(..., ge=0, le=5)
+    problem_identification: int = Field(..., ge=0, le=5)
+    actionability: int = Field(..., ge=0, le=5)
+    completeness: int = Field(..., ge=0, le=5)
+    specificity: int = Field(..., ge=0, le=5)
+    contextual_adequacy: int = Field(..., ge=0, le=5)
+    consistency: int = Field(..., ge=0, le=5)
+    brevity: int = Field(..., ge=0, le=5)
+    pass_at_1: float = Field(..., ge=0, le=1)
+    pass_at_5: float = Field(..., ge=0, le=1)
+    pass_at_10: float = Field(..., ge=0, le=1)
+    @validator("pass_at_5")
+    def _p5_ge_p1(cls, v, values):
+        if "pass_at_1" in values and v < values["pass_at_1"]:
+            raise ValueError("pass@5 must be >= pass@1")
+        return v
+    @validator("pass_at_10")
+    def _p10_ge_p5(cls, v, values):
+        if "pass_at_5" in values and v < values["pass_at_5"]:
+            raise ValueError("pass@10 must be >= pass@5")
+        return v
+    def multimetric(self) -> float:
+        total = sum(getattr(self, f) for f in ALL_SUBJECTIVE_FIELDS)
+        return float(total) / len(ALL_SUBJECTIVE_FIELDS)
+@router.post("/submit", status_code=200)
+async def submit_results(payload: ResultPayload):
+    """Accept new evaluation results and push them to the results dataset."""
+    # Prepare JSON in expected format (compatible with read_evals.py)
+    results_dict = {
+        "config": {
+            "model_dtype": "unknown",
+            "model_name": payload.model,
+            "model_sha": payload.revision,
+        },
+        "results": {},
+    }
+    # Primary metrics
+    results_dict["results"]["bleu"] = {"score": payload.bleu}
+    results_dict["results"]["multimetric"] = {"score": payload.multimetric()}
+    # Subjective metrics
+    for field in ALL_SUBJECTIVE_FIELDS:
+        results_dict["results"][field] = {"score": getattr(payload, field)}
+    # Pass@k metrics
+    results_dict["results"]["pass_at_1"] = {"score": payload.pass_at_1}
+    results_dict["results"]["pass_at_5"] = {"score": payload.pass_at_5}
+    results_dict["results"]["pass_at_10"] = {"score": payload.pass_at_10}
+    # File handling
+    os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
+    ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+    unique_id = uuid.uuid4().hex[:8]
+    filename = f"results_{payload.model.replace('/', '_')}_{ts}_{unique_id}.json"
+    local_path = os.path.join(EVAL_RESULTS_PATH, filename)
+    with open(local_path, "w") as fp:
+        json.dump(results_dict, fp)
+    try:
+        API.upload_file(
+            path_or_fileobj=local_path,
+            path_in_repo=filename,
+            repo_id=RESULTS_REPO,
+            repo_type="dataset",
+            commit_message=f"Add results for {payload.model}",
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Failed to upload results: {e}")
+    finally:
+        if os.path.exists(local_path):
+            os.remove(local_path)
+    return {"status": "ok", "detail": "Results submitted."}

src/display/utils.py CHANGED Viewed

@@ -25,10 +25,16 @@ auto_eval_column_dict = []
 # Init
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
-#Scores
-auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])

 # Init
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+# Average kept but not displayed by default
+auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", False)])
+# Determine which metrics are visible by default
+_DEFAULT_VISIBLE = {"bleu", "multimetric", "pass_at_1", "pass_at_5", "pass_at_10"}
 for task in Tasks:
+    show = task.name in _DEFAULT_VISIBLE
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", show)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])

src/leaderboard/read_evals.py CHANGED Viewed

@@ -76,7 +76,10 @@ class EvalResult:
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
-            mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
         return self(

             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
+            if task.metric == "score":
+                mean_acc = float(np.mean(accs))
+            else:
+                mean_acc = float(np.mean(accs) * 100.0)
             results[task.benchmark] = mean_acc
         return self(

src/populate.py CHANGED Viewed

@@ -14,7 +14,9 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
-    df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced

     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
+    # Sort primarily by LLM exact-match Pass@1 metric
+    sort_col = AutoEvalColumn.pass_at_1.name if hasattr(AutoEvalColumn, "pass_at_1") else AutoEvalColumn.average.name
+    df = df.sort_values(by=[sort_col], ascending=False)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced