Spaces:

Enderchef
/

NPFL-Leaderboard

Runtime error

File size: 10,201 Bytes

09fee22

import gradio as gr
import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import json
import os
from datetime import datetime
import time

# --- Configuration ---
QA_FILE = "qa.txt"
RESULTS_FILE = "Eval_results.jsonl"
JUDGE_MODEL_REPO = "google/flan-t5-base" # A capable but relatively small model for judging

# --- Setup: Ensure files exist ---
if not os.path.exists(RESULTS_FILE):
    with open(RESULTS_FILE, "w") as f:
        pass # Create an empty file if it doesn't exist

if not os.path.exists(QA_FILE):
    # Create a dummy qa.txt if it's missing, with a few example questions
    dummy_data = """ID,Question_Type,Question,Golden_Answer_Summary
1,Code,"Create a Python function that implements the Bubble Sort algorithm.","The function should take a list, use nested loops to compare adjacent elements, and swap them if they are in the wrong order. The outer loop runs n times, and the inner loop runs n-i-1 times."
2,Common Chat,"What is the capital of France?","The answer must be Paris."
3,Advanced Code,"Write a Python script that connects to a public FTP server, lists the files in the root directory, and then disconnects.","The script must import the `ftplib` library. It should create an FTP object, for example `FTP('ftp.dlptest.com')`, call the `login()` method, then `retrlines('LIST')` to print the directory listing, and finally `quit()` to close the connection."
"""
    with open(QA_FILE, "w") as f:
        f.write(dummy_data)


# --- AI Judge Logic ---
def get_ai_judge_verdict(judge_pipeline, question, golden_summary, ai_answer):
    """
    Uses the AI Judge model to give a verdict on the tested model's answer.
    """
    system_instruction = f"""
You are an expert evaluator for an AI model benchmark. Your task is to determine if the AI's answer is a correct and satisfactory response to the user's question. You must only respond with a single character: '1' for a correct/passing answer, or '0' for an incorrect/failing answer.

A '1' means the AI's answer correctly addresses the main components of the question and is similar in spirit to the expected golden answer summary.
A '0' means the AI's answer is factually wrong, does not address the question, is a refusal to answer, or is fundamentally incomplete.

---
User Question:
{question}

Expected Golden Answer Summary:
{golden_summary}

---
AI Model's Answer:
{ai_answer}
---

Based on this, is the AI Model's Answer correct? Respond with only '1' or '0'.
"""
    try:
        response = judge_pipeline(system_instruction, max_new_tokens=5)
        # Extract the generated text and clean it up
        verdict = response[0]['generated_text'].strip()
        # Ensure the verdict is either '1' or '0'
        if '1' in verdict:
            return 1
        else:
            return 0
    except Exception:
        # If the judge fails for any reason, default to a failing grade
        return 0

# --- Core Evaluation Logic ---
def run_evaluation(model_repo, model_nickname, progress=gr.Progress()):
    """
    Loads a user-specified model, runs it against the benchmark, evaluates the answers
    using an AI judge, and saves the results.
    """
    if not model_repo or not model_nickname:
        gr.Warning("Model Repository and Nickname cannot be empty.")
        return pd.DataFrame(), None

    # Load benchmark questions
    try:
        questions_df = pd.read_csv(QA_FILE)
        # Use a small subset for quick demos if needed
        # questions_df = questions_df.head(3)
    except Exception as e:
        gr.Error(f"Failed to load benchmark questions from {QA_FILE}: {e}")
        return pd.DataFrame(), None

    # --- Load Models ---
    progress(0, desc="Loading AI Judge Model...")
    try:
        judge_pipeline = pipeline("text2text-generation", model=JUDGE_MODEL_REPO, device_map="auto", torch_dtype=torch.bfloat16)
    except Exception as e:
        gr.Error(f"Failed to load AI Judge model '{JUDGE_MODEL_REPO}': {e}")
        return pd.DataFrame(), None

    progress(0.1, desc=f"Loading test model: {model_repo}")
    try:
        model_to_test_tokenizer = AutoTokenizer.from_pretrained(model_repo)
        model_to_test = AutoModelForCausalLM.from_pretrained(
            model_repo,
            device_map="auto",
            torch_dtype=torch.bfloat16 # bfloat16 is good for ZeroGPU
        )
        test_pipeline = pipeline(
            "text-generation",
            model=model_to_test,
            tokenizer=model_to_test_tokenizer,
            max_new_tokens=1024, # Set a reasonable limit for code generation
            do_sample=True,
            temperature=0.7,
            top_p=0.95
        )
    except Exception as e:
        gr.Error(f"Failed to load the specified test model '{model_repo}': {e}")
        return pd.DataFrame(), None

    # --- Run Benchmark Loop ---
    detailed_results = []
    total_score = 0
    total_questions = len(questions_df)

    for i, row in enumerate(questions_df.itertuples()):
        progress_val = 0.1 + (0.8 * (i / total_questions))
        progress(progress_val, desc=f"Running Q{row.ID}/{total_questions}")

        # Generate answer from the model being tested
        try:
            prompt = f"Question: {row.Question}\n\nAnswer:"
            response = test_pipeline(prompt)
            ai_answer = response[0]['generated_text'].replace(prompt, "").strip()
        except Exception as e:
            ai_answer = f"Error during generation: {e}"

        # Get verdict from the AI Judge
        score = get_ai_judge_verdict(judge_pipeline, row.Question, row.Golden_Answer_Summary, ai_answer)
        total_score += score

        detailed_results.append({
            "ID": row.ID,
            "Question": row.Question,
            "AI_Answer": ai_answer,
            "Score": score
        })
        time.sleep(0.1) # Small delay to allow UI to update

    # --- Finalize and Save Results ---
    progress(0.95, desc="Finalizing and saving...")
    final_score_percent = (total_score / total_questions) * 100 if total_questions > 0 else 0

    run_summary = {
        "model_nickname": model_nickname,
        "model_repo": model_repo,
        "score_percent": round(final_score_percent, 2),
        "timestamp": datetime.utcnow().isoformat(),
        "detailed_results": detailed_results
    }

    try:
        with open(RESULTS_FILE, "a") as f:
            f.write(json.dumps(run_summary) + "\n")
    except Exception as e:
        gr.Warning(f"Could not save results to {RESULTS_FILE}: {e}")

    progress(1, desc="Evaluation Complete!")
    return pd.DataFrame(detailed_results), gr.Markdown(f"**Overall Score: {final_score_percent:.2f}%**")


# --- Leaderboard Logic ---
def load_leaderboard():
    """
    Loads and displays the leaderboard from the results file.
    """
    if not os.path.exists(RESULTS_FILE) or os.path.getsize(RESULTS_FILE) == 0:
        return pd.DataFrame(columns=["Rank", "Model Nickname", "Score (%)", "Date"])

    results_data = []
    with open(RESULTS_FILE, "r") as f:
        for line in f:
            try:
                data = json.loads(line)
                results_data.append({
                    "Model Nickname": data.get("model_nickname"),
                    "Score (%)": data.get("score_percent"),
                    "Model Repo": data.get("model_repo"),
                    "Date": datetime.fromisoformat(data.get("timestamp")).strftime('%Y-%m-%d %H:%M:%S')
                })
            except (json.JSONDecodeError, KeyError):
                # Skip corrupted or malformed lines
                continue
    
    if not results_data:
        return pd.DataFrame(columns=["Rank", "Model Nickname", "Score (%)", "Date"])

    leaderboard_df = pd.DataFrame(results_data)
    leaderboard_df = leaderboard_df.sort_values(by="Score (%)", ascending=False).reset_index(drop=True)
    leaderboard_df["Rank"] = leaderboard_df.index + 1
    
    # Reorder columns for display
    leaderboard_df = leaderboard_df[["Rank", "Model Nickname", "Score (%)", "Date", "Model Repo"]]
    return leaderboard_df


# --- Gradio UI ---
with gr.Blocks(theme=gr.themes.Soft(), title="NPFL Benchmark") as demo:
    gr.Markdown("# NPFL (No Placeholders, Full Logic) AI Benchmark")

    with gr.Tabs():
        with gr.TabItem("Run Evaluation"):
            with gr.Row():
                with gr.Column(scale=2):
                    model_repo_input = gr.Textbox(
                        label="Hugging Face Model Repository",
                        placeholder="e.g., google/gemma-2b-it",
                        info="The model to be tested. Must be compatible with the text-generation pipeline."
                    )
                    model_nickname_input = gr.Textbox(
                        label="Model Nickname",
                        placeholder="e.g., Gemma-2B-v1",
                        info="A unique name to display on the leaderboard."
                    )
                    run_button = gr.Button("Start Evaluation", variant="primary")
                with gr.Column(scale=1):
                    final_score_output = gr.Markdown("**Overall Score: --**")

            gr.Markdown("---")
            gr.Markdown("### Detailed Run Results")
            results_output = gr.DataFrame(
                headers=["ID", "Question", "AI_Answer", "Score"],
                wrap=True,
                height=600
            )

        with gr.TabItem("Leaderboard"):
            leaderboard_refresh_button = gr.Button("Refresh Leaderboard")
            leaderboard_output = gr.DataFrame(
                headers=["Rank", "Model Nickname", "Score (%)", "Date", "Model Repo"],
                wrap=True,
                height=700
            )

    # --- Event Handlers ---
    run_button.click(
        fn=run_evaluation,
        inputs=[model_repo_input, model_nickname_input],
        outputs=[results_output, final_score_output]
    )

    leaderboard_refresh_button.click(
        fn=load_leaderboard,
        inputs=[],
        outputs=[leaderboard_output]
    )
    
    # Load leaderboard once on startup
    demo.load(load_leaderboard, None, leaderboard_output)


if __name__ == "__main__":
    demo.launch(debug=True)