File size: 10,201 Bytes
09fee22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import gradio as gr
import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import json
import os
from datetime import datetime
import time

# --- Configuration ---
QA_FILE = "qa.txt"
RESULTS_FILE = "Eval_results.jsonl"
JUDGE_MODEL_REPO = "google/flan-t5-base" # A capable but relatively small model for judging

# --- Setup: Ensure files exist ---
if not os.path.exists(RESULTS_FILE):
    with open(RESULTS_FILE, "w") as f:
        pass # Create an empty file if it doesn't exist

if not os.path.exists(QA_FILE):
    # Create a dummy qa.txt if it's missing, with a few example questions
    dummy_data = """ID,Question_Type,Question,Golden_Answer_Summary
1,Code,"Create a Python function that implements the Bubble Sort algorithm.","The function should take a list, use nested loops to compare adjacent elements, and swap them if they are in the wrong order. The outer loop runs n times, and the inner loop runs n-i-1 times."
2,Common Chat,"What is the capital of France?","The answer must be Paris."
3,Advanced Code,"Write a Python script that connects to a public FTP server, lists the files in the root directory, and then disconnects.","The script must import the `ftplib` library. It should create an FTP object, for example `FTP('ftp.dlptest.com')`, call the `login()` method, then `retrlines('LIST')` to print the directory listing, and finally `quit()` to close the connection."
"""
    with open(QA_FILE, "w") as f:
        f.write(dummy_data)


# --- AI Judge Logic ---
def get_ai_judge_verdict(judge_pipeline, question, golden_summary, ai_answer):
    """
    Uses the AI Judge model to give a verdict on the tested model's answer.
    """
    system_instruction = f"""
You are an expert evaluator for an AI model benchmark. Your task is to determine if the AI's answer is a correct and satisfactory response to the user's question. You must only respond with a single character: '1' for a correct/passing answer, or '0' for an incorrect/failing answer.

A '1' means the AI's answer correctly addresses the main components of the question and is similar in spirit to the expected golden answer summary.
A '0' means the AI's answer is factually wrong, does not address the question, is a refusal to answer, or is fundamentally incomplete.

---
User Question:
{question}

Expected Golden Answer Summary:
{golden_summary}

---
AI Model's Answer:
{ai_answer}
---

Based on this, is the AI Model's Answer correct? Respond with only '1' or '0'.
"""
    try:
        response = judge_pipeline(system_instruction, max_new_tokens=5)
        # Extract the generated text and clean it up
        verdict = response[0]['generated_text'].strip()
        # Ensure the verdict is either '1' or '0'
        if '1' in verdict:
            return 1
        else:
            return 0
    except Exception:
        # If the judge fails for any reason, default to a failing grade
        return 0

# --- Core Evaluation Logic ---
def run_evaluation(model_repo, model_nickname, progress=gr.Progress()):
    """
    Loads a user-specified model, runs it against the benchmark, evaluates the answers
    using an AI judge, and saves the results.
    """
    if not model_repo or not model_nickname:
        gr.Warning("Model Repository and Nickname cannot be empty.")
        return pd.DataFrame(), None

    # Load benchmark questions
    try:
        questions_df = pd.read_csv(QA_FILE)
        # Use a small subset for quick demos if needed
        # questions_df = questions_df.head(3)
    except Exception as e:
        gr.Error(f"Failed to load benchmark questions from {QA_FILE}: {e}")
        return pd.DataFrame(), None

    # --- Load Models ---
    progress(0, desc="Loading AI Judge Model...")
    try:
        judge_pipeline = pipeline("text2text-generation", model=JUDGE_MODEL_REPO, device_map="auto", torch_dtype=torch.bfloat16)
    except Exception as e:
        gr.Error(f"Failed to load AI Judge model '{JUDGE_MODEL_REPO}': {e}")
        return pd.DataFrame(), None

    progress(0.1, desc=f"Loading test model: {model_repo}")
    try:
        model_to_test_tokenizer = AutoTokenizer.from_pretrained(model_repo)
        model_to_test = AutoModelForCausalLM.from_pretrained(
            model_repo,
            device_map="auto",
            torch_dtype=torch.bfloat16 # bfloat16 is good for ZeroGPU
        )
        test_pipeline = pipeline(
            "text-generation",
            model=model_to_test,
            tokenizer=model_to_test_tokenizer,
            max_new_tokens=1024, # Set a reasonable limit for code generation
            do_sample=True,
            temperature=0.7,
            top_p=0.95
        )
    except Exception as e:
        gr.Error(f"Failed to load the specified test model '{model_repo}': {e}")
        return pd.DataFrame(), None

    # --- Run Benchmark Loop ---
    detailed_results = []
    total_score = 0
    total_questions = len(questions_df)

    for i, row in enumerate(questions_df.itertuples()):
        progress_val = 0.1 + (0.8 * (i / total_questions))
        progress(progress_val, desc=f"Running Q{row.ID}/{total_questions}")

        # Generate answer from the model being tested
        try:
            prompt = f"Question: {row.Question}\n\nAnswer:"
            response = test_pipeline(prompt)
            ai_answer = response[0]['generated_text'].replace(prompt, "").strip()
        except Exception as e:
            ai_answer = f"Error during generation: {e}"

        # Get verdict from the AI Judge
        score = get_ai_judge_verdict(judge_pipeline, row.Question, row.Golden_Answer_Summary, ai_answer)
        total_score += score

        detailed_results.append({
            "ID": row.ID,
            "Question": row.Question,
            "AI_Answer": ai_answer,
            "Score": score
        })
        time.sleep(0.1) # Small delay to allow UI to update

    # --- Finalize and Save Results ---
    progress(0.95, desc="Finalizing and saving...")
    final_score_percent = (total_score / total_questions) * 100 if total_questions > 0 else 0

    run_summary = {
        "model_nickname": model_nickname,
        "model_repo": model_repo,
        "score_percent": round(final_score_percent, 2),
        "timestamp": datetime.utcnow().isoformat(),
        "detailed_results": detailed_results
    }

    try:
        with open(RESULTS_FILE, "a") as f:
            f.write(json.dumps(run_summary) + "\n")
    except Exception as e:
        gr.Warning(f"Could not save results to {RESULTS_FILE}: {e}")

    progress(1, desc="Evaluation Complete!")
    return pd.DataFrame(detailed_results), gr.Markdown(f"**Overall Score: {final_score_percent:.2f}%**")


# --- Leaderboard Logic ---
def load_leaderboard():
    """
    Loads and displays the leaderboard from the results file.
    """
    if not os.path.exists(RESULTS_FILE) or os.path.getsize(RESULTS_FILE) == 0:
        return pd.DataFrame(columns=["Rank", "Model Nickname", "Score (%)", "Date"])

    results_data = []
    with open(RESULTS_FILE, "r") as f:
        for line in f:
            try:
                data = json.loads(line)
                results_data.append({
                    "Model Nickname": data.get("model_nickname"),
                    "Score (%)": data.get("score_percent"),
                    "Model Repo": data.get("model_repo"),
                    "Date": datetime.fromisoformat(data.get("timestamp")).strftime('%Y-%m-%d %H:%M:%S')
                })
            except (json.JSONDecodeError, KeyError):
                # Skip corrupted or malformed lines
                continue
    
    if not results_data:
        return pd.DataFrame(columns=["Rank", "Model Nickname", "Score (%)", "Date"])

    leaderboard_df = pd.DataFrame(results_data)
    leaderboard_df = leaderboard_df.sort_values(by="Score (%)", ascending=False).reset_index(drop=True)
    leaderboard_df["Rank"] = leaderboard_df.index + 1
    
    # Reorder columns for display
    leaderboard_df = leaderboard_df[["Rank", "Model Nickname", "Score (%)", "Date", "Model Repo"]]
    return leaderboard_df


# --- Gradio UI ---
with gr.Blocks(theme=gr.themes.Soft(), title="NPFL Benchmark") as demo:
    gr.Markdown("# NPFL (No Placeholders, Full Logic) AI Benchmark")

    with gr.Tabs():
        with gr.TabItem("Run Evaluation"):
            with gr.Row():
                with gr.Column(scale=2):
                    model_repo_input = gr.Textbox(
                        label="Hugging Face Model Repository",
                        placeholder="e.g., google/gemma-2b-it",
                        info="The model to be tested. Must be compatible with the text-generation pipeline."
                    )
                    model_nickname_input = gr.Textbox(
                        label="Model Nickname",
                        placeholder="e.g., Gemma-2B-v1",
                        info="A unique name to display on the leaderboard."
                    )
                    run_button = gr.Button("Start Evaluation", variant="primary")
                with gr.Column(scale=1):
                    final_score_output = gr.Markdown("**Overall Score: --**")

            gr.Markdown("---")
            gr.Markdown("### Detailed Run Results")
            results_output = gr.DataFrame(
                headers=["ID", "Question", "AI_Answer", "Score"],
                wrap=True,
                height=600
            )

        with gr.TabItem("Leaderboard"):
            leaderboard_refresh_button = gr.Button("Refresh Leaderboard")
            leaderboard_output = gr.DataFrame(
                headers=["Rank", "Model Nickname", "Score (%)", "Date", "Model Repo"],
                wrap=True,
                height=700
            )

    # --- Event Handlers ---
    run_button.click(
        fn=run_evaluation,
        inputs=[model_repo_input, model_nickname_input],
        outputs=[results_output, final_score_output]
    )

    leaderboard_refresh_button.click(
        fn=load_leaderboard,
        inputs=[],
        outputs=[leaderboard_output]
    )
    
    # Load leaderboard once on startup
    demo.load(load_leaderboard, None, leaderboard_output)


if __name__ == "__main__":
    demo.launch(debug=True)