Spaces:
Runtime error
Runtime error
File size: 10,201 Bytes
09fee22 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 |
import gradio as gr
import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import json
import os
from datetime import datetime
import time
# --- Configuration ---
QA_FILE = "qa.txt"
RESULTS_FILE = "Eval_results.jsonl"
JUDGE_MODEL_REPO = "google/flan-t5-base" # A capable but relatively small model for judging
# --- Setup: Ensure files exist ---
if not os.path.exists(RESULTS_FILE):
with open(RESULTS_FILE, "w") as f:
pass # Create an empty file if it doesn't exist
if not os.path.exists(QA_FILE):
# Create a dummy qa.txt if it's missing, with a few example questions
dummy_data = """ID,Question_Type,Question,Golden_Answer_Summary
1,Code,"Create a Python function that implements the Bubble Sort algorithm.","The function should take a list, use nested loops to compare adjacent elements, and swap them if they are in the wrong order. The outer loop runs n times, and the inner loop runs n-i-1 times."
2,Common Chat,"What is the capital of France?","The answer must be Paris."
3,Advanced Code,"Write a Python script that connects to a public FTP server, lists the files in the root directory, and then disconnects.","The script must import the `ftplib` library. It should create an FTP object, for example `FTP('ftp.dlptest.com')`, call the `login()` method, then `retrlines('LIST')` to print the directory listing, and finally `quit()` to close the connection."
"""
with open(QA_FILE, "w") as f:
f.write(dummy_data)
# --- AI Judge Logic ---
def get_ai_judge_verdict(judge_pipeline, question, golden_summary, ai_answer):
"""
Uses the AI Judge model to give a verdict on the tested model's answer.
"""
system_instruction = f"""
You are an expert evaluator for an AI model benchmark. Your task is to determine if the AI's answer is a correct and satisfactory response to the user's question. You must only respond with a single character: '1' for a correct/passing answer, or '0' for an incorrect/failing answer.
A '1' means the AI's answer correctly addresses the main components of the question and is similar in spirit to the expected golden answer summary.
A '0' means the AI's answer is factually wrong, does not address the question, is a refusal to answer, or is fundamentally incomplete.
---
User Question:
{question}
Expected Golden Answer Summary:
{golden_summary}
---
AI Model's Answer:
{ai_answer}
---
Based on this, is the AI Model's Answer correct? Respond with only '1' or '0'.
"""
try:
response = judge_pipeline(system_instruction, max_new_tokens=5)
# Extract the generated text and clean it up
verdict = response[0]['generated_text'].strip()
# Ensure the verdict is either '1' or '0'
if '1' in verdict:
return 1
else:
return 0
except Exception:
# If the judge fails for any reason, default to a failing grade
return 0
# --- Core Evaluation Logic ---
def run_evaluation(model_repo, model_nickname, progress=gr.Progress()):
"""
Loads a user-specified model, runs it against the benchmark, evaluates the answers
using an AI judge, and saves the results.
"""
if not model_repo or not model_nickname:
gr.Warning("Model Repository and Nickname cannot be empty.")
return pd.DataFrame(), None
# Load benchmark questions
try:
questions_df = pd.read_csv(QA_FILE)
# Use a small subset for quick demos if needed
# questions_df = questions_df.head(3)
except Exception as e:
gr.Error(f"Failed to load benchmark questions from {QA_FILE}: {e}")
return pd.DataFrame(), None
# --- Load Models ---
progress(0, desc="Loading AI Judge Model...")
try:
judge_pipeline = pipeline("text2text-generation", model=JUDGE_MODEL_REPO, device_map="auto", torch_dtype=torch.bfloat16)
except Exception as e:
gr.Error(f"Failed to load AI Judge model '{JUDGE_MODEL_REPO}': {e}")
return pd.DataFrame(), None
progress(0.1, desc=f"Loading test model: {model_repo}")
try:
model_to_test_tokenizer = AutoTokenizer.from_pretrained(model_repo)
model_to_test = AutoModelForCausalLM.from_pretrained(
model_repo,
device_map="auto",
torch_dtype=torch.bfloat16 # bfloat16 is good for ZeroGPU
)
test_pipeline = pipeline(
"text-generation",
model=model_to_test,
tokenizer=model_to_test_tokenizer,
max_new_tokens=1024, # Set a reasonable limit for code generation
do_sample=True,
temperature=0.7,
top_p=0.95
)
except Exception as e:
gr.Error(f"Failed to load the specified test model '{model_repo}': {e}")
return pd.DataFrame(), None
# --- Run Benchmark Loop ---
detailed_results = []
total_score = 0
total_questions = len(questions_df)
for i, row in enumerate(questions_df.itertuples()):
progress_val = 0.1 + (0.8 * (i / total_questions))
progress(progress_val, desc=f"Running Q{row.ID}/{total_questions}")
# Generate answer from the model being tested
try:
prompt = f"Question: {row.Question}\n\nAnswer:"
response = test_pipeline(prompt)
ai_answer = response[0]['generated_text'].replace(prompt, "").strip()
except Exception as e:
ai_answer = f"Error during generation: {e}"
# Get verdict from the AI Judge
score = get_ai_judge_verdict(judge_pipeline, row.Question, row.Golden_Answer_Summary, ai_answer)
total_score += score
detailed_results.append({
"ID": row.ID,
"Question": row.Question,
"AI_Answer": ai_answer,
"Score": score
})
time.sleep(0.1) # Small delay to allow UI to update
# --- Finalize and Save Results ---
progress(0.95, desc="Finalizing and saving...")
final_score_percent = (total_score / total_questions) * 100 if total_questions > 0 else 0
run_summary = {
"model_nickname": model_nickname,
"model_repo": model_repo,
"score_percent": round(final_score_percent, 2),
"timestamp": datetime.utcnow().isoformat(),
"detailed_results": detailed_results
}
try:
with open(RESULTS_FILE, "a") as f:
f.write(json.dumps(run_summary) + "\n")
except Exception as e:
gr.Warning(f"Could not save results to {RESULTS_FILE}: {e}")
progress(1, desc="Evaluation Complete!")
return pd.DataFrame(detailed_results), gr.Markdown(f"**Overall Score: {final_score_percent:.2f}%**")
# --- Leaderboard Logic ---
def load_leaderboard():
"""
Loads and displays the leaderboard from the results file.
"""
if not os.path.exists(RESULTS_FILE) or os.path.getsize(RESULTS_FILE) == 0:
return pd.DataFrame(columns=["Rank", "Model Nickname", "Score (%)", "Date"])
results_data = []
with open(RESULTS_FILE, "r") as f:
for line in f:
try:
data = json.loads(line)
results_data.append({
"Model Nickname": data.get("model_nickname"),
"Score (%)": data.get("score_percent"),
"Model Repo": data.get("model_repo"),
"Date": datetime.fromisoformat(data.get("timestamp")).strftime('%Y-%m-%d %H:%M:%S')
})
except (json.JSONDecodeError, KeyError):
# Skip corrupted or malformed lines
continue
if not results_data:
return pd.DataFrame(columns=["Rank", "Model Nickname", "Score (%)", "Date"])
leaderboard_df = pd.DataFrame(results_data)
leaderboard_df = leaderboard_df.sort_values(by="Score (%)", ascending=False).reset_index(drop=True)
leaderboard_df["Rank"] = leaderboard_df.index + 1
# Reorder columns for display
leaderboard_df = leaderboard_df[["Rank", "Model Nickname", "Score (%)", "Date", "Model Repo"]]
return leaderboard_df
# --- Gradio UI ---
with gr.Blocks(theme=gr.themes.Soft(), title="NPFL Benchmark") as demo:
gr.Markdown("# NPFL (No Placeholders, Full Logic) AI Benchmark")
with gr.Tabs():
with gr.TabItem("Run Evaluation"):
with gr.Row():
with gr.Column(scale=2):
model_repo_input = gr.Textbox(
label="Hugging Face Model Repository",
placeholder="e.g., google/gemma-2b-it",
info="The model to be tested. Must be compatible with the text-generation pipeline."
)
model_nickname_input = gr.Textbox(
label="Model Nickname",
placeholder="e.g., Gemma-2B-v1",
info="A unique name to display on the leaderboard."
)
run_button = gr.Button("Start Evaluation", variant="primary")
with gr.Column(scale=1):
final_score_output = gr.Markdown("**Overall Score: --**")
gr.Markdown("---")
gr.Markdown("### Detailed Run Results")
results_output = gr.DataFrame(
headers=["ID", "Question", "AI_Answer", "Score"],
wrap=True,
height=600
)
with gr.TabItem("Leaderboard"):
leaderboard_refresh_button = gr.Button("Refresh Leaderboard")
leaderboard_output = gr.DataFrame(
headers=["Rank", "Model Nickname", "Score (%)", "Date", "Model Repo"],
wrap=True,
height=700
)
# --- Event Handlers ---
run_button.click(
fn=run_evaluation,
inputs=[model_repo_input, model_nickname_input],
outputs=[results_output, final_score_output]
)
leaderboard_refresh_button.click(
fn=load_leaderboard,
inputs=[],
outputs=[leaderboard_output]
)
# Load leaderboard once on startup
demo.load(load_leaderboard, None, leaderboard_output)
if __name__ == "__main__":
demo.launch(debug=True) |