import gradio as gr import pandas as pd import torch from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM import json import os from datetime import datetime import time # --- Configuration --- QA_FILE = "qa.txt" RESULTS_FILE = "Eval_results.jsonl" JUDGE_MODEL_REPO = "google/flan-t5-base" # A capable but relatively small model for judging # --- Setup: Ensure files exist --- if not os.path.exists(RESULTS_FILE): with open(RESULTS_FILE, "w") as f: pass # Create an empty file if it doesn't exist if not os.path.exists(QA_FILE): # Create a dummy qa.txt if it's missing, with a few example questions dummy_data = """ID,Question_Type,Question,Golden_Answer_Summary 1,Code,"Create a Python function that implements the Bubble Sort algorithm.","The function should take a list, use nested loops to compare adjacent elements, and swap them if they are in the wrong order. The outer loop runs n times, and the inner loop runs n-i-1 times." 2,Common Chat,"What is the capital of France?","The answer must be Paris." 3,Advanced Code,"Write a Python script that connects to a public FTP server, lists the files in the root directory, and then disconnects.","The script must import the `ftplib` library. It should create an FTP object, for example `FTP('ftp.dlptest.com')`, call the `login()` method, then `retrlines('LIST')` to print the directory listing, and finally `quit()` to close the connection." """ with open(QA_FILE, "w") as f: f.write(dummy_data) # --- AI Judge Logic --- def get_ai_judge_verdict(judge_pipeline, question, golden_summary, ai_answer): """ Uses the AI Judge model to give a verdict on the tested model's answer. """ system_instruction = f""" You are an expert evaluator for an AI model benchmark. Your task is to determine if the AI's answer is a correct and satisfactory response to the user's question. You must only respond with a single character: '1' for a correct/passing answer, or '0' for an incorrect/failing answer. A '1' means the AI's answer correctly addresses the main components of the question and is similar in spirit to the expected golden answer summary. A '0' means the AI's answer is factually wrong, does not address the question, is a refusal to answer, or is fundamentally incomplete. --- User Question: {question} Expected Golden Answer Summary: {golden_summary} --- AI Model's Answer: {ai_answer} --- Based on this, is the AI Model's Answer correct? Respond with only '1' or '0'. """ try: response = judge_pipeline(system_instruction, max_new_tokens=5) # Extract the generated text and clean it up verdict = response[0]['generated_text'].strip() # Ensure the verdict is either '1' or '0' if '1' in verdict: return 1 else: return 0 except Exception: # If the judge fails for any reason, default to a failing grade return 0 # --- Core Evaluation Logic --- def run_evaluation(model_repo, model_nickname, progress=gr.Progress()): """ Loads a user-specified model, runs it against the benchmark, evaluates the answers using an AI judge, and saves the results. """ if not model_repo or not model_nickname: gr.Warning("Model Repository and Nickname cannot be empty.") return pd.DataFrame(), None # Load benchmark questions try: questions_df = pd.read_csv(QA_FILE) # Use a small subset for quick demos if needed # questions_df = questions_df.head(3) except Exception as e: gr.Error(f"Failed to load benchmark questions from {QA_FILE}: {e}") return pd.DataFrame(), None # --- Load Models --- progress(0, desc="Loading AI Judge Model...") try: judge_pipeline = pipeline("text2text-generation", model=JUDGE_MODEL_REPO, device_map="auto", torch_dtype=torch.bfloat16) except Exception as e: gr.Error(f"Failed to load AI Judge model '{JUDGE_MODEL_REPO}': {e}") return pd.DataFrame(), None progress(0.1, desc=f"Loading test model: {model_repo}") try: model_to_test_tokenizer = AutoTokenizer.from_pretrained(model_repo) model_to_test = AutoModelForCausalLM.from_pretrained( model_repo, device_map="auto", torch_dtype=torch.bfloat16 # bfloat16 is good for ZeroGPU ) test_pipeline = pipeline( "text-generation", model=model_to_test, tokenizer=model_to_test_tokenizer, max_new_tokens=1024, # Set a reasonable limit for code generation do_sample=True, temperature=0.7, top_p=0.95 ) except Exception as e: gr.Error(f"Failed to load the specified test model '{model_repo}': {e}") return pd.DataFrame(), None # --- Run Benchmark Loop --- detailed_results = [] total_score = 0 total_questions = len(questions_df) for i, row in enumerate(questions_df.itertuples()): progress_val = 0.1 + (0.8 * (i / total_questions)) progress(progress_val, desc=f"Running Q{row.ID}/{total_questions}") # Generate answer from the model being tested try: prompt = f"Question: {row.Question}\n\nAnswer:" response = test_pipeline(prompt) ai_answer = response[0]['generated_text'].replace(prompt, "").strip() except Exception as e: ai_answer = f"Error during generation: {e}" # Get verdict from the AI Judge score = get_ai_judge_verdict(judge_pipeline, row.Question, row.Golden_Answer_Summary, ai_answer) total_score += score detailed_results.append({ "ID": row.ID, "Question": row.Question, "AI_Answer": ai_answer, "Score": score }) time.sleep(0.1) # Small delay to allow UI to update # --- Finalize and Save Results --- progress(0.95, desc="Finalizing and saving...") final_score_percent = (total_score / total_questions) * 100 if total_questions > 0 else 0 run_summary = { "model_nickname": model_nickname, "model_repo": model_repo, "score_percent": round(final_score_percent, 2), "timestamp": datetime.utcnow().isoformat(), "detailed_results": detailed_results } try: with open(RESULTS_FILE, "a") as f: f.write(json.dumps(run_summary) + "\n") except Exception as e: gr.Warning(f"Could not save results to {RESULTS_FILE}: {e}") progress(1, desc="Evaluation Complete!") return pd.DataFrame(detailed_results), gr.Markdown(f"**Overall Score: {final_score_percent:.2f}%**") # --- Leaderboard Logic --- def load_leaderboard(): """ Loads and displays the leaderboard from the results file. """ if not os.path.exists(RESULTS_FILE) or os.path.getsize(RESULTS_FILE) == 0: return pd.DataFrame(columns=["Rank", "Model Nickname", "Score (%)", "Date"]) results_data = [] with open(RESULTS_FILE, "r") as f: for line in f: try: data = json.loads(line) results_data.append({ "Model Nickname": data.get("model_nickname"), "Score (%)": data.get("score_percent"), "Model Repo": data.get("model_repo"), "Date": datetime.fromisoformat(data.get("timestamp")).strftime('%Y-%m-%d %H:%M:%S') }) except (json.JSONDecodeError, KeyError): # Skip corrupted or malformed lines continue if not results_data: return pd.DataFrame(columns=["Rank", "Model Nickname", "Score (%)", "Date"]) leaderboard_df = pd.DataFrame(results_data) leaderboard_df = leaderboard_df.sort_values(by="Score (%)", ascending=False).reset_index(drop=True) leaderboard_df["Rank"] = leaderboard_df.index + 1 # Reorder columns for display leaderboard_df = leaderboard_df[["Rank", "Model Nickname", "Score (%)", "Date", "Model Repo"]] return leaderboard_df # --- Gradio UI --- with gr.Blocks(theme=gr.themes.Soft(), title="NPFL Benchmark") as demo: gr.Markdown("# NPFL (No Placeholders, Full Logic) AI Benchmark") with gr.Tabs(): with gr.TabItem("Run Evaluation"): with gr.Row(): with gr.Column(scale=2): model_repo_input = gr.Textbox( label="Hugging Face Model Repository", placeholder="e.g., google/gemma-2b-it", info="The model to be tested. Must be compatible with the text-generation pipeline." ) model_nickname_input = gr.Textbox( label="Model Nickname", placeholder="e.g., Gemma-2B-v1", info="A unique name to display on the leaderboard." ) run_button = gr.Button("Start Evaluation", variant="primary") with gr.Column(scale=1): final_score_output = gr.Markdown("**Overall Score: --**") gr.Markdown("---") gr.Markdown("### Detailed Run Results") results_output = gr.DataFrame( headers=["ID", "Question", "AI_Answer", "Score"], wrap=True, height=600 ) with gr.TabItem("Leaderboard"): leaderboard_refresh_button = gr.Button("Refresh Leaderboard") leaderboard_output = gr.DataFrame( headers=["Rank", "Model Nickname", "Score (%)", "Date", "Model Repo"], wrap=True, height=700 ) # --- Event Handlers --- run_button.click( fn=run_evaluation, inputs=[model_repo_input, model_nickname_input], outputs=[results_output, final_score_output] ) leaderboard_refresh_button.click( fn=load_leaderboard, inputs=[], outputs=[leaderboard_output] ) # Load leaderboard once on startup demo.load(load_leaderboard, None, leaderboard_output) if __name__ == "__main__": demo.launch(debug=True)