Spaces:

Enderchef
/

SuperBench-Eval

Running on Zero

App Files Files Community

Enderchef commited on Jun 26

Commit

aae1544

verified ·

1 Parent(s): 678a72a

Update app.py

Browse files

Files changed (1) hide show

app.py +175 -308

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ from datetime import datetime
 # It's good practice to ensure the cache directory exists.
 CACHE_DIR = "evaluation_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
-EVAL_FILE = os.path.join(CACHE_DIR, "eval.jsonl")
 # Cache to avoid reloading models and dataset configs
 model_cache = {}
@@ -25,14 +25,12 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
 # --- Constants for Benchmarks ---
 MMLU_DATASET = "cais/mmlu"
-# Temporarily remove MMLU-Pro references
-# MMLU_PRO_DATASET = "TIGER-Lab/MMLU-Pro"
 BENCHMARK_MAP = {
     "MMLU": MMLU_DATASET,
-    # "MMLU-Pro": MMLU_PRO_DATASET # Temporarily removed
 }
 # --- Data Loading and Preparation ---
 def get_all_benchmark_options():
     """
     Fetches and caches the available subjects (configs) for each benchmark dataset.
@@ -41,16 +39,13 @@ def get_all_benchmark_options():
     if benchmark_subject_cache:
         return benchmark_subject_cache
     print("Fetching benchmark configurations for the first time...")
-    # Only iterate over the allowed benchmarks (MMLU)
     for key, dataset_id in BENCHMARK_MAP.items():
         try:
-            # Fetching dataset configurations requires authentication if the dataset is private
             subjects = get_dataset_config_names(dataset_id, token=HF_TOKEN)
-            benchmark_subject_cache[key] = ["ALL"] + sorted([s for s in subjects if s != 'all']) # Sort subjects
         except Exception as e:
             print(f"Warning: Could not load configs for {key} ({dataset_id}). It might be private or unavailable. Error: {e}")
-            benchmark_subject_cache[key] = ["ALL"] # Provide a default
     print("Benchmark configurations cached.")
     return benchmark_subject_cache
@@ -65,39 +60,34 @@ def load_model(model_id):
     """
     if not model_id:
         raise ValueError("Model ID cannot be empty.")
-        gr.Info(f"Attempting to load model: {model_id}...")
     if model_id in model_cache:
         gr.Info(f"Model '{model_id}' found in cache.")
         return model_cache[model_id]
     try:
-        # Use bfloat16 for better performance on modern GPUs
         dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
         tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             token=HF_TOKEN,
             torch_dtype=dtype,
             trust_remote_code=True,
-            low_cpu_mem_usage=True, # Optimization for large models
         ).to("cuda" if torch.cuda.is_available() else "cpu")
-        # Create the pipeline for text generation
         generator = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
             device=0 if torch.cuda.is_available() else -1
         )
         model_cache[model_id] = generator
         gr.Info(f"Model '{model_id}' loaded successfully.")
         return generator
     except Exception as e:
-        # Raise a more specific error to be caught by the main evaluation function
-        raise RuntimeError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token (if required). Error: {e}")
 # --- Evaluation Logic ---
 def format_prompt(item):
     """Formats the MMLU question and choices into a standardized prompt."""
     prompt = f"Question: {item['question']}\n\nChoices:\nA. {item['choices'][0]}\nB. {item['choices'][1]}\nC. {item['choices'][2]}\nD. {item['choices'][3]}\n\nAnswer:"
@@ -108,125 +98,123 @@ def get_choice_letter(index):
     return chr(ord('A') + index) if 0 <= index <= 3 else None
 def extract_predicted_letter(output_text):
-    """
-    Extracts the predicted letter from the model's output.
-    It looks for a letter (A, B, C, D) immediately following 'Answer:'.
-    """
-    # Look for "Answer: X" and capture X
     match = re.search(r"Answer:\s*([ABCD])", output_text.strip(), re.IGNORECASE)
     if match:
         return match.group(1).upper()
-    # Fallback: if the model just outputs a letter
     match = re.search(r"^\s*([ABCD])\b", output_text.strip())
     if match:
         return match.group(1).upper()
     return None
-def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress):
     """
-    Evaluates a model on a specific subject from a dataset.
-    """
-    gr.Info(f"Loading dataset: {dataset_id} ({subject})...")
-    try:
-        # Load the 'test' split as it's standard for MMLU evaluation
-        dataset = load_dataset(dataset_id, subject, token=HF_TOKEN, split="test")
-    except Exception as e:
-        raise RuntimeError(f"Failed to load dataset '{dataset_id}' for subject '{subject}'. Error: {e}")
-    # Shuffle and select a subset of samples for evaluation
-    num_samples = min(sample_count, len(dataset))
-    dataset = dataset.shuffle(seed=42).select(range(num_samples))
-    correct_predictions = 0
-    results_details = []
-    for item in progress.tqdm(dataset, desc=f"Evaluating {subject}"):
-        prompt, correct_answer_idx = format_prompt(item)
-        expected_letter = get_choice_letter(correct_answer_idx)
-        # The generated text is often just after the prompt. We need to slice it.
-        full_prompt_text = generator.tokenizer.decode(generator.tokenizer.encode(prompt), skip_special_tokens=True)
-        # Generate a short response, aiming for a single letter answer.
-        # do_sample=False (greedy decoding) is crucial for reproducibility.
-        raw_output = generator(prompt, max_new_tokens=5, do_sample=False, pad_token_id=generator.tokenizer.eos_token_id)[0]["generated_text"]
-        # Isolate the newly generated part
-        generated_text_only = raw_output[len(full_prompt_text):].strip()
-        predicted_letter = extract_predicted_letter(generated_text_only)
-        is_correct = (predicted_letter == expected_letter)
-        if is_correct:
-            correct_predictions += 1
-        results_details.append({
-            "Question": item['question'],
-            "Correct": "✅" if is_correct else "❌",
-            "Expected": expected_letter,
-            "Predicted": predicted_letter or "N/A",
-            "Model Output": generated_text_only
-        })
-    accuracy = (correct_predictions / num_samples) * 100 if num_samples > 0 else 0
-    return accuracy, results_details
 @spaces.GPU()
-def run_evaluation(model_id, benchmark_category, subject_name, sample_count, progress=gr.Progress(track_tqdm=True)):
     """
-    Main function to orchestrate the entire evaluation process.
-    Handles single subject or 'ALL' subjects evaluation.
-    Returns a dictionary of Gradio updates.
     """
     try:
-        gr.Info("Starting evaluation...")
         generator = load_model(model_id)
         dataset_id = BENCHMARK_MAP.get(benchmark_category)
         if not dataset_id:
             raise ValueError(f"Invalid benchmark category: {benchmark_category}")
-        all_results_details = []
-        summary_lines = []
-        total_correct = 0
-        total_samples = 0
         subjects_to_run = []
         if subject_name == "ALL":
-            # Exclude the "ALL" placeholder from the list of subjects to run
             subjects_to_run = [s for s in ALL_BENCHMARK_SUBJECTS.get(benchmark_category, []) if s != "ALL"]
         else:
             subjects_to_run = [subject_name]
         if not subjects_to_run:
             gr.Warning(f"No subjects found for '{benchmark_category}'.")
-            # Return an empty but valid structure
-            return {
-                result_summary_output: gr.update(value="No subjects found to evaluate.", visible=True),
-                error_box: gr.update(visible=False),
-                details_box: gr.update(visible=False),
-            }
         for i, subject in enumerate(subjects_to_run):
-            gr.Info(f"Evaluating {benchmark_category} - {subject} ({i+1}/{len(subjects_to_run)})...")
             try:
-                accuracy, subject_details = evaluate_single_subject(generator, dataset_id, subject, sample_count, progress)
-                all_results_details.extend(subject_details)
-                num_correct = sum(1 for d in subject_details if d['Correct'] == "✅")
-                num_evaluated = len(subject_details)
-                total_correct += num_correct
-                total_samples += num_evaluated
-                summary_lines.append(f"- **{subject}**: {accuracy:.2f}% ({num_correct}/{num_evaluated})")
             except Exception as e:
                 error_trace = traceback.format_exc()
                 gr.Error(f"Skipping {subject} due to an error: {e}")
                 summary_lines.append(f"- **{subject}**: Evaluation failed. See logs for details:\n```\n{error_trace}\n```")
                 continue
         overall_accuracy = (total_correct / total_samples) * 100 if total_samples > 0 else 0
-        # --- Prepare Outputs ---
         if subject_name == "ALL":
             result_summary = f"### Overall Average Accuracy: {overall_accuracy:.2f}%\n"
             result_summary += f"across {total_samples:,} total samples from {len(subjects_to_run)} subjects.\n\n---\n\n**Breakdown by Subject:**\n"
@@ -234,44 +222,49 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
         else:
             result_summary = f"### Accuracy for {benchmark_category} - {subject_name}: {overall_accuracy:.2f}%\n"
             result_summary += f"({total_correct:,}/{total_samples:,} correct)"
-        # Save results for leaderboard
         record = {
             "model_id": model_id,
             "benchmark": benchmark_category,
             "accuracy": overall_accuracy,
-            "subject": subject_name, # Record if it was an 'ALL' run
             "sample_count": total_samples,
             "timestamp": datetime.now().isoformat()
         }
         with open(EVAL_FILE, "a") as f:
             f.write(json.dumps(record) + "\n")
         gr.Info("Evaluation completed successfully!")
         df_details = pd.DataFrame(all_results_details)
-        # Return a dictionary of component updates
-        return {
-            result_summary_output: gr.update(value=result_summary, visible=True),
-            error_box: gr.update(visible=False),
             details_box: gr.update(visible=True),
-            detailed_results_df: gr.update(value=df_details)
         }
     except Exception as e:
-        error_message = f"An unexpected error occurred during setup: {e}"
         error_details = traceback.format_exc()
         gr.Error(error_message)
-        return {
-            result_summary_output: gr.update(visible=False),
             error_box: gr.update(visible=True),
             error_output: gr.update(value=error_message),
             error_details_output: gr.update(value=error_details),
-            details_box: gr.update(visible=False)
         }
 # --- UI Helper Functions ---
 def update_subject_dropdown(benchmark_category):
     """Updates the subject dropdown choices based on the selected benchmark."""
     choices = ALL_BENCHMARK_SUBJECTS.get(benchmark_category, [])
@@ -281,46 +274,41 @@ def update_subject_dropdown(benchmark_category):
 def load_leaderboard(benchmark_filter, progress=gr.Progress()):
     """
     Loads and processes evaluation data to display on the leaderboard.
-    It now correctly averages scores for models that were evaluated on 'ALL' subjects.
     """
     progress(0, desc="Loading Leaderboard...")
     try:
         if not os.path.exists(EVAL_FILE):
             return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
         df = pd.read_json(EVAL_FILE, lines=True)
         if df.empty:
             return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
-        # Coerce accuracy to numeric and filter valid entries
         df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce')
         df.dropna(subset=['accuracy'], inplace=True)
-        # Filter by the selected benchmark (e.g., MMLU or MMLU-Pro)
         df_filtered = df[(df['benchmark'] == benchmark_filter) & (df['subject'] == 'ALL')].copy()
         if df_filtered.empty:
             return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
-        # Find the latest evaluation for each model
         df_filtered['timestamp'] = pd.to_datetime(df_filtered['timestamp'])
         latest_evals = df_filtered.loc[df_filtered.groupby('model_id')['timestamp'].idxmax()].copy()
         leaderboard_df = latest_evals.sort_values(by="accuracy", ascending=False).copy()
-        # Add Rank
         leaderboard_df.insert(0, 'Rank', range(1, len(leaderboard_df) + 1))
-        # Rename and format columns
         leaderboard_df.rename(columns={
             'model_id': 'Model ID',
             'accuracy': 'Avg. Accuracy (%)',
             'sample_count': 'Total Samples',
             'timestamp': 'Date'
         }, inplace=True)
         leaderboard_df['Avg. Accuracy (%)'] = leaderboard_df['Avg. Accuracy (%)'].map('{:.2f}'.format)
         leaderboard_df['Date'] = leaderboard_df['Date'].dt.strftime('%Y-%m-%d')
         progress(1, desc="Done.")
         return leaderboard_df[['Rank', 'Model ID', 'Avg. Accuracy (%)', 'Total Samples', 'Date']]
     except Exception as e:
@@ -329,174 +317,60 @@ def load_leaderboard(benchmark_filter, progress=gr.Progress()):
         return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
 # --- Gradio Interface Definition ---
-# Black/Orange Theme and bigger to fit screen
 custom_css = """
     /* --- Global & Layout (Bigger to fit screen) --- */
     body { font-family: 'Inter', sans-serif; background-color: #1a1a1a; color: #f0f0f0; } /* Dark background, light text */
     .gradio-container { max-width: 95% !important; margin: auto; padding: 20px; } /* Wider container */
-    .gr-group {
-        border-radius: 12px !important;
-        box-shadow: 0 4px 12px rgba(0,0,0,0.3) !important; /* Darker shadow */
-        border: 1px solid #333 !important; /* Darker border */
-        background-color: #2a2a2a; /* Darker group background */
-    }
-    .gr-panel {
-        border-radius: 12px !important;
-        box-shadow: 0 4px 12px rgba(0,0,0,0.3) !important;
-        border: 1px solid #333 !important;
-        background-color: #2a2a2a;
-    }
     /* --- Typography (Orange Hues) --- */
     h1 { text-align: center; font-size: 3rem !important; font-weight: 800; color: #ff8c00; margin-bottom: 0.5rem; letter-spacing: -1.5px; } /* Orange title */
     h3, h4 { color: #ffa500; } /* Orange headings */
     .subtitle { text-align: center; color: #cccccc; font-size: 1.2rem; margin-bottom: 2.5rem; max-width: 900px; margin-left: auto; margin-right: auto;}
     label { color: #f0f0f0 !important; } /* Label text color */
     /* --- Tabs --- */
     .gradio-tabs { background-color: #2a2a2a; border-radius: 12px; }
-    .gradio-tab-item { color: #f0f0f0; }
-    .gradio-tabs button {
-        background-color: #3a3a3a !important;
-        color: #f0f0f0 !important;
-        border-radius: 8px 8px 0 0 !important;
-        transition: all 0.3s ease;
-    }
-    .gradio-tabs button.selected {
-        background-color: #ff8c00 !important; /* Orange selected tab */
-        color: #1a1a1a !important; /* Dark text on orange */
-        font-weight: 700;
-    }
-    .gradio-tabs button:hover { background-color: #555 !important; }
     /* --- Inputs --- */
-    .gr-textbox, .gr-dropdown, .gr-slider {
-        background-color: #3a3a3a !important;
-        color: #f0f0f0 !important;
-        border: 1px solid #555 !important;
-        border-radius: 8px !important;
-    }
-    .gr-textbox textarea, .gr-textbox input, .gr-dropdown input {
-        color: #f0f0f0 !important;
-    }
-    .gr-textbox.gr-text-input:focus-within {
-        border-color: #ff8c00 !important; /* Orange focus border */
-        box-shadow: 0 0 0 2px rgba(255, 140, 0, 0.5) !important;
-    }
     /* --- Buttons --- */
-    .gr-button { font-weight: 600 !important; transition: all 0.2s ease; border-radius: 8px !important; }
-    .gr-button-primary {
-        background-color: #ff8c00 !important; /* Orange primary button */
-        color: #1a1a1a !important;
-        box-shadow: 0 4px 10px rgba(255, 140, 0, 0.3);
-        border: none;
-    }
-    .gr-button-primary:hover {
-        transform: translateY(-2px);
-        box-shadow: 0 6px 15px rgba(255, 140, 0, 0.5);
-        background-color: #ffa500 !important; /* Slightly lighter orange on hover */
-    }
-    .gr-button-secondary {
-        background-color: #444 !important;
-        color: #f0f0f0 !important;
-        border: 1px solid #555 !important;
-    }
-    .gr-button-secondary:hover {
-        background-color: #555 !important;
-    }
-    /* --- Custom Radio Buttons (Segmented Control) --- */
-    #leaderboard-toggle-group { display: flex; justify-content: center; align-items: center; gap: 1rem; margin-bottom: 1.5rem; }
-    #leaderboard-toggle { background-color: #3a3a3a; padding: 5px; border-radius: 10px; display: inline-flex; border: 1px solid #555; }
-    #leaderboard-toggle div.gr-form { display: flex; gap: 5px; }
-    #leaderboard-toggle input[type='radio'] { display: none; }
-    #leaderboard-toggle label {
-        padding: 8px 16px;
-        border-radius: 8px;
-        cursor: pointer;
-        transition: all 0.3s ease;
-        font-weight: 500;
-        color: #f0f0f0;
-        background: transparent;
-        border: none;
-        box-shadow: none;
-    }
-    #leaderboard-toggle input[type='radio']:checked + label {
-        background-color: #ff8c00; /* Orange selected */
-        color: #1a1a1a;
-        font-weight: 600;
-        box-shadow: 0 2px 5px rgba(255, 140, 0, 0.3);
-    }
-    #leaderboard-toggle label:hover {
-        background-color: #555;
-    }
     /* --- Dataframe / Table Styling --- */
-    .leaderboard-table .gr-dataframe table { border-collapse: collapse; width: 100%; }
-    .leaderboard-table .gr-dataframe thead th {
-        background-color: #3a3a3a !important;
-        color: #ffa500 !important; /* Orange headers */
-        font-weight: 600 !important;
-        text-align: left;
-        padding: 12px 15px;
-        border-bottom: 2px solid #555;
-    }
-    .leaderboard-table .gr-dataframe tbody tr:nth-of-type(even) { background-color: #2f2f2f; } /* Alternating row color */
-    .leaderboard-table .gr-dataframe tbody tr:hover { background-color: #4a4a4a; } /* Hover effect */
-    .leaderboard-table .gr-dataframe tbody td {
-        padding: 12px 15px;
-        border-bottom: 1px solid #3a3a3a;
-        color: #f0f0f0;
-    }
-    .leaderboard-table .gr-dataframe tbody td:first-child { font-weight: 700; color: #ffcc99; } /* Lighter orange for rank */
     /* --- Error & Result Panes --- */
-    #error-display-box {
-        background-color: #4a1e1e !important; /* Dark red for error */
-        border-color: #8c2f2f !important;
-        color: #ffc9c9 !important; /* Lighter red text */
-    }
-    #result-summary-box {
-        background-color: #1e3a2a !important; /* Dark green for success */
-        border-color: #2f8c4a !important;
-        color: #c9ffc9 !important; /* Lighter green text */
-    }
-    .gr-markdown p { color: #f0f0f0 !important; } /* Ensure markdown paragraph text is visible */
-    .gr-markdown strong { color: #ffa500 !important; } /* Strong text in orange */
-    .gradio-message { background-color: #ff8c00 !important; color: #1a1a1a !important; border: 1px solid #ff8c00 !important; } /* Gradio Info messages */
 """
 with gr.Blocks(theme=gr.themes.Base(), css=custom_css) as demo:
     gr.Markdown("<h1>🏆 SuperBench Eval: Evaluate models and view leaderboards 🏆</h1>")
     gr.Markdown("<p class='subtitle'>Benchmark leading models on MMLU. Your results contribute to a live leaderboard. Select a benchmark and run an evaluation, or view the current standings.</p>")
     with gr.Tabs() as tabs:
         # --- Leaderboard Tab ---
         with gr.TabItem("📊 Leaderboard", id=0):
             with gr.Column():
-                with gr.Row(elem_id="leaderboard-toggle-group"):
-                    # Temporarily remove MMLU-Pro from radio options
                     leaderboard_type_toggle = gr.Radio(
-                        ["MMLU"],
-                        label="Select Benchmark",
-                        value="MMLU",
-                        interactive=True,
-                        elem_id="leaderboard-toggle",
-                        container=False,
-                        show_label=False,
                     )
                     refresh_button = gr.Button("🔄 Refresh", size="sm")
                 leaderboard_table_output = gr.DataFrame(
                     headers=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"],
-                    interactive=False,
-                    datatype=["number", "str", "str", "number", "str"],
-                    row_count=15, # Adjusted for more rows
-                    elem_classes="leaderboard-table",
-                    # Removed col_count to allow dynamic width
                 )
         # --- Evaluation Tab ---
         with gr.TabItem("🚀 Run Evaluation", id=1):
             with gr.Row(variant='panel'):
@@ -504,77 +378,71 @@ with gr.Blocks(theme=gr.themes.Base(), css=custom_css) as demo:
                     with gr.Group():
                         gr.Markdown("### 1. Configure Evaluation")
                         model_id_input = gr.Textbox(
-                            label="Hugging Face Model ID",
-                            placeholder="e.g., meta-llama/Meta-Llama-3-8B-Instruct",
-                            interactive=True,
-                            scale=2 # Increased scale for textbox
                         )
-                        # Temporarily remove MMLU-Pro from radio options
                         benchmark_selection_radio = gr.Radio(
-                            ["MMLU"],
-                            label="Benchmark",
-                            value="MMLU",
-                            interactive=True,
                         )
                         with gr.Row():
                             benchmark_subject_dropdown = gr.Dropdown(
-                                label="Subject",
-                                # Ensure only MMLU subjects are fetched
-                                choices=ALL_BENCHMARK_SUBJECTS.get("MMLU", []),
-                                value="ALL",
-                                interactive=True
                             )
                             sample_count_slider = gr.Slider(
-                                label="Samples per Subject",
-                                minimum=5, maximum=100, value=25, step=5, interactive=True
                             )
                     run_button = gr.Button("Start Evaluation", variant="primary", scale=1)
                 with gr.Column(scale=3):
                     gr.Markdown("### 2. View Results")
                     # Panel for displaying the summary of results
                     with gr.Group(visible=False) as result_summary_box:
                         result_summary_output = gr.Markdown(elem_id="result-summary-box")
                     # Panel for displaying errors
                     with gr.Group(visible=False) as error_box:
                         error_output = gr.Textbox(label="Error Message", interactive=False, elem_id="error-display-box")
                         error_details_output = gr.Textbox(label="Error Details (Traceback)", interactive=False, lines=8)
                     # Panel for detailed, row-by-row results
                     with gr.Group(visible=False) as details_box:
                         gr.Markdown("#### Detailed Evaluation Log")
                         detailed_results_df = gr.DataFrame(
                             headers=["Question", "Correct", "Expected", "Predicted", "Model Output"],
                             datatype=["str", "str", "str", "str", "str"],
-                            interactive=False,
-                            row_count=10, # Adjusted for more rows
-                            # Removed col_count to allow dynamic width
-                            wrap=True,
                         )
-    # --- Event Handlers & Logic ---
-    # Update subject dropdown when benchmark type changes
     benchmark_selection_radio.change(
         fn=update_subject_dropdown,
         inputs=[benchmark_selection_radio],
         outputs=[benchmark_subject_dropdown]
     )
-    # Main evaluation trigger
     run_button.click(
         fn=run_evaluation,
         inputs=[model_id_input, benchmark_selection_radio, benchmark_subject_dropdown, sample_count_slider],
-        outputs=[result_summary_output, error_box, error_output, error_details_output, details_box, detailed_results_df]
-    ).then(
-        # After evaluation, switch to the leaderboard tab and refresh it
-        lambda: gr.update(selected=0), outputs=[tabs]
     ).then(
         load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output]
     )
-    # Leaderboard loading logic
     demo.load(
         fn=load_leaderboard,
         inputs=[leaderboard_type_toggle],
@@ -593,6 +461,5 @@ with gr.Blocks(theme=gr.themes.Base(), css=custom_css) as demo:
         show_progress='full'
     )
-# Launch the Gradio app
 if __name__ == "__main__":
-    demo.launch(debug=True)

 # It's good practice to ensure the cache directory exists.
 CACHE_DIR = "evaluation_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
+EVAL_FILE = os.path.join(CACHE_DIR, "evals.jsonl")
 # Cache to avoid reloading models and dataset configs
 model_cache = {}
 # --- Constants for Benchmarks ---
 MMLU_DATASET = "cais/mmlu"
 BENCHMARK_MAP = {
     "MMLU": MMLU_DATASET,
 }
 # --- Data Loading and Preparation ---
 def get_all_benchmark_options():
     """
     Fetches and caches the available subjects (configs) for each benchmark dataset.
     if benchmark_subject_cache:
         return benchmark_subject_cache
     print("Fetching benchmark configurations for the first time...")
     for key, dataset_id in BENCHMARK_MAP.items():
         try:
             subjects = get_dataset_config_names(dataset_id, token=HF_TOKEN)
+            benchmark_subject_cache[key] = ["ALL"] + sorted([s for s in subjects if s != 'all'])
         except Exception as e:
             print(f"Warning: Could not load configs for {key} ({dataset_id}). It might be private or unavailable. Error: {e}")
+            benchmark_subject_cache[key] = ["ALL"]
     print("Benchmark configurations cached.")
     return benchmark_subject_cache
     """
     if not model_id:
         raise ValueError("Model ID cannot be empty.")
+    gr.Info(f"Attempting to load model: {model_id}...")
     if model_id in model_cache:
         gr.Info(f"Model '{model_id}' found in cache.")
         return model_cache[model_id]
     try:
         dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
         tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             token=HF_TOKEN,
             torch_dtype=dtype,
             trust_remote_code=True,
+            low_cpu_mem_usage=True,
         ).to("cuda" if torch.cuda.is_available() else "cpu")
         generator = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
             device=0 if torch.cuda.is_available() else -1
         )
         model_cache[model_id] = generator
         gr.Info(f"Model '{model_id}' loaded successfully.")
         return generator
     except Exception as e:
+        raise RuntimeError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token. Error: {e}")
 # --- Evaluation Logic ---
 def format_prompt(item):
     """Formats the MMLU question and choices into a standardized prompt."""
     prompt = f"Question: {item['question']}\n\nChoices:\nA. {item['choices'][0]}\nB. {item['choices'][1]}\nC. {item['choices'][2]}\nD. {item['choices'][3]}\n\nAnswer:"
     return chr(ord('A') + index) if 0 <= index <= 3 else None
 def extract_predicted_letter(output_text):
+    """Extracts the predicted letter from the model's output."""
     match = re.search(r"Answer:\s*([ABCD])", output_text.strip(), re.IGNORECASE)
     if match:
         return match.group(1).upper()
     match = re.search(r"^\s*([ABCD])\b", output_text.strip())
     if match:
         return match.group(1).upper()
     return None
+def make_progress_html(text, percentage):
+    """Helper function to create the HTML for the progress bar."""
+    return f"""
+    <div class="progress-container">
+        <div class="progress-bar" style="width: {percentage}%;">
+            {text}
+        </div>
+    </div>
     """
 @spaces.GPU()
+def run_evaluation(model_id, benchmark_category, subject_name, sample_count):
     """
+    Main generator function to orchestrate the evaluation, yielding progress updates.
     """
     try:
+        # 1. Initial yield to set up the UI for loading state
+        yield {
+            progress_box: gr.update(visible=True),
+            progress_text_output: gr.update(value=f"Preparing evaluation for **{model_id}**..."),
+            progress_bar_output: gr.update(value=make_progress_html("Loading Model...", 0)),
+            result_summary_box: gr.update(visible=False),
+            details_box: gr.update(visible=False),
+            error_box: gr.update(visible=False),
+        }
         generator = load_model(model_id)
         dataset_id = BENCHMARK_MAP.get(benchmark_category)
         if not dataset_id:
             raise ValueError(f"Invalid benchmark category: {benchmark_category}")
         subjects_to_run = []
         if subject_name == "ALL":
             subjects_to_run = [s for s in ALL_BENCHMARK_SUBJECTS.get(benchmark_category, []) if s != "ALL"]
         else:
             subjects_to_run = [subject_name]
         if not subjects_to_run:
             gr.Warning(f"No subjects found for '{benchmark_category}'.")
+            yield { progress_box: gr.update(visible=False) }
+            return
+        all_results_details = []
+        summary_lines = []
+        total_correct = 0
+        total_samples = 0
+        # 2. Main evaluation loop
         for i, subject in enumerate(subjects_to_run):
+            overall_progress_text = f"**Overall Progress ({i+1}/{len(subjects_to_run)} subjects)**"
+            yield {
+                progress_text_output: gr.update(value=f"{overall_progress_text}\n\nLoading dataset for **{subject}**...")
+            }
             try:
+                # Load dataset for the current subject
+                dataset = load_dataset(dataset_id, subject, token=HF_TOKEN, split="test")
+                num_samples = min(sample_count, len(dataset))
+                dataset = dataset.shuffle(seed=42).select(range(num_samples))
+                correct_predictions_subject = 0
+                subject_details = []
+                # Loop over samples within the subject
+                for j, item in enumerate(dataset):
+                    prompt, correct_answer_idx = format_prompt(item)
+                    expected_letter = get_choice_letter(correct_answer_idx)
+                    full_prompt_text = generator.tokenizer.decode(generator.tokenizer.encode(prompt), skip_special_tokens=True)
+                    raw_output = generator(prompt, max_new_tokens=5, do_sample=False, pad_token_id=generator.tokenizer.eos_token_id)[0]["generated_text"]
+                    generated_text_only = raw_output[len(full_prompt_text):].strip()
+                    predicted_letter = extract_predicted_letter(generated_text_only)
+                    is_correct = (predicted_letter == expected_letter)
+                    if is_correct:
+                        correct_predictions_subject += 1
+                    subject_details.append({
+                        "Question": item['question'],
+                        "Correct": "✅" if is_correct else "❌",
+                        "Expected": expected_letter,
+                        "Predicted": predicted_letter or "N/A",
+                        "Model Output": generated_text_only
+                    })
+                    # Yield progress update for each sample
+                    percentage = ((j + 1) / num_samples) * 100
+                    progress_bar_text = f"Evaluating: {subject} ({j+1}/{num_samples})"
+                    yield {
+                        progress_bar_output: gr.update(value=make_progress_html(f"{percentage:.1f}%", percentage)),
+                        progress_text_output: gr.update(value=f"{overall_progress_text}\n\n{progress_bar_text}")
+                    }
+                accuracy = (correct_predictions_subject / num_samples) * 100 if num_samples > 0 else 0
+                all_results_details.extend(subject_details)
+                total_correct += correct_predictions_subject
+                total_samples += num_samples
+                summary_lines.append(f"- **{subject}**: {accuracy:.2f}% ({correct_predictions_subject}/{num_samples})")
             except Exception as e:
                 error_trace = traceback.format_exc()
                 gr.Error(f"Skipping {subject} due to an error: {e}")
                 summary_lines.append(f"- **{subject}**: Evaluation failed. See logs for details:\n```\n{error_trace}\n```")
                 continue
+        # 3. Final processing and result preparation
         overall_accuracy = (total_correct / total_samples) * 100 if total_samples > 0 else 0
         if subject_name == "ALL":
             result_summary = f"### Overall Average Accuracy: {overall_accuracy:.2f}%\n"
             result_summary += f"across {total_samples:,} total samples from {len(subjects_to_run)} subjects.\n\n---\n\n**Breakdown by Subject:**\n"
         else:
             result_summary = f"### Accuracy for {benchmark_category} - {subject_name}: {overall_accuracy:.2f}%\n"
             result_summary += f"({total_correct:,}/{total_samples:,} correct)"
+        # Write final result to the JSONL file
         record = {
             "model_id": model_id,
             "benchmark": benchmark_category,
             "accuracy": overall_accuracy,
+            "subject": subject_name,
             "sample_count": total_samples,
             "timestamp": datetime.now().isoformat()
         }
         with open(EVAL_FILE, "a") as f:
             f.write(json.dumps(record) + "\n")
         gr.Info("Evaluation completed successfully!")
         df_details = pd.DataFrame(all_results_details)
+        # 4. Final yield to show results and hide progress UI
+        yield {
+            progress_box: gr.update(visible=False),
+            result_summary_box: gr.update(visible=True),
+            result_summary_output: gr.update(value=result_summary),
             details_box: gr.update(visible=True),
+            detailed_results_df: gr.update(value=df_details),
+            error_box: gr.update(visible=False)
         }
     except Exception as e:
+        error_message = f"An unexpected error occurred: {e}"
         error_details = traceback.format_exc()
         gr.Error(error_message)
+        # Yield to show error message and hide progress UI
+        yield {
+            progress_box: gr.update(visible=False),
+            result_summary_box: gr.update(visible=False),
+            details_box: gr.update(visible=False),
             error_box: gr.update(visible=True),
             error_output: gr.update(value=error_message),
             error_details_output: gr.update(value=error_details),
         }
 # --- UI Helper Functions ---
 def update_subject_dropdown(benchmark_category):
     """Updates the subject dropdown choices based on the selected benchmark."""
     choices = ALL_BENCHMARK_SUBJECTS.get(benchmark_category, [])
 def load_leaderboard(benchmark_filter, progress=gr.Progress()):
     """
     Loads and processes evaluation data to display on the leaderboard.
     """
     progress(0, desc="Loading Leaderboard...")
     try:
         if not os.path.exists(EVAL_FILE):
             return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
         df = pd.read_json(EVAL_FILE, lines=True)
         if df.empty:
             return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
         df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce')
         df.dropna(subset=['accuracy'], inplace=True)
+        # Filter for 'ALL' subject runs for the selected benchmark
         df_filtered = df[(df['benchmark'] == benchmark_filter) & (df['subject'] == 'ALL')].copy()
         if df_filtered.empty:
             return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
         df_filtered['timestamp'] = pd.to_datetime(df_filtered['timestamp'])
         latest_evals = df_filtered.loc[df_filtered.groupby('model_id')['timestamp'].idxmax()].copy()
         leaderboard_df = latest_evals.sort_values(by="accuracy", ascending=False).copy()
         leaderboard_df.insert(0, 'Rank', range(1, len(leaderboard_df) + 1))
         leaderboard_df.rename(columns={
             'model_id': 'Model ID',
             'accuracy': 'Avg. Accuracy (%)',
             'sample_count': 'Total Samples',
             'timestamp': 'Date'
         }, inplace=True)
         leaderboard_df['Avg. Accuracy (%)'] = leaderboard_df['Avg. Accuracy (%)'].map('{:.2f}'.format)
         leaderboard_df['Date'] = leaderboard_df['Date'].dt.strftime('%Y-%m-%d')
         progress(1, desc="Done.")
         return leaderboard_df[['Rank', 'Model ID', 'Avg. Accuracy (%)', 'Total Samples', 'Date']]
     except Exception as e:
         return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
 # --- Gradio Interface Definition ---
 custom_css = """
     /* --- Global & Layout (Bigger to fit screen) --- */
     body { font-family: 'Inter', sans-serif; background-color: #1a1a1a; color: #f0f0f0; } /* Dark background, light text */
     .gradio-container { max-width: 95% !important; margin: auto; padding: 20px; } /* Wider container */
+    .gr-group { border-radius: 12px !important; box-shadow: 0 4px 12px rgba(0,0,0,0.3) !important; border: 1px solid #333 !important; background-color: #2a2a2a; }
+    .gr-panel { border-radius: 12px !important; box-shadow: 0 4px 12px rgba(0,0,0,0.3) !important; border: 1px solid #333 !important; background-color: #2a2a2a; }
     /* --- Typography (Orange Hues) --- */
     h1 { text-align: center; font-size: 3rem !important; font-weight: 800; color: #ff8c00; margin-bottom: 0.5rem; letter-spacing: -1.5px; } /* Orange title */
     h3, h4 { color: #ffa500; } /* Orange headings */
     .subtitle { text-align: center; color: #cccccc; font-size: 1.2rem; margin-bottom: 2.5rem; max-width: 900px; margin-left: auto; margin-right: auto;}
     label { color: #f0f0f0 !important; } /* Label text color */
+    /* --- Progress Bar --- */
+    .progress-container { background-color: #3a3a3a; border-radius: 8px; overflow: hidden; border: 1px solid #555; height: 28px; padding: 4px; }
+    .progress-bar { background: linear-gradient(90deg, #ff8c00, #ffa500); height: 100%; border-radius: 5px; transition: width 0.3s ease-in-out; display: flex; align-items: center; justify-content: center; color: #1a1a1a; font-weight: 600; font-size: 0.9rem; }
     /* --- Tabs --- */
     .gradio-tabs { background-color: #2a2a2a; border-radius: 12px; }
+    .gradio-tabs button { background-color: #3a3a3a !important; color: #f0f0f0 !important; border-radius: 8px 8px 0 0 !important; transition: all 0.3s ease; }
+    .gradio-tabs button.selected { background-color: #ff8c00 !important; color: #1a1a1a !important; font-weight: 700; }
     /* --- Inputs --- */
+    .gr-textbox, .gr-dropdown, .gr-slider { background-color: #3a3a3a !important; color: #f0f0f0 !important; border: 1px solid #555 !important; border-radius: 8px !important; }
     /* --- Buttons --- */
+    .gr-button-primary { background-color: #ff8c00 !important; color: #1a1a1a !important; box-shadow: 0 4px 10px rgba(255, 140, 0, 0.3); border: none; }
+    .gr-button-primary:hover { transform: translateY(-2px); box-shadow: 0 6px 15px rgba(255, 140, 0, 0.5); background-color: #ffa500 !important; }
     /* --- Dataframe / Table Styling --- */
+    .leaderboard-table .gr-dataframe thead th { background-color: #3a3a3a !important; color: #ffa500 !important; font-weight: 600 !important; text-align: left; padding: 12px 15px; border-bottom: 2px solid #555; }
+    .leaderboard-table .gr-dataframe tbody tr:nth-of-type(even) { background-color: #2f2f2f; }
+    .leaderboard-table .gr-dataframe tbody tr:hover { background-color: #4a4a4a; }
+    .leaderboard-table .gr-dataframe tbody td { padding: 12px 15px; border-bottom: 1px solid #3a3a3a; color: #f0f0f0; }
     /* --- Error & Result Panes --- */
+    #error-display-box { background-color: #4a1e1e !important; border-color: #8c2f2f !important; color: #ffc9c9 !important; }
+    #result-summary-box { background-color: #1e3a2a !important; border-color: #2f8c4a !important; color: #c9ffc9 !important; }
+    .gr-markdown p { color: #f0f0f0 !important; } .gr-markdown strong { color: #ffa500 !important; }
+    .gradio-message { background-color: #ff8c00 !important; color: #1a1a1a !important; border: 1px solid #ff8c00 !important; }
 """
 with gr.Blocks(theme=gr.themes.Base(), css=custom_css) as demo:
     gr.Markdown("<h1>🏆 SuperBench Eval: Evaluate models and view leaderboards 🏆</h1>")
     gr.Markdown("<p class='subtitle'>Benchmark leading models on MMLU. Your results contribute to a live leaderboard. Select a benchmark and run an evaluation, or view the current standings.</p>")
     with gr.Tabs() as tabs:
         # --- Leaderboard Tab ---
         with gr.TabItem("📊 Leaderboard", id=0):
             with gr.Column():
+                with gr.Row():
                     leaderboard_type_toggle = gr.Radio(
+                        ["MMLU"], label="Select Benchmark", value="MMLU", interactive=True
                     )
                     refresh_button = gr.Button("🔄 Refresh", size="sm")
                 leaderboard_table_output = gr.DataFrame(
                     headers=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"],
+                    interactive=False, datatype=["number", "str", "str", "number", "str"],
+                    row_count=15, elem_classes="leaderboard-table",
                 )
         # --- Evaluation Tab ---
         with gr.TabItem("🚀 Run Evaluation", id=1):
             with gr.Row(variant='panel'):
                     with gr.Group():
                         gr.Markdown("### 1. Configure Evaluation")
                         model_id_input = gr.Textbox(
+                            label="Hugging Face Model ID", placeholder="e.g., meta-llama/Meta-Llama-3-8B-Instruct",
+                            interactive=True, scale=2
                         )
                         benchmark_selection_radio = gr.Radio(
+                            ["MMLU"], label="Benchmark", value="MMLU", interactive=True
                         )
                         with gr.Row():
                             benchmark_subject_dropdown = gr.Dropdown(
+                                label="Subject", choices=ALL_BENCHMARK_SUBJECTS.get("MMLU", []),
+                                value="ALL", interactive=True
                             )
                             sample_count_slider = gr.Slider(
+                                label="Samples per Subject", minimum=5, maximum=100, value=10, step=5, interactive=True
                             )
                     run_button = gr.Button("Start Evaluation", variant="primary", scale=1)
                 with gr.Column(scale=3):
                     gr.Markdown("### 2. View Results")
+                    # NEW: Progress Bar UI
+                    with gr.Group(visible=False) as progress_box:
+                        progress_text_output = gr.Markdown("Starting...")
+                        progress_bar_output = gr.HTML(make_progress_html("Waiting...", 0))
                     # Panel for displaying the summary of results
                     with gr.Group(visible=False) as result_summary_box:
                         result_summary_output = gr.Markdown(elem_id="result-summary-box")
                     # Panel for displaying errors
                     with gr.Group(visible=False) as error_box:
                         error_output = gr.Textbox(label="Error Message", interactive=False, elem_id="error-display-box")
                         error_details_output = gr.Textbox(label="Error Details (Traceback)", interactive=False, lines=8)
                     # Panel for detailed, row-by-row results
                     with gr.Group(visible=False) as details_box:
                         gr.Markdown("#### Detailed Evaluation Log")
                         detailed_results_df = gr.DataFrame(
                             headers=["Question", "Correct", "Expected", "Predicted", "Model Output"],
                             datatype=["str", "str", "str", "str", "str"],
+                            interactive=False, row_count=10, wrap=True,
                         )
+    # --- Event Handlers & Logic ---
     benchmark_selection_radio.change(
         fn=update_subject_dropdown,
         inputs=[benchmark_selection_radio],
         outputs=[benchmark_subject_dropdown]
     )
+    # Main evaluation trigger, now handles a generator for progress updates
     run_button.click(
         fn=run_evaluation,
         inputs=[model_id_input, benchmark_selection_radio, benchmark_subject_dropdown, sample_count_slider],
+        outputs=[
+            progress_box, progress_text_output, progress_bar_output,
+            result_summary_box, result_summary_output,
+            error_box, error_output, error_details_output,
+            details_box, detailed_results_df
+        ]
     ).then(
+        # After evaluation, refresh the leaderboard
         load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output]
     )
+    # --- Leaderboard Loading Logic ---
     demo.load(
         fn=load_leaderboard,
         inputs=[leaderboard_type_toggle],
         show_progress='full'
     )
 if __name__ == "__main__":
+    demo.launch(debug=True)