import abc import gradio as gr import os import pandas as pd from gen_table import * from meta_data import * with gr.Blocks(title="Open Agent Leaderboard") as demo: struct = load_results(OVERALL_MATH_SCORE_FILE) timestamp = struct['time'] EVAL_TIME = format_timestamp(timestamp) results = struct['results'] N_MODEL = len(results) N_DATA = len(results['IO']) DATASETS = list(results['IO']) DATASETS.remove('META') print(DATASETS) # Ensure overall_table is generated before defining llm_options check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH) overall_table = generate_table(results, DEFAULT_MATH_BENCH) # Save the complete overall_table as a CSV file csv_path_overall = os.path.join(os.getcwd(), 'src/overall_results.csv') overall_table.to_csv(csv_path_overall, index=False) print(f"Overall results saved to {csv_path_overall}") # Extract all possible LLM options from overall_table llm_options = list(set(row.LLM for row in overall_table.itertuples() if hasattr(row, 'LLM'))) gr.Markdown(LEADERBORAD_INTRODUCTION.format(EVAL_TIME)) with gr.Tabs(elem_classes='tab-buttons') as tabs: with gr.Tab(label='🏅 Open Agent Overall Math Leaderboard'): gr.Markdown(LEADERBOARD_MD['MATH_MAIN']) # Move the definition of check_box and overall_table here # check_box = BUILD_L1_DF(results, DEFAULT_MATH_BENCH) # overall_table = generate_table(results, DEFAULT_MATH_BENCH) type_map = check_box['type_map'] type_map['Rank'] = 'number' checkbox_group = gr.CheckboxGroup( choices=check_box['all'], value=check_box['required'], label='Evaluation Dimension', interactive=True, ) # New CheckboxGroup component for selecting Algorithm and LLM algo_name = gr.CheckboxGroup( choices=ALGORITHMS, value=ALGORITHMS, label='Algorithm', interactive=True ) llm_name = gr.CheckboxGroup( choices=llm_options, # Use the extracted llm_options value=llm_options, label='LLM', interactive=True ) initial_headers = ['Rank'] + check_box['essential'] + checkbox_group.value available_headers = [h for h in initial_headers if h in overall_table.columns] data_component = gr.components.DataFrame( value=overall_table[available_headers], type='pandas', datatype=[type_map[x] for x in available_headers], interactive=False, wrap=True, visible=True) def filter_df(fields, algos, llms, *args): headers = ['Rank'] + check_box['essential'] + fields df = overall_table.copy() # Add filtering logic df['flag'] = df.apply(lambda row: ( row['Algorithm'] in algos and row['LLM'] in llms ), axis=1) df = df[df['flag']].copy() df.pop('flag') # Ensure all requested columns exist available_headers = [h for h in headers if h in df.columns] original_columns = df.columns.tolist() available_headers = sorted(available_headers, key=lambda x: original_columns.index(x)) # If no columns are available, return an empty DataFrame with basic columns if not available_headers: available_headers = ['Rank'] + check_box['essential'] comp = gr.components.DataFrame( value=df[available_headers], type='pandas', datatype=[type_map[x] for x in available_headers], interactive=False, wrap=True, visible=True) return comp # Update change events to include new filtering conditions checkbox_group.change( fn=filter_df, inputs=[checkbox_group, algo_name, llm_name], outputs=data_component ) algo_name.change( fn=filter_df, inputs=[checkbox_group, algo_name, llm_name], outputs=data_component ) llm_name.change( fn=filter_df, inputs=[checkbox_group, algo_name, llm_name], outputs=data_component ) with gr.Tab(label='🏅 Open Agent Detail Math Leaderboard'): gr.Markdown(LEADERBOARD_MD['MATH_DETAIL']) struct_detail = load_results(DETAIL_MATH_SCORE_FILE) timestamp = struct_detail['time'] EVAL_TIME = format_timestamp(timestamp) results_detail = struct_detail['results'] table, check_box = BUILD_L2_DF(results_detail, DEFAULT_MATH_BENCH) # Save the complete table as a CSV file csv_path_detail = os.path.join(os.getcwd(), 'src/detail_results.csv') table.to_csv(csv_path_detail, index=False) print(f"Detail results saved to {csv_path_detail}") type_map = check_box['type_map'] type_map['Rank'] = 'number' checkbox_group = gr.CheckboxGroup( choices=check_box['all'], value=check_box['required'], label='Evaluation Dimension', interactive=True, ) headers = ['Rank'] + checkbox_group.value with gr.Row(): algo_name = gr.CheckboxGroup( choices=ALGORITHMS, value=ALGORITHMS, label='Algorithm', interactive=True ) dataset_name = gr.CheckboxGroup( choices=DATASETS, value=DATASETS, label='Datasets', interactive=True ) llm_name = gr.CheckboxGroup( choices=check_box['LLM_options'], value=check_box['LLM_options'], label='LLM', interactive=True ) data_component = gr.components.DataFrame( value=table[headers], type='pandas', datatype=[type_map[x] for x in headers], interactive=False, wrap=True, visible=True) def filter_df2(fields, algos, datasets, llms): headers = ['Rank'] + fields df = table.copy() # Filter data df['flag'] = df.apply(lambda row: ( row['Algorithm'] in algos and row['Dataset'] in datasets and row['LLM'] in llms ), axis=1) df = df[df['flag']].copy() df.pop('flag') # Group by dataset and calculate ranking within each group based on Score if 'Score' in df.columns: # Create a temporary ranking column df['Rank'] = df.groupby('Dataset')['Score'].rank(method='first', ascending=False) # Ensure ranking is integer df['Rank'] = df['Rank'].astype(int) original_columns = df.columns.tolist() headers = sorted(headers, key=lambda x: original_columns.index(x)) comp = gr.components.DataFrame( value=df[headers], type='pandas', datatype=[type_map[x] for x in headers], interactive=False, wrap=True, visible=True) return comp # Add change events for all checkbox groups checkbox_group.change( fn=filter_df2, inputs=[checkbox_group, algo_name, dataset_name, llm_name], outputs=data_component ) algo_name.change( fn=filter_df2, inputs=[checkbox_group, algo_name, dataset_name, llm_name], outputs=data_component ) dataset_name.change( fn=filter_df2, inputs=[checkbox_group, algo_name, dataset_name, llm_name], outputs=data_component ) llm_name.change( fn=filter_df2, inputs=[checkbox_group, algo_name, dataset_name, llm_name], outputs=data_component ) with gr.Tab(label='🏅 Open Agent Multi-Modal Leaderboard'): gr.Markdown(LEADERBOARD_MD['MULTI_MODAL_MAIN']) struct_multi_modal = load_results(MULTIMODAL_SCORE_FILE) timestamp = struct_multi_modal['time'] EVAL_TIME_MM = format_timestamp(timestamp) # Use BUILD_L3_DF to process multi-modal results (pass the list directly) table_mm, check_box_mm = BUILD_L3_DF( struct_multi_modal['multi_modal_results'], DEFAULT_MULTI_MODAL_BENCH ) # Save the complete table as a CSV file csv_path_multi_modal = os.path.join(os.getcwd(), 'src/multi_modal_results.csv') table_mm.to_csv(csv_path_multi_modal, index=False) print(f"Multi-modal results saved to {csv_path_multi_modal}") type_map_mm = check_box_mm['type_map'] checkbox_group_mm = gr.CheckboxGroup( choices=check_box_mm['all'], value=check_box_mm['required'], label='Evaluation Dimension', interactive=True, ) # Ensure unique values for Agent and VLMs unique_agents = sorted(table_mm['Agent'].drop_duplicates().str.strip().tolist()) unique_vlms = sorted(table_mm['VLMs'].drop_duplicates().str.strip().tolist()) agent_name_mm = gr.CheckboxGroup( choices=unique_agents, value=unique_agents, label='Agent', interactive=True ) vlm_name_mm = gr.CheckboxGroup( choices=unique_vlms, value=unique_vlms, label='VLMs', interactive=True ) initial_headers_mm = ['Rank'] + checkbox_group_mm.value print(initial_headers_mm, "111111111") available_headers_mm = [h for h in initial_headers_mm if h in table_mm.columns] data_component_mm = gr.components.DataFrame( value=table_mm[available_headers_mm], type='pandas', datatype=[type_map_mm[x] for x in available_headers_mm], interactive=False, wrap=True, visible=True ) def filter_df_mm(fields, agents, vlms, *args): headers = ['Rank'] + fields df = table_mm.copy() # Validate inputs to avoid errors if not agents: agents = df['Agent'].unique().tolist() if not vlms: vlms = df['VLMs'].unique().tolist() # Add filtering logic df['flag'] = df.apply(lambda row: ( row['Agent'] in agents and row['VLMs'] in vlms ), axis=1) df = df[df['flag']].copy() df.pop('flag') # Ensure all requested columns exist available_headers = [h for h in headers if h in df.columns] # If no columns are available, return an empty DataFrame with basic columns if not available_headers: available_headers = ['Rank'] + check_box_mm['essential'] comp = gr.components.DataFrame( value=df[available_headers], type='pandas', datatype=[type_map_mm.get(col, 'str') for col in available_headers], interactive=False, wrap=True, visible=True ) return comp # Add change events for multi-modal leaderboard checkbox_group_mm.change( fn=filter_df_mm, inputs=[checkbox_group_mm, agent_name_mm, vlm_name_mm], outputs=data_component_mm ) agent_name_mm.change( fn=filter_df_mm, inputs=[checkbox_group_mm, agent_name_mm, vlm_name_mm], outputs=data_component_mm ) vlm_name_mm.change( fn=filter_df_mm, inputs=[checkbox_group_mm, agent_name_mm, vlm_name_mm], outputs=data_component_mm ) with gr.Row(): with gr.Accordion("📙 Citation", open=False): gr.Textbox( value=CITATION_BUTTON_TEXT, lines=7, label="Copy the BibTeX snippet to cite this source", elem_id="citation-button", show_copy_button=True, ) if __name__ == '__main__': demo.launch(server_name='0.0.0.0')