Spaces:
Running
Running
Yuxuan-Zhang-Dexter
commited on
Commit
·
6d4c755
1
Parent(s):
865dbef
update leaderboard with new agentic leaderboard layout
Browse files- app.py +162 -98
- assets/model_color.json +27 -27
- data_visualization.py +340 -20
- generate_normalized_cache.py +57 -0
- leaderboard_utils.py +81 -18
- rank_data_03_25_2025.json +94 -94
app.py
CHANGED
@@ -38,11 +38,11 @@ TIME_POINTS = {
|
|
38 |
}
|
39 |
|
40 |
# Load the initial JSON file with rank data
|
41 |
-
with open(TIME_POINTS["03/25/2025"], "r") as f:
|
42 |
rank_data = json.load(f)
|
43 |
|
44 |
# Load the model leaderboard data
|
45 |
-
with open("rank_single_model_03_25_2025.json", "r") as f:
|
46 |
model_rank_data = json.load(f)
|
47 |
|
48 |
# Add leaderboard state at the top level
|
@@ -72,17 +72,17 @@ leaderboard_state = {
|
|
72 |
|
73 |
|
74 |
# Load video links and news data
|
75 |
-
with open('assets/game_video_link.json', 'r') as f:
|
76 |
VIDEO_LINKS = json.load(f)
|
77 |
|
78 |
-
with open('assets/news.json', 'r') as f:
|
79 |
NEWS_DATA = json.load(f)
|
80 |
|
81 |
def load_rank_data(time_point):
|
82 |
"""Load rank data for a specific time point"""
|
83 |
if time_point in TIME_POINTS:
|
84 |
try:
|
85 |
-
with open(TIME_POINTS[time_point], "r") as f:
|
86 |
return json.load(f)
|
87 |
except FileNotFoundError:
|
88 |
return None
|
@@ -105,7 +105,7 @@ def prepare_dataframe_for_display(df, for_game=None):
|
|
105 |
|
106 |
# Replace '_' with '-' for better display
|
107 |
for col in display_df.columns:
|
108 |
-
if col.endswith(' Score'):
|
109 |
display_df[col] = display_df[col].apply(lambda x: '-' if x == '_' else x)
|
110 |
|
111 |
# If we're in detailed view, sort by score
|
@@ -120,36 +120,47 @@ def prepare_dataframe_for_display(df, for_game=None):
|
|
120 |
# Filter out models that didn't participate
|
121 |
display_df = display_df[~display_df[score_col].isna()]
|
122 |
else:
|
123 |
-
# For overall view, sort by average
|
124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
-
#
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
temp_sort_df[col] = pd.to_numeric(temp_sort_df[col], errors='coerce')
|
132 |
-
|
133 |
-
# Calculate average of the game scores (use mean of ranks from utils for actual ranking logic if different)
|
134 |
-
# For display sorting, let's use a simple average of available scores.
|
135 |
-
# The actual ranking for 'Average Rank' in leaderboard_utils uses mean of ranks, which is more robust.
|
136 |
-
# Here we just need a consistent sort order.
|
137 |
-
|
138 |
-
# Create a temporary column for sorting
|
139 |
-
temp_sort_df['temp_avg_score_for_sort'] = temp_sort_df[score_cols].mean(axis=1)
|
140 |
-
|
141 |
-
# Sort by this temporary average score (higher is better for scores)
|
142 |
-
# and then by Player name as a tie-breaker
|
143 |
-
display_df = display_df.loc[temp_sort_df.sort_values(by=['temp_avg_score_for_sort', 'Player'], ascending=[False, True]).index]
|
144 |
|
145 |
# Add line breaks to column headers
|
146 |
new_columns = {}
|
147 |
for col in display_df.columns:
|
148 |
-
if col.endswith(' Score'):
|
149 |
# Replace 'Game Name Score' with 'Game Name\nScore'
|
150 |
game_name = col.replace(' Score', '')
|
151 |
new_col = f"{game_name}\nScore"
|
152 |
new_columns[col] = new_col
|
|
|
|
|
|
|
153 |
|
154 |
# Rename columns with new line breaks
|
155 |
if new_columns:
|
@@ -164,8 +175,14 @@ def update_df_with_height(df):
|
|
164 |
col_widths = ["40px"] # Row number column width
|
165 |
col_widths.append("230px") # Player column - reduced by 20px
|
166 |
col_widths.append("120px") # Organization column
|
|
|
|
|
|
|
|
|
|
|
167 |
# Add game score columns
|
168 |
-
|
|
|
169 |
col_widths.append("120px")
|
170 |
|
171 |
return gr.update(value=df,
|
@@ -184,7 +201,7 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
|
|
184 |
# tetris_overall, tetris_details, # Commented out
|
185 |
tetris_plan_overall, tetris_plan_details,
|
186 |
ace_attorney_overall, ace_attorney_details,
|
187 |
-
top_n=
|
188 |
data_source=None):
|
189 |
global leaderboard_state
|
190 |
|
@@ -304,21 +321,22 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
|
|
304 |
|
305 |
# Get the appropriate DataFrame and charts based on current state
|
306 |
if leaderboard_state["current_game"]:
|
307 |
-
# For detailed view
|
|
|
308 |
# if leaderboard_state["current_game"] == "Super Mario Bros": # Commented out
|
309 |
# df = get_mario_leaderboard(data)
|
310 |
if leaderboard_state["current_game"] == "Super Mario Bros":
|
311 |
-
df = get_mario_planning_leaderboard(data)
|
312 |
elif leaderboard_state["current_game"] == "Sokoban":
|
313 |
-
df = get_sokoban_leaderboard(data)
|
314 |
elif leaderboard_state["current_game"] == "2048":
|
315 |
-
df = get_2048_leaderboard(data)
|
316 |
elif leaderboard_state["current_game"] == "Candy Crush":
|
317 |
-
df = get_candy_leaderboard(data)
|
318 |
elif leaderboard_state["current_game"] == "Tetris":
|
319 |
-
df = get_tetris_planning_leaderboard(data)
|
320 |
elif leaderboard_state["current_game"] == "Ace Attorney":
|
321 |
-
df = get_ace_attorney_leaderboard(data)
|
322 |
else: # Should not happen if current_game is one of the known games
|
323 |
df = pd.DataFrame() # Empty df
|
324 |
|
@@ -327,10 +345,12 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
|
|
327 |
radar_chart = chart # In detailed view, radar and group bar can be the same as the main chart
|
328 |
group_bar_chart = chart
|
329 |
else:
|
330 |
-
# For overall view
|
331 |
-
|
|
|
332 |
display_df = prepare_dataframe_for_display(df)
|
333 |
-
|
|
|
334 |
chart = radar_chart # In overall view, the 'detailed' chart can be the radar chart
|
335 |
|
336 |
# Return values, including all four plot placeholders
|
@@ -405,7 +425,7 @@ def get_initial_state():
|
|
405 |
}
|
406 |
}
|
407 |
|
408 |
-
def clear_filters(top_n=
|
409 |
global leaderboard_state
|
410 |
|
411 |
# Use provided data source or default to rank_data
|
@@ -420,9 +440,12 @@ def clear_filters(top_n=10, data_source=None):
|
|
420 |
"Ace Attorney": True
|
421 |
}
|
422 |
|
423 |
-
|
|
|
|
|
424 |
display_df = prepare_dataframe_for_display(df)
|
425 |
-
|
|
|
426 |
|
427 |
leaderboard_state = get_initial_state()
|
428 |
|
@@ -675,9 +698,18 @@ def build_app():
|
|
675 |
max-width: 140px !important;
|
676 |
}
|
677 |
|
678 |
-
/*
|
679 |
-
.table-container th:nth-child(
|
680 |
-
.table-container td:nth-child(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
681 |
width: 120px !important;
|
682 |
min-width: 100px !important;
|
683 |
max-width: 140px !important;
|
@@ -743,6 +775,27 @@ def build_app():
|
|
743 |
width: 100% !important;
|
744 |
margin-top: 40px !important;
|
745 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
746 |
""") as demo:
|
747 |
gr.Markdown("# 🎮 Lmgame Bench: Leaderboard 🎲")
|
748 |
|
@@ -875,6 +928,14 @@ def build_app():
|
|
875 |
with gr.Tabs():
|
876 |
with gr.Tab("🏆 Agent Leaderboard"):
|
877 |
# Visualization section
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
878 |
with gr.Row():
|
879 |
gr.Markdown("### 📊 Data Visualization")
|
880 |
|
@@ -884,6 +945,19 @@ def build_app():
|
|
884 |
visible=False,
|
885 |
elem_classes="visualization-container"
|
886 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
887 |
|
888 |
with gr.Column(visible=True) as overall_visualizations:
|
889 |
with gr.Tabs():
|
@@ -894,45 +968,32 @@ def build_app():
|
|
894 |
elem_classes="visualization-container"
|
895 |
)
|
896 |
gr.Markdown(
|
897 |
-
"*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison
|
898 |
elem_classes="radar-tip"
|
899 |
)
|
900 |
-
# Comment out the Group Bar Chart tab
|
901 |
with gr.Tab("📊 Group Bar Chart"):
|
902 |
-
with gr.Row():
|
903 |
-
# Calculate dynamic maximum based on total models
|
904 |
-
agent_max_models = get_total_model_count(rank_data)
|
905 |
-
top_n_slider = gr.Slider(
|
906 |
-
minimum=1,
|
907 |
-
maximum=agent_max_models,
|
908 |
-
step=1,
|
909 |
-
value=min(10, agent_max_models),
|
910 |
-
label=f"Number of Top Models to Display (max: {agent_max_models})",
|
911 |
-
elem_classes="top-n-slider"
|
912 |
-
)
|
913 |
group_bar_visualization = gr.Plot(
|
914 |
label="Comparative Analysis (Group Bar Chart)",
|
915 |
elem_classes="visualization-container"
|
916 |
)
|
917 |
gr.Markdown(
|
918 |
-
"*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison
|
919 |
elem_classes="radar-tip"
|
920 |
)
|
921 |
-
|
922 |
|
923 |
# Hidden placeholder for group bar visualization (to maintain code references)
|
924 |
# group_bar_visualization = gr.Plot(visible=False)
|
925 |
|
926 |
# Game selection section
|
927 |
with gr.Row():
|
928 |
-
gr.Markdown("###
|
929 |
with gr.Row():
|
930 |
# with gr.Column(): # Commented out Super Mario BrosUI
|
931 |
# gr.Markdown("**🎮 Super Mario Bros**")
|
932 |
# mario_overall = gr.Checkbox(label="Super Mario BrosScore", value=True)
|
933 |
# mario_details = gr.Checkbox(label="Super Mario BrosDetails", value=False)
|
934 |
with gr.Column(): # Added Super Mario BrosUI
|
935 |
-
gr.Markdown("
|
936 |
mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
|
937 |
mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
|
938 |
with gr.Column(): # Sokoban is now after mario_plan
|
@@ -972,12 +1033,16 @@ def build_app():
|
|
972 |
# Leaderboard table
|
973 |
with gr.Row():
|
974 |
gr.Markdown("### 📋 Detailed Results")
|
|
|
|
|
|
|
|
|
975 |
|
976 |
# Add reference to Jupyter notebook
|
977 |
with gr.Row():
|
978 |
gr.Markdown("*All data analysis can be replicated by checking [this Jupyter notebook](https://colab.research.google.com/drive/1CYFiJGm3EoBXXI8vICPVR82J9qrmmRvc#scrollTo=qft1Oald-21J)*")
|
979 |
|
980 |
-
# Get initial leaderboard dataframe
|
981 |
initial_df = get_combined_leaderboard(rank_data, {
|
982 |
# "Super Mario Bros": True, # Commented out
|
983 |
"Super Mario Bros": True,
|
@@ -987,7 +1052,7 @@ def build_app():
|
|
987 |
# "Tetris(complete)": True, # Commented out
|
988 |
"Tetris": True,
|
989 |
"Ace Attorney": True
|
990 |
-
})
|
991 |
|
992 |
# Format the DataFrame for display
|
993 |
initial_display_df = prepare_dataframe_for_display(initial_df)
|
@@ -996,8 +1061,14 @@ def build_app():
|
|
996 |
col_widths = ["40px"] # Row number column width
|
997 |
col_widths.append("230px") # Player column - reduced by 20px
|
998 |
col_widths.append("120px") # Organization column
|
|
|
|
|
|
|
|
|
|
|
999 |
# Add game score columns
|
1000 |
-
|
|
|
1001 |
col_widths.append("120px")
|
1002 |
|
1003 |
# Create a standard DataFrame component with enhanced styling
|
@@ -1062,8 +1133,8 @@ def build_app():
|
|
1062 |
# Update leaderboard and visualizations when checkboxes change
|
1063 |
for checkbox in checkbox_list:
|
1064 |
checkbox.change(
|
1065 |
-
lambda *args: update_leaderboard(*args, data_source=rank_data),
|
1066 |
-
inputs=checkbox_list
|
1067 |
outputs=[
|
1068 |
leaderboard_df,
|
1069 |
detailed_visualization,
|
@@ -1072,22 +1143,10 @@ def build_app():
|
|
1072 |
] + checkbox_list
|
1073 |
)
|
1074 |
|
1075 |
-
# Update when top_n_slider changes
|
1076 |
-
top_n_slider.change(
|
1077 |
-
lambda *args: update_leaderboard(*args, data_source=rank_data),
|
1078 |
-
inputs=checkbox_list + [top_n_slider],
|
1079 |
-
outputs=[
|
1080 |
-
leaderboard_df,
|
1081 |
-
detailed_visualization,
|
1082 |
-
radar_visualization,
|
1083 |
-
group_bar_visualization
|
1084 |
-
] + checkbox_list
|
1085 |
-
)
|
1086 |
-
|
1087 |
# Update when clear button is clicked
|
1088 |
clear_btn.click(
|
1089 |
-
lambda
|
1090 |
-
inputs=[
|
1091 |
outputs=[
|
1092 |
leaderboard_df,
|
1093 |
detailed_visualization,
|
@@ -1096,7 +1155,7 @@ def build_app():
|
|
1096 |
] + checkbox_list
|
1097 |
)
|
1098 |
|
1099 |
-
# Initialize the
|
1100 |
demo.load(
|
1101 |
lambda: clear_filters(data_source=rank_data),
|
1102 |
inputs=[],
|
@@ -1119,6 +1178,20 @@ def build_app():
|
|
1119 |
visible=False,
|
1120 |
elem_classes="visualization-container"
|
1121 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1122 |
|
1123 |
with gr.Column(visible=True) as model_overall_visualizations:
|
1124 |
with gr.Tabs():
|
@@ -1132,17 +1205,6 @@ def build_app():
|
|
1132 |
elem_classes="radar-tip"
|
1133 |
)
|
1134 |
with gr.Tab("📊 Group Bar Chart"):
|
1135 |
-
with gr.Row():
|
1136 |
-
# Calculate dynamic maximum based on total models
|
1137 |
-
model_max_models = get_total_model_count(model_rank_data)
|
1138 |
-
model_top_n_slider = gr.Slider(
|
1139 |
-
minimum=1,
|
1140 |
-
maximum=model_max_models,
|
1141 |
-
step=1,
|
1142 |
-
value=min(10, model_max_models),
|
1143 |
-
label=f"Number of Top Models to Display (max: {model_max_models})",
|
1144 |
-
elem_classes="top-n-slider"
|
1145 |
-
)
|
1146 |
model_group_bar_visualization = gr.Plot(
|
1147 |
label="Comparative Analysis (Group Bar Chart)",
|
1148 |
elem_classes="visualization-container"
|
@@ -1154,10 +1216,10 @@ def build_app():
|
|
1154 |
|
1155 |
# Game selection section
|
1156 |
with gr.Row():
|
1157 |
-
gr.Markdown("###
|
1158 |
with gr.Row():
|
1159 |
with gr.Column():
|
1160 |
-
gr.Markdown("
|
1161 |
model_mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
|
1162 |
model_mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
|
1163 |
with gr.Column():
|
@@ -1193,8 +1255,10 @@ def build_app():
|
|
1193 |
# Leaderboard table
|
1194 |
with gr.Row():
|
1195 |
gr.Markdown("### 📋 Detailed Results")
|
|
|
|
|
1196 |
|
1197 |
-
# Get initial leaderboard dataframe
|
1198 |
model_initial_df = get_combined_leaderboard(model_rank_data, {
|
1199 |
"Super Mario Bros": True,
|
1200 |
"Sokoban": True,
|
@@ -1202,7 +1266,7 @@ def build_app():
|
|
1202 |
"Candy Crush": True,
|
1203 |
"Tetris": True,
|
1204 |
"Ace Attorney": True
|
1205 |
-
})
|
1206 |
|
1207 |
# Format the DataFrame for display
|
1208 |
model_initial_display_df = prepare_dataframe_for_display(model_initial_df)
|
@@ -1300,7 +1364,7 @@ def build_app():
|
|
1300 |
] + model_checkbox_list
|
1301 |
)
|
1302 |
|
1303 |
-
# Initialize the model leaderboard
|
1304 |
demo.load(
|
1305 |
lambda: clear_filters(data_source=model_rank_data),
|
1306 |
inputs=[],
|
|
|
38 |
}
|
39 |
|
40 |
# Load the initial JSON file with rank data
|
41 |
+
with open(TIME_POINTS["03/25/2025"], "r", encoding='utf-8') as f:
|
42 |
rank_data = json.load(f)
|
43 |
|
44 |
# Load the model leaderboard data
|
45 |
+
with open("rank_single_model_03_25_2025.json", "r", encoding='utf-8') as f:
|
46 |
model_rank_data = json.load(f)
|
47 |
|
48 |
# Add leaderboard state at the top level
|
|
|
72 |
|
73 |
|
74 |
# Load video links and news data
|
75 |
+
with open('assets/game_video_link.json', 'r', encoding='utf-8') as f:
|
76 |
VIDEO_LINKS = json.load(f)
|
77 |
|
78 |
+
with open('assets/news.json', 'r', encoding='utf-8') as f:
|
79 |
NEWS_DATA = json.load(f)
|
80 |
|
81 |
def load_rank_data(time_point):
|
82 |
"""Load rank data for a specific time point"""
|
83 |
if time_point in TIME_POINTS:
|
84 |
try:
|
85 |
+
with open(TIME_POINTS[time_point], "r", encoding='utf-8') as f:
|
86 |
return json.load(f)
|
87 |
except FileNotFoundError:
|
88 |
return None
|
|
|
105 |
|
106 |
# Replace '_' with '-' for better display
|
107 |
for col in display_df.columns:
|
108 |
+
if col.endswith(' Score') and col != 'Avg Normalized Score':
|
109 |
display_df[col] = display_df[col].apply(lambda x: '-' if x == '_' else x)
|
110 |
|
111 |
# If we're in detailed view, sort by score
|
|
|
120 |
# Filter out models that didn't participate
|
121 |
display_df = display_df[~display_df[score_col].isna()]
|
122 |
else:
|
123 |
+
# For overall view, sort by average normalized score if available, otherwise fallback to average scores
|
124 |
+
if 'Avg Normalized Score' in display_df.columns:
|
125 |
+
# Sort by average normalized score (already calculated in leaderboard_utils)
|
126 |
+
display_df = display_df.sort_values(by='Avg Normalized Score', ascending=False)
|
127 |
+
else:
|
128 |
+
# Calculate an internal sorting key based on average scores, but don't add it to the display_df
|
129 |
+
score_cols = [col for col in display_df.columns if col.endswith(' Score')]
|
130 |
+
if score_cols:
|
131 |
+
temp_sort_df = display_df.copy()
|
132 |
+
for col in score_cols:
|
133 |
+
temp_sort_df[col] = pd.to_numeric(temp_sort_df[col], errors='coerce')
|
134 |
+
|
135 |
+
# Create a temporary column for sorting
|
136 |
+
temp_sort_df['temp_avg_score_for_sort'] = temp_sort_df[score_cols].mean(axis=1)
|
137 |
+
|
138 |
+
# Sort by this temporary average score (higher is better for scores)
|
139 |
+
# and then by Player name as a tie-breaker
|
140 |
+
display_df = display_df.loc[temp_sort_df.sort_values(by=['temp_avg_score_for_sort', 'Player'], ascending=[False, True]).index]
|
141 |
+
|
142 |
+
# Add medal emojis for top 3 performers
|
143 |
+
if len(display_df) > 0 and 'Player' in display_df.columns:
|
144 |
+
# Reset index to get proper ranking after sorting
|
145 |
+
display_df = display_df.reset_index(drop=True)
|
146 |
|
147 |
+
# Add medal emojis to Player names for top 3
|
148 |
+
medal_emojis = ['🥇', '🥈', '🥉']
|
149 |
+
for i in range(min(3, len(display_df))):
|
150 |
+
original_name = display_df.loc[i, 'Player']
|
151 |
+
display_df.loc[i, 'Player'] = f"{medal_emojis[i]} {original_name}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
# Add line breaks to column headers
|
154 |
new_columns = {}
|
155 |
for col in display_df.columns:
|
156 |
+
if col.endswith(' Score') and col != 'Avg Normalized Score':
|
157 |
# Replace 'Game Name Score' with 'Game Name\nScore'
|
158 |
game_name = col.replace(' Score', '')
|
159 |
new_col = f"{game_name}\nScore"
|
160 |
new_columns[col] = new_col
|
161 |
+
elif col == 'Avg Normalized Score':
|
162 |
+
# Add line break to Avg Normalized Score column
|
163 |
+
new_columns[col] = "Avg Normalized\nScore"
|
164 |
|
165 |
# Rename columns with new line breaks
|
166 |
if new_columns:
|
|
|
175 |
col_widths = ["40px"] # Row number column width
|
176 |
col_widths.append("230px") # Player column - reduced by 20px
|
177 |
col_widths.append("120px") # Organization column
|
178 |
+
|
179 |
+
# Check if there's an Avg Normalized Score column
|
180 |
+
if any('Avg Normalized' in col for col in df.columns):
|
181 |
+
col_widths.append("140px") # Avg Normalized Score column - slightly wider
|
182 |
+
|
183 |
# Add game score columns
|
184 |
+
remaining_cols = len(df.columns) - len(col_widths) + 1 # +1 because we subtracted row number column
|
185 |
+
for _ in range(remaining_cols):
|
186 |
col_widths.append("120px")
|
187 |
|
188 |
return gr.update(value=df,
|
|
|
201 |
# tetris_overall, tetris_details, # Commented out
|
202 |
tetris_plan_overall, tetris_plan_details,
|
203 |
ace_attorney_overall, ace_attorney_details,
|
204 |
+
top_n=3,
|
205 |
data_source=None):
|
206 |
global leaderboard_state
|
207 |
|
|
|
321 |
|
322 |
# Get the appropriate DataFrame and charts based on current state
|
323 |
if leaderboard_state["current_game"]:
|
324 |
+
# For detailed view - use slider value for both leaderboards
|
325 |
+
limit = top_n
|
326 |
# if leaderboard_state["current_game"] == "Super Mario Bros": # Commented out
|
327 |
# df = get_mario_leaderboard(data)
|
328 |
if leaderboard_state["current_game"] == "Super Mario Bros":
|
329 |
+
df = get_mario_planning_leaderboard(data, limit)
|
330 |
elif leaderboard_state["current_game"] == "Sokoban":
|
331 |
+
df = get_sokoban_leaderboard(data, limit)
|
332 |
elif leaderboard_state["current_game"] == "2048":
|
333 |
+
df = get_2048_leaderboard(data, limit)
|
334 |
elif leaderboard_state["current_game"] == "Candy Crush":
|
335 |
+
df = get_candy_leaderboard(data, limit)
|
336 |
elif leaderboard_state["current_game"] == "Tetris":
|
337 |
+
df = get_tetris_planning_leaderboard(data, limit)
|
338 |
elif leaderboard_state["current_game"] == "Ace Attorney":
|
339 |
+
df = get_ace_attorney_leaderboard(data, limit)
|
340 |
else: # Should not happen if current_game is one of the known games
|
341 |
df = pd.DataFrame() # Empty df
|
342 |
|
|
|
345 |
radar_chart = chart # In detailed view, radar and group bar can be the same as the main chart
|
346 |
group_bar_chart = chart
|
347 |
else:
|
348 |
+
# For overall view - use slider value for both leaderboards
|
349 |
+
limit = top_n
|
350 |
+
df, group_bar_chart = get_combined_leaderboard_with_group_bar(data, selected_games, top_n, limit)
|
351 |
display_df = prepare_dataframe_for_display(df)
|
352 |
+
# Pass appropriate title and top_n based on data source
|
353 |
+
_, radar_chart = get_combined_leaderboard_with_single_radar(data, selected_games, limit_to_top_n=limit, top_n=top_n)
|
354 |
chart = radar_chart # In overall view, the 'detailed' chart can be the radar chart
|
355 |
|
356 |
# Return values, including all four plot placeholders
|
|
|
425 |
}
|
426 |
}
|
427 |
|
428 |
+
def clear_filters(top_n=3, data_source=None):
|
429 |
global leaderboard_state
|
430 |
|
431 |
# Use provided data source or default to rank_data
|
|
|
440 |
"Ace Attorney": True
|
441 |
}
|
442 |
|
443 |
+
# Use slider value for both leaderboards
|
444 |
+
limit = top_n
|
445 |
+
df, group_bar_chart = get_combined_leaderboard_with_group_bar(data, selected_games, top_n, limit)
|
446 |
display_df = prepare_dataframe_for_display(df)
|
447 |
+
# Pass top_n parameter for consistent titles
|
448 |
+
_, radar_chart = get_combined_leaderboard_with_single_radar(data, selected_games, limit_to_top_n=limit, top_n=top_n)
|
449 |
|
450 |
leaderboard_state = get_initial_state()
|
451 |
|
|
|
698 |
max-width: 140px !important;
|
699 |
}
|
700 |
|
701 |
+
/* Avg Normalized Score column (4th column) */
|
702 |
+
.table-container th:nth-child(4),
|
703 |
+
.table-container td:nth-child(4) {
|
704 |
+
width: 140px !important;
|
705 |
+
min-width: 120px !important;
|
706 |
+
max-width: 160px !important;
|
707 |
+
text-align: center !important;
|
708 |
+
}
|
709 |
+
|
710 |
+
/* Game score columns (5th column onwards) */
|
711 |
+
.table-container th:nth-child(n+5),
|
712 |
+
.table-container td:nth-child(n+5) {
|
713 |
width: 120px !important;
|
714 |
min-width: 100px !important;
|
715 |
max-width: 140px !important;
|
|
|
775 |
width: 100% !important;
|
776 |
margin-top: 40px !important;
|
777 |
}
|
778 |
+
|
779 |
+
.welcome-message {
|
780 |
+
background: linear-gradient(135deg, #a8edea 0%, #fed6e3 100%);
|
781 |
+
color: #333;
|
782 |
+
padding: 20px;
|
783 |
+
border-radius: 10px;
|
784 |
+
margin: 20px 0;
|
785 |
+
text-align: center;
|
786 |
+
box-shadow: 0 4px 15px rgba(0,0,0,0.05);
|
787 |
+
}
|
788 |
+
|
789 |
+
.welcome-message h3 {
|
790 |
+
margin: 0 0 10px 0;
|
791 |
+
font-size: 1.3em;
|
792 |
+
}
|
793 |
+
|
794 |
+
.welcome-message p {
|
795 |
+
margin: 0;
|
796 |
+
font-size: 1.1em;
|
797 |
+
line-height: 1.5;
|
798 |
+
}
|
799 |
""") as demo:
|
800 |
gr.Markdown("# 🎮 Lmgame Bench: Leaderboard 🎲")
|
801 |
|
|
|
928 |
with gr.Tabs():
|
929 |
with gr.Tab("🏆 Agent Leaderboard"):
|
930 |
# Visualization section
|
931 |
+
|
932 |
+
with gr.Row():
|
933 |
+
gr.Markdown("""
|
934 |
+
**🎮 Welcome to LMGame Bench!**
|
935 |
+
|
936 |
+
We welcome everyone to implement their own gaming agents by replacing our baseAgent in `customer_runner.py` and test them on our benchmark. Join the competition and see how your agent performs!
|
937 |
+
""", elem_classes="welcome-message")
|
938 |
+
|
939 |
with gr.Row():
|
940 |
gr.Markdown("### 📊 Data Visualization")
|
941 |
|
|
|
945 |
visible=False,
|
946 |
elem_classes="visualization-container"
|
947 |
)
|
948 |
+
# with gr.Row():
|
949 |
+
# # Calculate dynamic maximum based on total models
|
950 |
+
# agent_max_models = get_total_model_count(rank_data)
|
951 |
+
# top_n_slider = gr.Slider(
|
952 |
+
# minimum=1,
|
953 |
+
# maximum=agent_max_models,
|
954 |
+
# step=1,
|
955 |
+
# value=min(3, agent_max_models),
|
956 |
+
# label=f"Number of Top Models to Display in All Views (max: {agent_max_models})",
|
957 |
+
# elem_classes="top-n-slider"
|
958 |
+
# )
|
959 |
+
|
960 |
+
|
961 |
|
962 |
with gr.Column(visible=True) as overall_visualizations:
|
963 |
with gr.Tabs():
|
|
|
968 |
elem_classes="visualization-container"
|
969 |
)
|
970 |
gr.Markdown(
|
971 |
+
"*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*\n\n*⚔️ - Model with our gaming agent*",
|
972 |
elem_classes="radar-tip"
|
973 |
)
|
|
|
974 |
with gr.Tab("📊 Group Bar Chart"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
975 |
group_bar_visualization = gr.Plot(
|
976 |
label="Comparative Analysis (Group Bar Chart)",
|
977 |
elem_classes="visualization-container"
|
978 |
)
|
979 |
gr.Markdown(
|
980 |
+
"*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*\n\n*⚔️ - Model with our gaming agent*",
|
981 |
elem_classes="radar-tip"
|
982 |
)
|
|
|
983 |
|
984 |
# Hidden placeholder for group bar visualization (to maintain code references)
|
985 |
# group_bar_visualization = gr.Plot(visible=False)
|
986 |
|
987 |
# Game selection section
|
988 |
with gr.Row():
|
989 |
+
gr.Markdown("### 🕹️ Game Selection")
|
990 |
with gr.Row():
|
991 |
# with gr.Column(): # Commented out Super Mario BrosUI
|
992 |
# gr.Markdown("**🎮 Super Mario Bros**")
|
993 |
# mario_overall = gr.Checkbox(label="Super Mario BrosScore", value=True)
|
994 |
# mario_details = gr.Checkbox(label="Super Mario BrosDetails", value=False)
|
995 |
with gr.Column(): # Added Super Mario BrosUI
|
996 |
+
gr.Markdown("**🍄 Super Mario Bros**")
|
997 |
mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
|
998 |
mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
|
999 |
with gr.Column(): # Sokoban is now after mario_plan
|
|
|
1033 |
# Leaderboard table
|
1034 |
with gr.Row():
|
1035 |
gr.Markdown("### 📋 Detailed Results")
|
1036 |
+
with gr.Row():
|
1037 |
+
gr.Markdown("*⚔️ - Model with our gaming agent*", elem_classes="radar-tip")
|
1038 |
+
|
1039 |
+
# Welcome message for custom gaming agents
|
1040 |
|
1041 |
# Add reference to Jupyter notebook
|
1042 |
with gr.Row():
|
1043 |
gr.Markdown("*All data analysis can be replicated by checking [this Jupyter notebook](https://colab.research.google.com/drive/1CYFiJGm3EoBXXI8vICPVR82J9qrmmRvc#scrollTo=qft1Oald-21J)*")
|
1044 |
|
1045 |
+
# Get initial leaderboard dataframe (limited by default slider value for agent leaderboard)
|
1046 |
initial_df = get_combined_leaderboard(rank_data, {
|
1047 |
# "Super Mario Bros": True, # Commented out
|
1048 |
"Super Mario Bros": True,
|
|
|
1052 |
# "Tetris(complete)": True, # Commented out
|
1053 |
"Tetris": True,
|
1054 |
"Ace Attorney": True
|
1055 |
+
}, limit_to_top_n=min(3, get_total_model_count(rank_data)))
|
1056 |
|
1057 |
# Format the DataFrame for display
|
1058 |
initial_display_df = prepare_dataframe_for_display(initial_df)
|
|
|
1061 |
col_widths = ["40px"] # Row number column width
|
1062 |
col_widths.append("230px") # Player column - reduced by 20px
|
1063 |
col_widths.append("120px") # Organization column
|
1064 |
+
|
1065 |
+
# Check if there's an Avg Normalized Score column
|
1066 |
+
if any('Avg Normalized' in col for col in initial_display_df.columns):
|
1067 |
+
col_widths.append("140px") # Avg Normalized Score column - slightly wider
|
1068 |
+
|
1069 |
# Add game score columns
|
1070 |
+
remaining_cols = len(initial_display_df.columns) - len(col_widths) + 1 # +1 because we subtracted row number column
|
1071 |
+
for _ in range(remaining_cols):
|
1072 |
col_widths.append("120px")
|
1073 |
|
1074 |
# Create a standard DataFrame component with enhanced styling
|
|
|
1133 |
# Update leaderboard and visualizations when checkboxes change
|
1134 |
for checkbox in checkbox_list:
|
1135 |
checkbox.change(
|
1136 |
+
lambda *args: update_leaderboard(*args, top_n=3, data_source=rank_data),
|
1137 |
+
inputs=checkbox_list,
|
1138 |
outputs=[
|
1139 |
leaderboard_df,
|
1140 |
detailed_visualization,
|
|
|
1143 |
] + checkbox_list
|
1144 |
)
|
1145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1146 |
# Update when clear button is clicked
|
1147 |
clear_btn.click(
|
1148 |
+
lambda: clear_filters(top_n=3, data_source=rank_data),
|
1149 |
+
inputs=[],
|
1150 |
outputs=[
|
1151 |
leaderboard_df,
|
1152 |
detailed_visualization,
|
|
|
1155 |
] + checkbox_list
|
1156 |
)
|
1157 |
|
1158 |
+
# Initialize the agent leaderboard (with top 5 limit)
|
1159 |
demo.load(
|
1160 |
lambda: clear_filters(data_source=rank_data),
|
1161 |
inputs=[],
|
|
|
1178 |
visible=False,
|
1179 |
elem_classes="visualization-container"
|
1180 |
)
|
1181 |
+
|
1182 |
+
with gr.Row():
|
1183 |
+
# Calculate dynamic maximum based on total models
|
1184 |
+
model_max_models = get_total_model_count(model_rank_data)
|
1185 |
+
model_top_n_slider = gr.Slider(
|
1186 |
+
minimum=1,
|
1187 |
+
maximum=model_max_models,
|
1188 |
+
step=1,
|
1189 |
+
value=min(5, model_max_models),
|
1190 |
+
label=f"Number of Top Models to Display in All Views (max: {model_max_models})",
|
1191 |
+
elem_classes="top-n-slider"
|
1192 |
+
)
|
1193 |
+
|
1194 |
+
|
1195 |
|
1196 |
with gr.Column(visible=True) as model_overall_visualizations:
|
1197 |
with gr.Tabs():
|
|
|
1205 |
elem_classes="radar-tip"
|
1206 |
)
|
1207 |
with gr.Tab("📊 Group Bar Chart"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1208 |
model_group_bar_visualization = gr.Plot(
|
1209 |
label="Comparative Analysis (Group Bar Chart)",
|
1210 |
elem_classes="visualization-container"
|
|
|
1216 |
|
1217 |
# Game selection section
|
1218 |
with gr.Row():
|
1219 |
+
gr.Markdown("### 🕹️ Game Selection")
|
1220 |
with gr.Row():
|
1221 |
with gr.Column():
|
1222 |
+
gr.Markdown("**🍄 Super Mario Bros**")
|
1223 |
model_mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
|
1224 |
model_mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
|
1225 |
with gr.Column():
|
|
|
1255 |
# Leaderboard table
|
1256 |
with gr.Row():
|
1257 |
gr.Markdown("### 📋 Detailed Results")
|
1258 |
+
with gr.Row():
|
1259 |
+
gr.Markdown("*💡 The slider above controls how many top models are shown in the radar chart, bar chart, and data table.*", elem_classes="radar-tip")
|
1260 |
|
1261 |
+
# Get initial leaderboard dataframe (limited by default slider value for model leaderboard)
|
1262 |
model_initial_df = get_combined_leaderboard(model_rank_data, {
|
1263 |
"Super Mario Bros": True,
|
1264 |
"Sokoban": True,
|
|
|
1266 |
"Candy Crush": True,
|
1267 |
"Tetris": True,
|
1268 |
"Ace Attorney": True
|
1269 |
+
}, limit_to_top_n=min(5, get_total_model_count(model_rank_data)))
|
1270 |
|
1271 |
# Format the DataFrame for display
|
1272 |
model_initial_display_df = prepare_dataframe_for_display(model_initial_df)
|
|
|
1364 |
] + model_checkbox_list
|
1365 |
)
|
1366 |
|
1367 |
+
# Initialize the model leaderboard (with default slider limit)
|
1368 |
demo.load(
|
1369 |
lambda: clear_filters(data_source=model_rank_data),
|
1370 |
inputs=[],
|
assets/model_color.json
CHANGED
@@ -27,31 +27,31 @@
|
|
27 |
"llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
|
28 |
"qwen3-235B-A22B-fp8": "#6A1B9A",
|
29 |
"random (x30)": "#9E9E9E",
|
30 |
-
"
|
31 |
-
"
|
32 |
-
"
|
33 |
-
"
|
34 |
-
"
|
35 |
-
"
|
36 |
-
"
|
37 |
-
"
|
38 |
-
"
|
39 |
-
"
|
40 |
-
"
|
41 |
-
"
|
42 |
-
"
|
43 |
-
"
|
44 |
-
"
|
45 |
-
"
|
46 |
-
"
|
47 |
-
"
|
48 |
-
"
|
49 |
-
"
|
50 |
-
"
|
51 |
-
"
|
52 |
-
"
|
53 |
-
"
|
54 |
-
"
|
55 |
-
"
|
56 |
-
"
|
57 |
}
|
|
|
27 |
"llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
|
28 |
"qwen3-235B-A22B-fp8": "#6A1B9A",
|
29 |
"random (x30)": "#9E9E9E",
|
30 |
+
"claude-3-7-sonnet-20250219 (⚔️)": "#4A90E2",
|
31 |
+
"claude-3-5-haiku-20241022 (⚔️)": "#7FB5E6",
|
32 |
+
"claude-3-5-sonnet-20241022 (⚔️)": "#1A4C7C",
|
33 |
+
"claude-opus-4-20250514 (⚔️)": "#3A80D2",
|
34 |
+
"claude-sonnet-4-20250514 (⚔️)": "#5A9FE2",
|
35 |
+
"gemini-2.0-flash (⚔️)": "#FF4081",
|
36 |
+
"gemini-2.0-flash-thinking-exp-1219 (⚔️)": "#C2185B",
|
37 |
+
"gemini-2.5-pro-exp-03-25 (⚔️)": "#FF80AB",
|
38 |
+
"gemini-2.5-flash-preview-04-17 (⚔️)": "#F06292",
|
39 |
+
"gemini-2.5-flash-preview-05-20 (⚔️)": "#F8BBD9",
|
40 |
+
"gemini-2.5-pro-preview-05-06 (⚔️)": "#AD1457",
|
41 |
+
"gemini-2.5-pro-preview-06-05 (⚔️)": "#EC407A",
|
42 |
+
"gpt-4o-2024-11-20 (⚔️)": "#00BFA5",
|
43 |
+
"gpt-4.5-preview-2025-02-27 (⚔️)": "#00796B",
|
44 |
+
"gpt-4.1-2025-04-14 (⚔️)": "#00897B",
|
45 |
+
"o1-2024-12-17 (⚔️)": "#4DB6AC",
|
46 |
+
"o1-mini-2024-09-12 (⚔️)": "#26A69A",
|
47 |
+
"o3-mini-2025-01-31(medium) (⚔️)": "#80CBC4",
|
48 |
+
"o3-2025-04-16 (⚔️)": "#26C6DA",
|
49 |
+
"o4-mini-2025-04-16 (⚔️)": "#00ACC1",
|
50 |
+
"grok-3-beta (⚔️)": "#FF7043",
|
51 |
+
"grok-3-mini-beta (⚔️)": "#FF8A65",
|
52 |
+
"deepseek-v3 (⚔️)": "#FFC107",
|
53 |
+
"deepseek-r1-0120 (⚔️)": "#FFA000",
|
54 |
+
"deepseek-r1-0528 (⚔️)": "#FFB300",
|
55 |
+
"llama-4-maverick-17b-128e-instruct-fp8 (⚔️)": "#8E24AA",
|
56 |
+
"qwen3-235B-A22B-fp8 (⚔️)": "#6A1B9A"
|
57 |
}
|
data_visualization.py
CHANGED
@@ -2,13 +2,15 @@ import plotly.graph_objects as go
|
|
2 |
import numpy as np
|
3 |
import pandas as pd
|
4 |
import json
|
|
|
|
|
5 |
from leaderboard_utils import (
|
6 |
get_combined_leaderboard,
|
7 |
GAME_ORDER
|
8 |
)
|
9 |
|
10 |
# Load model colors
|
11 |
-
with open('assets/model_color.json', 'r') as f:
|
12 |
MODEL_COLORS = json.load(f)
|
13 |
|
14 |
GAME_SCORE_COLUMNS = {
|
@@ -126,7 +128,7 @@ def create_radar_charts(df):
|
|
126 |
categories = [c.replace(" Score", "") for c in game_cols]
|
127 |
|
128 |
for col in game_cols:
|
129 |
-
vals = df[col].replace("n/a", 0).astype(float)
|
130 |
mean, std = vals.mean(), vals.std()
|
131 |
df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
132 |
|
@@ -179,7 +181,7 @@ def get_combined_leaderboard_with_radar(rank_data, selected_games):
|
|
179 |
df_viz = df.copy()
|
180 |
return df, create_radar_charts(df_viz)
|
181 |
|
182 |
-
def create_group_bar_chart(df, top_n=
|
183 |
game_cols = {}
|
184 |
for game in GAME_ORDER:
|
185 |
col = f"{game} Score"
|
@@ -330,8 +332,8 @@ def create_group_bar_chart(df, top_n=10):
|
|
330 |
|
331 |
|
332 |
|
333 |
-
def get_combined_leaderboard_with_group_bar(rank_data, selected_games, top_n=
|
334 |
-
df = get_combined_leaderboard(rank_data, selected_games)
|
335 |
# Create a copy for visualization to avoid modifying the original
|
336 |
df_viz = df.copy()
|
337 |
return df, create_group_bar_chart(df_viz, top_n)
|
@@ -344,7 +346,7 @@ def hex_to_rgba(hex_color, alpha=0.2):
|
|
344 |
return f'rgba({r}, {g}, {b}, {alpha})'
|
345 |
|
346 |
|
347 |
-
def create_single_radar_chart(df, selected_games=None, highlight_models=None):
|
348 |
if selected_games is None:
|
349 |
selected_games = ['Super Mario Bros', '2048', 'Candy Crush', 'Sokoban', 'Ace Attorney']
|
350 |
|
@@ -359,11 +361,25 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
|
|
359 |
game_cols = [f"{game} Score" for game in selected_games]
|
360 |
categories = formatted_games
|
361 |
|
362 |
-
#
|
|
|
|
|
|
|
|
|
363 |
for col in game_cols:
|
364 |
-
|
365 |
-
|
366 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
367 |
|
368 |
# Group players by prefix and sort alphabetically
|
369 |
model_groups = {}
|
@@ -411,12 +427,23 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
|
|
411 |
hovertemplate='<b>%{fullData.name}</b><br>Game: %{theta}<br>Score: %{r:.1f}<extra></extra>'
|
412 |
))
|
413 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
414 |
fig.update_layout(
|
415 |
autosize=True,
|
416 |
height=550, # Reduced height for better proportion with legend
|
417 |
margin=dict(l=400, r=100, t=20, b=20),
|
418 |
title=dict(
|
419 |
-
text=
|
420 |
x=0.5,
|
421 |
xanchor='center',
|
422 |
yanchor='top',
|
@@ -462,12 +489,20 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
|
|
462 |
|
463 |
return fig
|
464 |
|
465 |
-
def get_combined_leaderboard_with_single_radar(rank_data, selected_games, highlight_models=None):
|
466 |
-
|
|
|
|
|
|
|
|
|
|
|
467 |
selected_game_names = [g for g, sel in selected_games.items() if sel]
|
468 |
-
|
|
|
469 |
df_viz = df.copy()
|
470 |
-
|
|
|
|
|
471 |
|
472 |
def create_organization_radar_chart(rank_data):
|
473 |
df = get_combined_leaderboard(rank_data, {g: True for g in GAME_ORDER})
|
@@ -477,7 +512,7 @@ def create_organization_radar_chart(rank_data):
|
|
477 |
|
478 |
avg_df = pd.DataFrame([
|
479 |
{
|
480 |
-
**{col: df[df["Organization"] == org][col].
|
481 |
"Organization": org
|
482 |
}
|
483 |
for org in orgs
|
@@ -533,7 +568,10 @@ def create_top_players_radar_chart(rank_data, n=5):
|
|
533 |
|
534 |
for col in game_cols:
|
535 |
# Replace "n/a" with 0 and handle downcasting properly
|
536 |
-
|
|
|
|
|
|
|
537 |
mean, std = vals.mean(), vals.std()
|
538 |
top_df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
539 |
|
@@ -589,8 +627,15 @@ def create_player_radar_chart(rank_data, player_name):
|
|
589 |
|
590 |
for col in game_cols:
|
591 |
# Replace "n/a" with 0 and handle downcasting properly
|
592 |
-
|
593 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
594 |
player_df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
595 |
|
596 |
fig = go.Figure()
|
@@ -628,6 +673,281 @@ def create_player_radar_chart(rank_data, player_name):
|
|
628 |
)
|
629 |
return fig
|
630 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
631 |
|
632 |
def save_visualization(fig, filename):
|
633 |
-
fig.write_image(filename)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import numpy as np
|
3 |
import pandas as pd
|
4 |
import json
|
5 |
+
import os
|
6 |
+
from datetime import datetime
|
7 |
from leaderboard_utils import (
|
8 |
get_combined_leaderboard,
|
9 |
GAME_ORDER
|
10 |
)
|
11 |
|
12 |
# Load model colors
|
13 |
+
with open('assets/model_color.json', 'r', encoding='utf-8') as f:
|
14 |
MODEL_COLORS = json.load(f)
|
15 |
|
16 |
GAME_SCORE_COLUMNS = {
|
|
|
128 |
categories = [c.replace(" Score", "") for c in game_cols]
|
129 |
|
130 |
for col in game_cols:
|
131 |
+
vals = df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
|
132 |
mean, std = vals.mean(), vals.std()
|
133 |
df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
134 |
|
|
|
181 |
df_viz = df.copy()
|
182 |
return df, create_radar_charts(df_viz)
|
183 |
|
184 |
+
def create_group_bar_chart(df, top_n=5):
|
185 |
game_cols = {}
|
186 |
for game in GAME_ORDER:
|
187 |
col = f"{game} Score"
|
|
|
332 |
|
333 |
|
334 |
|
335 |
+
def get_combined_leaderboard_with_group_bar(rank_data, selected_games, top_n=5, limit_to_top_n=None):
|
336 |
+
df = get_combined_leaderboard(rank_data, selected_games, limit_to_top_n)
|
337 |
# Create a copy for visualization to avoid modifying the original
|
338 |
df_viz = df.copy()
|
339 |
return df, create_group_bar_chart(df_viz, top_n)
|
|
|
346 |
return f'rgba({r}, {g}, {b}, {alpha})'
|
347 |
|
348 |
|
349 |
+
def create_single_radar_chart(df, selected_games=None, highlight_models=None, chart_title=None, top_n=None, full_df=None):
|
350 |
if selected_games is None:
|
351 |
selected_games = ['Super Mario Bros', '2048', 'Candy Crush', 'Sokoban', 'Ace Attorney']
|
352 |
|
|
|
361 |
game_cols = [f"{game} Score" for game in selected_games]
|
362 |
categories = formatted_games
|
363 |
|
364 |
+
# Use full dataset for normalization to keep consistent scale
|
365 |
+
# If full_df is not provided, use the current df (fallback for backward compatibility)
|
366 |
+
normalization_df = full_df if full_df is not None else df
|
367 |
+
|
368 |
+
# Normalize using the full dataset but apply to the limited df
|
369 |
for col in game_cols:
|
370 |
+
# Get normalization parameters from full dataset
|
371 |
+
# Use where() to avoid FutureWarning about downcasting in replace()
|
372 |
+
full_series = normalization_df[col].copy()
|
373 |
+
full_series = full_series.where(full_series != "n/a", 0)
|
374 |
+
full_vals = full_series.astype(float)
|
375 |
+
mean, std = full_vals.mean(), full_vals.std()
|
376 |
+
|
377 |
+
# Apply normalization to the limited df
|
378 |
+
# Use where() to avoid FutureWarning about downcasting in replace()
|
379 |
+
limited_series = df[col].copy()
|
380 |
+
limited_series = limited_series.where(limited_series != "n/a", 0)
|
381 |
+
limited_vals = limited_series.astype(float)
|
382 |
+
df[f"norm_{col}"] = normalize_values(limited_vals, mean, std)
|
383 |
|
384 |
# Group players by prefix and sort alphabetically
|
385 |
model_groups = {}
|
|
|
427 |
hovertemplate='<b>%{fullData.name}</b><br>Game: %{theta}<br>Score: %{r:.1f}<extra></extra>'
|
428 |
))
|
429 |
|
430 |
+
# Dynamic title based on the data source and top_n
|
431 |
+
if chart_title is None:
|
432 |
+
if top_n is not None:
|
433 |
+
chart_title = f"Radar Chart - Top {top_n} Performers by Game"
|
434 |
+
else:
|
435 |
+
# Fallback title
|
436 |
+
if len(df) <= 10:
|
437 |
+
chart_title = "🎮 Agent Performance Across Games"
|
438 |
+
else:
|
439 |
+
chart_title = "🤖 Model Performance Across Games"
|
440 |
+
|
441 |
fig.update_layout(
|
442 |
autosize=True,
|
443 |
height=550, # Reduced height for better proportion with legend
|
444 |
margin=dict(l=400, r=100, t=20, b=20),
|
445 |
title=dict(
|
446 |
+
text=chart_title,
|
447 |
x=0.5,
|
448 |
xanchor='center',
|
449 |
yanchor='top',
|
|
|
489 |
|
490 |
return fig
|
491 |
|
492 |
+
def get_combined_leaderboard_with_single_radar(rank_data, selected_games, highlight_models=None, limit_to_top_n=None, chart_title=None, top_n=None):
|
493 |
+
# Get full dataset for normalization
|
494 |
+
full_df = get_combined_leaderboard(rank_data, selected_games, limit_to_top_n=None)
|
495 |
+
|
496 |
+
# Get limited dataset for display
|
497 |
+
df = get_combined_leaderboard(rank_data, selected_games, limit_to_top_n)
|
498 |
+
|
499 |
selected_game_names = [g for g, sel in selected_games.items() if sel]
|
500 |
+
|
501 |
+
# Create copies for visualization to avoid modifying the original
|
502 |
df_viz = df.copy()
|
503 |
+
full_df_viz = full_df.copy()
|
504 |
+
|
505 |
+
return df, create_single_radar_chart(df_viz, selected_game_names, highlight_models, chart_title, top_n, full_df_viz)
|
506 |
|
507 |
def create_organization_radar_chart(rank_data):
|
508 |
df = get_combined_leaderboard(rank_data, {g: True for g in GAME_ORDER})
|
|
|
512 |
|
513 |
avg_df = pd.DataFrame([
|
514 |
{
|
515 |
+
**{col: df[df["Organization"] == org][col].where(df[df["Organization"] == org][col] != "n/a", 0).astype(float).mean() for col in game_cols},
|
516 |
"Organization": org
|
517 |
}
|
518 |
for org in orgs
|
|
|
568 |
|
569 |
for col in game_cols:
|
570 |
# Replace "n/a" with 0 and handle downcasting properly
|
571 |
+
# Use where() to avoid FutureWarning about downcasting in replace()
|
572 |
+
series = top_df[col].copy()
|
573 |
+
series = series.where(series != "n/a", 0)
|
574 |
+
vals = series.astype(float)
|
575 |
mean, std = vals.mean(), vals.std()
|
576 |
top_df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
577 |
|
|
|
627 |
|
628 |
for col in game_cols:
|
629 |
# Replace "n/a" with 0 and handle downcasting properly
|
630 |
+
# Use where() to avoid FutureWarning about downcasting in replace()
|
631 |
+
player_series = player_df[col].copy()
|
632 |
+
player_series = player_series.where(player_series != "n/a", 0)
|
633 |
+
vals = player_series.astype(float)
|
634 |
+
|
635 |
+
df_series = df[col].copy()
|
636 |
+
df_series = df_series.where(df_series != "n/a", 0)
|
637 |
+
df_vals = df_series.astype(float)
|
638 |
+
mean, std = df_vals.mean(), df_vals.std()
|
639 |
player_df[f"norm_{col}"] = normalize_values(vals, mean, std)
|
640 |
|
641 |
fig = go.Figure()
|
|
|
673 |
)
|
674 |
return fig
|
675 |
|
676 |
+
def save_normalized_data(df, selected_games, filename="normalized_data.json"):
|
677 |
+
"""
|
678 |
+
Save normalized data to a JSON file for caching
|
679 |
+
|
680 |
+
Args:
|
681 |
+
df (pd.DataFrame): DataFrame with raw scores
|
682 |
+
selected_games (dict): Dictionary of selected games
|
683 |
+
filename (str): Output filename
|
684 |
+
"""
|
685 |
+
game_cols = [f"{game} Score" for game in GAME_ORDER if f"{game} Score" in df.columns]
|
686 |
+
|
687 |
+
# Calculate normalization parameters and normalized values
|
688 |
+
normalization_data = {
|
689 |
+
"timestamp": datetime.now().isoformat(),
|
690 |
+
"selected_games": selected_games,
|
691 |
+
"games": {},
|
692 |
+
"players": {}
|
693 |
+
}
|
694 |
+
|
695 |
+
# Store normalization parameters per game
|
696 |
+
for col in game_cols:
|
697 |
+
game_name = col.replace(" Score", "")
|
698 |
+
vals = df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
|
699 |
+
mean, std = vals.mean(), vals.std()
|
700 |
+
|
701 |
+
normalization_data["games"][game_name] = {
|
702 |
+
"mean": mean,
|
703 |
+
"std": std,
|
704 |
+
"raw_scores": vals.to_dict()
|
705 |
+
}
|
706 |
+
|
707 |
+
# Store normalized scores per player
|
708 |
+
for _, row in df.iterrows():
|
709 |
+
player = row["Player"]
|
710 |
+
player_data = {"organization": row.get("Organization", "unknown")}
|
711 |
+
|
712 |
+
for col in game_cols:
|
713 |
+
game_name = col.replace(" Score", "")
|
714 |
+
raw_score = row[col]
|
715 |
+
|
716 |
+
if raw_score != "n/a":
|
717 |
+
raw_score = float(raw_score)
|
718 |
+
mean = normalization_data["games"][game_name]["mean"]
|
719 |
+
std = normalization_data["games"][game_name]["std"]
|
720 |
+
normalized = normalize_values([raw_score], mean, std)[0]
|
721 |
+
else:
|
722 |
+
raw_score = "n/a"
|
723 |
+
normalized = 0
|
724 |
+
|
725 |
+
player_data[f"{game_name}_raw"] = raw_score
|
726 |
+
player_data[f"{game_name}_normalized"] = normalized
|
727 |
+
|
728 |
+
normalization_data["players"][player] = player_data
|
729 |
+
|
730 |
+
# Save to file
|
731 |
+
os.makedirs("cache", exist_ok=True)
|
732 |
+
filepath = os.path.join("cache", filename)
|
733 |
+
|
734 |
+
with open(filepath, 'w') as f:
|
735 |
+
json.dump(normalization_data, f, indent=2)
|
736 |
+
|
737 |
+
print(f"Normalized data saved to {filepath}")
|
738 |
+
return filepath
|
739 |
+
|
740 |
+
def load_normalized_data(filename="normalized_data.json"):
|
741 |
+
"""
|
742 |
+
Load normalized data from a JSON file
|
743 |
+
|
744 |
+
Args:
|
745 |
+
filename (str): Input filename
|
746 |
+
|
747 |
+
Returns:
|
748 |
+
dict: Normalized data or None if file doesn't exist
|
749 |
+
"""
|
750 |
+
filepath = os.path.join("cache", filename)
|
751 |
+
|
752 |
+
if not os.path.exists(filepath):
|
753 |
+
return None
|
754 |
+
|
755 |
+
try:
|
756 |
+
with open(filepath, 'r') as f:
|
757 |
+
data = json.load(f)
|
758 |
+
print(f"Normalized data loaded from {filepath}")
|
759 |
+
return data
|
760 |
+
except Exception as e:
|
761 |
+
print(f"Error loading normalized data: {e}")
|
762 |
+
return None
|
763 |
+
|
764 |
+
def get_normalized_scores_from_cache(players, games, cache_data):
|
765 |
+
"""
|
766 |
+
Extract normalized scores from cached data
|
767 |
+
|
768 |
+
Args:
|
769 |
+
players (list): List of player names
|
770 |
+
games (list): List of game names
|
771 |
+
cache_data (dict): Cached normalization data
|
772 |
+
|
773 |
+
Returns:
|
774 |
+
pd.DataFrame: DataFrame with normalized scores
|
775 |
+
"""
|
776 |
+
data = []
|
777 |
+
|
778 |
+
for player in players:
|
779 |
+
if player in cache_data["players"]:
|
780 |
+
player_data = {"Player": player}
|
781 |
+
player_cache = cache_data["players"][player]
|
782 |
+
|
783 |
+
for game in games:
|
784 |
+
raw_key = f"{game}_raw"
|
785 |
+
norm_key = f"{game}_normalized"
|
786 |
+
|
787 |
+
if raw_key in player_cache:
|
788 |
+
player_data[f"{game} Score"] = player_cache[raw_key]
|
789 |
+
player_data[f"norm_{game} Score"] = player_cache[norm_key]
|
790 |
+
else:
|
791 |
+
player_data[f"{game} Score"] = "n/a"
|
792 |
+
player_data[f"norm_{game} Score"] = 0
|
793 |
+
|
794 |
+
data.append(player_data)
|
795 |
+
|
796 |
+
return pd.DataFrame(data)
|
797 |
|
798 |
def save_visualization(fig, filename):
|
799 |
+
fig.write_image(filename)
|
800 |
+
|
801 |
+
def generate_and_save_normalized_data(rank_data, filename="normalized_data.json"):
|
802 |
+
"""
|
803 |
+
Generate normalized data for all games and save to file
|
804 |
+
|
805 |
+
Args:
|
806 |
+
rank_data (dict): Raw rank data
|
807 |
+
filename (str): Output filename
|
808 |
+
|
809 |
+
Returns:
|
810 |
+
str: Path to saved file
|
811 |
+
"""
|
812 |
+
# Select all games
|
813 |
+
all_games = {game: True for game in GAME_ORDER}
|
814 |
+
|
815 |
+
# Get combined leaderboard
|
816 |
+
df = get_combined_leaderboard(rank_data, all_games)
|
817 |
+
|
818 |
+
# Save normalized data
|
819 |
+
return save_normalized_data(df, all_games, filename)
|
820 |
+
|
821 |
+
def create_single_radar_chart_with_cache(df, selected_games=None, highlight_models=None, use_cache=True, cache_filename="normalized_data.json"):
|
822 |
+
"""
|
823 |
+
Create radar chart with optional caching support
|
824 |
+
"""
|
825 |
+
if selected_games is None:
|
826 |
+
selected_games = ['Super Mario Bros', '2048', 'Candy Crush', 'Sokoban', 'Ace Attorney']
|
827 |
+
|
828 |
+
# Try to load from cache first
|
829 |
+
cached_data = None
|
830 |
+
if use_cache:
|
831 |
+
cached_data = load_normalized_data(cache_filename)
|
832 |
+
|
833 |
+
if cached_data:
|
834 |
+
# Use cached normalized data
|
835 |
+
players = df["Player"].tolist()
|
836 |
+
df_normalized = get_normalized_scores_from_cache(players, selected_games, cached_data)
|
837 |
+
# Merge with original df to get Organization info
|
838 |
+
df_normalized = df_normalized.merge(df[["Player", "Organization"]], on="Player", how="left")
|
839 |
+
else:
|
840 |
+
# Fall back to on-the-fly normalization
|
841 |
+
df_normalized = df.copy()
|
842 |
+
game_cols = [f"{game} Score" for game in selected_games]
|
843 |
+
|
844 |
+
# Normalize
|
845 |
+
for col in game_cols:
|
846 |
+
vals = df_normalized[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
|
847 |
+
mean, std = vals.mean(), vals.std()
|
848 |
+
df_normalized[f"norm_{col}"] = normalize_values(vals, mean, std)
|
849 |
+
|
850 |
+
# Format game names
|
851 |
+
formatted_games = []
|
852 |
+
for game in selected_games:
|
853 |
+
if game == 'Super Mario Bros':
|
854 |
+
formatted_games.append('SMB')
|
855 |
+
else:
|
856 |
+
formatted_games.append(game)
|
857 |
+
|
858 |
+
categories = formatted_games
|
859 |
+
|
860 |
+
# Group players by prefix and sort alphabetically
|
861 |
+
model_groups = {}
|
862 |
+
for player in df_normalized["Player"]:
|
863 |
+
prefix = get_model_prefix(player)
|
864 |
+
model_groups.setdefault(prefix, []).append(player)
|
865 |
+
|
866 |
+
# Sort each group alphabetically
|
867 |
+
for prefix in model_groups:
|
868 |
+
model_groups[prefix] = sorted(model_groups[prefix], key=str.lower)
|
869 |
+
|
870 |
+
# Get sorted prefixes and create ordered player list
|
871 |
+
sorted_prefixes = sorted(model_groups.keys(), key=str.lower)
|
872 |
+
grouped_players = []
|
873 |
+
for prefix in sorted_prefixes:
|
874 |
+
grouped_players.extend(model_groups[prefix])
|
875 |
+
|
876 |
+
fig = go.Figure()
|
877 |
+
|
878 |
+
for player in grouped_players:
|
879 |
+
row = df_normalized[df_normalized["Player"] == player]
|
880 |
+
if row.empty:
|
881 |
+
continue
|
882 |
+
row = row.iloc[0]
|
883 |
+
|
884 |
+
is_highlighted = highlight_models and player in highlight_models
|
885 |
+
color = 'red' if is_highlighted else MODEL_COLORS.get(player, '#808080')
|
886 |
+
fillcolor = 'rgba(255, 0, 0, 0.4)' if is_highlighted else hex_to_rgba(color, 0.2)
|
887 |
+
|
888 |
+
# Get normalized values
|
889 |
+
if cached_data:
|
890 |
+
r = [row[f"norm_{game} Score"] for game in selected_games]
|
891 |
+
else:
|
892 |
+
r = [row[f"norm_{game} Score"] for game in selected_games]
|
893 |
+
|
894 |
+
display_name = player.lower()
|
895 |
+
|
896 |
+
fig.add_trace(go.Scatterpolar(
|
897 |
+
r=r + [r[0]],
|
898 |
+
theta=categories + [categories[0]],
|
899 |
+
mode='lines+markers',
|
900 |
+
fill='toself',
|
901 |
+
name=display_name,
|
902 |
+
line=dict(color=color, width=6 if is_highlighted else 2),
|
903 |
+
marker=dict(color=color, size=10 if is_highlighted else 6),
|
904 |
+
fillcolor=fillcolor,
|
905 |
+
opacity=1.0 if is_highlighted else 0.7,
|
906 |
+
hovertemplate='<b>%{fullData.name}</b><br>Game: %{theta}<br>Score: %{r:.1f}<extra></extra>'
|
907 |
+
))
|
908 |
+
|
909 |
+
fig.update_layout(
|
910 |
+
autosize=True,
|
911 |
+
height=550,
|
912 |
+
margin=dict(l=400, r=100, t=20, b=20),
|
913 |
+
title=dict(
|
914 |
+
text="AI Normalized Performance Across Games",
|
915 |
+
x=0.5,
|
916 |
+
xanchor='center',
|
917 |
+
yanchor='top',
|
918 |
+
y=0.95,
|
919 |
+
font=dict(size=20),
|
920 |
+
pad=dict(b=20)
|
921 |
+
),
|
922 |
+
polar=dict(
|
923 |
+
radialaxis=dict(
|
924 |
+
visible=True,
|
925 |
+
range=[0, 100],
|
926 |
+
tickangle=45,
|
927 |
+
tickfont=dict(size=12),
|
928 |
+
gridcolor='lightgray',
|
929 |
+
gridwidth=1,
|
930 |
+
angle=45
|
931 |
+
),
|
932 |
+
angularaxis=dict(
|
933 |
+
tickfont=dict(size=14, weight='bold'),
|
934 |
+
tickangle=0
|
935 |
+
)
|
936 |
+
),
|
937 |
+
legend=dict(
|
938 |
+
font=dict(size=12),
|
939 |
+
title="Choose your model 💡 (click / double-click)",
|
940 |
+
itemsizing='trace',
|
941 |
+
x=-1.4,
|
942 |
+
y=0.8,
|
943 |
+
yanchor='top',
|
944 |
+
xanchor='left',
|
945 |
+
bgcolor='rgba(255,255,255,0.6)',
|
946 |
+
bordercolor='gray',
|
947 |
+
borderwidth=1,
|
948 |
+
itemclick="toggleothers",
|
949 |
+
itemdoubleclick="toggle"
|
950 |
+
)
|
951 |
+
)
|
952 |
+
|
953 |
+
return fig
|
generate_normalized_cache.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Script to generate normalized data cache for faster visualization loading.
|
4 |
+
|
5 |
+
Usage:
|
6 |
+
python generate_normalized_cache.py [input_file] [output_file]
|
7 |
+
|
8 |
+
Example:
|
9 |
+
python generate_normalized_cache.py data/rank_data.json normalized_data.json
|
10 |
+
"""
|
11 |
+
|
12 |
+
import sys
|
13 |
+
import json
|
14 |
+
from data_visualization import generate_and_save_normalized_data, load_normalized_data
|
15 |
+
|
16 |
+
def main():
|
17 |
+
# Default files
|
18 |
+
input_file = "data/rank_data.json" # Update this path as needed
|
19 |
+
output_file = "normalized_data.json"
|
20 |
+
|
21 |
+
# Handle command line arguments
|
22 |
+
if len(sys.argv) > 1:
|
23 |
+
input_file = sys.argv[1]
|
24 |
+
if len(sys.argv) > 2:
|
25 |
+
output_file = sys.argv[2]
|
26 |
+
|
27 |
+
try:
|
28 |
+
# Load rank data
|
29 |
+
print(f"Loading rank data from {input_file}...")
|
30 |
+
with open(input_file, 'r') as f:
|
31 |
+
rank_data = json.load(f)
|
32 |
+
|
33 |
+
# Generate and save normalized data
|
34 |
+
print("Generating normalized data...")
|
35 |
+
saved_path = generate_and_save_normalized_data(rank_data, output_file)
|
36 |
+
|
37 |
+
# Verify the saved data
|
38 |
+
print("Verifying saved data...")
|
39 |
+
cached_data = load_normalized_data(output_file)
|
40 |
+
|
41 |
+
if cached_data:
|
42 |
+
print(f"✅ Successfully generated normalized data cache!")
|
43 |
+
print(f"📁 Saved to: {saved_path}")
|
44 |
+
print(f"🎮 Games included: {list(cached_data['games'].keys())}")
|
45 |
+
print(f"👥 Players included: {len(cached_data['players'])}")
|
46 |
+
print(f"📅 Generated at: {cached_data['timestamp']}")
|
47 |
+
else:
|
48 |
+
print("❌ Failed to verify cached data")
|
49 |
+
|
50 |
+
except FileNotFoundError:
|
51 |
+
print(f"❌ Error: Could not find input file '{input_file}'")
|
52 |
+
print("Please check the file path and try again.")
|
53 |
+
except Exception as e:
|
54 |
+
print(f"❌ Error: {str(e)}")
|
55 |
+
|
56 |
+
if __name__ == "__main__":
|
57 |
+
main()
|
leaderboard_utils.py
CHANGED
@@ -32,7 +32,7 @@ def get_organization(model_name):
|
|
32 |
return "unknown"
|
33 |
|
34 |
|
35 |
-
def get_sokoban_leaderboard(rank_data):
|
36 |
data = rank_data.get("Sokoban", {}).get("results", [])
|
37 |
df = pd.DataFrame(data)
|
38 |
df = df.rename(columns={
|
@@ -53,9 +53,12 @@ def get_sokoban_leaderboard(rank_data):
|
|
53 |
if "Score" in df.columns:
|
54 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
55 |
df = df.sort_values("Score", ascending=False)
|
|
|
|
|
|
|
56 |
return df
|
57 |
|
58 |
-
def get_2048_leaderboard(rank_data):
|
59 |
data = rank_data.get("2048", {}).get("results", [])
|
60 |
# --- Diagnostic Print Removed ---
|
61 |
# if data and isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
|
@@ -108,9 +111,12 @@ def get_2048_leaderboard(rank_data):
|
|
108 |
if "Score" in df.columns:
|
109 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
110 |
df = df.sort_values("Score", ascending=False)
|
|
|
|
|
|
|
111 |
return df
|
112 |
|
113 |
-
def get_candy_leaderboard(rank_data):
|
114 |
data = rank_data.get("Candy Crush", {}).get("results", [])
|
115 |
df = pd.DataFrame(data)
|
116 |
df = df.rename(columns={
|
@@ -127,9 +133,12 @@ def get_candy_leaderboard(rank_data):
|
|
127 |
if "Score" in df.columns:
|
128 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
129 |
df = df.sort_values("Score", ascending=False)
|
|
|
|
|
|
|
130 |
return df
|
131 |
|
132 |
-
def get_tetris_planning_leaderboard(rank_data):
|
133 |
data = rank_data.get("Tetris", {}).get("results", [])
|
134 |
df = pd.DataFrame(data)
|
135 |
df = df.rename(columns={
|
@@ -147,9 +156,12 @@ def get_tetris_planning_leaderboard(rank_data):
|
|
147 |
if "Score" in df.columns:
|
148 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
149 |
df = df.sort_values("Score", ascending=False)
|
|
|
|
|
|
|
150 |
return df
|
151 |
|
152 |
-
def get_ace_attorney_leaderboard(rank_data):
|
153 |
data = rank_data.get("Ace Attorney", {}).get("results", [])
|
154 |
df = pd.DataFrame(data)
|
155 |
df = df.rename(columns={
|
@@ -168,9 +180,12 @@ def get_ace_attorney_leaderboard(rank_data):
|
|
168 |
if "Score" in df.columns:
|
169 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
170 |
df = df.sort_values("Score", ascending=False) # Higher score is better
|
|
|
|
|
|
|
171 |
return df
|
172 |
|
173 |
-
def get_mario_planning_leaderboard(rank_data):
|
174 |
data = rank_data.get("Super Mario Bros", {}).get("results", [])
|
175 |
df = pd.DataFrame(data)
|
176 |
df = df.rename(columns={
|
@@ -188,6 +203,9 @@ def get_mario_planning_leaderboard(rank_data):
|
|
188 |
if "Score" in df.columns:
|
189 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
190 |
df = df.sort_values("Score", ascending=False)
|
|
|
|
|
|
|
191 |
return df
|
192 |
|
193 |
def calculate_rank_and_completeness(rank_data, selected_games):
|
@@ -285,13 +303,14 @@ def calculate_rank_and_completeness(rank_data, selected_games):
|
|
285 |
|
286 |
return df_results
|
287 |
|
288 |
-
def get_combined_leaderboard(rank_data, selected_games):
|
289 |
"""
|
290 |
Get combined leaderboard for selected games
|
291 |
|
292 |
Args:
|
293 |
rank_data (dict): Dictionary containing rank data
|
294 |
selected_games (dict): Dictionary of game names and their selection status
|
|
|
295 |
|
296 |
Returns:
|
297 |
pd.DataFrame: Combined leaderboard DataFrame
|
@@ -358,20 +377,64 @@ def get_combined_leaderboard(rank_data, selected_games):
|
|
358 |
# Create DataFrame
|
359 |
df_results = pd.DataFrame(results)
|
360 |
|
361 |
-
#
|
362 |
if not df_results.empty:
|
363 |
-
#
|
364 |
-
|
|
|
|
|
|
|
365 |
for game in GAME_ORDER:
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
370 |
|
371 |
-
# Sort by
|
372 |
-
df_results = df_results.sort_values("
|
373 |
|
374 |
-
#
|
375 |
-
|
|
|
376 |
|
377 |
return df_results
|
|
|
32 |
return "unknown"
|
33 |
|
34 |
|
35 |
+
def get_sokoban_leaderboard(rank_data, limit_to_top_n=None):
|
36 |
data = rank_data.get("Sokoban", {}).get("results", [])
|
37 |
df = pd.DataFrame(data)
|
38 |
df = df.rename(columns={
|
|
|
53 |
if "Score" in df.columns:
|
54 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
55 |
df = df.sort_values("Score", ascending=False)
|
56 |
+
# Apply limit if specified
|
57 |
+
if limit_to_top_n is not None:
|
58 |
+
df = df.head(limit_to_top_n)
|
59 |
return df
|
60 |
|
61 |
+
def get_2048_leaderboard(rank_data, limit_to_top_n=None):
|
62 |
data = rank_data.get("2048", {}).get("results", [])
|
63 |
# --- Diagnostic Print Removed ---
|
64 |
# if data and isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
|
|
|
111 |
if "Score" in df.columns:
|
112 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
113 |
df = df.sort_values("Score", ascending=False)
|
114 |
+
# Apply limit if specified
|
115 |
+
if limit_to_top_n is not None:
|
116 |
+
df = df.head(limit_to_top_n)
|
117 |
return df
|
118 |
|
119 |
+
def get_candy_leaderboard(rank_data, limit_to_top_n=None):
|
120 |
data = rank_data.get("Candy Crush", {}).get("results", [])
|
121 |
df = pd.DataFrame(data)
|
122 |
df = df.rename(columns={
|
|
|
133 |
if "Score" in df.columns:
|
134 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
135 |
df = df.sort_values("Score", ascending=False)
|
136 |
+
# Apply limit if specified
|
137 |
+
if limit_to_top_n is not None:
|
138 |
+
df = df.head(limit_to_top_n)
|
139 |
return df
|
140 |
|
141 |
+
def get_tetris_planning_leaderboard(rank_data, limit_to_top_n=None):
|
142 |
data = rank_data.get("Tetris", {}).get("results", [])
|
143 |
df = pd.DataFrame(data)
|
144 |
df = df.rename(columns={
|
|
|
156 |
if "Score" in df.columns:
|
157 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
158 |
df = df.sort_values("Score", ascending=False)
|
159 |
+
# Apply limit if specified
|
160 |
+
if limit_to_top_n is not None:
|
161 |
+
df = df.head(limit_to_top_n)
|
162 |
return df
|
163 |
|
164 |
+
def get_ace_attorney_leaderboard(rank_data, limit_to_top_n=None):
|
165 |
data = rank_data.get("Ace Attorney", {}).get("results", [])
|
166 |
df = pd.DataFrame(data)
|
167 |
df = df.rename(columns={
|
|
|
180 |
if "Score" in df.columns:
|
181 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
182 |
df = df.sort_values("Score", ascending=False) # Higher score is better
|
183 |
+
# Apply limit if specified
|
184 |
+
if limit_to_top_n is not None:
|
185 |
+
df = df.head(limit_to_top_n)
|
186 |
return df
|
187 |
|
188 |
+
def get_mario_planning_leaderboard(rank_data, limit_to_top_n=None):
|
189 |
data = rank_data.get("Super Mario Bros", {}).get("results", [])
|
190 |
df = pd.DataFrame(data)
|
191 |
df = df.rename(columns={
|
|
|
203 |
if "Score" in df.columns:
|
204 |
df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
|
205 |
df = df.sort_values("Score", ascending=False)
|
206 |
+
# Apply limit if specified
|
207 |
+
if limit_to_top_n is not None:
|
208 |
+
df = df.head(limit_to_top_n)
|
209 |
return df
|
210 |
|
211 |
def calculate_rank_and_completeness(rank_data, selected_games):
|
|
|
303 |
|
304 |
return df_results
|
305 |
|
306 |
+
def get_combined_leaderboard(rank_data, selected_games, limit_to_top_n=None):
|
307 |
"""
|
308 |
Get combined leaderboard for selected games
|
309 |
|
310 |
Args:
|
311 |
rank_data (dict): Dictionary containing rank data
|
312 |
selected_games (dict): Dictionary of game names and their selection status
|
313 |
+
limit_to_top_n (int, optional): Limit results to top N entries. None means no limit.
|
314 |
|
315 |
Returns:
|
316 |
pd.DataFrame: Combined leaderboard DataFrame
|
|
|
377 |
# Create DataFrame
|
378 |
df_results = pd.DataFrame(results)
|
379 |
|
380 |
+
# Calculate normalized scores and average normalized score
|
381 |
if not df_results.empty:
|
382 |
+
# Import the normalize_values function from data_visualization
|
383 |
+
from data_visualization import normalize_values
|
384 |
+
|
385 |
+
# Calculate normalized scores for each game
|
386 |
+
game_score_columns = []
|
387 |
for game in GAME_ORDER:
|
388 |
+
score_col = f"{game} Score"
|
389 |
+
if score_col in df_results.columns:
|
390 |
+
game_score_columns.append(score_col)
|
391 |
+
# Get numeric values, replacing 'n/a' with NaN
|
392 |
+
# Use where() to avoid FutureWarning about downcasting in replace()
|
393 |
+
series = df_results[score_col].copy()
|
394 |
+
series = series.where(series != 'n/a', np.nan)
|
395 |
+
numeric_scores = pd.to_numeric(series, errors='coerce')
|
396 |
+
|
397 |
+
# Skip games where all scores are NaN or 0
|
398 |
+
valid_scores = numeric_scores.dropna()
|
399 |
+
if len(valid_scores) > 0 and valid_scores.sum() > 0:
|
400 |
+
mean = valid_scores.mean()
|
401 |
+
std = valid_scores.std() if len(valid_scores) > 1 else 0
|
402 |
+
|
403 |
+
# Calculate normalized scores for all players
|
404 |
+
normalized_scores = []
|
405 |
+
for _, row in df_results.iterrows():
|
406 |
+
score = row[score_col]
|
407 |
+
if score == 'n/a' or pd.isna(score):
|
408 |
+
normalized_scores.append(0)
|
409 |
+
else:
|
410 |
+
normalized_scores.append(normalize_values([float(score)], mean, std)[0])
|
411 |
+
|
412 |
+
df_results[f"norm_{score_col}"] = normalized_scores
|
413 |
+
else:
|
414 |
+
# If no valid scores, set all normalized scores to 0
|
415 |
+
df_results[f"norm_{score_col}"] = 0
|
416 |
+
|
417 |
+
# Calculate average normalized score across games
|
418 |
+
normalized_columns = [f"norm_{col}" for col in game_score_columns if f"norm_{col}" in df_results.columns]
|
419 |
+
if normalized_columns:
|
420 |
+
df_results["Avg Normalized Score"] = df_results[normalized_columns].mean(axis=1).round(2)
|
421 |
+
else:
|
422 |
+
df_results["Avg Normalized Score"] = 0.0
|
423 |
+
|
424 |
+
# Reorder columns to put Avg Normalized Score after Organization
|
425 |
+
base_columns = ["Player", "Organization", "Avg Normalized Score"]
|
426 |
+
game_columns = [col for col in df_results.columns if col.endswith(" Score") and not col.startswith("norm_") and col != "Avg Normalized Score"]
|
427 |
+
other_columns = [col for col in df_results.columns if col not in base_columns + game_columns and not col.startswith("norm_")]
|
428 |
+
|
429 |
+
# Create final column order
|
430 |
+
final_columns = base_columns + game_columns + other_columns
|
431 |
+
df_results = df_results[final_columns]
|
432 |
|
433 |
+
# Sort by average normalized score in descending order
|
434 |
+
df_results = df_results.sort_values("Avg Normalized Score", ascending=False)
|
435 |
|
436 |
+
# Apply limit if specified
|
437 |
+
if limit_to_top_n is not None:
|
438 |
+
df_results = df_results.head(limit_to_top_n)
|
439 |
|
440 |
return df_results
|
rank_data_03_25_2025.json
CHANGED
@@ -3,61 +3,61 @@
|
|
3 |
"runs": 3,
|
4 |
"results": [
|
5 |
{
|
6 |
-
"model": "
|
7 |
"score": 1267.7,
|
8 |
"detail_data": "709,1532,1562",
|
9 |
"progress": "1-1"
|
10 |
},
|
11 |
{
|
12 |
-
"model": "
|
13 |
"score": 1418.7,
|
14 |
"detail_data": "2015,709,1532",
|
15 |
"progress": "1-1"
|
16 |
},
|
17 |
{
|
18 |
-
"model": "
|
19 |
"score": 1385.0,
|
20 |
"detail_data": "1672,1266,1247",
|
21 |
"progress": "1-1"
|
22 |
},
|
23 |
{
|
24 |
-
"model": "
|
25 |
"score": 1498.3,
|
26 |
"detail_data": "1561,1271,1663",
|
27 |
"progress": "1-1"
|
28 |
},
|
29 |
{
|
30 |
-
"model": "
|
31 |
"score": 1468.7,
|
32 |
"detail_data": "898,2008,1500",
|
33 |
"progress": "1-1"
|
34 |
},
|
35 |
{
|
36 |
-
"model": "
|
37 |
"score": 2126.3,
|
38 |
"detail_data": "1531,722,4126",
|
39 |
"progress": "1-1"
|
40 |
},
|
41 |
{
|
42 |
-
"model": "
|
43 |
"score": 2047.3,
|
44 |
"detail_data": "2017,2590,1535",
|
45 |
"progress": "1-1"
|
46 |
},
|
47 |
{
|
48 |
-
"model": "
|
49 |
"score": 855,
|
50 |
"detail_data": "855",
|
51 |
"progress": "1-1"
|
52 |
},
|
53 |
{
|
54 |
-
"model": "
|
55 |
"score": 3445,
|
56 |
"detail_data": "3445",
|
57 |
"progress": "1-1"
|
58 |
},
|
59 |
{
|
60 |
-
"model": "
|
61 |
"score": 1448.0,
|
62 |
"detail_data": "1525,1263,1556",
|
63 |
"progress": "1-1"
|
@@ -74,79 +74,79 @@
|
|
74 |
"runs": 3,
|
75 |
"results": [
|
76 |
{
|
77 |
-
"model": "
|
78 |
"score": 1914.67,
|
79 |
"details": "1352,2860,1532",
|
80 |
"highest_tail": 256
|
81 |
},
|
82 |
{
|
83 |
-
"model": "
|
84 |
"score": 2624,
|
85 |
"details": "2560,3224,2088",
|
86 |
"highest_tail": 256
|
87 |
},
|
88 |
{
|
89 |
-
"model": "
|
90 |
"score": 1873.33,
|
91 |
"details": "700,1240,3680",
|
92 |
"highest_tail": 256
|
93 |
},
|
94 |
{
|
95 |
-
"model": "
|
96 |
"score": 1697.33,
|
97 |
"details": "1304,1316,2472",
|
98 |
"highest_tail": 256
|
99 |
},
|
100 |
{
|
101 |
-
"model": "
|
102 |
"score": 3586.67,
|
103 |
"details": "5300,2400,3060",
|
104 |
"highest_tail": 512
|
105 |
},
|
106 |
{
|
107 |
-
"model": "
|
108 |
"score": 4036,
|
109 |
"details": "6412,2492,3204",
|
110 |
"highest_tail": 512
|
111 |
},
|
112 |
{
|
113 |
-
"model": "
|
114 |
"score": 1586.67,
|
115 |
"details": "1404,1272,2084",
|
116 |
"highest_tail": 128
|
117 |
},
|
118 |
{
|
119 |
-
"model": "
|
120 |
"score": 1656,
|
121 |
"details": "1156,2664,1148",
|
122 |
"highest_tail": 256
|
123 |
},
|
124 |
{
|
125 |
-
"model": "
|
126 |
"score": 1656,
|
127 |
"details": "1604,1284,2080",
|
128 |
"highest_tail": 256
|
129 |
},
|
130 |
{
|
131 |
-
"model": "
|
132 |
"score": 7580,
|
133 |
"details": "7580",
|
134 |
"highest_tail": 512
|
135 |
},
|
136 |
{
|
137 |
-
"model": "
|
138 |
"score": 2757.33,
|
139 |
"details": "3132,2004,3136",
|
140 |
"highest_tail": 256
|
141 |
},
|
142 |
{
|
143 |
-
"model": "
|
144 |
"score": 7120,
|
145 |
"details": "7120",
|
146 |
"highest_tail": 512
|
147 |
},
|
148 |
{
|
149 |
-
"model": "
|
150 |
"score": 4432.0,
|
151 |
"details": "4928,5456,2912",
|
152 |
"highest_tail": 512
|
@@ -158,25 +158,25 @@
|
|
158 |
"highest_tail": 128
|
159 |
},
|
160 |
{
|
161 |
-
"model": "
|
162 |
"score": 3036.0,
|
163 |
"details": "3036.0",
|
164 |
"highest_tail": 256
|
165 |
},
|
166 |
{
|
167 |
-
"model": "
|
168 |
"score": 3136,
|
169 |
"details": "2148,2360,4900",
|
170 |
"highest_tail": 256
|
171 |
},
|
172 |
{
|
173 |
-
"model": "
|
174 |
"score": 3330.0,
|
175 |
"details": "3260,3400",
|
176 |
"highest_tail": 256
|
177 |
},
|
178 |
{
|
179 |
-
"model": "
|
180 |
"score": 2144.0,
|
181 |
"details": "1436,2556,2440",
|
182 |
"highest_tail": 256
|
@@ -187,67 +187,67 @@
|
|
187 |
"runs": 3,
|
188 |
"results": [
|
189 |
{
|
190 |
-
"model": "
|
191 |
"score": 14.7,
|
192 |
"details": "16,14,14"
|
193 |
},
|
194 |
{
|
195 |
-
"model": "
|
196 |
"score": 16.3,
|
197 |
"details": "19,15,15"
|
198 |
},
|
199 |
{
|
200 |
-
"model": "
|
201 |
"score": 14.3,
|
202 |
"details": "15,14,14"
|
203 |
},
|
204 |
{
|
205 |
-
"model": "
|
206 |
"score": 16.3,
|
207 |
"details": "20,14,15"
|
208 |
},
|
209 |
{
|
210 |
-
"model": "
|
211 |
"score": 23.3,
|
212 |
"details": "23,23,24"
|
213 |
},
|
214 |
{
|
215 |
-
"model": "
|
216 |
"score": 21.3,
|
217 |
"details": "20,15,29"
|
218 |
},
|
219 |
{
|
220 |
-
"model": "
|
221 |
"score": 10.3,
|
222 |
"details": "9,10,12"
|
223 |
},
|
224 |
{
|
225 |
-
"model": "
|
226 |
"score": 13.7,
|
227 |
"details": "13,14,14"
|
228 |
},
|
229 |
{
|
230 |
-
"model": "
|
231 |
"score": 14,
|
232 |
"details": "18,11,13"
|
233 |
},
|
234 |
{
|
235 |
-
"model": "
|
236 |
"score": 35,
|
237 |
"details": "35"
|
238 |
},
|
239 |
{
|
240 |
-
"model": "
|
241 |
"score": 11.7,
|
242 |
"details": "11,11,13"
|
243 |
},
|
244 |
{
|
245 |
-
"model": "
|
246 |
"score": 42,
|
247 |
"details": "42"
|
248 |
},
|
249 |
{
|
250 |
-
"model": "
|
251 |
"score": 25.3,
|
252 |
"details": "22,35,19"
|
253 |
},
|
@@ -257,22 +257,22 @@
|
|
257 |
"details": ""
|
258 |
},
|
259 |
{
|
260 |
-
"model": "
|
261 |
"score": 20,
|
262 |
"details": "17,18,25"
|
263 |
},
|
264 |
{
|
265 |
-
"model": "
|
266 |
"score": 19.33,
|
267 |
"details": "20,17,21"
|
268 |
},
|
269 |
{
|
270 |
-
"model": "
|
271 |
"score": 33.67,
|
272 |
"details": "26,34,41"
|
273 |
},
|
274 |
{
|
275 |
-
"model": "
|
276 |
"score": 11.67,
|
277 |
"details": "13,14,8"
|
278 |
}
|
@@ -282,67 +282,67 @@
|
|
282 |
"runs": 3,
|
283 |
"results": [
|
284 |
{
|
285 |
-
"model": "
|
286 |
"score": 106,
|
287 |
"details": "92,165,61"
|
288 |
},
|
289 |
{
|
290 |
-
"model": "
|
291 |
"score": 484,
|
292 |
"details": "535,428,489"
|
293 |
},
|
294 |
{
|
295 |
-
"model": "
|
296 |
"score": 447.3,
|
297 |
"details": "409,436,497"
|
298 |
},
|
299 |
{
|
300 |
-
"model": "
|
301 |
"score": 334.7,
|
302 |
"details": "259,372,373"
|
303 |
},
|
304 |
{
|
305 |
-
"model": "
|
306 |
"score": 416.3,
|
307 |
"details": "411,414,424"
|
308 |
},
|
309 |
{
|
310 |
-
"model": "
|
311 |
"score": 254,
|
312 |
"details": "299,332,131"
|
313 |
},
|
314 |
{
|
315 |
-
"model": "
|
316 |
"score": 128.7,
|
317 |
"details": "67,139,180"
|
318 |
},
|
319 |
{
|
320 |
-
"model": "
|
321 |
"score": 182,
|
322 |
"details": "163,215,168"
|
323 |
},
|
324 |
{
|
325 |
-
"model": "
|
326 |
"score": 147.3,
|
327 |
"details": "131,104,207"
|
328 |
},
|
329 |
{
|
330 |
-
"model": "
|
331 |
"score": 159,
|
332 |
"details": "159"
|
333 |
},
|
334 |
{
|
335 |
-
"model": "
|
336 |
"score": 48,
|
337 |
"details": "21,86,37"
|
338 |
},
|
339 |
{
|
340 |
-
"model": "
|
341 |
"score": 647,
|
342 |
"details": "647"
|
343 |
},
|
344 |
{
|
345 |
-
"model": "
|
346 |
"score": 487.3,
|
347 |
"details": "259,591,612"
|
348 |
},
|
@@ -352,22 +352,22 @@
|
|
352 |
"details": ""
|
353 |
},
|
354 |
{
|
355 |
-
"model": "
|
356 |
"score": 464,
|
357 |
"details": "593,406,393"
|
358 |
},
|
359 |
{
|
360 |
-
"model": "
|
361 |
"score": 478.33,
|
362 |
"details": "545,468,422"
|
363 |
},
|
364 |
{
|
365 |
-
"model": "
|
366 |
"score": 491.67,
|
367 |
"details": "464,463,548"
|
368 |
},
|
369 |
{
|
370 |
-
"model": "
|
371 |
"score": 363.33,
|
372 |
"details": "365,372,353"
|
373 |
}
|
@@ -377,79 +377,79 @@
|
|
377 |
"runs": 3,
|
378 |
"results": [
|
379 |
{
|
380 |
-
"model": "
|
381 |
"score": 0,
|
382 |
"detail_box_on_target": "0,0,0",
|
383 |
"cracked_levels": "0,0,0"
|
384 |
},
|
385 |
{
|
386 |
-
"model": "
|
387 |
"score": 2.33,
|
388 |
"detail_box_on_target": "2,4,1",
|
389 |
"cracked_levels": "1,2,0"
|
390 |
},
|
391 |
{
|
392 |
-
"model": "
|
393 |
"score": 1.33,
|
394 |
"detail_box_on_target": "2,0,2",
|
395 |
"cracked_levels": "1,0,1"
|
396 |
},
|
397 |
{
|
398 |
-
"model": "
|
399 |
"score": 1.67,
|
400 |
"detail_box_on_target": "3,0,2",
|
401 |
"cracked_levels": "2,0,1"
|
402 |
},
|
403 |
{
|
404 |
-
"model": "
|
405 |
"score": 4.33,
|
406 |
"detail_box_on_target": "4,4,5",
|
407 |
"cracked_levels": "2,2,3"
|
408 |
},
|
409 |
{
|
410 |
-
"model": "
|
411 |
"score": 5.67,
|
412 |
"detail_box_on_target": "5,6,6",
|
413 |
"cracked_levels": "3,3,3"
|
414 |
},
|
415 |
{
|
416 |
-
"model": "
|
417 |
"score": 0,
|
418 |
"detail_box_on_target": "0,0,0",
|
419 |
"cracked_levels": "0,0,0"
|
420 |
},
|
421 |
{
|
422 |
-
"model": "
|
423 |
"score": 0,
|
424 |
"detail_box_on_target": "0,0,0",
|
425 |
"cracked_levels": "0,0,0"
|
426 |
},
|
427 |
{
|
428 |
-
"model": "
|
429 |
"score": 0,
|
430 |
"detail_box_on_target": "0,0,0",
|
431 |
"cracked_levels": "0,0,0"
|
432 |
},
|
433 |
{
|
434 |
-
"model": "
|
435 |
"score": 2.33,
|
436 |
"detail_box_on_target": "2,2,3",
|
437 |
"cracked_levels": "1,1,2"
|
438 |
},
|
439 |
{
|
440 |
-
"model": "
|
441 |
"score": 1.33,
|
442 |
"detail_box_on_target": "1,2,1",
|
443 |
"cracked_levels": "0,1,0"
|
444 |
},
|
445 |
{
|
446 |
-
"model": "
|
447 |
"score": 8,
|
448 |
"detail_box_on_target": "10,6",
|
449 |
"cracked_levels": "5,3"
|
450 |
},
|
451 |
{
|
452 |
-
"model": "
|
453 |
"score": 5.33,
|
454 |
"detail_box_on_target": "4,6,6",
|
455 |
"cracked_levels": "2,2,3"
|
@@ -461,22 +461,22 @@
|
|
461 |
"cracked_levels": "0,0,0"
|
462 |
},
|
463 |
{
|
464 |
-
"model": "
|
465 |
"score": 4,
|
466 |
"details": "4,4,4"
|
467 |
},
|
468 |
{
|
469 |
-
"model": "
|
470 |
"score": 3,
|
471 |
"details": "2,2,5"
|
472 |
},
|
473 |
{
|
474 |
-
"model": "
|
475 |
"score": 4.67,
|
476 |
"details": "4,4,6"
|
477 |
},
|
478 |
{
|
479 |
-
"model": "
|
480 |
"score": 2.33,
|
481 |
"details": "1,2,4"
|
482 |
}
|
@@ -486,79 +486,79 @@
|
|
486 |
"runs": 1,
|
487 |
"results": [
|
488 |
{
|
489 |
-
"model": "
|
490 |
"score": 2,
|
491 |
"progress": "1:2/5",
|
492 |
"evaluator result": "1/3"
|
493 |
},
|
494 |
{
|
495 |
-
"model": "
|
496 |
"score": 7,
|
497 |
"progress": "2:2/9",
|
498 |
"evaluator result": "5/11"
|
499 |
},
|
500 |
{
|
501 |
-
"model": "
|
502 |
"score": 0,
|
503 |
"progress": "0",
|
504 |
"evaluator result": "1/5"
|
505 |
},
|
506 |
{
|
507 |
-
"model": "
|
508 |
"score": 4,
|
509 |
"progress": "1:4/5",
|
510 |
"evaluator result": "1/7"
|
511 |
},
|
512 |
{
|
513 |
-
"model": "
|
514 |
"score": 7,
|
515 |
"progress": "2:2/9",
|
516 |
"evaluator result": "2/3"
|
517 |
},
|
518 |
{
|
519 |
-
"model": "
|
520 |
"score": 0,
|
521 |
"progress": "0",
|
522 |
"evaluator result": "0"
|
523 |
},
|
524 |
{
|
525 |
-
"model": "
|
526 |
"score": 0,
|
527 |
"progress": "0",
|
528 |
"evaluator result": "0"
|
529 |
},
|
530 |
{
|
531 |
-
"model": "
|
532 |
"score": 2,
|
533 |
"progress": "1:2/5",
|
534 |
"evaluator result": "2/3"
|
535 |
},
|
536 |
{
|
537 |
-
"model": "
|
538 |
"score": 0,
|
539 |
"progress": "0",
|
540 |
"evaluator result": "0"
|
541 |
},
|
542 |
{
|
543 |
-
"model": "
|
544 |
"score": 16,
|
545 |
"progress": "3: 2/8",
|
546 |
"evaluator result": "6/11"
|
547 |
},
|
548 |
{
|
549 |
-
"model": "
|
550 |
"score": 0,
|
551 |
"progress": "0",
|
552 |
"evaluator result": "1/5"
|
553 |
},
|
554 |
{
|
555 |
-
"model": "
|
556 |
"score": 16,
|
557 |
"progress": "3: 2/8",
|
558 |
"evaluator result": "1/2"
|
559 |
},
|
560 |
{
|
561 |
-
"model": "
|
562 |
"score": 4,
|
563 |
"progress": "1:4/5",
|
564 |
"evaluator result": "2/5"
|
@@ -570,17 +570,17 @@
|
|
570 |
"evaluator result": "0"
|
571 |
},
|
572 |
{
|
573 |
-
"model": "
|
574 |
"score": 6,
|
575 |
"details": "6"
|
576 |
},
|
577 |
{
|
578 |
-
"model": "
|
579 |
"score": 3.67,
|
580 |
"details": "3,4,4"
|
581 |
},
|
582 |
{
|
583 |
-
"model": "
|
584 |
"score": 4.33,
|
585 |
"details": "3,4,6"
|
586 |
}
|
|
|
3 |
"runs": 3,
|
4 |
"results": [
|
5 |
{
|
6 |
+
"model": "claude-3-5-sonnet-20241022 (⚔️)",
|
7 |
"score": 1267.7,
|
8 |
"detail_data": "709,1532,1562",
|
9 |
"progress": "1-1"
|
10 |
},
|
11 |
{
|
12 |
+
"model": "claude-3-7-sonnet-20250219 (⚔️)",
|
13 |
"score": 1418.7,
|
14 |
"detail_data": "2015,709,1532",
|
15 |
"progress": "1-1"
|
16 |
},
|
17 |
{
|
18 |
+
"model": "gemini-2.5-flash-preview-04-17 (⚔️)",
|
19 |
"score": 1385.0,
|
20 |
"detail_data": "1672,1266,1247",
|
21 |
"progress": "1-1"
|
22 |
},
|
23 |
{
|
24 |
+
"model": "gemini-2.5-pro-preview-05-06 (⚔️)",
|
25 |
"score": 1498.3,
|
26 |
"detail_data": "1561,1271,1663",
|
27 |
"progress": "1-1"
|
28 |
},
|
29 |
{
|
30 |
+
"model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
|
31 |
"score": 1468.7,
|
32 |
"detail_data": "898,2008,1500",
|
33 |
"progress": "1-1"
|
34 |
},
|
35 |
{
|
36 |
+
"model": "gpt-4.1-2025-04-14 (⚔️)",
|
37 |
"score": 2126.3,
|
38 |
"detail_data": "1531,722,4126",
|
39 |
"progress": "1-1"
|
40 |
},
|
41 |
{
|
42 |
+
"model": "gpt-4o-2024-11-20 (⚔️)",
|
43 |
"score": 2047.3,
|
44 |
"detail_data": "2017,2590,1535",
|
45 |
"progress": "1-1"
|
46 |
},
|
47 |
{
|
48 |
+
"model": "o1-2024-12-17 (⚔️)",
|
49 |
"score": 855,
|
50 |
"detail_data": "855",
|
51 |
"progress": "1-1"
|
52 |
},
|
53 |
{
|
54 |
+
"model": "o3-2025-04-16 (⚔️)",
|
55 |
"score": 3445,
|
56 |
"detail_data": "3445",
|
57 |
"progress": "1-1"
|
58 |
},
|
59 |
{
|
60 |
+
"model": "o4-mini-2025-04-16 (⚔️)",
|
61 |
"score": 1448.0,
|
62 |
"detail_data": "1525,1263,1556",
|
63 |
"progress": "1-1"
|
|
|
74 |
"runs": 3,
|
75 |
"results": [
|
76 |
{
|
77 |
+
"model": "claude-3-5-sonnet-20241022 (⚔️)",
|
78 |
"score": 1914.67,
|
79 |
"details": "1352,2860,1532",
|
80 |
"highest_tail": 256
|
81 |
},
|
82 |
{
|
83 |
+
"model": "claude-3-7-sonnet-20250219 (⚔️)",
|
84 |
"score": 2624,
|
85 |
"details": "2560,3224,2088",
|
86 |
"highest_tail": 256
|
87 |
},
|
88 |
{
|
89 |
+
"model": "deepseek-r1-0120 (⚔️)",
|
90 |
"score": 1873.33,
|
91 |
"details": "700,1240,3680",
|
92 |
"highest_tail": 256
|
93 |
},
|
94 |
{
|
95 |
+
"model": "gemini-2.5-flash-preview-04-17 (⚔️)",
|
96 |
"score": 1697.33,
|
97 |
"details": "1304,1316,2472",
|
98 |
"highest_tail": 256
|
99 |
},
|
100 |
{
|
101 |
+
"model": "gemini-2.5-pro-preview-05-06 (⚔️)",
|
102 |
"score": 3586.67,
|
103 |
"details": "5300,2400,3060",
|
104 |
"highest_tail": 512
|
105 |
},
|
106 |
{
|
107 |
+
"model": "grok-3-mini-beta (⚔️)",
|
108 |
"score": 4036,
|
109 |
"details": "6412,2492,3204",
|
110 |
"highest_tail": 512
|
111 |
},
|
112 |
{
|
113 |
+
"model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
|
114 |
"score": 1586.67,
|
115 |
"details": "1404,1272,2084",
|
116 |
"highest_tail": 128
|
117 |
},
|
118 |
{
|
119 |
+
"model": "gpt-4.1-2025-04-14 (⚔️)",
|
120 |
"score": 1656,
|
121 |
"details": "1156,2664,1148",
|
122 |
"highest_tail": 256
|
123 |
},
|
124 |
{
|
125 |
+
"model": "gpt-4o-2024-11-20 (⚔️)",
|
126 |
"score": 1656,
|
127 |
"details": "1604,1284,2080",
|
128 |
"highest_tail": 256
|
129 |
},
|
130 |
{
|
131 |
+
"model": "o1-2024-12-17 (⚔️)",
|
132 |
"score": 7580,
|
133 |
"details": "7580",
|
134 |
"highest_tail": 512
|
135 |
},
|
136 |
{
|
137 |
+
"model": "o1-mini-2024-09-12 (⚔️)",
|
138 |
"score": 2757.33,
|
139 |
"details": "3132,2004,3136",
|
140 |
"highest_tail": 256
|
141 |
},
|
142 |
{
|
143 |
+
"model": "o3-2025-04-16 (⚔️)",
|
144 |
"score": 7120,
|
145 |
"details": "7120",
|
146 |
"highest_tail": 512
|
147 |
},
|
148 |
{
|
149 |
+
"model": "o4-mini-2025-04-16 (⚔️)",
|
150 |
"score": 4432.0,
|
151 |
"details": "4928,5456,2912",
|
152 |
"highest_tail": 512
|
|
|
158 |
"highest_tail": 128
|
159 |
},
|
160 |
{
|
161 |
+
"model": "claude-opus-4-20250514 (⚔️)",
|
162 |
"score": 3036.0,
|
163 |
"details": "3036.0",
|
164 |
"highest_tail": 256
|
165 |
},
|
166 |
{
|
167 |
+
"model": "claude-sonnet-4-20250514 (⚔️)",
|
168 |
"score": 3136,
|
169 |
"details": "2148,2360,4900",
|
170 |
"highest_tail": 256
|
171 |
},
|
172 |
{
|
173 |
+
"model": "deepseek-r1-0528 (⚔️)",
|
174 |
"score": 3330.0,
|
175 |
"details": "3260,3400",
|
176 |
"highest_tail": 256
|
177 |
},
|
178 |
{
|
179 |
+
"model": "qwen3-235B-A22B-fp8 (⚔️)",
|
180 |
"score": 2144.0,
|
181 |
"details": "1436,2556,2440",
|
182 |
"highest_tail": 256
|
|
|
187 |
"runs": 3,
|
188 |
"results": [
|
189 |
{
|
190 |
+
"model": "claude-3-5-sonnet-20241022 (⚔️)",
|
191 |
"score": 14.7,
|
192 |
"details": "16,14,14"
|
193 |
},
|
194 |
{
|
195 |
+
"model": "claude-3-7-sonnet-20250219 (⚔️)",
|
196 |
"score": 16.3,
|
197 |
"details": "19,15,15"
|
198 |
},
|
199 |
{
|
200 |
+
"model": "deepseek-r1-0120 (⚔️)",
|
201 |
"score": 14.3,
|
202 |
"details": "15,14,14"
|
203 |
},
|
204 |
{
|
205 |
+
"model": "gemini-2.5-flash-preview-04-17 (⚔️)",
|
206 |
"score": 16.3,
|
207 |
"details": "20,14,15"
|
208 |
},
|
209 |
{
|
210 |
+
"model": "gemini-2.5-pro-preview-05-06 (⚔️)",
|
211 |
"score": 23.3,
|
212 |
"details": "23,23,24"
|
213 |
},
|
214 |
{
|
215 |
+
"model": "grok-3-mini-beta (⚔️)",
|
216 |
"score": 21.3,
|
217 |
"details": "20,15,29"
|
218 |
},
|
219 |
{
|
220 |
+
"model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
|
221 |
"score": 10.3,
|
222 |
"details": "9,10,12"
|
223 |
},
|
224 |
{
|
225 |
+
"model": "gpt-4.1-2025-04-14 (⚔️)",
|
226 |
"score": 13.7,
|
227 |
"details": "13,14,14"
|
228 |
},
|
229 |
{
|
230 |
+
"model": "gpt-4o-2024-11-20 (⚔️)",
|
231 |
"score": 14,
|
232 |
"details": "18,11,13"
|
233 |
},
|
234 |
{
|
235 |
+
"model": "o1-2024-12-17 (⚔️)",
|
236 |
"score": 35,
|
237 |
"details": "35"
|
238 |
},
|
239 |
{
|
240 |
+
"model": "o1-mini-2024-09-12 (⚔️)",
|
241 |
"score": 11.7,
|
242 |
"details": "11,11,13"
|
243 |
},
|
244 |
{
|
245 |
+
"model": "o3-2025-04-16 (⚔️)",
|
246 |
"score": 42,
|
247 |
"details": "42"
|
248 |
},
|
249 |
{
|
250 |
+
"model": "o4-mini-2025-04-16 (⚔️)",
|
251 |
"score": 25.3,
|
252 |
"details": "22,35,19"
|
253 |
},
|
|
|
257 |
"details": ""
|
258 |
},
|
259 |
{
|
260 |
+
"model": "claude-opus-4-20250514 (⚔️)",
|
261 |
"score": 20,
|
262 |
"details": "17,18,25"
|
263 |
},
|
264 |
{
|
265 |
+
"model": "claude-sonnet-4-20250514 (⚔️)",
|
266 |
"score": 19.33,
|
267 |
"details": "20,17,21"
|
268 |
},
|
269 |
{
|
270 |
+
"model": "deepseek-r1-0528 (⚔️)",
|
271 |
"score": 33.67,
|
272 |
"details": "26,34,41"
|
273 |
},
|
274 |
{
|
275 |
+
"model": "qwen3-235B-A22B-fp8 (⚔️)",
|
276 |
"score": 11.67,
|
277 |
"details": "13,14,8"
|
278 |
}
|
|
|
282 |
"runs": 3,
|
283 |
"results": [
|
284 |
{
|
285 |
+
"model": "claude-3-5-sonnet-20241022 (⚔️)",
|
286 |
"score": 106,
|
287 |
"details": "92,165,61"
|
288 |
},
|
289 |
{
|
290 |
+
"model": "claude-3-7-sonnet-20250219 (⚔️)",
|
291 |
"score": 484,
|
292 |
"details": "535,428,489"
|
293 |
},
|
294 |
{
|
295 |
+
"model": "deepseek-r1-0120 (⚔️)",
|
296 |
"score": 447.3,
|
297 |
"details": "409,436,497"
|
298 |
},
|
299 |
{
|
300 |
+
"model": "gemini-2.5-flash-preview-04-17 (⚔️)",
|
301 |
"score": 334.7,
|
302 |
"details": "259,372,373"
|
303 |
},
|
304 |
{
|
305 |
+
"model": "gemini-2.5-pro-preview-05-06 (⚔️)",
|
306 |
"score": 416.3,
|
307 |
"details": "411,414,424"
|
308 |
},
|
309 |
{
|
310 |
+
"model": "grok-3-mini-beta (⚔️)",
|
311 |
"score": 254,
|
312 |
"details": "299,332,131"
|
313 |
},
|
314 |
{
|
315 |
+
"model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
|
316 |
"score": 128.7,
|
317 |
"details": "67,139,180"
|
318 |
},
|
319 |
{
|
320 |
+
"model": "gpt-4.1-2025-04-14 (⚔️)",
|
321 |
"score": 182,
|
322 |
"details": "163,215,168"
|
323 |
},
|
324 |
{
|
325 |
+
"model": "gpt-4o-2024-11-20 (⚔️)",
|
326 |
"score": 147.3,
|
327 |
"details": "131,104,207"
|
328 |
},
|
329 |
{
|
330 |
+
"model": "o1-2024-12-17 (⚔️)",
|
331 |
"score": 159,
|
332 |
"details": "159"
|
333 |
},
|
334 |
{
|
335 |
+
"model": "o1-mini-2024-09-12 (⚔️)",
|
336 |
"score": 48,
|
337 |
"details": "21,86,37"
|
338 |
},
|
339 |
{
|
340 |
+
"model": "o3-2025-04-16 (⚔️)",
|
341 |
"score": 647,
|
342 |
"details": "647"
|
343 |
},
|
344 |
{
|
345 |
+
"model": "o4-mini-2025-04-16 (⚔️)",
|
346 |
"score": 487.3,
|
347 |
"details": "259,591,612"
|
348 |
},
|
|
|
352 |
"details": ""
|
353 |
},
|
354 |
{
|
355 |
+
"model": "claude-opus-4-20250514 (⚔️)",
|
356 |
"score": 464,
|
357 |
"details": "593,406,393"
|
358 |
},
|
359 |
{
|
360 |
+
"model": "claude-sonnet-4-20250514 (⚔️)",
|
361 |
"score": 478.33,
|
362 |
"details": "545,468,422"
|
363 |
},
|
364 |
{
|
365 |
+
"model": "deepseek-r1-0528 (⚔️)",
|
366 |
"score": 491.67,
|
367 |
"details": "464,463,548"
|
368 |
},
|
369 |
{
|
370 |
+
"model": "qwen3-235B-A22B-fp8 (⚔️)",
|
371 |
"score": 363.33,
|
372 |
"details": "365,372,353"
|
373 |
}
|
|
|
377 |
"runs": 3,
|
378 |
"results": [
|
379 |
{
|
380 |
+
"model": "claude-3-5-sonnet-20241022 (⚔️)",
|
381 |
"score": 0,
|
382 |
"detail_box_on_target": "0,0,0",
|
383 |
"cracked_levels": "0,0,0"
|
384 |
},
|
385 |
{
|
386 |
+
"model": "claude-3-7-sonnet-20250219 (⚔️)",
|
387 |
"score": 2.33,
|
388 |
"detail_box_on_target": "2,4,1",
|
389 |
"cracked_levels": "1,2,0"
|
390 |
},
|
391 |
{
|
392 |
+
"model": "deepseek-r1-0120 (⚔️)",
|
393 |
"score": 1.33,
|
394 |
"detail_box_on_target": "2,0,2",
|
395 |
"cracked_levels": "1,0,1"
|
396 |
},
|
397 |
{
|
398 |
+
"model": "gemini-2.5-flash-preview-04-17 (⚔️)",
|
399 |
"score": 1.67,
|
400 |
"detail_box_on_target": "3,0,2",
|
401 |
"cracked_levels": "2,0,1"
|
402 |
},
|
403 |
{
|
404 |
+
"model": "gemini-2.5-pro-preview-05-06 (⚔️)",
|
405 |
"score": 4.33,
|
406 |
"detail_box_on_target": "4,4,5",
|
407 |
"cracked_levels": "2,2,3"
|
408 |
},
|
409 |
{
|
410 |
+
"model": "grok-3-mini-beta (⚔️)",
|
411 |
"score": 5.67,
|
412 |
"detail_box_on_target": "5,6,6",
|
413 |
"cracked_levels": "3,3,3"
|
414 |
},
|
415 |
{
|
416 |
+
"model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
|
417 |
"score": 0,
|
418 |
"detail_box_on_target": "0,0,0",
|
419 |
"cracked_levels": "0,0,0"
|
420 |
},
|
421 |
{
|
422 |
+
"model": "gpt-4.1-2025-04-14 (⚔️)",
|
423 |
"score": 0,
|
424 |
"detail_box_on_target": "0,0,0",
|
425 |
"cracked_levels": "0,0,0"
|
426 |
},
|
427 |
{
|
428 |
+
"model": "gpt-4o-2024-11-20 (⚔️)",
|
429 |
"score": 0,
|
430 |
"detail_box_on_target": "0,0,0",
|
431 |
"cracked_levels": "0,0,0"
|
432 |
},
|
433 |
{
|
434 |
+
"model": "o1-2024-12-17 (⚔️)",
|
435 |
"score": 2.33,
|
436 |
"detail_box_on_target": "2,2,3",
|
437 |
"cracked_levels": "1,1,2"
|
438 |
},
|
439 |
{
|
440 |
+
"model": "o1-mini-2024-09-12 (⚔️)",
|
441 |
"score": 1.33,
|
442 |
"detail_box_on_target": "1,2,1",
|
443 |
"cracked_levels": "0,1,0"
|
444 |
},
|
445 |
{
|
446 |
+
"model": "o3-2025-04-16 (⚔️)",
|
447 |
"score": 8,
|
448 |
"detail_box_on_target": "10,6",
|
449 |
"cracked_levels": "5,3"
|
450 |
},
|
451 |
{
|
452 |
+
"model": "o4-mini-2025-04-16 (⚔️)",
|
453 |
"score": 5.33,
|
454 |
"detail_box_on_target": "4,6,6",
|
455 |
"cracked_levels": "2,2,3"
|
|
|
461 |
"cracked_levels": "0,0,0"
|
462 |
},
|
463 |
{
|
464 |
+
"model": "claude-opus-4-20250514 (⚔️)",
|
465 |
"score": 4,
|
466 |
"details": "4,4,4"
|
467 |
},
|
468 |
{
|
469 |
+
"model": "claude-sonnet-4-20250514 (⚔️)",
|
470 |
"score": 3,
|
471 |
"details": "2,2,5"
|
472 |
},
|
473 |
{
|
474 |
+
"model": "deepseek-r1-0528 (⚔️)",
|
475 |
"score": 4.67,
|
476 |
"details": "4,4,6"
|
477 |
},
|
478 |
{
|
479 |
+
"model": "qwen3-235B-A22B-fp8 (⚔️)",
|
480 |
"score": 2.33,
|
481 |
"details": "1,2,4"
|
482 |
}
|
|
|
486 |
"runs": 1,
|
487 |
"results": [
|
488 |
{
|
489 |
+
"model": "claude-3-5-sonnet-20241022 (⚔️)",
|
490 |
"score": 2,
|
491 |
"progress": "1:2/5",
|
492 |
"evaluator result": "1/3"
|
493 |
},
|
494 |
{
|
495 |
+
"model": "claude-3-7-sonnet-20250219 (⚔️)",
|
496 |
"score": 7,
|
497 |
"progress": "2:2/9",
|
498 |
"evaluator result": "5/11"
|
499 |
},
|
500 |
{
|
501 |
+
"model": "deepseek-r1-0120 (⚔️)",
|
502 |
"score": 0,
|
503 |
"progress": "0",
|
504 |
"evaluator result": "1/5"
|
505 |
},
|
506 |
{
|
507 |
+
"model": "gemini-2.5-flash-preview-04-17 (⚔️)",
|
508 |
"score": 4,
|
509 |
"progress": "1:4/5",
|
510 |
"evaluator result": "1/7"
|
511 |
},
|
512 |
{
|
513 |
+
"model": "gemini-2.5-pro-preview-05-06 (⚔️)",
|
514 |
"score": 7,
|
515 |
"progress": "2:2/9",
|
516 |
"evaluator result": "2/3"
|
517 |
},
|
518 |
{
|
519 |
+
"model": "grok-3-mini-beta (⚔️)",
|
520 |
"score": 0,
|
521 |
"progress": "0",
|
522 |
"evaluator result": "0"
|
523 |
},
|
524 |
{
|
525 |
+
"model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
|
526 |
"score": 0,
|
527 |
"progress": "0",
|
528 |
"evaluator result": "0"
|
529 |
},
|
530 |
{
|
531 |
+
"model": "gpt-4.1-2025-04-14 (⚔️)",
|
532 |
"score": 2,
|
533 |
"progress": "1:2/5",
|
534 |
"evaluator result": "2/3"
|
535 |
},
|
536 |
{
|
537 |
+
"model": "gpt-4o-2024-11-20 (⚔️)",
|
538 |
"score": 0,
|
539 |
"progress": "0",
|
540 |
"evaluator result": "0"
|
541 |
},
|
542 |
{
|
543 |
+
"model": "o1-2024-12-17 (⚔️)",
|
544 |
"score": 16,
|
545 |
"progress": "3: 2/8",
|
546 |
"evaluator result": "6/11"
|
547 |
},
|
548 |
{
|
549 |
+
"model": "o1-mini-2024-09-12 (⚔️)",
|
550 |
"score": 0,
|
551 |
"progress": "0",
|
552 |
"evaluator result": "1/5"
|
553 |
},
|
554 |
{
|
555 |
+
"model": "o3-2025-04-16 (⚔️)",
|
556 |
"score": 16,
|
557 |
"progress": "3: 2/8",
|
558 |
"evaluator result": "1/2"
|
559 |
},
|
560 |
{
|
561 |
+
"model": "o4-mini-2025-04-16 (⚔️)",
|
562 |
"score": 4,
|
563 |
"progress": "1:4/5",
|
564 |
"evaluator result": "2/5"
|
|
|
570 |
"evaluator result": "0"
|
571 |
},
|
572 |
{
|
573 |
+
"model": "claude-opus-4-20250514 (⚔️)",
|
574 |
"score": 6,
|
575 |
"details": "6"
|
576 |
},
|
577 |
{
|
578 |
+
"model": "claude-sonnet-4-20250514 (⚔️)",
|
579 |
"score": 3.67,
|
580 |
"details": "3,4,4"
|
581 |
},
|
582 |
{
|
583 |
+
"model": "gemini-2.5-flash-preview-05-20 (⚔️)",
|
584 |
"score": 4.33,
|
585 |
"details": "3,4,6"
|
586 |
}
|