Spaces:

lmgame
/

lmgame_bench

Running

lmgame_bench / leaderboard_utils.py

Yuxuan-Zhang-Dexter

update leaderboard with new agentic leaderboard layout

6d4c755 3 days ago

19.2 kB

	import pandas as pd
	import json
	import numpy as np

	# Define game order
	GAME_ORDER = [
	# "Super Mario Bros", # Commented out
	"Super Mario Bros",
	"Sokoban",
	"2048",
	"Candy Crush",
	# "Tetris (complete)", # Commented out
	"Tetris",
	"Ace Attorney"
	]

	def get_organization(model_name):
	m = model_name.lower()
	if "claude" in m:
	return "anthropic"
	elif "gemini" in m:
	return "google"
	elif "o1" in m or "gpt" in m or "o3" in m or "o4" in m:
	return "openai"
	elif "deepseek" in m:
	return "deepseek"
	elif "llama" in m:
	return "meta"
	elif "grok" in m:
	return "xai"
	else:
	return "unknown"


	def get_sokoban_leaderboard(rank_data, limit_to_top_n=None):
	data = rank_data.get("Sokoban", {}).get("results", [])
	df = pd.DataFrame(data)
	df = df.rename(columns={
	"model": "Player",
	"score": "Score",
	"steps": "Steps",
	"detail_box_on_target": "Detail Box On Target",
	"cracked_levels": "Levels Cracked"
	})
	df["Organization"] = df["Player"].apply(get_organization)

	# Define columns to keep, ensuring 'Score' is present
	columns_to_keep = ["Player", "Organization", "Score", "Levels Cracked", "Detail Box On Target", "Steps"]
	# Filter to only columns that actually exist in the DataFrame after renaming
	df_columns = [col for col in columns_to_keep if col in df.columns]
	df = df[df_columns]

	if "Score" in df.columns:
	df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
	df = df.sort_values("Score", ascending=False)
	# Apply limit if specified
	if limit_to_top_n is not None:
	df = df.head(limit_to_top_n)
	return df

	def get_2048_leaderboard(rank_data, limit_to_top_n=None):
	data = rank_data.get("2048", {}).get("results", [])
	# --- Diagnostic Print Removed ---
	# if data and isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
	# print(f"DEBUG_UTILS: Keys in first item of raw data for 2048: {list(data[0].keys())}")
	# elif not data:
	# print("DEBUG_UTILS: Raw data for 2048 is empty.")
	# else:
	# print("DEBUG_UTILS: Raw data for 2048 is not in the expected list of dicts format.")
	# --- End Diagnostic Print Removed ---
	df = pd.DataFrame(data)
	# print(f"DEBUG_UTILS: Columns after pd.DataFrame(data): {df.columns.tolist()}") # REMOVED

	df = df.rename(columns={
	"model": "Player",
	"score": "Score", # From new JSON structure
	"details": "Details", # From new JSON structure
	"highest_tail": "Highest Tail" # Added new column
	# Old fields like "steps", "time", "rank" are removed
	})
	# print(f"DEBUG_UTILS: Columns after rename: {df.columns.tolist()}") # REMOVED

	# Ensure 'Player' column exists before applying get_organization
	if "Player" in df.columns:
	df["Organization"] = df["Player"].apply(get_organization)
	else:
	# Handle case where 'Player' column might be missing after rename (should not happen with current logic)
	# print("DEBUG_UTILS: 'Player' column not found after rename, skipping Organization.") # REMOVED
	df["Organization"] = "unknown" # Fallback

	columns_to_keep = ["Player", "Organization", "Score", "Highest Tail", "Details"] # Added "Highest Tail"

	# Defensive check for 'Highest Tail' before filtering - REMOVED
	# if 'highest_tail' in df.columns and 'Highest Tail' not in df.columns:
	# print("DEBUG_UTILS: 'highest_tail' (lowercase) found, but 'Highest Tail' (capitalized) not. This indicates a rename issue.")
	# elif 'Highest Tail' not in df.columns and 'highest_tail' not in df.columns:
	# print("DEBUG_UTILS: Neither 'Highest Tail' nor 'highest_tail' found in columns before filtering.")

	# df_columns = [col for col in columns_to_keep if col in df.columns] # REMOVED logic that used df_columns
	# print(f"DEBUG_UTILS: df_columns selected (columns that are in columns_to_keep AND in df.columns): {df_columns}") # REMOVED

	# Ensure all columns in columns_to_keep exist in df, fill with np.nan if not
	for col_k in columns_to_keep:
	if col_k not in df.columns:
	# print(f"DEBUG_UTILS: Column '{col_k}' from columns_to_keep not found in DataFrame. Adding it with NaN values.") # REMOVED
	df[col_k] = np.nan # Or some other default like 'n/a' if appropriate

	df = df[columns_to_keep] # Use columns_to_keep directly after ensuring they exist
	# print(f"DEBUG_UTILS: Columns after final selection: {df.columns.tolist()}") # REMOVED

	if "Score" in df.columns:
	df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
	df = df.sort_values("Score", ascending=False)
	# Apply limit if specified
	if limit_to_top_n is not None:
	df = df.head(limit_to_top_n)
	return df

	def get_candy_leaderboard(rank_data, limit_to_top_n=None):
	data = rank_data.get("Candy Crush", {}).get("results", [])
	df = pd.DataFrame(data)
	df = df.rename(columns={
	"model": "Player",
	"score": "Score",
	"details": "Details"
	})
	df["Organization"] = df["Player"].apply(get_organization)

	columns_to_keep = ["Player", "Organization", "Score", "Details"]
	df_columns = [col for col in columns_to_keep if col in df.columns]
	df = df[df_columns]

	if "Score" in df.columns:
	df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
	df = df.sort_values("Score", ascending=False)
	# Apply limit if specified
	if limit_to_top_n is not None:
	df = df.head(limit_to_top_n)
	return df

	def get_tetris_planning_leaderboard(rank_data, limit_to_top_n=None):
	data = rank_data.get("Tetris", {}).get("results", [])
	df = pd.DataFrame(data)
	df = df.rename(columns={
	"model": "Player",
	"score": "Score", # From new JSON structure
	"details": "Details" # From new JSON structure
	# Old fields like "steps_blocks", "rank" are removed
	})
	df["Organization"] = df["Player"].apply(get_organization)

	columns_to_keep = ["Player", "Organization", "Score", "Details"]
	df_columns = [col for col in columns_to_keep if col in df.columns]
	df = df[df_columns]

	if "Score" in df.columns:
	df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
	df = df.sort_values("Score", ascending=False)
	# Apply limit if specified
	if limit_to_top_n is not None:
	df = df.head(limit_to_top_n)
	return df

	def get_ace_attorney_leaderboard(rank_data, limit_to_top_n=None):
	data = rank_data.get("Ace Attorney", {}).get("results", [])
	df = pd.DataFrame(data)
	df = df.rename(columns={
	"model": "Player",
	"score": "Score",
	"progress": "Progress"
	})
	df["Organization"] = df["Player"].apply(get_organization)

	# Define columns to keep
	columns_to_keep = ["Player", "Organization", "Score", "Progress"]
	# Filter to only columns that actually exist in the DataFrame after renaming
	df_columns = [col for col in columns_to_keep if col in df.columns]
	df = df[df_columns]

	if "Score" in df.columns:
	df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
	df = df.sort_values("Score", ascending=False) # Higher score is better
	# Apply limit if specified
	if limit_to_top_n is not None:
	df = df.head(limit_to_top_n)
	return df

	def get_mario_planning_leaderboard(rank_data, limit_to_top_n=None):
	data = rank_data.get("Super Mario Bros", {}).get("results", [])
	df = pd.DataFrame(data)
	df = df.rename(columns={
	"model": "Player",
	"score": "Score",
	"detail_data": "Detail Data",
	"progress": "Progress"
	})
	df["Organization"] = df["Player"].apply(get_organization)
	# Define columns to keep
	columns_to_keep = ["Player", "Organization", "Score", "Progress", "Detail Data"]
	df_columns = [col for col in columns_to_keep if col in df.columns]
	df = df[df_columns]

	if "Score" in df.columns:
	df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
	df = df.sort_values("Score", ascending=False)
	# Apply limit if specified
	if limit_to_top_n is not None:
	df = df.head(limit_to_top_n)
	return df

	def calculate_rank_and_completeness(rank_data, selected_games):
	# Dictionary to store DataFrames for each game
	game_dfs = {}

	# Get DataFrames for selected games
	# if selected_games.get("Super Mario Bros"): # Commented out
	# game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
	if selected_games.get("Super Mario Bros"):
	game_dfs["Super Mario Bros"] = get_mario_planning_leaderboard(rank_data)
	if selected_games.get("Sokoban"):
	game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
	if selected_games.get("2048"):
	game_dfs["2048"] = get_2048_leaderboard(rank_data)
	if selected_games.get("Candy Crush"):
	game_dfs["Candy Crush"] = get_candy_leaderboard(rank_data)
	# if selected_games.get("Tetris (complete)"): # Commented out
	# game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
	if selected_games.get("Tetris"):
	game_dfs["Tetris"] = get_tetris_planning_leaderboard(rank_data)
	if selected_games.get("Ace Attorney"):
	game_dfs["Ace Attorney"] = get_ace_attorney_leaderboard(rank_data)

	# Get all unique players
	all_players = set()
	for df in game_dfs.values():
	all_players.update(df["Player"].unique())
	all_players = sorted(list(all_players))

	# Create results DataFrame
	results = []
	for player in all_players:
	player_data = {
	"Player": player,
	"Organization": get_organization(player)
	}
	ranks = []
	games_played = 0

	# Calculate rank and completeness for each game
	for game in GAME_ORDER:
	if game in game_dfs:
	df = game_dfs[game]
	if player in df["Player"].values:
	games_played += 1
	# Get player's score based on game type
	# if game == "Super Mario Bros": # Commented out
	# player_score = df[df["Player"] == player]["Score"].iloc[0]
	# rank = len(df[df["Score"] > player_score]) + 1
	if game == "Super Mario Bros":
	player_score = df[df["Player"] == player]["Score"].iloc[0]
	rank = len(df[df["Score"] > player_score]) + 1
	elif game == "Sokoban":
	player_score = df[df["Player"] == player]["Score"].iloc[0]
	rank = len(df[df["Score"] > player_score]) + 1
	elif game == "2048":
	player_score = df[df["Player"] == player]["Score"].iloc[0]
	rank = len(df[df["Score"] > player_score]) + 1
	elif game == "Candy Crush":
	player_score = df[df["Player"] == player]["Score"].iloc[0]
	rank = len(df[df["Score"] > player_score]) + 1
	elif game in ["Tetris"]:
	player_score = df[df["Player"] == player]["Score"].iloc[0]
	rank = len(df[df["Score"] > player_score]) + 1
	elif game == "Ace Attorney":
	player_score = df[df["Player"] == player]["Score"].iloc[0]
	rank = len(df[df["Score"] > player_score]) + 1

	ranks.append(rank)
	player_data[f"{game} Score"] = player_score
	else:
	player_data[f"{game} Score"] = 'n/a'

	# Calculate average rank and completeness for sorting
	if ranks:
	player_data["Average Rank"] = round(np.mean(ranks), 2)
	player_data["Games Played"] = games_played
	else:
	player_data["Average Rank"] = float('inf')
	player_data["Games Played"] = 0

	results.append(player_data)

	# Create DataFrame and sort by average rank and completeness
	df_results = pd.DataFrame(results)
	if not df_results.empty:
	# Sort by average rank (ascending) and games played (descending)
	df_results = df_results.sort_values(
	by=["Average Rank", "Games Played"],
	ascending=[True, False]
	)
	# Drop the sorting columns
	df_results = df_results.drop(["Average Rank", "Games Played"], axis=1)

	return df_results

	def get_combined_leaderboard(rank_data, selected_games, limit_to_top_n=None):
	"""
	Get combined leaderboard for selected games

	Args:
	rank_data (dict): Dictionary containing rank data
	selected_games (dict): Dictionary of game names and their selection status
	limit_to_top_n (int, optional): Limit results to top N entries. None means no limit.

	Returns:
	pd.DataFrame: Combined leaderboard DataFrame
	"""
	# Dictionary to store DataFrames for each game
	game_dfs = {}

	# Get DataFrames for selected games
	# if selected_games.get("Super Mario Bros"): # Commented out
	# game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
	if selected_games.get("Super Mario Bros"):
	game_dfs["Super Mario Bros"] = get_mario_planning_leaderboard(rank_data)
	if selected_games.get("Sokoban"):
	game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
	if selected_games.get("2048"):
	game_dfs["2048"] = get_2048_leaderboard(rank_data)
	if selected_games.get("Candy Crush"):
	game_dfs["Candy Crush"] = get_candy_leaderboard(rank_data)
	# if selected_games.get("Tetris (complete)"): # Commented out
	# game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
	if selected_games.get("Tetris"):
	game_dfs["Tetris"] = get_tetris_planning_leaderboard(rank_data)
	if selected_games.get("Ace Attorney"):
	game_dfs["Ace Attorney"] = get_ace_attorney_leaderboard(rank_data)

	# Get all unique players
	all_players = set()
	for df in game_dfs.values():
	all_players.update(df["Player"].unique())
	all_players = sorted(list(all_players))

	# Create results DataFrame
	results = []
	for player in all_players:
	player_data = {
	"Player": player,
	"Organization": get_organization(player)
	}

	# Add scores for each game
	for game in GAME_ORDER:
	if game in game_dfs:
	df = game_dfs[game]
	if player in df["Player"].values:
	# if game == "Super Mario Bros": # Commented out
	# player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
	if game == "Super Mario Bros":
	player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
	elif game == "Sokoban":
	player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
	elif game == "2048":
	player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
	elif game == "Candy Crush":
	player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
	elif game in ["Tetris"]:
	player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
	elif game == "Ace Attorney":
	player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
	else:
	player_data[f"{game} Score"] = 'n/a'

	results.append(player_data)

	# Create DataFrame
	df_results = pd.DataFrame(results)

	# Calculate normalized scores and average normalized score
	if not df_results.empty:
	# Import the normalize_values function from data_visualization
	from data_visualization import normalize_values

	# Calculate normalized scores for each game
	game_score_columns = []
	for game in GAME_ORDER:
	score_col = f"{game} Score"
	if score_col in df_results.columns:
	game_score_columns.append(score_col)
	# Get numeric values, replacing 'n/a' with NaN
	# Use where() to avoid FutureWarning about downcasting in replace()
	series = df_results[score_col].copy()
	series = series.where(series != 'n/a', np.nan)
	numeric_scores = pd.to_numeric(series, errors='coerce')

	# Skip games where all scores are NaN or 0
	valid_scores = numeric_scores.dropna()
	if len(valid_scores) > 0 and valid_scores.sum() > 0:
	mean = valid_scores.mean()
	std = valid_scores.std() if len(valid_scores) > 1 else 0

	# Calculate normalized scores for all players
	normalized_scores = []
	for _, row in df_results.iterrows():
	score = row[score_col]
	if score == 'n/a' or pd.isna(score):
	normalized_scores.append(0)
	else:
	normalized_scores.append(normalize_values([float(score)], mean, std)[0])

	df_results[f"norm_{score_col}"] = normalized_scores
	else:
	# If no valid scores, set all normalized scores to 0
	df_results[f"norm_{score_col}"] = 0

	# Calculate average normalized score across games
	normalized_columns = [f"norm_{col}" for col in game_score_columns if f"norm_{col}" in df_results.columns]
	if normalized_columns:
	df_results["Avg Normalized Score"] = df_results[normalized_columns].mean(axis=1).round(2)
	else:
	df_results["Avg Normalized Score"] = 0.0

	# Reorder columns to put Avg Normalized Score after Organization
	base_columns = ["Player", "Organization", "Avg Normalized Score"]
	game_columns = [col for col in df_results.columns if col.endswith(" Score") and not col.startswith("norm_") and col != "Avg Normalized Score"]
	other_columns = [col for col in df_results.columns if col not in base_columns + game_columns and not col.startswith("norm_")]

	# Create final column order
	final_columns = base_columns + game_columns + other_columns
	df_results = df_results[final_columns]

	# Sort by average normalized score in descending order
	df_results = df_results.sort_values("Avg Normalized Score", ascending=False)

	# Apply limit if specified
	if limit_to_top_n is not None:
	df_results = df_results.head(limit_to_top_n)

	return df_results