Spaces:

mlfoundations
/

OpenThoughts_data_explorer

Running

App Files Files Community

OpenThoughts_data_explorer / app.py

jmercat

max merging results, trusted model subset and overall improvements

8d8e935 about 1 month ago

raw

history blame contribute delete

142 kB

	#!/usr/bin/env python3
	"""
	Interactive Benchmark Explorer
	A comprehensive web application for exploring OpenThoughts benchmark correlations and model performance
	"""

	import streamlit as st
	import pandas as pd
	import numpy as np
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	import seaborn as sns
	import matplotlib.pyplot as plt
	from scipy.stats import pearsonr, kendalltau
	from scipy.optimize import minimize
	import ast
	import io
	import base64
	from itertools import combinations
	import warnings
	import time
	from sklearn.preprocessing import StandardScaler
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.cluster import KMeans
	from sklearn.metrics import r2_score, mean_absolute_error
	from scipy.optimize import curve_fit
	import re
	warnings.filterwarnings('ignore')

	# Configure page
	st.set_page_config(
	page_title="OpenThoughts Evalchemy Benchmark Explorer",
	page_icon="📊",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Custom CSS for better styling
	st.markdown("""
	<style>
	.main-header {
	font-size: 2.5rem;
	font-weight: bold;
	color: #1f77b4;
	text-align: center;
	margin-bottom: 2rem;
	}
	.metric-card {
	background-color: #f8f9fa;
	padding: 1rem;
	border-radius: 0.5rem;
	border-left: 4px solid #1f77b4;
	margin: 0.5rem 0;
	}
	.correlation-high { color: #d73027; font-weight: bold; }
	.correlation-medium { color: #fdae61; font-weight: bold; }
	.correlation-low { color: #4575b4; font-weight: bold; }
	.category-math { color: #d73027; font-weight: bold; }
	.category-code { color: #1f78b4; font-weight: bold; }
	.category-science { color: #33a02c; font-weight: bold; }
	.category-general { color: #ff7f00; font-weight: bold; }
	</style>
	""", unsafe_allow_html=True)

	@st.cache_data
	def load_trusted_models():
	"""Load and parse trusted models from CSV file"""
	try:
	df = pd.read_csv('trusted_models.csv')
	trusted_models = []
	experiment_codes = []

	for idx, row in df.iterrows():
	model_name = str(row['Model Name']).strip()
	if model_name and model_name != 'nan' and model_name not in ['LOW PRIORITY EVALS', 'Experiment Name']:
	trusted_models.append(model_name)
	# Also track experiment codes (patterns like a1_, b2_, etc.)
	if re.match(r'^[a-z]\d+_', model_name):
	experiment_codes.append(model_name)

	print(f"Loaded {len(trusted_models)} trusted models ({len(experiment_codes)} experiment codes)")
	return trusted_models, experiment_codes
	except Exception as e:
	print(f"Error loading trusted models: {e}")
	return [], []

	def extract_experiment_pattern(model_name):
	"""Extract experiment pattern from mlfoundations-dev model names"""
	if not model_name.startswith('mlfoundations-dev/'):
	return None

	suffix = model_name[len('mlfoundations-dev/'):]

	# Look for patterns like b2_math_difficulty_0.3k_eval_636d
	match = re.match(r'^([a-z]\d+_[^_]+(?:_[^_]+)*)', suffix)
	if match:
	return match.group(1)

	# Look for direct experiment code matches
	match = re.match(r'^([a-z]\d+_[a-zA-Z_]+)', suffix)
	if match:
	return match.group(1)

	return None

	def filter_trusted_models(df, trusted_models_data):
	"""Filter dataframe to only include trusted models with enhanced experiment matching"""
	if not trusted_models_data:
	return df

	# Unpack the data
	if isinstance(trusted_models_data, tuple):
	trusted_models, experiment_codes = trusted_models_data
	else:
	# Backward compatibility
	trusted_models = trusted_models_data
	experiment_codes = [m for m in trusted_models if re.match(r'^[a-z]\d+_', m)]

	trusted_set = set(trusted_models)
	experiment_set = set(experiment_codes)

	def is_trusted_model(model_name):
	# Direct exact match
	if model_name in trusted_set:
	return True

	# Handle format conversion: __ in trusted models vs / in dataset
	# Convert model_name format to match trusted models format
	model_name_converted = model_name.replace('/', '__')
	if model_name_converted in trusted_set:
	return True

	# Convert trusted models format to match dataset format
	for trusted in trusted_models:
	trusted_converted = trusted.replace('__', '/')
	if model_name == trusted_converted:
	return True

	# Case-insensitive partial matching for regular models
	model_lower = model_name.lower()
	for trusted in trusted_models:
	trusted_lower = trusted.lower()

	# Also check converted formats in case-insensitive matching
	trusted_converted_lower = trusted.replace('__', '/').lower()
	model_converted_lower = model_name.replace('/', '__').lower()

	# Flexible matching with minimum length requirements
	if len(trusted_lower) >= 5 and trusted_lower in model_lower:
	return True
	if len(model_lower) >= 5 and model_lower in trusted_lower:
	return True
	if len(trusted_converted_lower) >= 5 and trusted_converted_lower in model_lower:
	return True
	if len(model_converted_lower) >= 5 and model_converted_lower in trusted_lower:
	return True

	# Check core model names (after / or __)
	if ('/' in model_name or '__' in model_name) and ('/' in trusted or '__' in trusted):
	# Extract core names using both separators
	model_core = model_name.replace('__', '/').split('/')[-1].lower()
	trusted_core = trusted.replace('__', '/').split('/')[-1].lower()
	if len(model_core) >= 3 and len(trusted_core) >= 3:
	if model_core in trusted_core or trusted_core in model_core:
	return True

	# Experiment code matching for mlfoundations-dev models
	if model_name.startswith('mlfoundations-dev/'):
	pattern = extract_experiment_pattern(model_name)
	if pattern:
	# Try exact match first
	if pattern in experiment_set:
	return True

	# Try partial matches with experiment codes
	for exp_code in experiment_codes:
	if pattern.startswith(exp_code) or exp_code.startswith(pattern):
	return True

	return False

	# Filter models
	trusted_indices = [idx for idx in df.index if is_trusted_model(idx)]
	filtered_df = df.loc[trusted_indices]

	return filtered_df if len(filtered_df) > 0 else df

	@st.cache_data
	def load_comprehensive_data(use_trusted_filter=True):
	"""Load and clean the comprehensive benchmark data."""
	try:
	# Use explicit encoding and error handling
	df = pd.read_csv("max_comprehensive_benchmark_scores.csv", index_col=0, encoding='utf-8')

	# Clean the data - handle list-like values stored as strings
	# Process in batches for better performance with large datasets
	total_cols = len(df.columns)

	if total_cols > 20:
	# Show progress for large datasets
	progress_text = st.empty()
	progress_bar = st.progress(0)

	for i, col in enumerate(df.columns):
	if total_cols > 20:
	progress_text.text(f"Processing column {i+1}/{total_cols}: {col}")
	progress_bar.progress((i+1) / total_cols)

	def extract_value(x):
	if pd.isna(x):
	return np.nan
	if isinstance(x, str) and x.startswith('['):
	try:
	parsed = ast.literal_eval(x)
	if isinstance(parsed, list) and len(parsed) > 0:
	return float(parsed[0]) # Ensure float type
	else:
	return np.nan
	except (ValueError, SyntaxError):
	return np.nan
	try:
	return float(x) # Ensure numeric values are float
	except (ValueError, TypeError):
	return np.nan

	df[col] = df[col].apply(extract_value)
	df[col] = pd.to_numeric(df[col], errors='coerce')

	if total_cols > 20:
	progress_text.empty()
	progress_bar.empty()

	# Filter to trusted models only if requested
	if use_trusted_filter:
	trusted_models_data = load_trusted_models()
	df = filter_trusted_models(df, trusted_models_data)

	# Filter to only models that have data for at least a few benchmarks
	min_benchmarks = 3
	df_filtered = df.dropna(thresh=min_benchmarks, axis=0)

	# Ensure we have some data
	if len(df_filtered) == 0:
	st.error("No models found with sufficient benchmark data.")
	return pd.DataFrame()

	return df_filtered

	except FileNotFoundError:
	st.error("Could not find max_comprehensive_benchmark_scores.csv. Please ensure the data file exists.")
	return pd.DataFrame()
	except Exception as e:
	st.error(f"Error loading data: {str(e)}")
	return pd.DataFrame()

	@st.cache_data
	def load_stderr_data(use_trusted_filter=True):
	"""Load and clean standard error data."""
	try:
	stderr_df = pd.read_csv("max_benchmark_standard_errors.csv", index_col=0, encoding='utf-8')

	# Clean the data
	for col in stderr_df.columns:
	def extract_value(x):
	if pd.isna(x):
	return np.nan
	if isinstance(x, str) and x.startswith('['):
	try:
	parsed = ast.literal_eval(x)
	if isinstance(parsed, list) and len(parsed) > 0:
	return float(parsed[0]) # Ensure float type
	else:
	return np.nan
	except (ValueError, SyntaxError):
	return np.nan
	try:
	return float(x) # Ensure numeric values are float
	except (ValueError, TypeError):
	return np.nan

	stderr_df[col] = stderr_df[col].apply(extract_value)
	stderr_df[col] = pd.to_numeric(stderr_df[col], errors='coerce')

	# Filter to trusted models only if requested
	if use_trusted_filter:
	trusted_models_data = load_trusted_models()
	stderr_df = filter_trusted_models(stderr_df, trusted_models_data)

	return stderr_df

	except FileNotFoundError:
	st.warning("Could not find max_benchmark_standard_errors.csv. Standard error analysis will be limited.")
	return pd.DataFrame()
	except Exception as e:
	st.warning(f"Error loading standard error data: {str(e)}")
	return pd.DataFrame()

	def clean_benchmark_name(name):
	"""Clean benchmark names for consistent display."""
	return (name.replace("LiveCodeBench_accuracy_avg", "LiveCodeBenchv2")
	.replace('_accuracy_avg', '')
	.replace('_accuracy', '')
	.replace('LiveCodeBench', 'LCB')
	.replace('GPQADiamond', 'GPQAD')
	)

	def get_focused_benchmark_mapping():
	"""Define the target benchmarks and categories."""
	target_benchmarks = {
	# Math benchmarks
	'AIME24': 'AIME24_accuracy_avg',
	'AIME25': 'AIME25_accuracy_avg',
	'AMC23': 'AMC23_accuracy_avg',
	'MATH500': 'MATH500_accuracy',

	# Code benchmarks
	'CodeElo': 'CodeElo_accuracy_avg',
	'CodeForces': 'CodeForces_accuracy_avg',
	'LCBv2': 'LiveCodeBench_accuracy_avg',
	'LCBv5': 'LiveCodeBenchv5_accuracy_avg',

	# Science benchmarks
	'GPQADiamond': 'GPQADiamond_accuracy_avg',
	'JEEBench': 'JEEBench_accuracy_avg',

	# General benchmarks
	'MMLUPro': 'MMLUPro_accuracy_avg',
	'HLE': 'HLE_accuracy_avg'
	}

	benchmark_categories = {
	'Math': ['AIME24', 'AIME25', 'AMC23', 'MATH500'],
	'Code': ['CodeElo', 'CodeForces', 'LCBv2', 'LCBv5'],
	'Science': ['GPQADiamond', 'JEEBench'],
	'General': ['MMLUPro', 'HLE']
	}

	colors = {'Math': '#d73027', 'Code': '#1f78b4', 'Science': '#33a02c', 'General': '#ff7f00'}

	# Create reverse mapping
	col_to_category = {}
	for category, bench_list in benchmark_categories.items():
	for bench_name in bench_list:
	actual_name = target_benchmarks.get(bench_name)
	if actual_name:
	col_to_category[actual_name] = category

	return target_benchmarks, benchmark_categories, colors, col_to_category

	def compute_correlations(df, method='kendall'):
	"""Compute correlation matrix using specified method."""
	if method == 'pearson':
	return df.corr(method='pearson')
	elif method == 'kendall':
	return df.corr(method='kendall')
	else:
	raise ValueError(f"Unsupported correlation method: {method}")

	def create_interactive_heatmap(corr_matrix, title="Correlation Heatmap"):
	"""Create an interactive correlation heatmap using Plotly."""
	target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()

	# Get clean names for display
	clean_names = [clean_benchmark_name(name) for name in corr_matrix.columns]

	# Convert to percentages for display
	corr_matrix_pct = (corr_matrix * 100).round(1)

	# Create hover text
	hover_text = []
	for i, bench1 in enumerate(corr_matrix.columns):
	hover_row = []
	for j, bench2 in enumerate(corr_matrix.columns):
	if i == j:
	hover_row.append(f"{clean_names[i]}<br>Correlation: 100%")
	else:
	corr_val = corr_matrix_pct.iloc[i, j]
	if pd.isna(corr_val):
	hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>No data")
	else:
	hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>Correlation: {corr_val:.1f}%")
	hover_text.append(hover_row)

	# Create the heatmap
	fig = go.Figure(data=go.Heatmap(
	z=corr_matrix.values,
	x=clean_names,
	y=clean_names,
	colorscale='RdBu_r',
	zmid=0,
	text=corr_matrix_pct.values,
	texttemplate="%{text}",
	textfont={"size": 10},
	hoverinfo='text',
	hovertext=hover_text,
	colorbar=dict(title="Correlation", tickformat=".2f")
	))

	# Update layout
	fig.update_layout(
	title=title,
	xaxis_title="",
	yaxis_title="",
	width=800,
	height=800,
	font=dict(size=12)
	)

	# Color the axis labels by category
	for i, bench in enumerate(corr_matrix.columns):
	category = col_to_category.get(bench, 'Unknown')
	color = colors.get(category, 'black')

	return fig

	def create_scatter_plot(df, x_bench, y_bench, stderr_df=None):
	"""Create an interactive scatter plot between two benchmarks."""
	if x_bench not in df.columns or y_bench not in df.columns:
	return None

	# Get common data
	common_data = df[[x_bench, y_bench]].dropna()

	if len(common_data) < 3:
	return None

	x_vals = common_data[x_bench]
	y_vals = common_data[y_bench]

	# Calculate correlation
	corr, p_val = pearsonr(x_vals, y_vals)

	# Create figure
	fig = go.Figure()

	# Add scatter points
	fig.add_trace(go.Scatter(
	x=x_vals,
	y=y_vals,
	mode='markers',
	text=common_data.index,
	hovertemplate=(
	"<b>%{text}</b><br>" +
	f"{clean_benchmark_name(x_bench)}: %{{x:.3f}}<br>" +
	f"{clean_benchmark_name(y_bench)}: %{{y:.3f}}<br>" +
	"<extra></extra>"
	),
	marker=dict(size=8, opacity=0.7, color='steelblue')
	))

	# Add regression line
	z = np.polyfit(x_vals, y_vals, 1)
	p = np.poly1d(z)
	x_line = np.linspace(x_vals.min(), x_vals.max(), 100)

	# Format p-value appropriately
	if p_val < 0.001:
	p_str = f"p < 0.001"
	else:
	p_str = f"p = {p_val:.3f}"

	fig.add_trace(go.Scatter(
	x=x_line,
	y=p(x_line),
	mode='lines',
	name=f'r = {corr:.3f}, {p_str}',
	line=dict(color='red', dash='dash')
	))

	# Update layout
	fig.update_layout(
	title=f"{clean_benchmark_name(y_bench)} vs {clean_benchmark_name(x_bench)}",
	xaxis_title=clean_benchmark_name(x_bench),
	yaxis_title=clean_benchmark_name(y_bench),
	showlegend=True,
	width=600,
	height=500
	)

	return fig

	def filter_target_benchmarks(df):
	"""Filter dataframe to only include target benchmarks."""
	target_benchmarks, _, _, _ = get_focused_benchmark_mapping()

	available_benchmarks = []
	for display_name, actual_name in target_benchmarks.items():
	if actual_name in df.columns:
	available_benchmarks.append(actual_name)

	return df[available_benchmarks].copy()

	@st.cache_data
	def estimate_missing_ranks(df, method='kendall', min_corr=0.1, min_benchmarks=2, _version="v2_fixed_ranking"):
	"""
	Estimate missing ranks using rank correlation.
	Now ensures ALL missing values are filled.

	Parameters:
	-----------
	df: DataFrame
	Input data with missing values
	method: Rank correlation method ('kendall')
	min_corr: float
	Minimum correlation threshold for using a benchmark (lowered to 0.1)
	min_benchmarks: int
	Minimum number of benchmarks needed for estimation (lowered to 2)
	_version: str
	Version parameter to force cache invalidation when ranking logic changes
	"""
	# Convert to ranks (higher scores get better/lower ranks)
	df_ranks = df.rank(ascending=False)

	# Compute rank correlation matrix
	if method == 'kendall':
	rank_corr_matrix = df_ranks.corr(method='kendall')
	else:
	raise ValueError(f"Unsupported correlation method: {method}")

	# Pre-compute correlation thresholds to avoid repeated calculations
	valid_correlations = {}
	for benchmark in df.columns:
	valid_correlations[benchmark] = []
	for other_bench in df.columns:
	if benchmark != other_bench:
	corr_val = rank_corr_matrix.loc[benchmark, other_bench]
	if not pd.isna(corr_val) and abs(corr_val) >= min_corr:
	valid_correlations[benchmark].append((other_bench, abs(corr_val)))
	# Sort by correlation strength for better prediction
	valid_correlations[benchmark].sort(key=lambda x: x[1], reverse=True)

	# For each model and benchmark combination with missing data
	missing_count = 0
	total_missing = df_ranks.isna().sum().sum()

	for model_idx in df.index:
	available_benchmarks = df_ranks.columns[df_ranks.loc[model_idx].notna()].tolist()

	if len(available_benchmarks) >= min_benchmarks:
	for benchmark in df.columns:
	if pd.isna(df_ranks.loc[model_idx, benchmark]):
	# Get pre-computed valid correlations for this benchmark
	valid_pairs = valid_correlations[benchmark]

	correlations = []
	ranks = []

	# First try: use correlations above min_corr threshold
	for other_bench, corr_strength in valid_pairs:
	if other_bench in available_benchmarks:
	correlations.append(corr_strength)
	ranks.append(df_ranks.loc[model_idx, other_bench])

	# Use more benchmarks for better estimation
	if len(correlations) >= 8:
	break

	# If no good correlations found, use ANY available benchmarks
	if len(correlations) == 0:
	for other_bench in available_benchmarks:
	if other_bench != benchmark:
	corr_val = rank_corr_matrix.loc[benchmark, other_bench]
	if not pd.isna(corr_val):
	correlations.append(max(0.01, abs(corr_val))) # Minimum weight
	ranks.append(df_ranks.loc[model_idx, other_bench])

	# If still no correlations, use simple average of available ranks
	if len(correlations) == 0:
	available_ranks = [df_ranks.loc[model_idx, bench] for bench in available_benchmarks]
	if available_ranks:
	estimated_rank = np.mean(available_ranks)
	df_ranks.loc[model_idx, benchmark] = estimated_rank
	missing_count += 1
	else:
	# Weighted average of ranks using correlations as weights
	correlations = np.array(correlations)
	ranks = np.array(ranks)

	# Normalize weights
	weights = correlations / correlations.sum()
	estimated_rank = np.average(ranks, weights=weights)

	df_ranks.loc[model_idx, benchmark] = estimated_rank
	missing_count += 1

	return df_ranks

	@st.cache_data
	def create_consensus_ranking(df, method='kendall', use_rank_imputation=True, min_benchmarks_for_ranking=6):
	"""
	Create a consensus ranking using rank correlation-based estimation.

	Parameters:
	-----------
	df: DataFrame
	Input data with models as rows and benchmarks as columns
	method: str
	Correlation method for rank imputation ('kendall')
	use_rank_imputation: bool
	Whether to use rank imputation for missing values
	min_benchmarks_for_ranking: int
	Minimum number of original benchmarks required for a model to be included in ranking

	Returns:
	tuple: (ranking_df, rank_matrix, metadata)
	"""
	# Filter models to only include those with sufficient benchmark coverage
	original_coverage = df.notna().sum(axis=1)
	models_with_sufficient_data = original_coverage[original_coverage >= min_benchmarks_for_ranking].index

	if len(models_with_sufficient_data) == 0:
	# If no models meet the criteria, lower the threshold
	min_benchmarks_for_ranking = max(1, original_coverage.max() // 2)
	models_with_sufficient_data = original_coverage[original_coverage >= min_benchmarks_for_ranking].index

	# Filter dataframe to only include models with sufficient data
	df_filtered = df.loc[models_with_sufficient_data]

	if use_rank_imputation:
	# Estimate missing ranks
	df_ranks = estimate_missing_ranks(df_filtered, method)

	# Calculate consensus rank for each model (median rank across all benchmarks)
	consensus_ranks = df_ranks.median(axis=1, skipna=True)

	# Calculate coverage and estimation statistics
	original_coverage_filtered = df_filtered.notna().sum(axis=1)
	imputed_coverage = df_ranks.notna().sum(axis=1)
	estimated_count = imputed_coverage - original_coverage_filtered

	# Create ranking dataframe
	ranking_data = []
	for model in df_filtered.index:
	ranking_data.append({
	'Model': model.split('/')[-1] if '/' in model else model,
	'Full_Model_Name': model,
	'Consensus_Rank': float(consensus_ranks[model]),
	'Original_Benchmarks': int(original_coverage_filtered[model]),
	'Total_Benchmarks': len(df_filtered.columns), # Always 12 since we fill all missing values
	'Estimated_Ranks': int(estimated_count[model]),
	'Coverage_Pct': float(original_coverage_filtered[model] / len(df_filtered.columns) * 100)
	})

	ranking_df = pd.DataFrame(ranking_data).sort_values('Consensus_Rank', ascending=True) # Lower rank = better

	metadata = {
	'method': method,
	'imputation_used': True,
	'total_estimates': int(estimated_count.sum()),
	'models_with_estimates': int((estimated_count > 0).sum()),
	'ranking_method': 'consensus_rank',
	'min_benchmarks_required': min_benchmarks_for_ranking,
	'models_filtered_out': len(df) - len(df_filtered),
	'total_benchmarks': len(df_filtered.columns)
	}

	else:
	# Simple ranking based on available data only
	df_ranks = df_filtered.rank(method='min', ascending=False, na_option='keep')
	median_ranks = df_ranks.median(axis=1, skipna=True)

	ranking_data = []
	for model in df_filtered.index:
	ranking_data.append({
	'Model': model.split('/')[-1] if '/' in model else model,
	'Full_Model_Name': model,
	'Consensus_Rank': float(median_ranks[model]),
	'Original_Benchmarks': int(df_filtered.notna().sum(axis=1)[model]),
	'Total_Benchmarks': int(df_filtered.notna().sum(axis=1)[model]),
	'Estimated_Ranks': 0,
	'Coverage_Pct': float(df_filtered.notna().sum(axis=1)[model] / len(df_filtered.columns) * 100)
	})

	ranking_df = pd.DataFrame(ranking_data).sort_values('Consensus_Rank', ascending=True)

	metadata = {
	'method': 'none',
	'imputation_used': False,
	'total_estimates': 0,
	'models_with_estimates': 0,
	'ranking_method': 'median_rank',
	'min_benchmarks_required': min_benchmarks_for_ranking,
	'models_filtered_out': len(df) - len(df_filtered),
	'total_benchmarks': len(df_filtered.columns)
	}

	return ranking_df, df_ranks, metadata

	@st.cache_data
	def create_optimized_radar_chart(df_display, selected_models, selected_benchmarks_for_radar):
	"""Create an optimized radar chart for the selected models and benchmarks."""
	if not selected_benchmarks_for_radar or not selected_models:
	return None

	# Pre-filter data to only what we need
	filtered_data = df_display.loc[selected_models, selected_benchmarks_for_radar]
	clean_benchmark_names = [clean_benchmark_name(b) for b in selected_benchmarks_for_radar]

	# Define colors for different models
	colors_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
	'#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']

	fig = go.Figure()

	# Calculate dynamic range for better visualization
	all_values = filtered_data.values.flatten()
	all_values = all_values[~pd.isna(all_values)]

	if len(all_values) > 0:
	min_val = float(np.min(all_values))
	max_val = float(np.max(all_values))
	# Add some padding
	range_padding = (max_val - min_val) * 0.1
	radar_min = max(0, min_val - range_padding)
	radar_max = min(1, max_val + range_padding)
	else:
	radar_min, radar_max = 0, 1

	for i, model in enumerate(selected_models):
	# Get model data for selected benchmarks only
	model_scores = []
	for benchmark in selected_benchmarks_for_radar:
	score = filtered_data.loc[model, benchmark]
	# Convert to float, use 0.0 for any remaining NaN values
	model_scores.append(0.0 if pd.isna(score) else float(score))

	# Close the radar chart by adding the first value at the end
	radar_values = model_scores + [model_scores[0]]
	radar_benchmarks = clean_benchmark_names + [clean_benchmark_names[0]]

	# Create model name for legend (remove path prefix if present)
	model_display_name = model.split('/')[-1] if '/' in model else model

	# Use color from list, cycling if needed
	model_color = colors_list[i % len(colors_list)]

	fig.add_trace(go.Scatterpolar(
	r=radar_values,
	theta=radar_benchmarks,
	fill='toself',
	name=model_display_name,
	line_color=model_color,
	hovertemplate='<b>%{theta}</b><br>Score: %{r:.3f}<extra></extra>'
	))

	# Adjust chart size based on number of models
	chart_height = 600 if len(selected_models) <= 3 else 700

	fig.update_layout(
	polar=dict(
	radialaxis=dict(
	visible=True,
	range=[radar_min, radar_max],
	tickformat='.2f'
	)),
	showlegend=True,
	title=f"Model Performance Radar Chart ({len(selected_benchmarks_for_radar)} benchmarks, {len(selected_models)} models)",
	width=700,
	height=chart_height
	)

	return fig

	def weighted_correlation(x, y, weights):
	"""Compute weighted Pearson correlation coefficient."""
	# Remove NaN values
	valid_mask = ~(np.isnan(x) \| np.isnan(y) \| np.isnan(weights))
	if valid_mask.sum() < 3:
	return np.nan, np.nan

	x_clean = x[valid_mask]
	y_clean = y[valid_mask]
	w_clean = weights[valid_mask]

	# Weighted means
	x_mean = np.average(x_clean, weights=w_clean)
	y_mean = np.average(y_clean, weights=w_clean)

	# Weighted covariance and variances
	cov = np.average((x_clean - x_mean) * (y_clean - y_mean), weights=w_clean)
	var_x = np.average((x_clean - x_mean)**2, weights=w_clean)
	var_y = np.average((y_clean - y_mean)**2, weights=w_clean)

	# Weighted correlation
	if var_x == 0 or var_y == 0:
	return np.nan, np.nan

	corr = cov / np.sqrt(var_x * var_y)

	# Approximate degrees of freedom for weighted data
	# Using effective sample size approximation
	sum_w = np.sum(w_clean)
	sum_w2 = np.sum(w_clean**2)
	eff_n = sum_w**2 / sum_w2

	# Standard error of correlation (approximate)
	if eff_n > 3:
	from scipy.stats import t
	se_corr = np.sqrt((1 - corr**2) / (eff_n - 2))
	t_stat = corr / se_corr
	p_value = 2 * (1 - t.cdf(abs(t_stat), eff_n - 2))
	else:
	p_value = np.nan

	return corr, p_value

	def match_scores_with_stderr(scores_df, stderr_df, target_benchmarks):
	"""Match score columns with their corresponding stderr columns."""
	target_benchmarks_dict, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()

	score_to_stderr_mapping = {}

	# Look for stderr matches with various naming patterns
	for col in target_benchmarks:
	stderr_col = None

	# Try different naming patterns in order of preference
	potential_stderr_names = [
	f"{col}_std_err", # Direct match
	f"{col.replace('_accuracy', '_accuracy_std_err')}", # Handle _accuracy vs _accuracy_avg
	f"{col.replace('_accuracy_avg', '_accuracy_std_err')}", # Handle _accuracy_avg
	]

	# Special handling for MATH500 and other variations
	if col == 'MATH500_accuracy':
	potential_stderr_names.extend([
	'MATH500x2_accuracy_std_err',
	'MATH500_accuracy_std_err'
	])

	# Add 'x2' variants for all benchmarks (in case there are other x2 versions)
	base_name = col.replace('_accuracy_avg', '').replace('_accuracy', '')
	potential_stderr_names.extend([
	f"{base_name}x2_accuracy_std_err",
	f"{base_name}_accuracy_std_err"
	])

	# Find the first matching column with sufficient data
	for stderr_name in potential_stderr_names:
	if stderr_name in stderr_df.columns:
	# Check if there's sufficient data (at least 10 models)
	non_null_count = stderr_df[stderr_name].notna().sum()
	if non_null_count >= 10:
	stderr_col = stderr_name
	break

	if stderr_col:
	score_to_stderr_mapping[col] = stderr_col

	return score_to_stderr_mapping

	def create_uncertainty_aware_correlation_matrix(scores_df, stderr_df, score_to_stderr_mapping):
	"""Create correlation matrix accounting for measurement uncertainties."""
	target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()

	benchmarks = list(score_to_stderr_mapping.keys())
	n_benchmarks = len(benchmarks)

	# Initialize matrices
	corr_matrix = np.full((n_benchmarks, n_benchmarks), np.nan)
	pvalue_matrix = np.full((n_benchmarks, n_benchmarks), np.nan)
	weighted_corr_matrix = np.full((n_benchmarks, n_benchmarks), np.nan)
	weighted_pvalue_matrix = np.full((n_benchmarks, n_benchmarks), np.nan)

	for i, bench1 in enumerate(benchmarks):
	for j, bench2 in enumerate(benchmarks):
	if i == j:
	# Diagonal: compute reliability coefficient
	stderr_col = score_to_stderr_mapping[bench1]

	# Has actual stderr data
	# reliability = 1 - (measurement_error_variance / total_variance)
	scores = scores_df[bench1].dropna()
	stderrs = stderr_df[stderr_col].dropna()

	# Align data
	common_idx = scores.index.intersection(stderrs.index)
	if len(common_idx) >= 3:
	aligned_scores = scores.loc[common_idx]
	aligned_stderrs = stderrs.loc[common_idx]

	# Total variance in observed scores
	total_variance = aligned_scores.var()

	# Mean measurement error variance
	mean_error_variance = (aligned_stderrs**2).mean()

	# Reliability = proportion of total variance that is "true" variance
	if total_variance > 0:
	reliability = max(0, 1 - (mean_error_variance / total_variance))

	# For regular correlation, we still use 1.0 (mathematical definition)
	corr_matrix[i, j] = 1.0
	pvalue_matrix[i, j] = 0.0

	# For weighted correlation, use reliability coefficient
	weighted_corr_matrix[i, j] = reliability
	weighted_pvalue_matrix[i, j] = 0.0
	else:
	corr_matrix[i, j] = 1.0
	weighted_corr_matrix[i, j] = 0.0
	pvalue_matrix[i, j] = 0.0
	weighted_pvalue_matrix[i, j] = 0.0
	else:
	# Insufficient data
	corr_matrix[i, j] = 1.0
	weighted_corr_matrix[i, j] = np.nan
	pvalue_matrix[i, j] = 0.0
	weighted_pvalue_matrix[i, j] = np.nan
	continue

	# Get common valid data
	# First, align the dataframes by common index
	common_idx = scores_df.index.intersection(stderr_df.index)

	x = scores_df.loc[common_idx, bench1].values
	y = scores_df.loc[common_idx, bench2].values

	# Get standard errors
	stderr1_col = score_to_stderr_mapping[bench1]
	stderr2_col = score_to_stderr_mapping[bench2]

	# Standard (unweighted) correlation
	valid_mask = ~(np.isnan(x) \| np.isnan(y))
	if valid_mask.sum() >= 3:
	corr, p_val = pearsonr(x[valid_mask], y[valid_mask])
	corr_matrix[i, j] = corr
	pvalue_matrix[i, j] = p_val

	# Weighted correlation
	stderr1 = stderr_df.loc[common_idx, stderr1_col].values
	stderr2 = stderr_df.loc[common_idx, stderr2_col].values

	# Weighted correlation using inverse variance weighting
	# Weight = 1 / (stderr1^2 + stderr2^2) - accounting for error in both variables
	valid_stderr_mask = ~(np.isnan(stderr1) \| np.isnan(stderr2)) & valid_mask
	if valid_stderr_mask.sum() >= 3:
	combined_variance = stderr1[valid_stderr_mask]2 + stderr2[valid_stderr_mask]2
	# Avoid division by zero
	weights = np.where(combined_variance > 0, 1.0 / combined_variance, 0)

	if weights.sum() > 0:
	w_corr, w_p_val = weighted_correlation(
	x[valid_stderr_mask],
	y[valid_stderr_mask],
	weights
	)
	weighted_corr_matrix[i, j] = w_corr
	weighted_pvalue_matrix[i, j] = w_p_val
	else:
	# Use regular correlation for weighted matrix too
	if valid_mask.sum() >= 3:
	weighted_corr_matrix[i, j] = corr_matrix[i, j]
	weighted_pvalue_matrix[i, j] = pvalue_matrix[i, j]

	# Convert to DataFrames
	corr_df = pd.DataFrame(corr_matrix, index=benchmarks, columns=benchmarks)
	pvalue_df = pd.DataFrame(pvalue_matrix, index=benchmarks, columns=benchmarks)
	weighted_corr_df = pd.DataFrame(weighted_corr_matrix, index=benchmarks, columns=benchmarks)
	weighted_pvalue_df = pd.DataFrame(weighted_pvalue_matrix, index=benchmarks, columns=benchmarks)

	return corr_df, pvalue_df, weighted_corr_df, weighted_pvalue_df

	def create_uncertainty_weighted_heatmap_plotly(weighted_corr_df, title_prefix="Uncertainty-Weighted Correlation Analysis"):
	"""Create a single uncertainty-weighted heatmap using Plotly."""
	target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()

	# Get clean names for display
	clean_names = [clean_benchmark_name(name) for name in weighted_corr_df.columns]

	# Weighted correlation heatmap
	weighted_corr_pct = (weighted_corr_df * 100).round(1)

	# Create hover text for weighted correlations
	hover_text_weighted = []
	for i, bench1 in enumerate(weighted_corr_df.columns):
	hover_row = []
	for j, bench2 in enumerate(weighted_corr_df.columns):
	if i == j:
	reliability = weighted_corr_df.iloc[i, j]
	if pd.isna(reliability):
	hover_row.append(f"{clean_names[i]}<br>Reliability: Unknown")
	else:
	hover_row.append(f"{clean_names[i]}<br>Reliability: {reliability*100:.1f}%")
	else:
	corr_val = weighted_corr_pct.iloc[i, j]
	if pd.isna(corr_val):
	hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>No weighted data")
	else:
	hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>Weighted correlation: {corr_val:.1f}%")
	hover_text_weighted.append(hover_row)

	# Create the heatmap
	fig = go.Figure(data=go.Heatmap(
	z=weighted_corr_df.values,
	x=clean_names,
	y=clean_names,
	colorscale='RdBu_r',
	zmid=0,
	text=weighted_corr_pct.values,
	texttemplate="%{text}",
	textfont={"size": 10},
	hoverinfo='text',
	hovertext=hover_text_weighted,
	colorbar=dict(title="Correlation")
	))

	# Update layout
	fig.update_layout(
	title=f"{title_prefix}<br><sub>Diagonal shows reliability coefficients (signal-to-noise ratios)</sub>",
	width=800,
	height=700,
	font=dict(size=12),
	xaxis=dict(tickangle=45),
	yaxis=dict(tickangle=0)
	)

	return fig

	def create_uncertainty_aware_heatmap_plotly(corr_df, weighted_corr_df, title_prefix="Correlation Analysis"):
	"""Create side-by-side interactive heatmaps comparing regular vs weighted correlations using Plotly."""
	target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()

	# Get clean names for display
	clean_names = [clean_benchmark_name(name) for name in corr_df.columns]

	# Create subplots
	fig = make_subplots(
	rows=1, cols=2,
	subplot_titles=('Regular Correlation Matrix<br>(Equal weighting)',
	'Uncertainty-Weighted Correlation Matrix<br>(Inverse variance weighting)'),
	horizontal_spacing=0.15
	)

	# Regular correlation heatmap
	corr_matrix_pct = (corr_df * 100).round(1)

	# Create hover text for regular correlations
	hover_text_regular = []
	for i, bench1 in enumerate(corr_df.columns):
	hover_row = []
	for j, bench2 in enumerate(corr_df.columns):
	if i == j:
	hover_row.append(f"{clean_names[i]}<br>Self-correlation: 100%")
	else:
	corr_val = corr_matrix_pct.iloc[i, j]
	if pd.isna(corr_val):
	hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>No data")
	else:
	hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>Correlation: {corr_val:.1f}%")
	hover_text_regular.append(hover_row)

	fig.add_trace(go.Heatmap(
	z=corr_df.values,
	x=clean_names,
	y=clean_names,
	colorscale='RdBu_r',
	zmid=0,
	text=corr_matrix_pct.values,
	texttemplate="%{text}",
	textfont={"size": 8},
	hoverinfo='text',
	hovertext=hover_text_regular,
	showscale=False,
	name="Regular"
	), row=1, col=1)

	# Weighted correlation heatmap
	weighted_corr_pct = (weighted_corr_df * 100).round(1)

	# Create hover text for weighted correlations
	hover_text_weighted = []
	for i, bench1 in enumerate(weighted_corr_df.columns):
	hover_row = []
	for j, bench2 in enumerate(weighted_corr_df.columns):
	if i == j:
	reliability = weighted_corr_df.iloc[i, j]
	if pd.isna(reliability):
	hover_row.append(f"{clean_names[i]}<br>Reliability: Unknown")
	else:
	hover_row.append(f"{clean_names[i]}<br>Reliability: {reliability*100:.1f}%")
	else:
	corr_val = weighted_corr_pct.iloc[i, j]
	if pd.isna(corr_val):
	hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>No weighted data")
	else:
	hover_row.append(f"{clean_names[i]} vs {clean_names[j]}<br>Weighted correlation: {corr_val:.1f}%")
	hover_text_weighted.append(hover_row)

	fig.add_trace(go.Heatmap(
	z=weighted_corr_df.values,
	x=clean_names,
	y=clean_names,
	colorscale='RdBu_r',
	zmid=0,
	text=weighted_corr_pct.values,
	texttemplate="%{text}",
	textfont={"size": 8},
	hoverinfo='text',
	hovertext=hover_text_weighted,
	showscale=True,
	colorbar=dict(title="Correlation", x=1.02),
	name="Weighted"
	), row=1, col=2)

	# Update layout
	fig.update_layout(
	title=f"{title_prefix}<br><sub>Diagonal shows reliability coefficients for weighted matrix</sub>",
	width=1400,
	height=700,
	font=dict(size=12)
	)

	# Update axes
	fig.update_xaxes(tickangle=45, row=1, col=1)
	fig.update_xaxes(tickangle=45, row=1, col=2)
	fig.update_yaxes(tickangle=0, row=1, col=1)
	fig.update_yaxes(tickangle=0, row=1, col=2)

	return fig

	def main():
	"""Main application."""
	# Initialize session state for persistent selections
	if 'analysis_mode' not in st.session_state:
	st.session_state.analysis_mode = "📊 Overview Dashboard"
	if 'use_verified_models' not in st.session_state:
	st.session_state.use_verified_models = True
	if 'selected_categories' not in st.session_state:
	st.session_state.selected_categories = []
	if 'filter_zeros' not in st.session_state:
	st.session_state.filter_zeros = True
	if 'min_models' not in st.session_state:
	st.session_state.min_models = 10

	st.markdown('<h1 class="main-header">OpenThoughts Evalchemy Benchmark Explorer</h1>',
	unsafe_allow_html=True)

	# Sidebar
	st.sidebar.header("🎛️ Controls")

	# Analysis mode selection - using session state
	analysis_mode = st.sidebar.selectbox(
	"Choose Analysis Mode",
	["📊 Overview Dashboard", "🔥 Correlation Heatmap", "📈 Scatter Plot Explorer",
	"🎯 Model Performance", "🔬 Uncertainty Analysis"],
	index=["📊 Overview Dashboard", "🔥 Correlation Heatmap", "📈 Scatter Plot Explorer",
	"🎯 Model Performance", "🔬 Uncertainty Analysis"].index(st.session_state.analysis_mode) if st.session_state.analysis_mode != "📋 Statistical Summary" else 0,
	key="analysis_mode"
	)

	# Data filtering options
	st.sidebar.subheader("Data Filters")

	# Verified models filter - using session state
	use_verified_models = st.sidebar.checkbox(
	"Include only verified models",
	value=st.session_state.use_verified_models,
	key="use_verified_models"
	)

	# Load data with timing (after checkbox is defined)
	start_time = time.time()
	df = load_comprehensive_data(use_verified_models)
	stderr_df = load_stderr_data(use_verified_models)
	load_time = time.time() - start_time

	# Debug information (hidden in an expander)
	# with st.expander("🔧 Debug Information", expanded=False):
	# st.write(f"Data Shape: {df.shape if not df.empty else 'No data'}")
	# st.write(f"Columns: {len(df.columns) if not df.empty else 0}")
	# st.write(f"Models: {len(df.index) if not df.empty else 0}")
	# if not df.empty:
	# st.write(f"Sample columns: {list(df.columns[:5])}")
	# st.write(f"Data types: {df.dtypes.value_counts().to_dict()}")
	# st.write(f"Missing values per column: {df.isnull().sum().sum()}")
	# st.write(f"StdErr data available: {'Yes' if stderr_df is not None else 'No'}")

	if df.empty:
	st.error("No data available. Please check that the CSV files are properly uploaded and accessible.")
	return

	# Filter to target benchmarks
	df = filter_target_benchmarks(df)
	target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()

	# Initialize selected categories if empty
	if not st.session_state.selected_categories:
	st.session_state.selected_categories = list(benchmark_categories.keys())

	# Category filter - using session state
	selected_categories = st.sidebar.multiselect(
	"Select Benchmark Categories",
	list(benchmark_categories.keys()),
	default=st.session_state.selected_categories,
	key="selected_categories"
	)

	# Filter benchmarks based on selected categories
	filtered_benchmarks = []
	for category in selected_categories:
	for bench_name in benchmark_categories[category]:
	actual_name = target_benchmarks.get(bench_name)
	if actual_name in df.columns:
	filtered_benchmarks.append(actual_name)

	if filtered_benchmarks:
	df_display = df[filtered_benchmarks].copy()
	else:
	df_display = df.copy()

	# Zero filtering - using session state
	filter_zeros = st.sidebar.checkbox(
	"Filter out zero/near-zero values",
	value=st.session_state.filter_zeros,
	key="filter_zeros"
	)
	if filter_zeros:
	for col in df_display.columns:
	df_display.loc[(df_display[col] == 0) \| (df_display[col] < 0.01), col] = np.nan

	# Minimum data points filter
	coverage_counts = [df_display[col].notna().sum() for col in df_display.columns]
	if coverage_counts:
	min_coverage = min(coverage_counts)
	max_coverage = max(coverage_counts)
	default_min = max(10, min_coverage) # Default to at least 10 or minimum available

	# Update session state min_models if it's out of range, but allow minimum of 0
	if st.session_state.min_models > max_coverage:
	st.session_state.min_models = default_min

	min_models = st.sidebar.slider(
	"Minimum models per benchmark",
	min_value=0, # Always allow 0 minimum to include all benchmarks
	max_value=max_coverage,
	value=st.session_state.min_models,
	help=f"Current range: {min_coverage} to {max_coverage} models. Set to 0 to include all benchmarks.",
	key="min_models"
	)
	else:
	min_models = 10

	# Apply the minimum models filter
	valid_benchmarks = []
	for col in df_display.columns:
	if df_display[col].notna().sum() >= min_models:
	valid_benchmarks.append(col)
	df_display = df_display[valid_benchmarks]

	# Performance info
	# st.sidebar.markdown("---")
	# st.sidebar.subheader("⚡ Performance")
	# if load_time > 0:
	# st.sidebar.metric("Data Load Time", f"{load_time:.2f}s")
	# st.sidebar.metric("Dataset Size", f"{len(df_display)} × {len(df_display.columns)}")
	# if not df_display.empty:
	# data_coverage = (df_display.notna().sum().sum() / (len(df_display) * len(df_display.columns))) * 100
	# st.sidebar.metric("Data Coverage", f"{data_coverage:.1f}%")

	# Main content based on analysis mode
	if analysis_mode == "📊 Overview Dashboard":
	show_overview_dashboard(df_display, stderr_df)

	elif analysis_mode == "🔥 Correlation Heatmap":
	show_interactive_heatmap(df_display, stderr_df)

	elif analysis_mode == "📈 Scatter Plot Explorer":
	show_scatter_explorer(df_display, stderr_df)

	elif analysis_mode == "🎯 Model Performance":
	show_model_performance(df_display)

	elif analysis_mode == "🔬 Uncertainty Analysis":
	show_uncertainty_analysis(df_display, stderr_df)

	def show_overview_dashboard(df, stderr_df):
	"""Show the overview dashboard."""
	st.header("📊 Overview Dashboard")

	# Key metrics
	col1, col2, col3, col4 = st.columns(4)

	with col1:
	st.metric("Models", len(df))

	with col2:
	st.metric("Benchmarks", len(df.columns))

	with col3:
	total_evals = df.notna().sum().sum()
	st.metric("Total Evaluations", f"{total_evals:,}")

	with col4:
	avg_coverage = (df.notna().sum() / len(df)).mean() * 100
	st.metric("Avg Coverage", f"{avg_coverage:.1f}%")

	# Benchmark coverage chart
	st.subheader("Benchmark Coverage")

	coverage_data = []
	target_benchmarks, benchmark_categories, colors, col_to_category = get_focused_benchmark_mapping()

	for col in df.columns:
	coverage = int(df[col].notna().sum()) # Ensure integer type
	category = col_to_category.get(col, 'Unknown')
	clean_name = clean_benchmark_name(col)

	# Ensure we have valid data
	if coverage >= 0: # Only include valid coverage counts
	coverage_data.append({
	'Benchmark': str(clean_name), # Ensure string type
	'Coverage': coverage,
	'Percentage': float(coverage / len(df) * 100), # Ensure float type
	'Category': str(category) # Ensure string type
	})

	if coverage_data: # Only create plot if we have data
	coverage_df = pd.DataFrame(coverage_data).sort_values('Coverage', ascending=True)

	# Ensure data types are correct
	coverage_df['Coverage'] = coverage_df['Coverage'].astype(int)
	coverage_df['Percentage'] = coverage_df['Percentage'].astype(float)
	coverage_df['Benchmark'] = coverage_df['Benchmark'].astype(str)
	coverage_df['Category'] = coverage_df['Category'].astype(str)

	# Create bar plot with explicit parameters
	fig = px.bar(coverage_df,
	x='Coverage',
	y='Benchmark',
	color='Category',
	color_discrete_map=colors,
	title="Model Coverage by Benchmark",
	labels={'Coverage': 'Number of Models'},
	orientation='h',
	text='Coverage') # Add text labels to bars

	# Update layout for better visibility
	fig.update_traces(texttemplate='%{text}', textposition='outside')
	fig.update_layout(
	height=max(400, len(coverage_df) * 25), # Dynamic height based on data
	showlegend=True,
	xaxis_title="Number of Models",
	yaxis_title="Benchmark"
	)

	st.plotly_chart(fig, use_container_width=True)
	else:
	st.warning("No coverage data available to display.")

	# Quick correlation insights
	st.subheader("Quick Correlation Insights")

	corr_matrix = compute_correlations(df, 'kendall')

	# Get top correlations
	pairs = []
	for i, bench1 in enumerate(corr_matrix.columns):
	for j, bench2 in enumerate(corr_matrix.columns[i+1:], i+1):
	if not pd.isna(corr_matrix.iloc[i, j]):
	cat1 = col_to_category.get(bench1, 'Unknown')
	cat2 = col_to_category.get(bench2, 'Unknown')
	pairs.append((bench1, bench2, corr_matrix.iloc[i, j], cat1, cat2))

	pairs.sort(key=lambda x: abs(x[2]), reverse=True)

	col1, col2 = st.columns(2)

	with col1:
	st.markdown("🔥 Top 5 Highest Correlations")
	for i, (bench1, bench2, corr, cat1, cat2) in enumerate(pairs[:5]):
	st.write(f"{i+1}. {clean_benchmark_name(bench1)} ↔ {clean_benchmark_name(bench2)} r = {corr:.3f}")

	with col2:
	st.markdown("📊 Category Analysis")
	within_cat = [p[2] for p in pairs if p[3] == p[4]]
	across_cat = [p[2] for p in pairs if p[3] != p[4]]

	if within_cat:
	st.write(f"Within-category avg: {np.mean(within_cat):.3f}")
	if across_cat:
	st.write(f"Across-category avg: {np.mean(across_cat):.3f}")

	st.write(f"Total pairs analyzed: {len(pairs)}")

	def show_interactive_heatmap(df, stderr_df):
	"""Display interactive correlation heatmap with various options."""
	st.header("🔥 Correlation Heatmap")

	col1, col2, col3 = st.columns(3)

	with col1:
	# Check if stderr data is available for the uncertainty-aware checkbox
	stderr_available = stderr_df is not None
	uncertainty_aware = False
	if stderr_available:
	uncertainty_aware = st.checkbox(
	"🔬 Uncertainty-Aware Analysis",
	value=False,
	help="Use measurement uncertainties to weight correlations (requires standard error data)"
	)

	# Adjust method selector based on uncertainty-aware mode
	if uncertainty_aware:
	st.selectbox(
	"Correlation Method",
	["pearson"],
	index=0,
	disabled=True,
	help="Uncertainty-aware analysis uses Pearson correlations only\n\nWeighted correlations require parametric methods to properly account for measurement uncertainties."
	)
	method = "pearson" # Force Pearson for uncertainty-aware analysis
	else:
	method = st.selectbox(
	"Correlation Method",
	["kendall", "pearson"],
	help="Pearson: Measures linear relationships\nKendall: Measures ordinal relationships"
	)

	# Additional options
	if uncertainty_aware and stderr_df is not None:
	st.info("🔬 Uncertainty-Aware Mode: Correlations are weighted by inverse measurement variance. "
	"Diagonal shows reliability coefficients (proportion of variance that is 'true signal' vs measurement error).")

	# Match scores with stderr data
	available_benchmarks = list(df.columns)
	score_to_stderr_mapping = match_scores_with_stderr(df, stderr_df, available_benchmarks)

	if len(score_to_stderr_mapping) == 0:
	st.warning("No matching standard error data found for the selected benchmarks. "
	"Falling back to regular correlation analysis.")
	uncertainty_aware = False
	else:
	# Filter to benchmarks with stderr data
	benchmarks_with_stderr = list(score_to_stderr_mapping.keys())
	df_stderr = df[benchmarks_with_stderr].copy()

	st.success(f"Found standard error data for {len(score_to_stderr_mapping)} benchmarks: "
	f"{', '.join([clean_benchmark_name(b) for b in benchmarks_with_stderr])}")

	# Align dataframes
	common_models = df_stderr.index.intersection(stderr_df.index)
	df_aligned = df_stderr.loc[common_models]
	stderr_aligned = stderr_df.loc[common_models]

	st.write(f"Analysis scope: {len(common_models)} models with both scores and standard errors")

	# Compute uncertainty-aware correlations
	with st.spinner("Computing uncertainty-weighted correlations..."):
	corr_df, pvalue_df, weighted_corr_df, weighted_pvalue_df = create_uncertainty_aware_correlation_matrix(
	df_aligned, stderr_aligned, score_to_stderr_mapping
	)

	# Create and display uncertainty-aware heatmap
	fig = create_uncertainty_weighted_heatmap_plotly(
	weighted_corr_df,
	title_prefix=f"Uncertainty-Weighted {method.capitalize()} Correlations"
	)

	st.plotly_chart(fig, use_container_width=True)

	# Show reliability statistics
	with st.expander("📊 Reliability Statistics", expanded=False):
	st.write("Benchmark Reliability Coefficients (proportion of variance that is true signal):")
	reliability_data = []
	for bench in weighted_corr_df.columns:
	diag_val = weighted_corr_df.loc[bench, bench]
	if not pd.isna(diag_val):
	reliability_data.append({
	'Benchmark': clean_benchmark_name(bench),
	'Reliability': f"{diag_val*100:.1f}%",
	'Category': next((cat for cat, benchs in get_focused_benchmark_mapping()[1].items()
	for b in benchs if get_focused_benchmark_mapping()[0].get(b) == bench), 'Unknown')
	})

	if reliability_data:
	reliability_df = pd.DataFrame(reliability_data)
	st.dataframe(reliability_df, use_container_width=True)

	avg_reliability = pd.to_numeric([d['Reliability'].rstrip('%') for d in reliability_data]).mean() / 100
	st.metric("Average Reliability", f"{avg_reliability:.3f} ({avg_reliability*100:.1f}%)")

	# Show correlation differences
	with st.expander("📈 Impact of Uncertainty Weighting", expanded=False):
	st.write("Correlation Changes (Weighted - Regular):")

	diff_data = []
	for i, bench1 in enumerate(corr_df.columns):
	for j, bench2 in enumerate(corr_df.columns):
	if i < j: # Only upper triangle
	regular_corr = corr_df.iloc[i, j]
	weighted_corr = weighted_corr_df.iloc[i, j]

	if not (pd.isna(regular_corr) or pd.isna(weighted_corr)):
	diff = weighted_corr - regular_corr
	diff_data.append({
	'Benchmark Pair': f"{clean_benchmark_name(bench1)} vs {clean_benchmark_name(bench2)}",
	'Regular': f"{regular_corr:.3f}",
	'Weighted': f"{weighted_corr:.3f}",
	'Difference': f"{diff:+.3f}",
	'Abs Difference': abs(diff)
	})

	if diff_data:
	diff_df = pd.DataFrame(diff_data)
	# Sort by absolute difference
	diff_df_sorted = diff_df.sort_values('Abs Difference', ascending=False)
	st.dataframe(diff_df_sorted.drop('Abs Difference', axis=1), use_container_width=True)

	# Summary stats
	diffs = [float(d['Difference']) for d in diff_data]
	col1, col2, col3 = st.columns(3)
	with col1:
	st.metric("Mean Change", f"{np.mean(diffs):+.4f}")
	with col2:
	st.metric("Max \|Change\|", f"{max(abs(d) for d in diffs):.4f}")
	with col3:
	st.metric("Large Changes (\|Δ\| > 0.1)", f"{sum(1 for d in diffs if abs(d) > 0.1)}")

	# Regular correlation analysis (original functionality restored)
	if df.empty:
	st.error("No data available.")
	return

	# Compute correlation matrix
	corr_matrix = compute_correlations(df, method)

	if corr_matrix.empty:
	st.error("Unable to compute correlations.")
	return

	# Create and display regular heatmap (original way)
	fig = create_interactive_heatmap(corr_matrix, f"{method.capitalize()} Correlation Matrix")

	st.plotly_chart(fig, use_container_width=True)

	# Correlation statistics (original)
	st.subheader("Correlation Statistics")

	# Get all off-diagonal correlations (original method)
	mask = np.triu(np.ones_like(corr_matrix, dtype=bool), k=1)
	corr_values = corr_matrix.where(mask).stack().dropna()

	col1, col2, col3, col4 = st.columns(4)

	with col1:
	st.metric("Mean Correlation", f"{corr_values.mean():.3f}")

	with col2:
	st.metric("Median Correlation", f"{corr_values.median():.3f}")

	with col3:
	st.metric("Max Correlation", f"{corr_values.max():.3f}")

	with col4:
	st.metric("Min Correlation", f"{corr_values.min():.3f}")

	# Distribution of correlations (original)
	st.subheader("Correlation Distribution")

	fig_hist = px.histogram(corr_values,
	nbins=20,
	title="Distribution of Pairwise Correlations",
	labels={'value': 'Correlation Coefficient', 'count': 'Frequency'})
	st.plotly_chart(fig_hist, use_container_width=True)

	# Methodology note
	with st.expander("ℹ️ About Correlation Methods", expanded=False):
	st.markdown("""
	Pearson: Measures linear relationships. Values range from -1 to +1.
	- +1: Perfect positive linear relationship
	- 0: No linear relationship
	- -1: Perfect negative linear relationship

	Kendall: Measures ordinal association using concordant/discordant pairs.
	- More robust for small samples
	- Better for data with many tied values

	Uncertainty-Aware Analysis: When available, uses measurement standard errors to:
	- Weight correlations by inverse measurement variance
	- Show reliability coefficients (signal-to-noise ratios) on diagonal
	- Provide more accurate correlation estimates for noisy data
	""")

	def show_scatter_explorer(df, stderr_df):
	"""Show the scatter plot explorer with integrated simplified modeling."""
	# Initialize session state for scatter plot selections
	if 'scatter_x_benchmark' not in st.session_state:
	st.session_state.scatter_x_benchmark = df.columns[0] if len(df.columns) > 0 else None
	if 'scatter_y_benchmark' not in st.session_state:
	st.session_state.scatter_y_benchmark = df.columns[1] if len(df.columns) > 1 else df.columns[0] if len(df.columns) > 0 else None

	st.header("📈 Scatter Plot Explorer")

	# Benchmark selection
	col1, col2 = st.columns(2)

	with col1:
	# Get current index for x_benchmark
	x_index = 0
	if st.session_state.scatter_x_benchmark in df.columns:
	x_index = list(df.columns).index(st.session_state.scatter_x_benchmark)

	x_benchmark = st.selectbox(
	"X-axis Benchmark",
	df.columns,
	index=x_index,
	format_func=clean_benchmark_name,
	key="scatter_x_benchmark"
	)

	with col2:
	# Get current index for y_benchmark
	y_index = 1 if len(df.columns) > 1 else 0
	if st.session_state.scatter_y_benchmark in df.columns:
	y_index = list(df.columns).index(st.session_state.scatter_y_benchmark)

	y_benchmark = st.selectbox(
	"Y-axis Benchmark",
	df.columns,
	index=y_index,
	format_func=clean_benchmark_name,
	key="scatter_y_benchmark"
	)

	if x_benchmark and y_benchmark and x_benchmark != y_benchmark:
	# Always use the simplified modeling (linear or saturation)
	fig, models = create_advanced_scatter_plot(df, x_benchmark, y_benchmark, stderr_df)

	if fig and models:
	st.plotly_chart(fig, use_container_width=True)

	# Show the best model information
	best_model = models[0]
	st.info(f"Best fit: {best_model['name']} (R² = {best_model['r2']:.3f})")

	# Show model interpretation
	if best_model['type'] == 'linear':
	st.caption("📏 Linear relationship: One benchmark increases proportionally with the other.")
	elif best_model['type'] == 'saturation':
	if 'direction' in best_model and best_model['direction'] == 'flipped':
	st.caption("📈 Inverse saturation: The Y-axis benchmark plateaus as X-axis benchmark increases.")
	else:
	st.caption("📈 Saturation: One benchmark plateaus as the other increases.")

	# Add detailed explanation for saturation fits
	with st.expander("ℹ️ How saturation fitting works", expanded=False):
	st.markdown("""
	Saturation Model: `y = a × (1 - e^(-b×x)) + c`

	Bidirectional Fitting Process:
	1. Try both directions: Fit `y = f(x)` and `x = f(y)`
	2. Choose best fit: Select direction with higher R² score
	3. Consistent plotting: Curve coordinates are computed in the best-fitting direction and plotted identically regardless of axis orientation

	Why this matters: Some relationships are better modeled in one direction (e.g., performance plateaus as model size increases). The algorithm automatically finds the best direction and ensures the curve looks the same whether you plot X vs Y or Y vs X.

	Parameters:
	- `a`: Maximum change (amplitude)
	- `b`: Rate of saturation (higher = faster plateau)
	- `c`: Baseline offset
	""")

	# Add preference reason if applicable
	if best_model.get('preferred', False) and 'preference_reason' in best_model:
	st.caption(f"ℹ️ {best_model['preference_reason']}")

	else:
	st.warning("Insufficient data for modeling (need at least 5 data points).")

	# Additional statistics
	common_data = df[[x_benchmark, y_benchmark]].dropna()

	if len(common_data) >= 3:
	col1, col2, col3 = st.columns(3)

	# Correlation metrics
	pearson_r, pearson_p = pearsonr(common_data[x_benchmark], common_data[y_benchmark])
	kendall_r, kendall_p = kendalltau(common_data[x_benchmark], common_data[y_benchmark])

	# Format p-values appropriately
	def format_pvalue(p):
	if p < 0.001:
	info = "P-values < 0.001 indicate very strong statistical significance. This results from good sample sizes and meaningful relationships."
	return "p < 0.001", info
	elif p < 0.05:
	info = "P-values < 0.05 indicate moderate statistical significance. This results from reasonable sample sizes and meaningful relationships."
	return f"p = {p:.3f}", info
	elif p < 0.1:
	info = "P-values < 0.1 indicate weak statistical significance. This results from low sample sizes and/or weak relationships."
	return f"p = {p:.3f}", info
	else:
	info = "P-values > 0.1 indicate very weak statistical significance. This results from insufficient sample sizes and/or weak relationships."
	return f"p = {p:.3f}", info


	with col1:
	p_value, info = format_pvalue(pearson_p)
	st.metric("Pearson r", f"{pearson_r:.3f}", help="Pearson's r is a parametric measure of linear correlation.")
	st.caption(p_value, help=info)

	with col2:
	p_value, info = format_pvalue(kendall_p)
	st.metric("Kendall τ", f"{kendall_r:.3f}", help="Kendall's tau is a non-parametric measure of ordinal correlation that is robust to outliers.")
	st.caption(p_value, help=info)

	with col3:
	# Show data table
	st.subheader("Data Points")
	display_data = common_data.copy()
	display_data.columns = [clean_benchmark_name(col) for col in display_data.columns]
	st.dataframe(display_data, use_container_width=True)
	else:
	st.info("Please select two different benchmarks to compare.")

	def show_model_performance(df):
	"""Show model performance analysis."""
	# Initialize session state for model performance selections
	if 'model_search_term' not in st.session_state:
	st.session_state.model_search_term = ""
	if 'use_rank_imputation' not in st.session_state:
	st.session_state.use_rank_imputation = True
	if 'min_corr' not in st.session_state:
	st.session_state.min_corr = 0.3
	if 'min_benchmarks_for_ranking' not in st.session_state:
	st.session_state.min_benchmarks_for_ranking = 6

	st.header("🎯 Model Performance Analysis")

	# Model search - using session state
	search_term = st.text_input(
	"🔍 Search for models",
	value=st.session_state.model_search_term,
	placeholder="Enter model name or part of name",
	key="model_search_term"
	)

	# Filter by search term if provided
	if search_term:
	matching_models = df.index[df.index.str.contains(search_term, case=False, na=False)]
	if len(matching_models) > 0:
	df_display = df.loc[matching_models]
	else:
	st.warning(f"No models found matching '{search_term}'")
	df_display = df
	else:
	df_display = df

	# Performance ranking
	st.subheader("Model Rankings")

	# Ranking method controls
	col1, col2, col3, col4 = st.columns(4)

	with col1:
	use_rank_imputation = st.checkbox(
	"Use rank-based estimation",
	value=st.session_state.use_rank_imputation,
	help="Estimate missing rankings using Kendall rank correlations between benchmarks. More fair than simple averaging.",
	key="use_rank_imputation"
	)

	with col2:
	if use_rank_imputation:
	# Always use Kendall correlation for rank-based estimation
	rank_method = "kendall"
	st.info("🔢 Using Kendall rank correlation (robust to outliers and tied values)")
	else:
	rank_method = "none"

	with col3:
	if use_rank_imputation:
	min_corr = st.slider(
	"Min correlation threshold",
	min_value=0.1,
	max_value=0.8,
	value=st.session_state.min_corr,
	step=0.1,
	help="Minimum rank correlation required to use a benchmark for prediction",
	key="min_corr"
	)
	else:
	min_corr = 0.3

	with col4:
	min_benchmarks_for_ranking = st.slider(
	"Min benchmarks required",
	min_value=1,
	max_value=12,
	value=st.session_state.min_benchmarks_for_ranking,
	step=1,
	help="Minimum number of original benchmarks required for a model to be included in ranking",
	key="min_benchmarks_for_ranking"
	)

	# Generate rankings with progress indicator
	# Always compute rankings on the FULL dataset to preserve true ranks
	if use_rank_imputation and len(df) > 50:
	with st.spinner(f"Computing consensus rankings for {len(df)} models..."):
	full_ranking_df, rank_matrix, metadata = create_consensus_ranking(
	df, # Use full dataset, not df_display
	method=rank_method,
	use_rank_imputation=use_rank_imputation,
	min_benchmarks_for_ranking=min_benchmarks_for_ranking
	)
	else:
	full_ranking_df, rank_matrix, metadata = create_consensus_ranking(
	df, # Use full dataset, not df_display
	method=rank_method,
	use_rank_imputation=use_rank_imputation,
	min_benchmarks_for_ranking=min_benchmarks_for_ranking
	)

	# Filter ranking results to match search if provided
	if search_term:
	matching_models = df.index[df.index.str.contains(search_term, case=False, na=False)]
	if len(matching_models) > 0:
	# Filter the ranking to only show matching models, preserving their original ranks
	ranking_df = full_ranking_df[full_ranking_df['Full_Model_Name'].isin(matching_models)]
	else:
	st.warning(f"No models found matching '{search_term}'")
	ranking_df = full_ranking_df.head(0) # Empty dataframe
	else:
	ranking_df = full_ranking_df

	# Show filtering information
	if search_term:
	if len(ranking_df) > 0:
	st.info(f"🔍 Found {len(ranking_df)} models matching '{search_term}'. "
	f"Rankings computed on full dataset of {len(full_ranking_df)} models.")
	else:
	st.warning(f"No models found matching '{search_term}'")
	elif metadata['models_filtered_out'] > 0:
	st.info(f"ℹ️ Filtered out {metadata['models_filtered_out']} models with fewer than {metadata['min_benchmarks_required']} benchmarks. "
	f"Ranking {len(ranking_df)} models on {metadata['total_benchmarks']} benchmarks.")
	else:
	st.success(f"✅ All {len(ranking_df)} models meet the minimum benchmark requirement ({metadata['min_benchmarks_required']} benchmarks).")

	# Display ranking information
	col1, col2 = st.columns(2)

	with col1:
	# Change title and behavior based on search term
	if search_term:
	st.markdown(f"🔍 Models matching '{search_term}'")
	models_to_show = ranking_df # Show all matching models
	show_count = len(ranking_df)
	else:
	st.markdown("🏆 Top 10 Models")
	models_to_show = ranking_df.head(10) # Show only top 10
	show_count = min(10, len(ranking_df))

	if metadata['imputation_used']:
	st.caption(f"🔬 Using {metadata['method']} rank correlations with {metadata['total_estimates']} estimated ranks")
	else:
	st.caption("📊 Using median rank of available rankings")

	# Show models with their actual ranking position
	# Use a scrollable container if there are many results
	if search_term and show_count > 20:
	with st.container(height=400): # Scrollable container for long lists
	for i, (idx, row) in enumerate(models_to_show.iterrows()):
	# Calculate actual rank position in the FULL ranking (1-based)
	actual_rank = full_ranking_df.index.get_loc(idx) + 1

	estimated_info = f" (+{row['Estimated_Ranks']} est.)" if row['Estimated_Ranks'] > 0 else ""
	coverage_info = f"{row['Coverage_Pct']:.0f}%"

	if metadata['imputation_used']:
	st.write(f"{actual_rank}. {row['Model']} (median rank: {row['Consensus_Rank']:.1f})")
	st.caption(f" 📊 {row['Original_Benchmarks']}/{row['Total_Benchmarks']} benchmarks{estimated_info}")
	else:
	st.write(f"{actual_rank}. {row['Model']} (median rank: {row['Consensus_Rank']:.1f})")
	st.caption(f" 📊 {row['Original_Benchmarks']} benchmarks ({coverage_info} coverage)")
	else:
	for i, (idx, row) in enumerate(models_to_show.iterrows()):
	# Calculate actual rank position in the FULL ranking (1-based)
	actual_rank = full_ranking_df.index.get_loc(idx) + 1

	estimated_info = f" (+{row['Estimated_Ranks']} est.)" if row['Estimated_Ranks'] > 0 else ""
	coverage_info = f"{row['Coverage_Pct']:.0f}%"

	if metadata['imputation_used']:
	st.write(f"{actual_rank}. {row['Model']} (median rank: {row['Consensus_Rank']:.1f})")
	st.caption(f" 📊 {row['Original_Benchmarks']}/{row['Total_Benchmarks']} benchmarks{estimated_info}")
	else:
	st.write(f"{actual_rank}. {row['Model']} (median rank: {row['Consensus_Rank']:.1f})")
	st.caption(f" 📊 {row['Original_Benchmarks']} benchmarks ({coverage_info} coverage)")

	# Show summary when search results are displayed
	if search_term:
	if show_count == 0:
	st.info("No models found matching the search term.")
	else:
	st.info(f"Found {show_count} model(s) matching '{search_term}'")

	with col2:
	st.markdown("📊 Ranking Distribution")

	# Create histogram of consensus ranks
	fig = px.histogram(
	ranking_df,
	x='Consensus_Rank',
	nbins=20,
	title="Distribution of Consensus Rankings",
	labels={'Consensus_Rank': 'Average Rank (lower is better)', 'count': 'Number of Models'}
	)
	fig.update_layout(height=400)
	st.plotly_chart(fig, use_container_width=True)

	# Show ranking methodology explanation
	if metadata['imputation_used']:
	with st.expander("ℹ️ How Rank-Based Estimation Works"):
	st.write(f"""
	Method: {metadata['method'].title()} rank correlation

	Process:
	1. Convert benchmark scores to ranks (1st, 2nd, 3rd, etc.)
	2. Calculate rank correlations between all benchmark pairs
	3. For missing data: predict rank using weighted average of available ranks
	4. Weights based on rank correlation strength (min threshold: {min_corr})
	5. Final consensus rank = median rank across all benchmarks

	Optimizations:
	- Pre-compute correlation matrices for efficiency
	- Limit to top 5 most correlated benchmarks per prediction
	- Cache results to avoid recomputation

	Upsides:
	- Eliminates bias from models tested only on easier/harder benchmarks
	- Uses the correlation structure to make informed predictions
	- Focuses on relative ranking rather than absolute scores
	- More robust to outliers and scale differences
	- Median consensus rank is less affected by extreme outlier rankings

	Statistics:
	- Total rank estimates made: {metadata['total_estimates']:,}
	- Models with estimated ranks: {metadata['models_with_estimates']}
	""")
	else:
	with st.expander("ℹ️ Simple Ranking Method"):
	st.write("""
	Method: Median rank of available rankings

	Limitation: Models tested on fewer or easier benchmarks may appear artificially better.

	Recommendation: Enable rank-based estimation for fairer comparisons.
	""")

	# Model comparison section
	st.subheader("Model Comparison")

	# Initialize session state for radar chart selections
	if 'selected_benchmarks_for_radar' not in st.session_state:
	available_benchmarks = list(df_display.columns)
	default_benchmarks = available_benchmarks[:min(8, len(available_benchmarks))]
	st.session_state.selected_benchmarks_for_radar = default_benchmarks

	if 'complete_data_only' not in st.session_state:
	st.session_state.complete_data_only = True

	if 'selected_models_for_radar' not in st.session_state:
	st.session_state.selected_models_for_radar = []

	# Benchmark selection for radar chart (always visible)
	st.subheader("📊 Benchmark & Model Selection")

	col1, col2 = st.columns([2, 1])

	with col1:
	available_benchmarks = list(df_display.columns)

	# Ensure session state benchmarks are still valid
	valid_benchmarks = [b for b in st.session_state.selected_benchmarks_for_radar if b in available_benchmarks]
	if not valid_benchmarks:
	valid_benchmarks = available_benchmarks[:min(8, len(available_benchmarks))]
	st.session_state.selected_benchmarks_for_radar = valid_benchmarks

	selected_benchmarks_for_radar = st.multiselect(
	"Select benchmarks for radar chart",
	available_benchmarks,
	default=valid_benchmarks,
	format_func=clean_benchmark_name,
	help="Choose which benchmarks to display in the radar chart",
	key="selected_benchmarks_for_radar"
	)

	with col2:
	complete_data_only = st.checkbox(
	"Complete data only",
	value=st.session_state.complete_data_only,
	help="Show only models that have data for ALL selected benchmarks",
	key="complete_data_only"
	)

	# Filter available models based on benchmark selection and complete data requirement
	if complete_data_only and selected_benchmarks_for_radar:
	# Only show models that have data for all selected benchmarks
	models_with_complete_data = []
	for model in df_display.index:
	has_all_data = True
	for benchmark in selected_benchmarks_for_radar:
	if pd.isna(df_display.loc[model, benchmark]):
	has_all_data = False
	break
	if has_all_data:
	models_with_complete_data.append(model)

	available_models_for_selection = models_with_complete_data
	models_info = f"({len(available_models_for_selection)} models with complete data)"
	else:
	available_models_for_selection = df_display.index.tolist()
	models_info = f"({len(available_models_for_selection)} models total)"

	# Model selection with filtered list - use top ranked models as default
	if available_models_for_selection:
	# Get top performers from full ranking (not filtered ranking)
	top_models_from_ranking = full_ranking_df['Full_Model_Name'].head(5).tolist()
	default_selection = [m for m in top_models_from_ranking if m in available_models_for_selection][:3]

	# Sort available models by ranking (best to worst)
	# Create a ranking order dictionary for quick lookup using FULL ranking
	ranking_order = {model: rank for rank, model in enumerate(full_ranking_df['Full_Model_Name'].tolist())}

	# Sort available models by their ranking (lower rank number = better performance)
	available_models_sorted = sorted(
	available_models_for_selection,
	key=lambda x: ranking_order.get(x, float('inf')) # Models not in ranking go to end
	)

	# Ensure session state models are still valid
	valid_selected_models = [m for m in st.session_state.selected_models_for_radar if m in available_models_for_selection]
	if not valid_selected_models and default_selection:
	valid_selected_models = default_selection
	st.session_state.selected_models_for_radar = valid_selected_models
	else:
	default_selection = []
	valid_selected_models = []
	available_models_sorted = []

	selected_models = st.multiselect(
	f"Select models to compare {models_info}",
	available_models_sorted, # Use sorted list instead of original
	default=valid_selected_models,
	help="Models are ordered by ranking (best to worst) and filtered based on benchmark selection and complete data setting above",
	key="selected_models_for_radar"
	)

	if selected_models:
	comparison_data = df_display.loc[selected_models].T
	comparison_data.index = [clean_benchmark_name(idx) for idx in comparison_data.index]

	# Performance Radar Chart
	st.subheader("📊 Performance Radar Chart")

	if not selected_benchmarks_for_radar:
	st.info("Please select at least one benchmark above for the radar chart.")
	elif len(selected_models) == 0:
	st.info("Please select models above to see the radar chart comparison.")
	elif len(selected_models) > 10:
	st.warning(f"Too many models selected ({len(selected_models)}). Please select 10 or fewer models for the radar chart.")
	st.info("💡 Tip: Use the search box above to filter models, then select a smaller subset for comparison.")
	else:
	# Show radar chart for 1-10 models with optimization
	if len(selected_models) > 3 or len(selected_benchmarks_for_radar) > 8:
	with st.spinner("Generating radar chart..."):
	fig = create_optimized_radar_chart(df_display, selected_models, selected_benchmarks_for_radar)
	else:
	fig = create_optimized_radar_chart(df_display, selected_models, selected_benchmarks_for_radar)

	if fig:
	st.plotly_chart(fig, use_container_width=True)

	# Add explanation about missing values (only if not using complete data only)
	if not complete_data_only:
	missing_info = []
	for model in selected_models:
	missing_benchmarks = []
	for benchmark in selected_benchmarks_for_radar:
	if pd.isna(df_display.loc[model, benchmark]):
	missing_benchmarks.append(clean_benchmark_name(benchmark))
	if missing_benchmarks:
	missing_info.append(f"• {model.split('/')[-1]}: {', '.join(missing_benchmarks)}")

	if missing_info:
	with st.expander("ℹ️ Missing Data Information"):
	st.write("Missing values are shown as 0 in the radar chart:")
	for info in missing_info:
	st.write(info)
	else:
	# When complete data only is enabled, all selected models should have complete data
	st.info("✅ All selected models have complete data for the chosen benchmarks.")

	# Performance tips for large selections
	if len(selected_models) > 5:
	st.info(f"💡 Viewing {len(selected_models)} models: For better readability, consider selecting fewer models or use the detailed comparison table below.")

	# Detailed comparison table
	st.subheader("Detailed Comparison")
	st.dataframe(comparison_data, use_container_width=True)

	def show_uncertainty_analysis(df, stderr_df):
	"""Show uncertainty analysis if standard error data is available."""
	# Initialize session state for uncertainty analysis selections
	if 'uncertainty_x_benchmark' not in st.session_state:
	st.session_state.uncertainty_x_benchmark = None
	if 'uncertainty_y_benchmark' not in st.session_state:
	st.session_state.uncertainty_y_benchmark = None

	st.header("🔬 Uncertainty Analysis")

	if stderr_df is None:
	st.warning("Standard error data not available. This analysis requires benchmark_standard_errors.csv")
	return

	st.info("This section analyzes measurement uncertainty and reliability of benchmark evaluations.")

	# Match benchmarks with standard errors
	matched_benchmarks = []
	for score_col in df.columns:
	# Try to find matching stderr column
	potential_stderr_cols = [
	f"{score_col}_std_err",
	f"{score_col.replace('_accuracy', '_accuracy_std_err')}",
	f"{score_col.replace('_accuracy_avg', '_accuracy_std_err')}"
	]

	for stderr_col in potential_stderr_cols:
	if stderr_col in stderr_df.columns:
	matched_benchmarks.append((score_col, stderr_col))
	break

	if not matched_benchmarks:
	st.warning("No matching standard error data found for the selected benchmarks.")
	return

	st.success(f"Found standard error data for {len(matched_benchmarks)} benchmarks.")

	# Measurement precision analysis
	st.subheader("📊 Measurement Precision")

	precision_data = []
	for score_col, stderr_col in matched_benchmarks:
	scores = df[score_col].dropna()
	stderrs = stderr_df[stderr_col].dropna()

	if len(stderrs) > 0:
	mean_stderr = stderrs.mean()
	median_stderr = stderrs.median()

	# Signal-to-noise ratio
	if len(scores) > 0:
	signal_std = scores.std()
	snr = signal_std / mean_stderr if mean_stderr > 0 else float('inf')
	else:
	snr = 0

	precision_data.append({
	'Benchmark': clean_benchmark_name(score_col),
	'Mean StdErr': mean_stderr,
	'Median StdErr': median_stderr,
	'Signal/Noise': snr,
	'N Models': len(stderrs)
	})

	if precision_data:
	precision_df = pd.DataFrame(precision_data)
	st.dataframe(precision_df, use_container_width=True)

	# Visualization
	fig = px.scatter(precision_df,
	x='Mean StdErr',
	y='Signal/Noise',
	size='N Models',
	hover_name='Benchmark',
	title="Measurement Precision: Signal-to-Noise vs Standard Error",
	labels={'Signal/Noise': 'Signal-to-Noise Ratio'})
	st.plotly_chart(fig, use_container_width=True)

	# Uncertainty-aware scatter plot
	st.subheader("🎯 Uncertainty-Aware Scatter Plot")

	# Let user select benchmarks with stderr data
	available_benchmarks = [score_col for score_col, _ in matched_benchmarks]

	# Initialize session state benchmarks if not set or invalid
	if (st.session_state.uncertainty_x_benchmark not in available_benchmarks or
	st.session_state.uncertainty_y_benchmark not in available_benchmarks):
	st.session_state.uncertainty_x_benchmark = available_benchmarks[0] if available_benchmarks else None
	st.session_state.uncertainty_y_benchmark = available_benchmarks[1] if len(available_benchmarks) > 1 else available_benchmarks[0] if available_benchmarks else None

	col1, col2 = st.columns(2)

	with col1:
	# Get current index for x_benchmark
	x_index = 0
	if st.session_state.uncertainty_x_benchmark in available_benchmarks:
	x_index = available_benchmarks.index(st.session_state.uncertainty_x_benchmark)

	x_benchmark = st.selectbox(
	"X-axis Benchmark (with uncertainty)",
	available_benchmarks,
	index=x_index,
	format_func=clean_benchmark_name,
	key="uncertainty_x_benchmark"
	)

	with col2:
	# Get current index for y_benchmark
	y_index = 1 if len(available_benchmarks) > 1 else 0
	if st.session_state.uncertainty_y_benchmark in available_benchmarks:
	y_index = available_benchmarks.index(st.session_state.uncertainty_y_benchmark)

	y_benchmark = st.selectbox(
	"Y-axis Benchmark (with uncertainty)",
	available_benchmarks,
	index=y_index,
	format_func=clean_benchmark_name,
	key="uncertainty_y_benchmark"
	)

	if x_benchmark and y_benchmark and x_benchmark != y_benchmark:
	# Get matched data
	matched_data = match_scores_with_stderr(df, stderr_df, {x_benchmark, y_benchmark})

	if not matched_data:
	st.error("No matching data found between scores and stderr.")
	return

	# Check if both benchmarks have stderr matches
	if x_benchmark not in matched_data or y_benchmark not in matched_data:
	missing = []
	if x_benchmark not in matched_data:
	missing.append(clean_benchmark_name(x_benchmark))
	if y_benchmark not in matched_data:
	missing.append(clean_benchmark_name(y_benchmark))
	st.error(f"No stderr data found for: {', '.join(missing)}")
	return

	# Get the stderr column names
	score_to_stderr_mapping = matched_data

	# Create combined dataframe with scores and stderr
	combined_data = df[[x_benchmark, y_benchmark]].copy()
	stderr_x_col = score_to_stderr_mapping[x_benchmark]
	stderr_y_col = score_to_stderr_mapping[y_benchmark]

	# Add stderr columns
	combined_data[stderr_x_col] = stderr_df[stderr_x_col]
	combined_data[stderr_y_col] = stderr_df[stderr_y_col]

	# Drop rows with any missing data
	matched_data_df = combined_data.dropna()

	if len(matched_data_df) < 3:
	st.error("Insufficient data points with both scores and stderr (need at least 3).")
	return

	# Create scatter plot with error bars
	if len(matched_data_df) >= 3:
	fig = go.Figure()

	# Add scatter points with error bars
	fig.add_trace(go.Scatter(
	x=matched_data_df[x_benchmark],
	y=matched_data_df[y_benchmark],
	error_x=dict(
	type='data',
	array=matched_data_df[score_to_stderr_mapping[x_benchmark]],
	visible=True
	),
	error_y=dict(
	type='data',
	array=matched_data_df[score_to_stderr_mapping[y_benchmark]],
	visible=True
	),
	mode='markers',
	marker=dict(size=8, opacity=0.7),
	text=matched_data_df.index,
	hovertemplate='<b>%{text}</b><br>' +
	f'{clean_benchmark_name(x_benchmark)}: %{{x:.3f}} ± %{{error_x:.3f}}<br>' +
	f'{clean_benchmark_name(y_benchmark)}: %{{y:.3f}} ± %{{error_y:.3f}}<extra></extra>',
	name='Models'
	))

	# Fit linear regression for reference
	from sklearn.linear_model import LinearRegression
	X = matched_data_df[x_benchmark].values.reshape(-1, 1)
	y = matched_data_df[y_benchmark].values

	model = LinearRegression()
	model.fit(X, y)

	x_line = np.linspace(X.min(), X.max(), 100)
	y_line = model.predict(x_line.reshape(-1, 1))

	fig.add_trace(go.Scatter(
	x=x_line,
	y=y_line,
	mode='lines',
	name=f'Linear Fit (R² = {model.score(X, y):.3f})',
	line=dict(dash='dash', color='red')
	))

	fig.update_layout(
	title=f"Uncertainty-Aware Analysis: {clean_benchmark_name(x_benchmark)} vs {clean_benchmark_name(y_benchmark)}",
	xaxis_title=clean_benchmark_name(x_benchmark),
	yaxis_title=clean_benchmark_name(y_benchmark),
	hovermode='closest'
	)

	st.plotly_chart(fig, use_container_width=True)

	# Uncertainty metrics
	st.subheader("📊 Uncertainty Metrics")

	col1, col2, col3 = st.columns(3)

	with col1:
	avg_x_err = matched_data_df[score_to_stderr_mapping[x_benchmark]].mean()
	st.metric("Avg X Error", f"{avg_x_err:.4f}")

	with col2:
	avg_y_err = matched_data_df[score_to_stderr_mapping[y_benchmark]].mean()
	st.metric("Avg Y Error", f"{avg_y_err:.4f}")

	with col3:
	# Signal-to-noise ratio
	x_snr = matched_data_df[x_benchmark].std() / avg_x_err
	st.metric("X Signal/Noise", f"{x_snr:.2f}")

	# Data table
	st.subheader("📋 Data with Uncertainties")
	display_cols = [x_benchmark, score_to_stderr_mapping[x_benchmark],
	y_benchmark, score_to_stderr_mapping[y_benchmark]]
	display_data = matched_data_df[display_cols].copy()

	# Rename columns for display
	new_names = {
	x_benchmark: f"{clean_benchmark_name(x_benchmark)} (Score)",
	score_to_stderr_mapping[x_benchmark]: f"{clean_benchmark_name(x_benchmark)} (±Error)",
	y_benchmark: f"{clean_benchmark_name(y_benchmark)} (Score)",
	score_to_stderr_mapping[y_benchmark]: f"{clean_benchmark_name(y_benchmark)} (±Error)"
	}
	display_data = display_data.rename(columns=new_names)

	st.dataframe(display_data, use_container_width=True)

	else:
	st.warning("Need at least 3 data points for uncertainty analysis.")

	# Linear regression model
	def fit_linear_model(x, y):
	"""Fit a simple linear model."""
	try:
	from sklearn.linear_model import LinearRegression
	from sklearn.metrics import r2_score, mean_squared_error

	X = x.reshape(-1, 1)
	model = LinearRegression()
	model.fit(X, y)

	y_pred = model.predict(X)
	r2 = r2_score(y, y_pred)
	mse = mean_squared_error(y, y_pred)

	# Create prediction function
	def predict(x_new):
	return model.predict(np.array(x_new).reshape(-1, 1))

	return {
	'params': [model.coef_[0], model.intercept_],
	'r2': r2,
	'mse': mse,
	'model_func': predict,
	'name': f'Linear (slope={model.coef_[0]:.3f})',
	'type': 'linear'
	}
	except Exception as e:
	print(f"Linear model fitting failed: {e}")
	return None

	# Hockey stick model fitting
	def fit_hockey_stick_model(x, y):
	"""
	Fit a hockey stick (saturation) model: y = a * min(x, threshold) + b
	"""
	def hockey_stick(x, a, b, threshold):
	return a * np.minimum(x, threshold) + b

	try:
	# Initial guess: threshold at 75th percentile, linear fit for slope
	x_thresh_guess = np.percentile(x, 75)
	linear_fit = np.polyfit(x, y, 1)

	# Initial parameters: [slope, intercept, threshold]
	p0 = [linear_fit[0], linear_fit[1], x_thresh_guess]

	# Fit the model
	popt, pcov = curve_fit(hockey_stick, x, y, p0=p0, maxfev=2000)

	# Calculate R²
	y_pred = hockey_stick(x, *popt)
	r2 = r2_score(y, y_pred)

	# Calculate residual standard error
	residuals = y - y_pred
	mse = np.mean(residuals**2)

	return {
	'params': popt,
	'r2': r2,
	'mse': mse,
	'model_func': lambda x_new: hockey_stick(x_new, *popt),
	'name': f'Hockey Stick (threshold={popt[2]:.3f})',
	'param_names': ['slope', 'intercept', 'threshold']
	}
	except:
	return None

	def fit_saturation_model(x, y):
	"""
	Fit saturation model: y = a * (1 - exp(-b * x)) + c
	Tries both directions (x vs y and y vs x) and chooses the better fit.
	Returns curve coordinates computed in the best direction for consistent plotting.
	"""
	def saturation(x, a, b, c):
	return a * (1 - np.exp(-b * x)) + c

	def fit_direction(x_data, y_data, direction_name):
	"""Helper function to fit saturation in one direction"""
	try:
	# Initial guess
	y_range = np.max(y_data) - np.min(y_data)
	p0 = [y_range, 1.0, np.min(y_data)]

	# Fit the model
	popt, pcov = curve_fit(saturation, x_data, y_data, p0=p0, maxfev=2000)

	# Calculate R²
	y_pred = saturation(x_data, *popt)
	r2 = r2_score(y_data, y_pred)

	# Calculate residual standard error
	residuals = y_data - y_pred
	mse = np.mean(residuals**2)

	return {
	'params': popt,
	'r2': r2,
	'mse': mse,
	'direction': direction_name,
	'x_data': x_data,
	'y_data': y_data
	}
	except:
	return None

	# Try normal direction (x vs y)
	normal_fit = fit_direction(x, y, 'normal')

	# Try flipped direction (y vs x) - we'll need to invert this later
	flipped_fit = fit_direction(y, x, 'flipped')

	# Choose the better fit based on R²
	best_fit = None
	if normal_fit and flipped_fit:
	if normal_fit['r2'] >= flipped_fit['r2']:
	best_fit = normal_fit
	else:
	best_fit = flipped_fit
	elif normal_fit:
	best_fit = normal_fit
	elif flipped_fit:
	best_fit = flipped_fit

	if best_fit is None:
	return None

	# Compute curve coordinates in the best-fitting direction
	if best_fit['direction'] == 'normal':
	# Standard saturation: y = f(x)
	x_curve_data = best_fit['x_data'] # This is original x
	y_curve_data = best_fit['y_data'] # This is original y
	curve_x_range = np.linspace(x_curve_data.min(), x_curve_data.max(), 100)
	curve_y_values = saturation(curve_x_range, *best_fit['params'])

	# Store curve coordinates in original x,y space
	curve_coords = {
	'x_coords': curve_x_range,
	'y_coords': curve_y_values
	}

	model_func = lambda x_new: saturation(x_new, *best_fit['params'])
	name = f'Saturation (rate={best_fit["params"][1]:.3f})'
	else:
	# Flipped saturation: curve was fit as x = f(y), need to store in x,y space
	y_curve_data = best_fit['x_data'] # This was y when flipped
	x_curve_data = best_fit['y_data'] # This was x when flipped

	# Generate curve in the fitted direction (y vs x)
	y_range = np.linspace(y_curve_data.min(), y_curve_data.max(), 100)
	x_fitted = saturation(y_range, *best_fit['params'])

	# Store curve coordinates in original x,y space (swap back)
	curve_coords = {
	'x_coords': x_fitted,
	'y_coords': y_range
	}

	# Create inverse function for predictions
	a, b, c = best_fit['params']

	def inverse_saturation(x_new):
	# Solve: x = a * (1 - exp(-b * y)) + c for y
	# Rearranging: y = -ln(1 - (x - c) / a) / b
	x_new = np.asarray(x_new)
	result = np.full_like(x_new, np.nan, dtype=float)

	# Simple domain handling - only compute where mathematically valid
	if a > 0 and b > 0:
	# Valid domain: c <= x < c + a
	valid_mask = (x_new >= c) & (x_new < c + a * 0.999) # Leave small margin from asymptote

	if np.any(valid_mask):
	x_valid = x_new[valid_mask]
	ratio = (x_valid - c) / a
	ratio = np.clip(ratio, 1e-10, 0.999) # Avoid log(0) and log(negative)
	result[valid_mask] = -np.log(1 - ratio) / b

	return result

	model_func = inverse_saturation
	name = f'Saturation-Inv (rate={best_fit["params"][1]:.3f})'

	return {
	'params': best_fit['params'],
	'r2': best_fit['r2'],
	'mse': best_fit['mse'],
	'model_func': model_func,
	'name': name,
	'param_names': ['amplitude', 'rate', 'offset'],
	'direction': best_fit['direction'],
	'curve_coords': curve_coords # Pre-computed curve coordinates
	}

	def fit_polynomial_model(x, y, degree=2):
	"""
	Fit polynomial model of specified degree
	"""
	try:
	# Fit polynomial
	poly_coeffs = np.polyfit(x, y, degree)
	poly_func = np.poly1d(poly_coeffs)

	# Calculate R²
	y_pred = poly_func(x)
	r2 = r2_score(y, y_pred)

	# Calculate residual standard error
	residuals = y - y_pred
	mse = np.mean(residuals**2)

	return {
	'params': poly_coeffs,
	'r2': r2,
	'mse': mse,
	'model_func': lambda x_new: poly_func(x_new),
	'name': f'Polynomial (degree={degree})',
	'param_names': [f'coeff_{i}' for i in range(degree+1)]
	}
	except:
	return None


	def fit_random_forest_model(x, y):
	"""
	Fit Random Forest model for non-parametric regression with overfitting prevention
	"""
	try:
	# Reshape for sklearn
	X = x.values.reshape(-1, 1) if hasattr(x, 'values') else x.reshape(-1, 1)

	# Use conservative parameters to prevent overfitting on small datasets
	n_samples = len(x)

	# Adjust parameters based on dataset size
	if n_samples < 30:
	# Very conservative for small datasets
	rf = RandomForestRegressor(
	n_estimators=50, # Fewer trees
	max_depth=2, # Very shallow trees
	min_samples_split=max(2, n_samples // 10), # At least 10% of data to split
	min_samples_leaf=max(1, n_samples // 20), # At least 5% of data per leaf
	max_features=1, # Only one feature anyway
	random_state=42,
	bootstrap=True,
	oob_score=True if n_samples > 10 else False
	)
	elif n_samples < 100:
	# Moderately conservative
	rf = RandomForestRegressor(
	n_estimators=100,
	max_depth=3, # Shallow trees
	min_samples_split=max(2, n_samples // 8),
	min_samples_leaf=max(1, n_samples // 15),
	max_features=1,
	random_state=42,
	bootstrap=True,
	oob_score=True
	)
	else:
	# Still conservative but allow more complexity
	rf = RandomForestRegressor(
	n_estimators=100,
	max_depth=4, # Slightly deeper
	min_samples_split=max(2, n_samples // 6),
	min_samples_leaf=max(2, n_samples // 12),
	max_features=1,
	random_state=42,
	bootstrap=True,
	oob_score=True
	)

	rf.fit(X, y)

	# Predict
	y_pred = rf.predict(X)
	r2 = r2_score(y, y_pred)
	mse = np.mean((y - y_pred)**2)

	# Use OOB score as a better estimate of performance if available
	oob_r2 = getattr(rf, 'oob_score_', None)
	display_r2 = oob_r2 if oob_r2 is not None else r2

	return {
	'model': rf,
	'r2': display_r2, # Use OOB score if available to reduce overfitting bias
	'r2_train': r2, # Keep training R² for comparison
	'mse': mse,
	'model_func': lambda x_new: rf.predict(x_new.reshape(-1, 1) if len(x_new.shape) == 1 else x_new),
	'name': f'Random Forest (OOB)' if oob_r2 is not None else 'Random Forest',
	'param_names': ['n_estimators', 'max_depth', 'min_samples_split']
	}
	except:
	return None

	def detect_clusters_and_fit(x, y, n_clusters=2):
	"""
	Detect clusters in the data and fit separate models
	"""
	try:
	# Prepare data for clustering
	data = np.column_stack([x, y])
	scaler = StandardScaler()
	data_scaled = scaler.fit_transform(data)

	# Perform clustering
	kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
	cluster_labels = kmeans.fit_predict(data_scaled)

	# Fit linear models for each cluster
	cluster_models = []
	total_r2_weighted = 0
	total_mse_weighted = 0
	total_points = len(x)

	for i in range(n_clusters):
	mask = cluster_labels == i
	if np.sum(mask) >= 3: # Need at least 3 points
	x_cluster = x[mask]
	y_cluster = y[mask]

	# Fit linear model for this cluster
	coeffs = np.polyfit(x_cluster, y_cluster, 1)
	poly_func = np.poly1d(coeffs)

	y_pred_cluster = poly_func(x_cluster)
	r2_cluster = r2_score(y_cluster, y_pred_cluster)
	mse_cluster = np.mean((y_cluster - y_pred_cluster)**2)

	cluster_models.append({
	'coeffs': coeffs,
	'mask': mask,
	'r2': r2_cluster,
	'mse': mse_cluster,
	'n_points': np.sum(mask)
	})

	# Weight by cluster size
	weight = np.sum(mask) / total_points
	total_r2_weighted += r2_cluster * weight
	total_mse_weighted += mse_cluster * weight

	if len(cluster_models) > 0:
	def cluster_predict(x_new):
	# For prediction, assign to nearest cluster centroid
	result = np.zeros_like(x_new)
	for i, model in enumerate(cluster_models):
	if i == 0: # For simplicity, use first cluster model for all predictions
	result = np.polyval(model['coeffs'], x_new)
	break
	return result

	return {
	'cluster_models': cluster_models,
	'cluster_labels': cluster_labels,
	'r2': total_r2_weighted,
	'mse': total_mse_weighted,
	'model_func': cluster_predict,
	'name': f'Clustered Linear (k={n_clusters})',
	'param_names': [f'cluster_{i}_slope' for i in range(len(cluster_models))]
	}
	except:
	pass

	return None

	def fit_all_models(x, y):
	"""
	Fit simplified model set: only linear and saturation models.
	Returns only the single best model between linear and saturation.
	"""
	models = []

	# Linear model (baseline)
	try:
	linear_coeffs = np.polyfit(x, y, 1)
	linear_func = np.poly1d(linear_coeffs)
	y_pred_linear = linear_func(x)
	r2_linear = r2_score(y, y_pred_linear)
	mse_linear = np.mean((y - y_pred_linear)**2)

	linear_model = {
	'name': 'Linear',
	'r2': r2_linear,
	'mse': mse_linear,
	'model_func': lambda x_new: linear_func(x_new),
	'params': linear_coeffs,
	'param_names': ['slope', 'intercept'],
	'type': 'linear'
	}
	models.append(linear_model)
	except:
	pass

	# Saturation model (with bidirectional fitting)
	saturation_result = fit_saturation_model(x, y)
	if saturation_result:
	saturation_result['type'] = 'saturation'
	models.append(saturation_result)

	if not models:
	return []

	# Sort by R² score (descending) and get the best one
	models.sort(key=lambda m: m['r2'], reverse=True)
	best_model = models[0]

	# Apply preference logic
	if len(models) > 1:
	# Find linear model
	linear_model = next((m for m in models if m['type'] == 'linear'), None)
	if linear_model:
	# Prefer linear model in two cases:
	# 1. When overall performance is poor (R² < 0.5)
	# 2. When linear model performs well (R² > 0.7) and is not significantly worse than best model
	if best_model['r2'] < 0.5:
	linear_model['name'] = 'Linear'
	linear_model['preferred'] = True
	linear_model['preference_reason'] = f"Preferred due to poor overall performance (best R² = {best_model['r2']:.3f} < 0.5)"
	return [linear_model]
	elif linear_model['r2'] > 0.7 and (best_model['r2'] - linear_model['r2']) < 0.1:
	linear_model['name'] = 'Linear'
	linear_model['preferred'] = True
	linear_model['preference_reason'] = f"Preferred due to good linear fit (R² = {linear_model['r2']:.3f}) with minimal improvement from saturation model"
	return [linear_model]

	# Return only the best model
	return [best_model]

	def create_advanced_scatter_plot(df, x_bench, y_bench, stderr_df=None):
	"""Create an advanced scatter plot with single best model fit."""
	if x_bench not in df.columns or y_bench not in df.columns:
	return None, None

	# Get common data
	common_data = df[[x_bench, y_bench]].dropna()

	if len(common_data) < 5: # Need more data for advanced fitting
	return None, None

	x_vals = common_data[x_bench].values
	y_vals = common_data[y_bench].values

	# Fit models (returns only the best one)
	models = fit_all_models(x_vals, y_vals)

	if not models:
	return None, None

	best_model = models[0]

	# Create figure
	fig = go.Figure()

	# Add scatter points
	fig.add_trace(go.Scatter(
	x=x_vals,
	y=y_vals,
	mode='markers',
	text=common_data.index,
	hovertemplate=(
	"<b>%{text}</b><br>" +
	f"{clean_benchmark_name(x_bench)}: %{{x:.3f}}<br>" +
	f"{clean_benchmark_name(y_bench)}: %{{y:.3f}}<br>" +
	"<extra></extra>"
	),
	marker=dict(size=8, opacity=0.7, color='steelblue'),
	name='Data Points'
	))

	# Add the best model fit
	try:
	# Check if we have pre-computed curve coordinates (for saturation models)
	if 'curve_coords' in best_model:
	x_line_valid = best_model['curve_coords']['x_coords']
	y_line_valid = best_model['curve_coords']['y_coords']

	# Filter out NaN values for plotting
	valid_mask = ~np.isnan(y_line_valid) & ~np.isnan(x_line_valid)
	if np.any(valid_mask):
	x_line_valid = x_line_valid[valid_mask]
	y_line_valid = y_line_valid[valid_mask]
	else:
	# For other models, generate curve points as before
	x_line = np.linspace(x_vals.min(), x_vals.max(), 100)
	y_line = best_model['model_func'](x_line)

	# Filter out NaN values for plotting
	valid_mask = ~np.isnan(y_line)
	if np.any(valid_mask):
	x_line_valid = x_line[valid_mask]
	y_line_valid = y_line[valid_mask]
	else:
	x_line_valid = y_line_valid = np.array([])

	if len(x_line_valid) > 0:
	# Format model name and stats
	model_name = f"{best_model['name']} (R²={best_model['r2']:.3f})"

	# Style based on model type
	if best_model['type'] == 'linear':
	line_color = 'red'
	line_width = 3
	else: # saturation
	line_color = 'green'
	line_width = 3

	# Add star for preferred models
	if best_model.get('preferred', False):
	model_name = f"{model_name}"
	line_color = 'darkblue'

	fig.add_trace(go.Scatter(
	x=x_line_valid,
	y=y_line_valid,
	mode='lines',
	name=model_name,
	line=dict(color=line_color, width=line_width)
	))
	except Exception as e:
	st.warning(f"Could not plot model curve: {e}")

	# Update layout
	fig.update_layout(
	title=f"{clean_benchmark_name(y_bench)} vs {clean_benchmark_name(x_bench)}",
	xaxis_title=clean_benchmark_name(x_bench),
	yaxis_title=clean_benchmark_name(y_bench),
	showlegend=True,
	width=800,
	height=600
	)

	return fig, models

	def show_advanced_modeling(df, stderr_df):
	"""Show the advanced modeling interface for benchmark prediction."""
	st.header("🤖 Advanced Benchmark Modeling & Prediction")

	st.markdown("""
	This section provides advanced modeling capabilities to better understand and predict benchmark relationships.
	It handles the different correlation patterns you've identified: hockey stick (saturation), linear, and noisy/clustered patterns.
	""")

	# Model configuration
	st.subheader("🛠️ Model Configuration")

	col1, col2, col3 = st.columns(3)

	with col1:
	modeling_approach = st.selectbox(
	"Choose Modeling Approach",
	["Single Pair Analysis", "Multi-Benchmark Prediction", "Ensemble Prediction"],
	help="Single Pair: Analyze relationship between two benchmarks\nMulti-Benchmark: Predict one benchmark from multiple others\nEnsemble: Combine multiple models for robust prediction"
	)

	with col2:
	min_data_points = st.slider(
	"Minimum Data Points",
	min_value=5,
	max_value=50,
	value=10,
	help="Minimum number of models needed for reliable modeling"
	)

	with col3:
	cross_validation = st.checkbox(
	"Cross Validation",
	value=True,
	help="Use cross-validation to assess model generalization"
	)

	if modeling_approach == "Single Pair Analysis":
	show_single_pair_analysis(df, stderr_df, min_data_points, cross_validation)
	elif modeling_approach == "Multi-Benchmark Prediction":
	show_multi_benchmark_prediction(df, stderr_df, min_data_points, cross_validation)
	else: # Ensemble Prediction
	show_ensemble_prediction(df, stderr_df, min_data_points, cross_validation)

	def show_single_pair_analysis(df, stderr_df, min_data_points, cross_validation):
	"""Detailed single pair analysis with all model types."""
	st.subheader("🔍 Single Pair Deep Analysis")

	col1, col2 = st.columns(2)

	with col1:
	x_benchmark = st.selectbox("Predictor Benchmark", df.columns, format_func=clean_benchmark_name)
	with col2:
	y_benchmark = st.selectbox("Target Benchmark", df.columns,
	index=1 if len(df.columns) > 1 else 0,
	format_func=clean_benchmark_name)

	if x_benchmark == y_benchmark:
	st.warning("Please select different benchmarks for meaningful analysis.")
	return

	# Get data
	common_data = df[[x_benchmark, y_benchmark]].dropna()

	if len(common_data) < min_data_points:
	st.error(f"Insufficient data: {len(common_data)} points available, {min_data_points} required.")
	return

	x_vals = common_data[x_benchmark].values
	y_vals = common_data[y_benchmark].values

	# Fit all models
	with st.spinner("Fitting models..."):
	models = fit_all_models(x_vals, y_vals)

	if not models:
	st.error("Failed to fit any models to the data.")
	return

	# Cross validation if requested
	if cross_validation and len(common_data) >= 10:
	with st.spinner("Performing cross-validation..."):
	cv_results = perform_cross_validation(x_vals, y_vals, models[:5]) # Top 5 models

	st.subheader("📊 Cross-Validation Results")
	cv_df = pd.DataFrame(cv_results)
	st.dataframe(cv_df, use_container_width=True)

	# Create visualization
	fig, _ = create_advanced_scatter_plot(df, x_benchmark, y_benchmark, stderr_df)
	if fig:
	st.plotly_chart(fig, use_container_width=True)

	# Model comparison
	st.subheader("🏆 Model Performance Ranking")

	model_data = []
	for i, model in enumerate(models):
	model_data.append({
	'Rank': i + 1,
	'Model': model['name'],
	'R² Score': f"{model['r2']:.4f}",
	'MSE': f"{model['mse']:.6f}",
	'Type': model['type'],
	'Recommended': get_model_recommendation(model, x_vals, y_vals)
	})

	model_df = pd.DataFrame(model_data)
	st.dataframe(model_df, use_container_width=True)

	# Pattern analysis
	st.subheader("🔍 Pattern Analysis")

	best_model = models[0]
	pattern_type = analyze_relationship_pattern(x_vals, y_vals, best_model)

	pattern_colors = {
	'Linear': 'info',
	'Hockey Stick': 'warning',
	'Saturation': 'warning',
	'Non-linear': 'info',
	'Clustered': 'error',
	'Noisy': 'error'
	}

	pattern_color = pattern_colors.get(pattern_type, 'info')

	if pattern_color == 'warning':
	st.warning(f"Pattern Detected: {pattern_type}\n\n{get_pattern_explanation(pattern_type)}")
	elif pattern_color == 'error':
	st.error(f"Pattern Detected: {pattern_type}\n\n{get_pattern_explanation(pattern_type)}")
	else:
	st.info(f"Pattern Detected: {pattern_type}\n\n{get_pattern_explanation(pattern_type)}")

	# Interactive prediction
	st.subheader("🎯 Interactive Prediction")

	col1, col2, col3 = st.columns(3)

	with col1:
	selected_model_idx = st.selectbox(
	"Choose Model for Prediction",
	range(len(models[:5])), # Top 5 models
	format_func=lambda i: f"{models[i]['name']} (R²={models[i]['r2']:.3f})"
	)

	with col2:
	x_input = st.number_input(
	f"{clean_benchmark_name(x_benchmark)} Score",
	min_value=0.0,
	max_value=1.0,
	value=0.5,
	step=0.01,
	format="%.3f"
	)

	with col3:
	if st.button("🔮 Predict", type="primary"):
	selected_model = models[selected_model_idx]
	try:
	prediction = selected_model['model_func'](np.array([x_input]))[0]
	confidence = calculate_prediction_confidence(selected_model, x_vals, y_vals, x_input)

	st.success(f"Predicted {clean_benchmark_name(y_benchmark)}: {prediction:.3f}")
	st.info(f"Model: {selected_model['name']} \| Confidence: {confidence}")
	except Exception as e:
	st.error(f"Prediction failed: {str(e)}")

	def show_multi_benchmark_prediction(df, stderr_df, min_data_points, cross_validation):
	"""Multi-benchmark prediction interface."""
	st.subheader("🎯 Multi-Benchmark Prediction")

	st.info("Predict one benchmark using multiple others as predictors.")

	# Target selection
	target_benchmark = st.selectbox(
	"Select Target Benchmark to Predict",
	df.columns,
	format_func=clean_benchmark_name
	)

	# Predictor selection
	predictor_benchmarks = st.multiselect(
	"Select Predictor Benchmarks",
	[col for col in df.columns if col != target_benchmark],
	default=[col for col in df.columns if col != target_benchmark][:3], # Default first 3
	format_func=clean_benchmark_name
	)

	if not predictor_benchmarks:
	st.warning("Please select at least one predictor benchmark.")
	return

	# Filter data to models with complete data
	all_benchmarks = [target_benchmark] + predictor_benchmarks
	complete_data = df[all_benchmarks].dropna()

	if len(complete_data) < min_data_points:
	st.error(f"Insufficient complete data: {len(complete_data)} models available, {min_data_points} required.")
	return

	# Prepare data
	X = complete_data[predictor_benchmarks].values
	y = complete_data[target_benchmark].values

	# Fit ensemble of models
	with st.spinner("Training multi-benchmark models..."):
	ensemble_results = fit_multi_benchmark_models(X, y, predictor_benchmarks)

	# Display results
	st.subheader("📊 Multi-Benchmark Model Performance")

	results_data = []
	for model_name, result in ensemble_results.items():
	results_data.append({
	'Model': model_name,
	'R² Score': f"{result['r2']:.4f}",
	'MAE': f"{result['mae']:.4f}",
	'Feature Importance': result.get('importance', 'N/A')
	})

	results_df = pd.DataFrame(results_data)
	st.dataframe(results_df, use_container_width=True)

	# Feature importance visualization
	best_model_name = max(ensemble_results.keys(), key=lambda k: ensemble_results[k]['r2'])
	best_model = ensemble_results[best_model_name]

	if 'feature_importance' in best_model:
	st.subheader("📈 Feature Importance")

	importance_data = pd.DataFrame({
	'Benchmark': [clean_benchmark_name(b) for b in predictor_benchmarks],
	'Importance': best_model['feature_importance']
	}).sort_values('Importance', ascending=True)

	fig_importance = px.bar(
	importance_data,
	x='Importance',
	y='Benchmark',
	orientation='h',
	title=f"Feature Importance for Predicting {clean_benchmark_name(target_benchmark)}"
	)
	st.plotly_chart(fig_importance, use_container_width=True)

	# Interactive prediction
	st.subheader("🎯 Multi-Benchmark Prediction")

	st.write("Enter scores for predictor benchmarks:")

	input_values = {}
	cols = st.columns(min(len(predictor_benchmarks), 3))

	for i, benchmark in enumerate(predictor_benchmarks):
	with cols[i % 3]:
	input_values[benchmark] = st.number_input(
	clean_benchmark_name(benchmark),
	min_value=0.0,
	max_value=1.0,
	value=float(df[benchmark].median()),
	step=0.001,
	format="%.3f",
	key=f"input_{benchmark}"
	)

	if st.button("🔮 Predict from Multiple Benchmarks", type="primary"):
	input_array = np.array([[input_values[b] for b in predictor_benchmarks]])

	# Use best model for prediction
	prediction = best_model['model'].predict(input_array)[0]

	st.success(f"Predicted {clean_benchmark_name(target_benchmark)}: {prediction:.3f}")
	st.info(f"Using model: {best_model_name} (R² = {best_model['r2']:.3f})")

	def show_ensemble_prediction(df, stderr_df, min_data_points, cross_validation):
	"""Ensemble prediction combining multiple approaches."""
	st.subheader("🎭 Ensemble Prediction")

	st.info("Combine multiple modeling approaches for robust predictions.")

	# Implementation for ensemble prediction
	st.write("🚧 Ensemble prediction coming soon! This will combine:")
	st.write("- Multiple model types (linear, non-linear, clustering)")
	st.write("- Multiple predictor sets")
	st.write("- Uncertainty quantification")
	st.write("- Robust prediction intervals")

	# Helper functions for advanced modeling

	def perform_cross_validation(x, y, models, n_folds=5):
	"""Perform cross-validation on models."""
	from sklearn.model_selection import KFold

	cv_results = []
	kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

	for model in models:
	fold_r2_scores = []
	fold_mae_scores = []

	for train_idx, test_idx in kf.split(x):
	x_train, x_test = x[train_idx], x[test_idx]
	y_train, y_test = y[train_idx], y[test_idx]

	try:
	# Re-fit model on training data
	if model['type'] == 'parametric':
	if 'Hockey' in model['name']:
	fitted_model = fit_hockey_stick_model(x_train, y_train)
	elif 'Saturation' in model['name']:
	fitted_model = fit_saturation_model(x_train, y_train)
	elif 'Polynomial' in model['name']:
	degree = 2 if 'degree=2' in model['name'] else 3
	fitted_model = fit_polynomial_model(x_train, y_train, degree)
	else: # Linear
	fitted_model = fit_polynomial_model(x_train, y_train, 1)

	if fitted_model:
	y_pred = fitted_model['model_func'](x_test)
	fold_r2 = r2_score(y_test, y_pred)
	fold_mae = mean_absolute_error(y_test, y_pred)

	fold_r2_scores.append(fold_r2)
	fold_mae_scores.append(fold_mae)

	elif model['type'] == 'non_parametric' and 'Random Forest' in model['name']:
	# Handle Random Forest models
	fitted_model = fit_random_forest_model(x_train, y_train)

	if fitted_model:
	y_pred = fitted_model['model_func'](x_test)
	fold_r2 = r2_score(y_test, y_pred)
	fold_mae = mean_absolute_error(y_test, y_pred)

	fold_r2_scores.append(fold_r2)
	fold_mae_scores.append(fold_mae)
	except:
	continue

	if fold_r2_scores:
	cv_results.append({
	'Model': model['name'],
	'CV R² Mean': f"{np.mean(fold_r2_scores):.4f}",
	'CV R² Std': f"{np.std(fold_r2_scores):.4f}",
	'CV MAE Mean': f"{np.mean(fold_mae_scores):.4f}",
	'CV MAE Std': f"{np.std(fold_mae_scores):.4f}"
	})

	return cv_results

	def get_model_recommendation(model, x_vals, y_vals):
	"""Get recommendation for when to use this model."""
	model_name = model['name']
	r2 = model['r2']

	# Check if this is a preferred linear model
	is_preferred_linear = 'Linear (Preferred' in model_name
	has_preference_reason = 'preference_reason' in model

	# Check if this is Random Forest with OOB validation
	is_rf_oob = 'Random Forest' in model_name and '(OOB)' in model_name

	if is_preferred_linear:
	if has_preference_reason:
	return f"Recommended: {model.get('preference_reason', 'Simple model preferred')}"
	else:
	return "Recommended: Simple linear model preferred"
	elif r2 < 0.3:
	return "Poor fit - not recommended"
	elif 'Hockey Stick' in model_name:
	return "Good for saturation patterns"
	elif 'Saturation' in model_name:
	return "Good for gradual leveling off"
	elif 'Polynomial' in model_name:
	return "Good for curved relationships"
	elif 'Clustered' in model_name:
	return "Good for grouped data"
	elif 'Random Forest' in model_name:
	if is_rf_oob:
	if r2 > 0.7:
	return "Excellent non-parametric fit (OOB validated)"
	elif r2 > 0.5:
	return "Good non-parametric fit (OOB validated)"
	else:
	return "Moderate non-parametric fit - consider simpler models"
	else:
	return "Non-parametric model - may overfit on small datasets"
	elif 'Linear' in model_name:
	if r2 > 0.8:
	return "Excellent linear fit - highly recommended"
	elif r2 > 0.6:
	return "Good linear fit - recommended"
	elif r2 > 0.4:
	return "Moderate linear fit - simple and interpretable"
	else:
	return "Weak linear fit - consider other patterns"
	elif r2 > 0.8:
	return "Excellent fit - highly recommended"
	elif r2 > 0.6:
	return "Good fit - recommended"
	else:
	return "Moderate fit - use with caution"

	def analyze_relationship_pattern(x_vals, y_vals, best_model):
	"""Analyze the relationship pattern between benchmarks."""
	model_name = best_model['name']
	r2 = best_model['r2']

	# Check for different patterns
	if 'Hockey Stick' in model_name and r2 > 0.6:
	return 'Hockey Stick'
	elif 'Saturation' in model_name and r2 > 0.6:
	return 'Saturation'
	elif 'Clustered' in model_name and r2 > 0.5:
	return 'Clustered'
	elif r2 < 0.4:
	return 'Noisy'
	elif 'Polynomial' in model_name and r2 > 0.6:
	return 'Non-linear'
	else:
	return 'Linear'

	def get_pattern_explanation(pattern_type):
	"""Get explanation for different pattern types."""
	explanations = {
	'Linear': "The benchmarks show a consistent linear relationship. Performance on one benchmark predicts the other reliably.",
	'Hockey Stick': "One benchmark saturates while the other continues improving. This suggests a capability ceiling for one benchmark.",
	'Saturation': "The relationship shows gradual leveling off, indicating diminishing returns at higher performance levels.",
	'Non-linear': "The relationship is curved but predictable. Consider the full range when making predictions.",
	'Clustered': "The data shows distinct groups or clusters. Different model families may follow different patterns.",
	'Noisy': "The relationship is weak or highly variable. Predictions should be made with caution and wide confidence intervals."
	}

	return explanations.get(pattern_type, "Unknown pattern type.")

	def calculate_prediction_confidence(model, x_vals, y_vals, x_input):
	"""Calculate prediction confidence."""
	# Simple confidence based on how close input is to training data
	x_min, x_max = x_vals.min(), x_vals.max()

	if x_min <= x_input <= x_max:
	# Within training range
	distance_from_center = abs(x_input - np.median(x_vals))
	max_distance = max(abs(x_min - np.median(x_vals)), abs(x_max - np.median(x_vals)))
	confidence_score = 1.0 - (distance_from_center / max_distance)

	r2_factor = model['r2']
	overall_confidence = confidence_score * r2_factor

	if overall_confidence > 0.8:
	return "High"
	elif overall_confidence > 0.5:
	return "Medium"
	else:
	return "Low"
	else:
	return "Very Low (Extrapolation)"

	def fit_multi_benchmark_models(X, y, predictor_names):
	"""Fit multiple models for multi-benchmark prediction."""
	from sklearn.linear_model import LinearRegression, Ridge
	from sklearn.ensemble import RandomForestRegressor

	models = {}

	# Linear Regression
	try:
	lr = LinearRegression()
	lr.fit(X, y)
	y_pred = lr.predict(X)

	models['Linear Regression'] = {
	'model': lr,
	'r2': r2_score(y, y_pred),
	'mae': mean_absolute_error(y, y_pred),
	'feature_importance': np.abs(lr.coef_),
	'importance': 'Linear coefficients'
	}
	except:
	pass

	# Ridge Regression
	try:
	ridge = Ridge(alpha=1.0)
	ridge.fit(X, y)
	y_pred = ridge.predict(X)

	models['Ridge Regression'] = {
	'model': ridge,
	'r2': r2_score(y, y_pred),
	'mae': mean_absolute_error(y, y_pred),
	'feature_importance': np.abs(ridge.coef_),
	'importance': 'Regularized coefficients'
	}
	except:
	pass

	# Random Forest
	try:
	n_samples, n_features = X.shape

	# Adjust Random Forest parameters based on dataset size to prevent overfitting
	if n_samples < 30:
	# Very conservative for small datasets
	rf = RandomForestRegressor(
	n_estimators=50,
	max_depth=2,
	min_samples_split=max(2, n_samples // 8),
	min_samples_leaf=max(1, n_samples // 15),
	max_features=min(2, n_features), # Limit feature subset
	random_state=42,
	bootstrap=True,
	oob_score=True if n_samples > 10 else False
	)
	elif n_samples < 100:
	# Moderately conservative
	rf = RandomForestRegressor(
	n_estimators=100,
	max_depth=3,
	min_samples_split=max(2, n_samples // 6),
	min_samples_leaf=max(1, n_samples // 12),
	max_features=min(3, max(1, n_features // 2)),
	random_state=42,
	bootstrap=True,
	oob_score=True
	)
	else:
	# Still conservative but allow more complexity
	rf = RandomForestRegressor(
	n_estimators=100,
	max_depth=5,
	min_samples_split=max(2, n_samples // 5),
	min_samples_leaf=max(2, n_samples // 10),
	max_features='sqrt', # Standard sqrt(n_features)
	random_state=42,
	bootstrap=True,
	oob_score=True
	)

	rf.fit(X, y)
	y_pred = rf.predict(X)

	# Use OOB score if available as it's a better estimate
	oob_r2 = getattr(rf, 'oob_score_', None)
	train_r2 = r2_score(y, y_pred)
	display_r2 = oob_r2 if oob_r2 is not None else train_r2

	models['Random Forest'] = {
	'model': rf,
	'r2': display_r2,
	'r2_train': train_r2, # Keep training R² for comparison
	'mae': mean_absolute_error(y, y_pred),
	'feature_importance': rf.feature_importances_,
	'importance': f'Tree-based importance {"(OOB validated)" if oob_r2 is not None else ""}'
	}
	except:
	pass

	return models

	if __name__ == "__main__":
	main()