|
from __future__ import annotations |
|
import gradio as gr |
|
import pandas as pd |
|
from pathlib import Path |
|
from typing import Union |
|
|
|
|
|
BASE_DIR = Path(__file__).resolve().parent.parent |
|
DATA_PATH = BASE_DIR / "data" / "leaderboard.csv" |
|
|
|
|
|
CATEGORY_TO_HIGHLIGHT = "Deep Research Agent" |
|
HIGHLIGHT_EMOJI = "🚀" |
|
|
|
|
|
COLUMN_RENAME_MAP = { |
|
'overall_score': 'overall', |
|
'comprehensiveness': 'comp.', |
|
'insight': 'insight', |
|
'instruction_following': 'inst.', |
|
'readability': 'read.', |
|
'citation_accuracy': 'c.acc.', |
|
'effective_citations': 'eff.c.' |
|
} |
|
|
|
|
|
MODEL_CATEGORIES = { |
|
"Deep Research Agent": [ |
|
"gemini-2.5-pro-deepresearch", |
|
"grok-deeper-search", |
|
"openai-deepresearch", |
|
"perplexity-Research", |
|
"doubao-deepresearch", |
|
"kimi-researcher", |
|
"claude-research", |
|
"langchain-open-deep-research" |
|
], |
|
"LLM with Search": [ |
|
"claude-3-7-sonnet-with-search", |
|
"claude-3-5-sonnet-with-search", |
|
"sonar-reasoning-pro", |
|
"sonar-reasoning", |
|
"sonar-pro", |
|
"sonar", |
|
"gemini-2.5-pro-preview-05-06", |
|
"gpt-4o-search-preview", |
|
"gpt-4.1", |
|
"gemini-2.5-flash-preview-04-17", |
|
"gpt-4o-mini-search-preview", |
|
"nvidia-aiq-research-assistant", |
|
"gpt-4.1-mini" |
|
] |
|
} |
|
|
|
|
|
MODEL_LINKS = { |
|
|
|
"gemini-2.5-pro-deepresearch": "https://gemini.google/overview/deep-research/", |
|
"grok-deeper-search": "https://x.ai/news/grok-3", |
|
"openai-deepresearch": "https://openai.com/zh-Hans-CN/index/introducing-deep-research/", |
|
"perplexity-Research": "https://www.perplexity.ai/hub/blog/introducing-perplexity-deep-research", |
|
"doubao-deepresearch": "https://www.doubao.com/chat/", |
|
"kimi-researcher": "https://moonshotai.github.io/Kimi-Researcher/", |
|
"claude-research": "https://www.anthropic.com/news/research", |
|
"nvidia-aiq-research-assistant": "https://github.com/NVIDIA-AI-Blueprints/aiq-research-assistant", |
|
"langchain-open-deep-research": "https://github.com/langchain-ai/open_deep_research", |
|
|
|
|
|
"claude-3-7-sonnet-with-search": "", |
|
"claude-3-5-sonnet-with-search": "", |
|
"sonar-reasoning-pro": "", |
|
"sonar-reasoning": "", |
|
"sonar-pro": "", |
|
"sonar": "", |
|
"gemini-2.5-pro-preview-05-06": "", |
|
"gpt-4o-search-preview": "", |
|
"gpt-4.1": "", |
|
"gemini-2.5-flash-preview-04-17": "", |
|
"gpt-4o-mini-search-preview": "", |
|
"gpt-4.1-mini": "" |
|
} |
|
|
|
|
|
MODEL_LICENSE_TYPE = { |
|
|
|
"gemini-2.5-pro-deepresearch": "Proprietary", |
|
"grok-deeper-search": "Proprietary", |
|
"openai-deepresearch": "Proprietary", |
|
"perplexity-Research": "Proprietary", |
|
"doubao-deepresearch": "Proprietary", |
|
"kimi-researcher": "Proprietary", |
|
"claude-research": "Proprietary", |
|
"nvidia-aiq-research-assistant": "Apache 2.0", |
|
"langchain-open-deep-research": "MIT", |
|
|
|
|
|
"claude-3-7-sonnet-with-search": "Proprietary", |
|
"claude-3-5-sonnet-with-search": "Proprietary", |
|
"sonar-reasoning-pro": "Proprietary", |
|
"sonar-reasoning": "Proprietary", |
|
"sonar-pro": "Proprietary", |
|
"sonar": "Proprietary", |
|
"gemini-2.5-pro-preview-05-06": "Proprietary", |
|
"gpt-4o-search-preview": "Proprietary", |
|
"gpt-4.1": "Proprietary", |
|
"gemini-2.5-flash-preview-04-17": "Proprietary", |
|
"gpt-4o-mini-search-preview": "Proprietary", |
|
"gpt-4.1-mini": "Proprietary" |
|
} |
|
|
|
def load_leaderboard() -> pd.DataFrame: |
|
if not DATA_PATH.exists(): |
|
raise FileNotFoundError( |
|
f"Leaderboard file not found: {DATA_PATH}.\n" |
|
"→ 先运行 rank_leaderboard.py 生成 data/leaderboard.csv" |
|
) |
|
df = pd.read_csv(DATA_PATH) |
|
df.columns = [c.strip() for c in df.columns] |
|
|
|
def get_category(model_name): |
|
for category, models in MODEL_CATEGORIES.items(): |
|
if model_name in models: |
|
return category |
|
return "Others" |
|
|
|
def get_license_type(model_name): |
|
return MODEL_LICENSE_TYPE.get(model_name, "Unknown") |
|
|
|
df['category'] = df['model'].apply(get_category) |
|
df['license_type'] = df['model'].apply(get_license_type) |
|
return df |
|
|
|
def make_ranked(df: pd.DataFrame) -> pd.DataFrame: |
|
ranked = df.sort_values(by='overall_score', ascending=False).reset_index(drop=True) |
|
ranked.insert(0, "Rank", range(1, len(ranked) + 1)) |
|
|
|
|
|
ranked = ranked.rename(columns=COLUMN_RENAME_MAP) |
|
|
|
|
|
numeric_columns = ['overall', 'comp.', 'insight', 'inst.', 'read.', 'c.acc.', 'eff.c.'] |
|
for col in numeric_columns: |
|
if col in ranked.columns: |
|
|
|
ranked[col] = ranked[col].apply( |
|
lambda x: round(float(x), 2) if x != "-" and pd.notna(x) else x |
|
) |
|
|
|
|
|
def format_model_name(row): |
|
model_name = row['model'] |
|
link = MODEL_LINKS.get(model_name, "") |
|
|
|
|
|
if row['category'] == CATEGORY_TO_HIGHLIGHT: |
|
display_name = f'<span style="color: #823AFF;">{HIGHLIGHT_EMOJI} {model_name}</span>' |
|
else: |
|
display_name = model_name |
|
|
|
|
|
if link and link.strip(): |
|
return f'<a href="{link}" target="_blank" style="text-decoration: none;">{display_name}</a>' |
|
else: |
|
|
|
return f'<span class="model-name" data-model="{model_name}">{display_name}</span>' |
|
|
|
ranked['model'] = ranked.apply(format_model_name, axis=1) |
|
|
|
return ranked |
|
|
|
def filter_data(search_text: str, selected_categories: list): |
|
df = load_leaderboard() |
|
|
|
if search_text.strip(): |
|
df = df[df['model'].str.contains(search_text.strip(), case=False, na=False)] |
|
|
|
if selected_categories: |
|
df = df[df['category'].isin(selected_categories)] |
|
|
|
ranked_df = make_ranked(df) |
|
return ranked_df |
|
|
|
def create_leaderboard_tab(): |
|
with gr.Tab("🏆Leaderboard"): |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
search_box = gr.Textbox( |
|
label="Model Search", |
|
placeholder="Entering model name to search...", |
|
value="" |
|
) |
|
with gr.Column(scale=2): |
|
category_checkboxes = gr.CheckboxGroup( |
|
label="Model Categories", |
|
choices=list(MODEL_CATEGORIES.keys()), |
|
value=list(MODEL_CATEGORIES.keys()) |
|
) |
|
|
|
|
|
initial_df = make_ranked(load_leaderboard()) |
|
|
|
|
|
column_count = len(initial_df.columns) |
|
datatypes = ["str"] * column_count |
|
model_col_index = initial_df.columns.get_loc('model') |
|
datatypes[model_col_index] = "html" |
|
|
|
|
|
table = gr.Dataframe( |
|
value=initial_df, |
|
datatype=datatypes, |
|
max_height=600, |
|
show_label=False, |
|
elem_id="leaderboard_table", |
|
interactive=False, |
|
wrap=False, |
|
column_widths=["80px", "350px", "100px", "100px", "100px", "100px", "100px", "100px", "100px", "200px", "150px"] |
|
) |
|
|
|
def update_display(search_text, selected_categories): |
|
df = filter_data(search_text, selected_categories) |
|
return df |
|
|
|
|
|
search_box.change( |
|
fn=update_display, |
|
inputs=[search_box, category_checkboxes], |
|
outputs=table |
|
) |
|
category_checkboxes.change( |
|
fn=update_display, |
|
inputs=[search_box, category_checkboxes], |
|
outputs=table |
|
) |
|
|
|
|
|
with gr.Row(): |
|
gr.Markdown(f""" |
|
### 📊 Column Descriptions |
|
- **Rank**: Model ranking based on overall score |
|
- **model**: Model name (<span style="color: #823AFF;">{HIGHLIGHT_EMOJI} = {CATEGORY_TO_HIGHLIGHT}</span>) |
|
- **overall**: Overall Score (weighted average of all metrics) |
|
- **comp.**: Comprehensiveness - How thorough and complete the research is |
|
- **insight**: Insight Quality - Depth and value of analysis |
|
- **inst.**: Instruction Following - Adherence to user instructions |
|
- **read.**: Readability - Clarity and organization of content |
|
- **c.acc.**: Citation Accuracy - Correctness of references |
|
- **eff.c.**: Effective Citations - Relevance and quality of sources |
|
- **category**: Model category |
|
- **license_type**: The software license type of the model/service |
|
|
|
💡 **Tip**: Model names are clickable when links are available. Visit the GitHub repositories for more details! |
|
""") |
|
|
|
return search_box |