Upload 2 files
Browse files- app.py +4 -4
- llm_judge_results.jsonl +59 -59
app.py
CHANGED
|
@@ -24,7 +24,7 @@ color_map = {
|
|
| 24 |
CAPTION_V2 = f"""**ProfBench**: Over 7,000 brand-new expert-authored response–criterion pairs across 80 professional tasks across PhD STEM (Chemistry, Physics) and MBA Services (Finance, Consulting) domains. \n
|
| 25 |
ProfBench is a high-quality, text-only dataset that represent the complex reasoning tasks faced by professionals in fields like finance and chemistry. We're not talking about simple Q&A or retrieval-based tasks. We're talking about multi-page assignments that require deep domain knowledge and reasoning. Can AI generate comprehensive reports by applying the nuanced reasoning that a PhD-level physicist/chemist or an MBA-level consultant/financier would have? \n
|
| 26 |
[Blog](https://huggingface.co/blog/nvidia/profbench) | [Paper](https://arxiv.org/abs/2510.18941) | [Data](https://huggingface.co/datasets/nvidia/ProfBench) | [Code](https://github.com/NVlabs/ProfBench) | [Nemo Evaluator SDK](https://github.com/NVIDIA-NeMo/Evaluator)\n
|
| 27 |
-
Want to see your favorite models added? Run it with [Nemo Evaluator SDK for scalable evaluation](https://github.com/NVIDIA-NeMo/Evaluator) or [ProfBench code for quick evaluation](https://github.com/NVlabs/ProfBench), send us the scores or ping
|
| 28 |
|
| 29 |
|
| 30 |
def color_model_type_column(df, color_map):
|
|
@@ -111,7 +111,7 @@ with gr.Blocks(theme=theme) as app:
|
|
| 111 |
with gr.TabItem("Report Generation"):
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column(scale=7):
|
| 114 |
-
gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples)")
|
| 115 |
|
| 116 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 117 |
with gr.TabItem("Leaderboard"):
|
|
@@ -154,7 +154,7 @@ with gr.Blocks(theme=theme) as app:
|
|
| 154 |
|
| 155 |
with gr.TabItem("LLM Judge"):
|
| 156 |
with gr.Row():
|
| 157 |
-
gr.Markdown("LLM Judge Leaderboard: LLM Judges are evaluated based on whether they can accurately predict the human-labelled criterion fulfilment across 3 different models (o3, Grok4, R1-0528). We consider not only macro-F1 across 3486 samples but also whether LLM-Judge display bias towards/against any models using a Bias Index. The Overall score is calculated based on Overall F1 - Bias Index.")
|
| 158 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 159 |
with gr.TabItem("Leaderboard"):
|
| 160 |
with gr.Row():
|
|
@@ -197,7 +197,7 @@ with gr.Blocks(theme=theme) as app:
|
|
| 197 |
with gr.TabItem("Report Generation w Docs"):
|
| 198 |
with gr.Row():
|
| 199 |
with gr.Column(scale=7):
|
| 200 |
-
gr.Markdown("Report Generation Leaderboard with Grounding Documents: LLMs generate reports with the human-curated reference documents as context. Results below are based on the full dataset and gpt-oss-120b (mixed) as judge.")
|
| 201 |
|
| 202 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 203 |
with gr.TabItem("Leaderboard"):
|
|
|
|
| 24 |
CAPTION_V2 = f"""**ProfBench**: Over 7,000 brand-new expert-authored response–criterion pairs across 80 professional tasks across PhD STEM (Chemistry, Physics) and MBA Services (Finance, Consulting) domains. \n
|
| 25 |
ProfBench is a high-quality, text-only dataset that represent the complex reasoning tasks faced by professionals in fields like finance and chemistry. We're not talking about simple Q&A or retrieval-based tasks. We're talking about multi-page assignments that require deep domain knowledge and reasoning. Can AI generate comprehensive reports by applying the nuanced reasoning that a PhD-level physicist/chemist or an MBA-level consultant/financier would have? \n
|
| 26 |
[Blog](https://huggingface.co/blog/nvidia/profbench) | [Paper](https://arxiv.org/abs/2510.18941) | [Data](https://huggingface.co/datasets/nvidia/ProfBench) | [Code](https://github.com/NVlabs/ProfBench) | [Nemo Evaluator SDK](https://github.com/NVIDIA-NeMo/Evaluator)\n
|
| 27 |
+
Want to see your favorite models added? Run it with [Nemo Evaluator SDK for scalable evaluation](https://github.com/NVIDIA-NeMo/Evaluator) or [ProfBench code for quick evaluation](https://github.com/NVlabs/ProfBench), send us the scores or ping zhilinw/viviennez [at] nvidia.com to run it for you!"""
|
| 28 |
|
| 29 |
|
| 30 |
def color_model_type_column(df, color_map):
|
|
|
|
| 111 |
with gr.TabItem("Report Generation"):
|
| 112 |
with gr.Row():
|
| 113 |
with gr.Column(scale=7):
|
| 114 |
+
gr.Markdown("Report Generation Leaderboard: LLMs generate reports with just the prompt, which are then evaluated by gpt-oss-120b (mixed) judge with the lite dataset (160 samples) \nEvaluation and cost estimation last performed on 29 Oct 2025.")
|
| 115 |
|
| 116 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 117 |
with gr.TabItem("Leaderboard"):
|
|
|
|
| 154 |
|
| 155 |
with gr.TabItem("LLM Judge"):
|
| 156 |
with gr.Row():
|
| 157 |
+
gr.Markdown("LLM Judge Leaderboard: LLM Judges are evaluated based on whether they can accurately predict the human-labelled criterion fulfilment across 3 different models (o3, Grok4, R1-0528). We consider not only macro-F1 across 3486 samples but also whether LLM-Judge display bias towards/against any models using a Bias Index. The Overall score is calculated based on Overall F1 - Bias Index. \nEvaluation and cost estimations last performed on 20 Sep 2025.")
|
| 158 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 159 |
with gr.TabItem("Leaderboard"):
|
| 160 |
with gr.Row():
|
|
|
|
| 197 |
with gr.TabItem("Report Generation w Docs"):
|
| 198 |
with gr.Row():
|
| 199 |
with gr.Column(scale=7):
|
| 200 |
+
gr.Markdown("Report Generation Leaderboard with Grounding Documents: LLMs generate reports with the human-curated reference documents as context. Results below are based on the full dataset and gpt-oss-120b (mixed) as judge. \nEvaluation and cost estimations last performed on 20 Sep 2025.")
|
| 201 |
|
| 202 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
| 203 |
with gr.TabItem("Leaderboard"):
|
llm_judge_results.jsonl
CHANGED
|
@@ -1,59 +1,59 @@
|
|
| 1 |
-
{"Model": "OpenAI/GPT-4.1", "Category": "Closed-source Instruct", "Overall": 75.4, "Physics": 80.9, "Chemistry": 69.2, "Finance": 71.0, "Consulting": 80.0, "Extraction": 79.8, "Reasoning": 74.4, "Style": 65.8, "Overall F1": 76.3, "o3": 5.5, "R1-0528": 4.6, "Grok4": 5.0, "Bias
|
| 2 |
-
{"Model": "OpenAI/GPT-4.1-mini", "Category": "Closed-source Instruct", "Overall": 74.9, "Physics": 83.9, "Chemistry": 67.3, "Finance": 69.1, "Consulting": 80.6, "Extraction": 79.2, "Reasoning": 74.7, "Style": 69.8, "Overall F1": 76.4, "o3": -0.2, "R1-0528": 1.2, "Grok4": -0.3, "Bias
|
| 3 |
-
{"Model": "OpenAI/GPT-4.1-nano", "Category": "Closed-source Instruct", "Overall": 54.1, "Physics": 69.8, "Chemistry": 62.9, "Finance": 66.7, "Consulting": 68.4, "Extraction": 71.0, "Reasoning": 65.6, "Style": 63.5, "Overall F1": 67.9, "o3": -14.5, "R1-0528": -2.1, "Grok4": -0.7, "Bias
|
| 4 |
-
{"Model": "Google/Gemini-2.5-Flash", "Category": "Closed-source Instruct", "Overall": 73.4, "Physics": 82.9, "Chemistry": 67.3, "Finance": 70.8, "Consulting": 79.6, "Extraction": 79.2, "Reasoning": 74.5, "Style": 67.7, "Overall F1": 76.3, "o3": -4.2, "R1-0528": -6.6, "Grok4": -7.1, "Bias
|
| 5 |
-
{"Model": "Google/Gemini-2.5-Flash-Lite", "Category": "Closed-source Instruct", "Overall": 73.3, "Physics": 83.6, "Chemistry": 68.2, "Finance": 68.2, "Consulting": 80.6, "Extraction": 77.9, "Reasoning": 75.0, "Style": 71.0, "Overall F1": 76.4, "o3": -1.1, "R1-0528": 2.0, "Grok4": 0.6, "Bias
|
| 6 |
-
{"Model": "Anthropic/claude-sonnet-4", "Category": "Closed-source Instruct", "Overall": 70.2, "Physics": 85.0, "Chemistry": 66.9, "Finance": 68.1, "Consulting": 76.3, "Extraction": 77.6, "Reasoning": 73.3, "Style": 64.1, "Overall F1": 75.2, "o3": -6.5, "R1-0528": -5.2, "Grok4": -10.2, "Bias
|
| 7 |
-
{"Model": "anthropic/claude-3.5-haiku", "Category": "Closed-source Instruct", "Overall": 72.5, "Physics": 78.9, "Chemistry": 67.2, "Finance": 71.2, "Consulting": 76.7, "Extraction": 76.9, "Reasoning": 73.3, "Style": 65.4, "Overall F1": 74.9, "o3": -1.7, "R1-0528": 0.7, "Grok4": -1.4, "Bias
|
| 8 |
-
{"Model": "Qwen/Qwen3-235B-A22B-Instruct-2507", "Category": "Open-weight Instruct", "Overall": 75.1, "Physics": 86.5, "Chemistry": 69.3, "Finance": 69.3, "Consulting": 79.6, "Extraction": 79.2, "Reasoning": 76.0, "Style": 64.6, "Overall F1": 77.3, "o3": 3.8, "R1-0528": 2.2, "Grok4": 1.6, "Bias
|
| 9 |
-
{"Model": "Qwen/Qwen3-30B-A3B-instruct-2507", "Category": "Open-weight Instruct", "Overall": 73.1, "Physics": 82.0, "Chemistry": 68.3, "Finance": 67.3, "Consulting": 79.7, "Extraction": 76.5, "Reasoning": 74.5, "Style": 64.7, "Overall F1": 75.5, "o3": 4.7, "R1-0528": 7.1, "Grok4": 5.3, "Bias
|
| 10 |
-
{"Model": "MoonshotAI/Kimi-K2-Instruct-0905", "Category": "Open-weight Instruct", "Overall": 74.7, "Physics": 84.5, "Chemistry": 69.9, "Finance": 67.5, "Consulting": 81.9, "Extraction": 80.2, "Reasoning": 75.5, "Style": 65.9, "Overall F1": 77.0, "o3": 7.5, "R1-0528": 6.1, "Grok4": 5.2, "Bias
|
| 11 |
-
{"Model": "MoonshotAI/Kimi-K2-Instruct-0711", "Category": "Open-weight Instruct", "Overall": 75.2, "Physics": 85.3, "Chemistry": 69.5, "Finance": 68.3, "Consulting": 82.3, "Extraction": 80.3, "Reasoning": 76.1, "Style": 66.4, "Overall F1": 77.6, "o3": 7.1, "R1-0528": 6.1, "Grok4": 4.7, "Bias
|
| 12 |
-
{"Model": "DeepSeek-AI/DeepSeek-V3.1", "Category": "Open-weight Instruct", "Overall": 72.8, "Physics": 79.6, "Chemistry": 68.2, "Finance": 68.3, "Consulting": 78.7, "Extraction": 77.4, "Reasoning": 73.9, "Style": 65.8, "Overall F1": 75.2, "o3": 0.2, "R1-0528": -1.5, "Grok4": -2.2, "Bias
|
| 13 |
-
{"Model": "DeepSeek-AI/DeepSeek-V3-0324", "Category": "Open-weight Instruct", "Overall": 72.6, "Physics": 84.5, "Chemistry": 68.0, "Finance": 67.0, "Consulting": 78.3, "Extraction": 77.7, "Reasoning": 74.6, "Style": 63.5, "Overall F1": 75.7, "o3": 1.5, "R1-0528": 2.4, "Grok4": -0.7, "Bias
|
| 14 |
-
{"Model": "nvidia/llama-3.1-nemotron-nano-8b-v1", "Category": "Open-weight Instruct", "Overall": 55.8, "Physics": 56.5, "Chemistry": 59.5, "Finance": 57.3, "Consulting": 56.7, "Extraction": 61.3, "Reasoning": 58.6, "Style": 59.1, "Overall F1": 59.3, "o3": -28.5, "R1-0528": -26.5, "Grok4": -30.0, "Bias
|
| 15 |
-
{"Model": "nvidia/llama-3.3-nemotron-super-49b-v1", "Category": "Open-weight Instruct", "Overall": 68.8, "Physics": 77.2, "Chemistry": 65.1, "Finance": 70.2, "Consulting": 72.1, "Extraction": 74.1, "Reasoning": 70.7, "Style": 64.1, "Overall F1": 72.3, "o3": -15.7, "R1-0528": -12.2, "Grok4": -13.0, "Bias
|
| 16 |
-
{"Model": "nvidia/llama-3.1-nemotron-ultra-253b-v1", "Category": "Open-weight Instruct", "Overall": 67.4, "Physics": 84.8, "Chemistry": 63.6, "Finance": 66.6, "Consulting": 61.8, "Extraction": 72.6, "Reasoning": 67.8, "Style": 57.8, "Overall F1": 69.6, "o3": -10.0, "R1-0528": -11.4, "Grok4": -9.2, "Bias
|
| 17 |
-
{"Model": "meta/llama-4-maverick-17b-128e-instruct", "Category": "Open-weight Instruct", "Overall": 67.9, "Physics": 64.9, "Chemistry": 66.7, "Finance": 73.4, "Consulting": 76.4, "Extraction": 76.5, "Reasoning": 70.4, "Style": 67.9, "Overall F1": 72.4, "o3": -14.3, "R1-0528": -10.5, "Grok4": -9.8, "Bias
|
| 18 |
-
{"Model": "meta/llama-4-scout-17b-16e-instruct", "Category": "Open-weight Instruct", "Overall": 65.9, "Physics": 60.4, "Chemistry": 69.4, "Finance": 71.3, "Consulting": 75.6, "Extraction": 76.2, "Reasoning": 69.9, "Style": 62.0, "Overall F1": 71.8, "o3": -14.5, "R1-0528": -10.2, "Grok4": -8.6, "Bias
|
| 19 |
-
{"Model": "meta/llama-3.1-405b-instruct", "Category": "Open-weight Instruct", "Overall": 71.6, "Physics": 85.1, "Chemistry": 69.1, "Finance": 67.6, "Consulting": 81.7, "Extraction": 77.7, "Reasoning": 75.5, "Style": 65.5, "Overall F1": 77.0, "o3": 11.5, "R1-0528": 6.1, "Grok4": 9.4, "Bias
|
| 20 |
-
{"Model": "meta/llama-3.3-70b-instruct", "Category": "Open-weight Instruct", "Overall": 74.1, "Physics": 84.6, "Chemistry": 66.5, "Finance": 71.6, "Consulting": 79.1, "Extraction": 78.1, "Reasoning": 75.4, "Style": 64.6, "Overall F1": 76.7, "o3": -3.1, "R1-0528": -0.8, "Grok4": -3.4, "Bias
|
| 21 |
-
{"Model": "meta/llama-3.1-70b-instruct", "Category": "Open-weight Instruct", "Overall": 70.7, "Physics": 82.1, "Chemistry": 66.7, "Finance": 72.6, "Consulting": 76.0, "Extraction": 77.5, "Reasoning": 73.9, "Style": 64.7, "Overall F1": 75.4, "o3": -6.2, "R1-0528": -1.5, "Grok4": -4.1, "Bias
|
| 22 |
-
{"Model": "meta/llama-3.1-8b-instruct", "Category": "Open-weight Instruct", "Overall": 63.1, "Physics": 76.2, "Chemistry": 69.3, "Finance": 70.2, "Consulting": 71.0, "Extraction": 76.6, "Reasoning": 71.5, "Style": 61.7, "Overall F1": 73.2, "o3": -4.0, "R1-0528": 6.1, "Grok4": -1.5, "Bias
|
| 23 |
-
{"Model": "meta/llama-3.2-3b-instruct", "Category": "Open-weight Instruct", "Overall": 58.3, "Physics": 67.6, "Chemistry": 63.8, "Finance": 59.7, "Consulting": 66.1, "Extraction": 68.8, "Reasoning": 64.6, "Style": 54.6, "Overall F1": 66.2, "o3": 8.8, "R1-0528": 16.7, "Grok4": 13.1, "Bias
|
| 24 |
-
{"Model": "meta/llama-3.1-1b-instruct", "Category": "Open-weight Instruct", "Overall": 39.5, "Physics": 31.9, "Chemistry": 48.4, "Finance": 44.9, "Consulting": 55.8, "Extraction": 47.8, "Reasoning": 43.2, "Style": 46.2, "Overall F1": 45.7, "o3": 31.0, "R1-0528": 33.1, "Grok4": 37.2, "Bias
|
| 25 |
-
{"Model": "OpenAI/GPT-5 (high)", "Category": "Closed-source Reasoning", "Overall": 76.0, "Physics": 90.2, "Chemistry": 68.2, "Finance": 69.4, "Consulting": 80.9, "Extraction": 78.3, "Reasoning": 76.7, "Style": 79.1, "Overall F1": 78.3, "o3": 1.0, "R1-0528": -0.8, "Grok4": -1.3, "Bias
|
| 26 |
-
{"Model": "OpenAI/GPT-5 (med)", "Category": "Closed-source Reasoning", "Overall": 76.7, "Physics": 89.2, "Chemistry": 67.9, "Finance": 69.0, "Consulting": 80.9, "Extraction": 78.1, "Reasoning": 76.3, "Style": 77.3, "Overall F1": 77.9, "o3": 0.0, "R1-0528": -0.9, "Grok4": -1.2, "Bias
|
| 27 |
-
{"Model": "OpenAI/GPT-5 (low)", "Category": "Closed-source Reasoning", "Overall": 76.3, "Physics": 88.6, "Chemistry": 69.3, "Finance": 69.0, "Consulting": 80.9, "Extraction": 78.1, "Reasoning": 76.6, "Style": 79.4, "Overall F1": 78.1, "o3": 0.3, "R1-0528": -1.5, "Grok4": -1.4, "Bias
|
| 28 |
-
{"Model": "OpenAI/GPT-5 (minimal)", "Category": "Closed-source Reasoning", "Overall": 71.9, "Physics": 86.8, "Chemistry": 68.6, "Finance": 71.2, "Consulting": 77.5, "Extraction": 78.9, "Reasoning": 75.2, "Style": 64.8, "Overall F1": 77.0, "o3": -0.5, "R1-0528": -5.6, "Grok4": -5.0, "Bias
|
| 29 |
-
{"Model": "OpenAI/GPT-5-mini (high)", "Category": "Closed-source Reasoning", "Overall": 75.3, "Physics": 84.5, "Chemistry": 69.2, "Finance": 70.4, "Consulting": 82.8, "Extraction": 78.4, "Reasoning": 75.9, "Style": 74.1, "Overall F1": 77.7, "o3": 6.6, "R1-0528": 4.2, "Grok4": 4.6, "Bias
|
| 30 |
-
{"Model": "OpenAI/GPT-5-mini (med)", "Category": "Closed-source Reasoning", "Overall": 74.4, "Physics": 83.3, "Chemistry": 68.2, "Finance": 69.9, "Consulting": 81.5, "Extraction": 78.1, "Reasoning": 74.6, "Style": 72.8, "Overall F1": 76.7, "o3": 6.3, "R1-0528": 4.0, "Grok4": 4.3, "Bias
|
| 31 |
-
{"Model": "OpenAI/GPT-5-mini (low)", "Category": "Closed-source Reasoning", "Overall": 74.7, "Physics": 82.9, "Chemistry": 68.5, "Finance": 70.3, "Consulting": 81.7, "Extraction": 77.4, "Reasoning": 74.6, "Style": 78.0, "Overall F1": 76.8, "o3": 5.9, "R1-0528": 3.8, "Grok4": 4.6, "Bias
|
| 32 |
-
{"Model": "OpenAI/GPT-5-mini (minimal)", "Category": "Closed-source Reasoning", "Overall": 66.7, "Physics": 81.7, "Chemistry": 64.0, "Finance": 69.1, "Consulting": 76.0, "Extraction": 75.9, "Reasoning": 72.5, "Style": 58.8, "Overall F1": 73.8, "o3": -4.0, "R1-0528": -6.2, "Grok4": -11.1, "Bias
|
| 33 |
-
{"Model": "OpenAI/GPT-5-nano (high)", "Category": "Closed-source Reasoning", "Overall": 71.9, "Physics": 86.8, "Chemistry": 67.6, "Finance": 68.7, "Consulting": 79.8, "Extraction": 77.6, "Reasoning": 75.1, "Style": 74.0, "Overall F1": 76.9, "o3": 5.3, "R1-0528": 0.3, "Grok4": 3.1, "Bias
|
| 34 |
-
{"Model": "OpenAI/GPT-5-nano (med)", "Category": "Closed-source Reasoning", "Overall": 72.7, "Physics": 85.6, "Chemistry": 67.0, "Finance": 68.7, "Consulting": 79.7, "Extraction": 77.1, "Reasoning": 74.3, "Style": 78.3, "Overall F1": 76.4, "o3": 3.4, "R1-0528": -0.3, "Grok4": 1.7, "Bias
|
| 35 |
-
{"Model": "OpenAI/GPT-5-nano (low)", "Category": "Closed-source Reasoning", "Overall": 73.6, "Physics": 83.5, "Chemistry": 67.6, "Finance": 68.6, "Consulting": 77.7, "Extraction": 76.9, "Reasoning": 73.5, "Style": 70.9, "Overall F1": 75.4, "o3": 2.4, "R1-0528": 0.6, "Grok4": 1.9, "Bias
|
| 36 |
-
{"Model": "OpenAI/GPT-5-nano (minimal)", "Category": "Closed-source Reasoning", "Overall": 55.0, "Physics": 68.8, "Chemistry": 55.3, "Finance": 60.9, "Consulting": 63.0, "Extraction": 65.8, "Reasoning": 62.1, "Style": 54.3, "Overall F1": 63.2, "o3": -18.7, "R1-0528": -19.6, "Grok4": -26.9, "Bias
|
| 37 |
-
{"Model": "OpenAI/o3 (high)", "Category": "Closed-source Reasoning", "Overall": 76.4, "Physics": 88.3, "Chemistry": 68.2, "Finance": 69.3, "Consulting": 81.1, "Extraction": 79.1, "Reasoning": 76.1, "Style": 75.3, "Overall F1": 77.9, "o3": 2.0, "R1-0528": 0.5, "Grok4": 0.8, "Bias
|
| 38 |
-
{"Model": "OpenAI/o3 (med)", "Category": "Closed-source Reasoning", "Overall": 76.0, "Physics": 89.3, "Chemistry": 69.1, "Finance": 68.9, "Consulting": 81.0, "Extraction": 79.3, "Reasoning": 76.4, "Style": 76.9, "Overall F1": 78.2, "o3": 3.0, "R1-0528": 0.8, "Grok4": 1.5, "Bias
|
| 39 |
-
{"Model": "OpenAI/o3 (low)", "Category": "Closed-source Reasoning", "Overall": 76.4, "Physics": 88.9, "Chemistry": 69.3, "Finance": 70.3, "Consulting": 81.9, "Extraction": 79.7, "Reasoning": 76.8, "Style": 76.7, "Overall F1": 78.7, "o3": 3.8, "R1-0528": 1.5, "Grok4": 2.6, "Bias
|
| 40 |
-
{"Model": "OpenAI/o4-mini (high)", "Category": "Closed-source Reasoning", "Overall": 75.8, "Physics": 88.5, "Chemistry": 68.9, "Finance": 70.5, "Consulting": 81.5, "Extraction": 78.7, "Reasoning": 76.8, "Style": 76.5, "Overall F1": 78.4, "o3": 4.5, "R1-0528": 2.7, "Grok4": 1.9, "Bias
|
| 41 |
-
{"Model": "OpenAI/o4-mini (med)", "Category": "Closed-source Reasoning", "Overall": 75.8, "Physics": 88.1, "Chemistry": 69.6, "Finance": 70.8, "Consulting": 81.6, "Extraction": 78.9, "Reasoning": 76.8, "Style": 74.1, "Overall F1": 78.6, "o3": 4.0, "R1-0528": 2.8, "Grok4": 1.2, "Bias
|
| 42 |
-
{"Model": "OpenAI/o4-mini (low)", "Category": "Closed-source Reasoning", "Overall": 76.8, "Physics": 88.6, "Chemistry": 70.1, "Finance": 70.1, "Consulting": 81.0, "Extraction": 78.8, "Reasoning": 76.8, "Style": 74.1, "Overall F1": 78.5, "o3": 3.4, "R1-0528": 3.3, "Grok4": 1.7, "Bias
|
| 43 |
-
{"Model": "xAI/grok-4", "Category": "Closed-source Reasoning", "Overall": 75.9, "Physics": 86.1, "Chemistry": 68.5, "Finance": 70.7, "Consulting": 80.8, "Extraction": 78.5, "Reasoning": 76.3, "Style": 75.2, "Overall F1": 77.7, "o3": 0.7, "R1-0528": 2.5, "Grok4": 1.8, "Bias
|
| 44 |
-
{"Model": "xAI/grok-3-mini", "Category": "Closed-source Reasoning", "Overall": 75.1, "Physics": 85.8, "Chemistry": 66.9, "Finance": 69.4, "Consulting": 82.0, "Extraction": 78.1, "Reasoning": 75.3, "Style": 75.2, "Overall F1": 77.2, "o3": 4.5, "R1-0528": 2.4, "Grok4": 2.9, "Bias
|
| 45 |
-
{"Model": "Anthropic/claude-sonnet-4-20250514", "Category": "Closed-source Reasoning", "Overall": 70.9, "Physics": 75.7, "Chemistry": 66.3, "Finance": 69.9, "Consulting": 77.8, "Extraction": 77.5, "Reasoning": 72.3, "Style": 66.0, "Overall F1": 74.0, "o3": -11.2, "R1-0528": -8.1, "Grok4": -10.7, "Bias
|
| 46 |
-
{"Model": "Google/Gemini-2.5-Pro", "Category": "Closed-source Reasoning", "Overall": 78.2, "Physics": 87.3, "Chemistry": 70.2, "Finance": 71.9, "Consulting": 82.6, "Extraction": 81.3, "Reasoning": 77.4, "Style": 76.8, "Overall F1": 79.2, "o3": 3.1, "R1-0528": 2.8, "Grok4": 2.1, "Bias
|
| 47 |
-
{"Model": "Google/Gemini-2.5-Flash (Thinking)", "Category": "Closed-source Reasoning", "Overall": 78.1, "Physics": 87.0, "Chemistry": 68.7, "Finance": 71.6, "Consulting": 81.2, "Extraction": 80.1, "Reasoning": 76.7, "Style": 74.6, "Overall F1": 78.4, "o3": 2.3, "R1-0528": 2.5, "Grok4": 2.2, "Bias
|
| 48 |
-
{"Model": "Google/Gemini-2.5-Flash-Lite (Thinking)", "Category": "Closed-source Reasoning", "Overall": 74.7, "Physics": 83.7, "Chemistry": 67.0, "Finance": 72.2, "Consulting": 81.9, "Extraction": 78.7, "Reasoning": 75.9, "Style": 79.1, "Overall F1": 77.5, "o3": -1.1, "R1-0528": 0.2, "Grok4": -2.6, "Bias
|
| 49 |
-
{"Model": "OpenAI/gpt-oss-20b (high)", "Category": "Open-weight Reasoning", "Overall": 74.4, "Physics": 89.3, "Chemistry": 68.7, "Finance": 68.5, "Consulting": 80.7, "Extraction": 77.8, "Reasoning": 76.5, "Style": 77.7, "Overall F1": 77.9, "o3": 3.3, "R1-0528": -0.2, "Grok4": 0.9, "Bias
|
| 50 |
-
{"Model": "OpenAI/gpt-oss-20b (medium)", "Category": "Open-weight Reasoning", "Overall": 74.8, "Physics": 87.7, "Chemistry": 68.3, "Finance": 69.7, "Consulting": 80.9, "Extraction": 78.5, "Reasoning": 76.3, "Style": 76.2, "Overall F1": 77.8, "o3": 3.6, "R1-0528": 1.1, "Grok4": 0.6, "Bias
|
| 51 |
-
{"Model": "OpenAI/gpt-oss-20b (low)", "Category": "Open-weight Reasoning", "Overall": 75.6, "Physics": 85.4, "Chemistry": 69.3, "Finance": 70.8, "Consulting": 79.2, "Extraction": 77.6, "Reasoning": 76.3, "Style": 71.1, "Overall F1": 77.5, "o3": 0.4, "R1-0528": -0.3, "Grok4": 1.6, "Bias
|
| 52 |
-
{"Model": "OpenAI/gpt-oss-120b (high)", "Category": "Open-weight Reasoning", "Overall": 75.4, "Physics": 89.5, "Chemistry": 68.9, "Finance": 69.7, "Consulting": 80.8, "Extraction": 78.9, "Reasoning": 76.7, "Style": 80.8, "Overall F1": 78.4, "o3": 1.6, "R1-0528": -1.4, "Grok4": 0.3, "Bias
|
| 53 |
-
{"Model": "OpenAI/gpt-oss-120b (med)", "Category": "Open-weight Reasoning", "Overall": 75.8, "Physics": 88.1, "Chemistry": 67.4, "Finance": 70.5, "Consulting": 79.9, "Extraction": 79.6, "Reasoning": 76.0, "Style": 75.3, "Overall F1": 77.7, "o3": 0.6, "R1-0528": -1.3, "Grok4": -0.9, "Bias
|
| 54 |
-
{"Model": "OpenAI/gpt-oss-120b (low)", "Category": "Open-weight Reasoning", "Overall": 76.7, "Physics": 86.0, "Chemistry": 67.2, "Finance": 72.1, "Consulting": 79.0, "Extraction": 79.2, "Reasoning": 75.7, "Style": 72.4, "Overall F1": 77.3, "o3": -1.0, "R1-0528": -1.6, "Grok4": -1.5, "Bias
|
| 55 |
-
{"Model": "OpenAI/gpt-oss-120b (mixed)", "Category": "Open-weight Reasoning", "Overall": 78.2, "Physics": 89.5, "Chemistry": 68.9, "Finance": 72.2, "Consulting": 79.7, "Extraction": 79.7, "Reasoning": 76.9, "Style": 80.8, "Overall F1": 78.7, "o3": -0.5, "R1-0528": -0.9, "Grok4": -1.0, "Bias
|
| 56 |
-
{"Model": "DeepSeek-AI/DeepSeek-V3.1 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 76.6, "Physics": 84.3, "Chemistry": 69.3, "Finance": 70.8, "Consulting": 80.3, "Extraction": 78.9, "Reasoning": 75.6, "Style": 72.0, "Overall F1": 77.3, "o3": 3.2, "R1-0528": 3.3, "Grok4": 2.6, "Bias
|
| 57 |
-
{"Model": "DeepSeek-AI/DeepSeek-R1-0528", "Category": "Open-weight Reasoning", "Overall": 69.4, "Physics": 79.6, "Chemistry": 65.1, "Finance": 68.5, "Consulting": 71.6, "Extraction": 74.7, "Reasoning": 70.9, "Style": 64.1, "Overall F1": 72.2, "o3": -11.6, "R1-0528": -9.3, "Grok4": -8.8, "Bias
|
| 58 |
-
{"Model": "Qwen/Qwen3-30B-A3B-Thinking-2507", "Category": "Open-weight Reasoning", "Overall": 39.8, "Physics": 46.7, "Chemistry": 35.9, "Finance": 45.4, "Consulting": 35.8, "Extraction": 42.1, "Reasoning": 41.2, "Style": 35.3, "Overall F1": 41.5, "o3": -0.2, "R1-0528": -1.3, "Grok4": 0.4, "Bias
|
| 59 |
-
{"Model": "Qwen/Qwen3-235B-A22B-Thinking-2507", "Category": "Open-weight Reasoning", "Overall": 76.5, "Physics": 87.2, "Chemistry": 67.9, "Finance": 69.0, "Consulting": 80.4, "Extraction": 79.3, "Reasoning": 75.6, "Style": 74.3, "Overall F1": 77.3, "o3": -1.0, "R1-0528": -1.8, "Grok4": -1.5, "Bias
|
|
|
|
| 1 |
+
{"Model": "OpenAI/GPT-4.1", "Category": "Closed-source Instruct", "Overall": 75.4, "Physics": 80.9, "Chemistry": 69.2, "Finance": 71.0, "Consulting": 80.0, "Extraction": 79.8, "Reasoning": 74.4, "Style": 65.8, "Overall F1": 76.3, "Bias to o3": 5.5, "Bias to R1-0528": 4.6, "Bias to Grok4": 5.0, "Bias Index": 0.9, "Input Tokens": 1619.0, "Output Tokens": 1.0, "Cost": 11.31}
|
| 2 |
+
{"Model": "OpenAI/GPT-4.1-mini", "Category": "Closed-source Instruct", "Overall": 74.9, "Physics": 83.9, "Chemistry": 67.3, "Finance": 69.1, "Consulting": 80.6, "Extraction": 79.2, "Reasoning": 74.7, "Style": 69.8, "Overall F1": 76.4, "Bias to o3": -0.2, "Bias to R1-0528": 1.2, "Bias to Grok4": -0.3, "Bias Index": 1.5, "Input Tokens": 1619.0, "Output Tokens": 1.0, "Cost": 2.26}
|
| 3 |
+
{"Model": "OpenAI/GPT-4.1-nano", "Category": "Closed-source Instruct", "Overall": 54.1, "Physics": 69.8, "Chemistry": 62.9, "Finance": 66.7, "Consulting": 68.4, "Extraction": 71.0, "Reasoning": 65.6, "Style": 63.5, "Overall F1": 67.9, "Bias to o3": -14.5, "Bias to R1-0528": -2.1, "Bias to Grok4": -0.7, "Bias Index": 13.8, "Input Tokens": 1619.0, "Output Tokens": 1.0, "Cost": 0.56}
|
| 4 |
+
{"Model": "Google/Gemini-2.5-Flash", "Category": "Closed-source Instruct", "Overall": 73.4, "Physics": 82.9, "Chemistry": 67.3, "Finance": 70.8, "Consulting": 79.6, "Extraction": 79.2, "Reasoning": 74.5, "Style": 67.7, "Overall F1": 76.3, "Bias to o3": -4.2, "Bias to R1-0528": -6.6, "Bias to Grok4": -7.1, "Bias Index": 2.9, "Input Tokens": 1779.0, "Output Tokens": 1.0, "Cost": 1.87}
|
| 5 |
+
{"Model": "Google/Gemini-2.5-Flash-Lite", "Category": "Closed-source Instruct", "Overall": 73.3, "Physics": 83.6, "Chemistry": 68.2, "Finance": 68.2, "Consulting": 80.6, "Extraction": 77.9, "Reasoning": 75.0, "Style": 71.0, "Overall F1": 76.4, "Bias to o3": -1.1, "Bias to R1-0528": 2.0, "Bias to Grok4": 0.6, "Bias Index": 3.1, "Input Tokens": 1779.0, "Output Tokens": 1.0, "Cost": 0.62}
|
| 6 |
+
{"Model": "Anthropic/claude-sonnet-4", "Category": "Closed-source Instruct", "Overall": 70.2, "Physics": 85.0, "Chemistry": 66.9, "Finance": 68.1, "Consulting": 76.3, "Extraction": 77.6, "Reasoning": 73.3, "Style": 64.1, "Overall F1": 75.2, "Bias to o3": -6.5, "Bias to R1-0528": -5.2, "Bias to Grok4": -10.2, "Bias Index": 5.0, "Input Tokens": 1913.0, "Output Tokens": 1.0, "Cost": 20.06}
|
| 7 |
+
{"Model": "anthropic/claude-3.5-haiku", "Category": "Closed-source Instruct", "Overall": 72.5, "Physics": 78.9, "Chemistry": 67.2, "Finance": 71.2, "Consulting": 76.7, "Extraction": 76.9, "Reasoning": 73.3, "Style": 65.4, "Overall F1": 74.9, "Bias to o3": -1.7, "Bias to R1-0528": 0.7, "Bias to Grok4": -1.4, "Bias Index": 2.4, "Input Tokens": 1913.0, "Output Tokens": 1.0, "Cost": 5.35}
|
| 8 |
+
{"Model": "Qwen/Qwen3-235B-A22B-Instruct-2507", "Category": "Open-weight Instruct", "Overall": 75.1, "Physics": 86.5, "Chemistry": 69.3, "Finance": 69.3, "Consulting": 79.6, "Extraction": 79.2, "Reasoning": 76.0, "Style": 64.6, "Overall F1": 77.3, "Bias to o3": 3.8, "Bias to R1-0528": 2.2, "Bias to Grok4": 1.6, "Bias Index": 2.2, "Input Tokens": 1779.0, "Output Tokens": 1.0, "Cost": 0.48}
|
| 9 |
+
{"Model": "Qwen/Qwen3-30B-A3B-instruct-2507", "Category": "Open-weight Instruct", "Overall": 73.1, "Physics": 82.0, "Chemistry": 68.3, "Finance": 67.3, "Consulting": 79.7, "Extraction": 76.5, "Reasoning": 74.5, "Style": 64.7, "Overall F1": 75.5, "Bias to o3": 4.7, "Bias to R1-0528": 7.1, "Bias to Grok4": 5.3, "Bias Index": 2.4, "Input Tokens": 1778.0, "Output Tokens": 1.0, "Cost": 0.32}
|
| 10 |
+
{"Model": "MoonshotAI/Kimi-K2-Instruct-0905", "Category": "Open-weight Instruct", "Overall": 74.7, "Physics": 84.5, "Chemistry": 69.9, "Finance": 67.5, "Consulting": 81.9, "Extraction": 80.2, "Reasoning": 75.5, "Style": 65.9, "Overall F1": 77.0, "Bias to o3": 7.5, "Bias to R1-0528": 6.1, "Bias to Grok4": 5.2, "Bias Index": 2.3, "Input Tokens": 1623.0, "Output Tokens": 1.0, "Cost": 0.81}
|
| 11 |
+
{"Model": "MoonshotAI/Kimi-K2-Instruct-0711", "Category": "Open-weight Instruct", "Overall": 75.2, "Physics": 85.3, "Chemistry": 69.5, "Finance": 68.3, "Consulting": 82.3, "Extraction": 80.3, "Reasoning": 76.1, "Style": 66.4, "Overall F1": 77.6, "Bias to o3": 7.1, "Bias to R1-0528": 6.1, "Bias to Grok4": 4.7, "Bias Index": 2.4, "Input Tokens": 1636.0, "Output Tokens": 1.0, "Cost": 0.81}
|
| 12 |
+
{"Model": "DeepSeek-AI/DeepSeek-V3.1", "Category": "Open-weight Instruct", "Overall": 72.8, "Physics": 79.6, "Chemistry": 68.2, "Finance": 68.3, "Consulting": 78.7, "Extraction": 77.4, "Reasoning": 73.9, "Style": 65.8, "Overall F1": 75.2, "Bias to o3": 0.2, "Bias to R1-0528": -1.5, "Bias to Grok4": -2.2, "Bias Index": 2.4, "Input Tokens": 1586.0, "Output Tokens": 1.0, "Cost": 1.11}
|
| 13 |
+
{"Model": "DeepSeek-AI/DeepSeek-V3-0324", "Category": "Open-weight Instruct", "Overall": 72.6, "Physics": 84.5, "Chemistry": 68.0, "Finance": 67.0, "Consulting": 78.3, "Extraction": 77.7, "Reasoning": 74.6, "Style": 63.5, "Overall F1": 75.7, "Bias to o3": 1.5, "Bias to R1-0528": 2.4, "Bias to Grok4": -0.7, "Bias Index": 3.1, "Input Tokens": 1585.0, "Output Tokens": 1.0, "Cost": 1.11}
|
| 14 |
+
{"Model": "nvidia/llama-3.1-nemotron-nano-8b-v1", "Category": "Open-weight Instruct", "Overall": 55.8, "Physics": 56.5, "Chemistry": 59.5, "Finance": 57.3, "Consulting": 56.7, "Extraction": 61.3, "Reasoning": 58.6, "Style": 59.1, "Overall F1": 59.3, "Bias to o3": -28.5, "Bias to R1-0528": -26.5, "Bias to Grok4": -30.0, "Bias Index": 3.5, "Input Tokens": 1633.0, "Output Tokens": 1.0, "Cost": 0.09}
|
| 15 |
+
{"Model": "nvidia/llama-3.3-nemotron-super-49b-v1", "Category": "Open-weight Instruct", "Overall": 68.8, "Physics": 77.2, "Chemistry": 65.1, "Finance": 70.2, "Consulting": 72.1, "Extraction": 74.1, "Reasoning": 70.7, "Style": 64.1, "Overall F1": 72.3, "Bias to o3": -15.7, "Bias to R1-0528": -12.2, "Bias to Grok4": -13.0, "Bias Index": 3.5, "Input Tokens": 1637.0, "Output Tokens": 1.0, "Cost": 0.74}
|
| 16 |
+
{"Model": "nvidia/llama-3.1-nemotron-ultra-253b-v1", "Category": "Open-weight Instruct", "Overall": 67.4, "Physics": 84.8, "Chemistry": 63.6, "Finance": 66.6, "Consulting": 61.8, "Extraction": 72.6, "Reasoning": 67.8, "Style": 57.8, "Overall F1": 69.6, "Bias to o3": -10.0, "Bias to R1-0528": -11.4, "Bias to Grok4": -9.2, "Bias Index": 2.2, "Input Tokens": 1637.0, "Output Tokens": 1.0, "Cost": 3.43}
|
| 17 |
+
{"Model": "meta/llama-4-maverick-17b-128e-instruct", "Category": "Open-weight Instruct", "Overall": 67.9, "Physics": 64.9, "Chemistry": 66.7, "Finance": 73.4, "Consulting": 76.4, "Extraction": 76.5, "Reasoning": 70.4, "Style": 67.9, "Overall F1": 72.4, "Bias to o3": -14.3, "Bias to R1-0528": -10.5, "Bias to Grok4": -9.8, "Bias Index": 4.5, "Input Tokens": 1566.0, "Output Tokens": 1.0, "Cost": 0.82}
|
| 18 |
+
{"Model": "meta/llama-4-scout-17b-16e-instruct", "Category": "Open-weight Instruct", "Overall": 65.9, "Physics": 60.4, "Chemistry": 69.4, "Finance": 71.3, "Consulting": 75.6, "Extraction": 76.2, "Reasoning": 69.9, "Style": 62.0, "Overall F1": 71.8, "Bias to o3": -14.5, "Bias to R1-0528": -10.2, "Bias to Grok4": -8.6, "Bias Index": 5.9, "Input Tokens": 1565.0, "Output Tokens": 1.0, "Cost": 0.44}
|
| 19 |
+
{"Model": "meta/llama-3.1-405b-instruct", "Category": "Open-weight Instruct", "Overall": 71.6, "Physics": 85.1, "Chemistry": 69.1, "Finance": 67.6, "Consulting": 81.7, "Extraction": 77.7, "Reasoning": 75.5, "Style": 65.5, "Overall F1": 77.0, "Bias to o3": 11.5, "Bias to R1-0528": 6.1, "Bias to Grok4": 9.4, "Bias Index": 5.4, "Input Tokens": 1628.0, "Output Tokens": 1.0, "Cost": 4.54}
|
| 20 |
+
{"Model": "meta/llama-3.3-70b-instruct", "Category": "Open-weight Instruct", "Overall": 74.1, "Physics": 84.6, "Chemistry": 66.5, "Finance": 71.6, "Consulting": 79.1, "Extraction": 78.1, "Reasoning": 75.4, "Style": 64.6, "Overall F1": 76.7, "Bias to o3": -3.1, "Bias to R1-0528": -0.8, "Bias to Grok4": -3.4, "Bias Index": 2.6, "Input Tokens": 1628.0, "Output Tokens": 1.0, "Cost": 0.22}
|
| 21 |
+
{"Model": "meta/llama-3.1-70b-instruct", "Category": "Open-weight Instruct", "Overall": 70.7, "Physics": 82.1, "Chemistry": 66.7, "Finance": 72.6, "Consulting": 76.0, "Extraction": 77.5, "Reasoning": 73.9, "Style": 64.7, "Overall F1": 75.4, "Bias to o3": -6.2, "Bias to R1-0528": -1.5, "Bias to Grok4": -4.1, "Bias Index": 4.7, "Input Tokens": 1628.0, "Output Tokens": 1.0, "Cost": 0.22}
|
| 22 |
+
{"Model": "meta/llama-3.1-8b-instruct", "Category": "Open-weight Instruct", "Overall": 63.1, "Physics": 76.2, "Chemistry": 69.3, "Finance": 70.2, "Consulting": 71.0, "Extraction": 76.6, "Reasoning": 71.5, "Style": 61.7, "Overall F1": 73.2, "Bias to o3": -4.0, "Bias to R1-0528": 6.1, "Bias to Grok4": -1.5, "Bias Index": 10.1, "Input Tokens": 1628.0, "Output Tokens": 1.0, "Cost": 0.09}
|
| 23 |
+
{"Model": "meta/llama-3.2-3b-instruct", "Category": "Open-weight Instruct", "Overall": 58.3, "Physics": 67.6, "Chemistry": 63.8, "Finance": 59.7, "Consulting": 66.1, "Extraction": 68.8, "Reasoning": 64.6, "Style": 54.6, "Overall F1": 66.2, "Bias to o3": 8.8, "Bias to R1-0528": 16.7, "Bias to Grok4": 13.1, "Bias Index": 7.9, "Input Tokens": 1628.0, "Output Tokens": 1.0, "Cost": 0.02}
|
| 24 |
+
{"Model": "meta/llama-3.1-1b-instruct", "Category": "Open-weight Instruct", "Overall": 39.5, "Physics": 31.9, "Chemistry": 48.4, "Finance": 44.9, "Consulting": 55.8, "Extraction": 47.8, "Reasoning": 43.2, "Style": 46.2, "Overall F1": 45.7, "Bias to o3": 31.0, "Bias to R1-0528": 33.1, "Bias to Grok4": 37.2, "Bias Index": 6.2, "Input Tokens": 1628.0, "Output Tokens": 1.0, "Cost": 0.02}
|
| 25 |
+
{"Model": "OpenAI/GPT-5 (high)", "Category": "Closed-source Reasoning", "Overall": 76.0, "Physics": 90.2, "Chemistry": 68.2, "Finance": 69.4, "Consulting": 80.9, "Extraction": 78.3, "Reasoning": 76.7, "Style": 79.1, "Overall F1": 78.3, "Bias to o3": 1.0, "Bias to R1-0528": -0.8, "Bias to Grok4": -1.3, "Bias Index": 2.3, "Input Tokens": 1618.0, "Output Tokens": 668.0, "Cost": 30.34}
|
| 26 |
+
{"Model": "OpenAI/GPT-5 (med)", "Category": "Closed-source Reasoning", "Overall": 76.7, "Physics": 89.2, "Chemistry": 67.9, "Finance": 69.0, "Consulting": 80.9, "Extraction": 78.1, "Reasoning": 76.3, "Style": 77.3, "Overall F1": 77.9, "Bias to o3": 0.0, "Bias to R1-0528": -0.9, "Bias to Grok4": -1.2, "Bias Index": 1.2, "Input Tokens": 1619.0, "Output Tokens": 287.0, "Cost": 17.06}
|
| 27 |
+
{"Model": "OpenAI/GPT-5 (low)", "Category": "Closed-source Reasoning", "Overall": 76.3, "Physics": 88.6, "Chemistry": 69.3, "Finance": 69.0, "Consulting": 80.9, "Extraction": 78.1, "Reasoning": 76.6, "Style": 79.4, "Overall F1": 78.1, "Bias to o3": 0.3, "Bias to R1-0528": -1.5, "Bias to Grok4": -1.4, "Bias Index": 1.8, "Input Tokens": 1618.0, "Output Tokens": 130.0, "Cost": 11.58}
|
| 28 |
+
{"Model": "OpenAI/GPT-5 (minimal)", "Category": "Closed-source Reasoning", "Overall": 71.9, "Physics": 86.8, "Chemistry": 68.6, "Finance": 71.2, "Consulting": 77.5, "Extraction": 78.9, "Reasoning": 75.2, "Style": 64.8, "Overall F1": 77.0, "Bias to o3": -0.5, "Bias to R1-0528": -5.6, "Bias to Grok4": -5.0, "Bias Index": 5.1, "Input Tokens": 1618.0, "Output Tokens": 7.0, "Cost": 7.29}
|
| 29 |
+
{"Model": "OpenAI/GPT-5-mini (high)", "Category": "Closed-source Reasoning", "Overall": 75.3, "Physics": 84.5, "Chemistry": 69.2, "Finance": 70.4, "Consulting": 82.8, "Extraction": 78.4, "Reasoning": 75.9, "Style": 74.1, "Overall F1": 77.7, "Bias to o3": 6.6, "Bias to R1-0528": 4.2, "Bias to Grok4": 4.6, "Bias Index": 2.4, "Input Tokens": 1619.0, "Output Tokens": 497.0, "Cost": 4.88}
|
| 30 |
+
{"Model": "OpenAI/GPT-5-mini (med)", "Category": "Closed-source Reasoning", "Overall": 74.4, "Physics": 83.3, "Chemistry": 68.2, "Finance": 69.9, "Consulting": 81.5, "Extraction": 78.1, "Reasoning": 74.6, "Style": 72.8, "Overall F1": 76.7, "Bias to o3": 6.3, "Bias to R1-0528": 4.0, "Bias to Grok4": 4.3, "Bias Index": 2.3, "Input Tokens": 1618.0, "Output Tokens": 228.0, "Cost": 3.0}
|
| 31 |
+
{"Model": "OpenAI/GPT-5-mini (low)", "Category": "Closed-source Reasoning", "Overall": 74.7, "Physics": 82.9, "Chemistry": 68.5, "Finance": 70.3, "Consulting": 81.7, "Extraction": 77.4, "Reasoning": 74.6, "Style": 78.0, "Overall F1": 76.8, "Bias to o3": 5.9, "Bias to R1-0528": 3.8, "Bias to Grok4": 4.6, "Bias Index": 2.1, "Input Tokens": 1618.0, "Output Tokens": 92.0, "Cost": 2.05}
|
| 32 |
+
{"Model": "OpenAI/GPT-5-mini (minimal)", "Category": "Closed-source Reasoning", "Overall": 66.7, "Physics": 81.7, "Chemistry": 64.0, "Finance": 69.1, "Consulting": 76.0, "Extraction": 75.9, "Reasoning": 72.5, "Style": 58.8, "Overall F1": 73.8, "Bias to o3": -4.0, "Bias to R1-0528": -6.2, "Bias to Grok4": -11.1, "Bias Index": 7.1, "Input Tokens": 1618.0, "Output Tokens": 7.0, "Cost": 1.46}
|
| 33 |
+
{"Model": "OpenAI/GPT-5-nano (high)", "Category": "Closed-source Reasoning", "Overall": 71.9, "Physics": 86.8, "Chemistry": 67.6, "Finance": 68.7, "Consulting": 79.8, "Extraction": 77.6, "Reasoning": 75.1, "Style": 74.0, "Overall F1": 76.9, "Bias to o3": 5.3, "Bias to R1-0528": 0.3, "Bias to Grok4": 3.1, "Bias Index": 5.0, "Input Tokens": 1618.0, "Output Tokens": 1309.0, "Cost": 2.11}
|
| 34 |
+
{"Model": "OpenAI/GPT-5-nano (med)", "Category": "Closed-source Reasoning", "Overall": 72.7, "Physics": 85.6, "Chemistry": 67.0, "Finance": 68.7, "Consulting": 79.7, "Extraction": 77.1, "Reasoning": 74.3, "Style": 78.3, "Overall F1": 76.4, "Bias to o3": 3.4, "Bias to R1-0528": -0.3, "Bias to Grok4": 1.7, "Bias Index": 3.7, "Input Tokens": 1618.0, "Output Tokens": 479.0, "Cost": 0.95}
|
| 35 |
+
{"Model": "OpenAI/GPT-5-nano (low)", "Category": "Closed-source Reasoning", "Overall": 73.6, "Physics": 83.5, "Chemistry": 67.6, "Finance": 68.6, "Consulting": 77.7, "Extraction": 76.9, "Reasoning": 73.5, "Style": 70.9, "Overall F1": 75.4, "Bias to o3": 2.4, "Bias to R1-0528": 0.6, "Bias to Grok4": 1.9, "Bias Index": 1.8, "Input Tokens": 1619.0, "Output Tokens": 141.0, "Cost": 0.48}
|
| 36 |
+
{"Model": "OpenAI/GPT-5-nano (minimal)", "Category": "Closed-source Reasoning", "Overall": 55.0, "Physics": 68.8, "Chemistry": 55.3, "Finance": 60.9, "Consulting": 63.0, "Extraction": 65.8, "Reasoning": 62.1, "Style": 54.3, "Overall F1": 63.2, "Bias to o3": -18.7, "Bias to R1-0528": -19.6, "Bias to Grok4": -26.9, "Bias Index": 8.2, "Input Tokens": 1618.0, "Output Tokens": 7.0, "Cost": 0.29}
|
| 37 |
+
{"Model": "OpenAI/o3 (high)", "Category": "Closed-source Reasoning", "Overall": 76.4, "Physics": 88.3, "Chemistry": 68.2, "Finance": 69.3, "Consulting": 81.1, "Extraction": 79.1, "Reasoning": 76.1, "Style": 75.3, "Overall F1": 77.9, "Bias to o3": 2.0, "Bias to R1-0528": 0.5, "Bias to Grok4": 0.8, "Bias Index": 1.5, "Input Tokens": 1618.0, "Output Tokens": 350.0, "Cost": 21.04}
|
| 38 |
+
{"Model": "OpenAI/o3 (med)", "Category": "Closed-source Reasoning", "Overall": 76.0, "Physics": 89.3, "Chemistry": 69.1, "Finance": 68.9, "Consulting": 81.0, "Extraction": 79.3, "Reasoning": 76.4, "Style": 76.9, "Overall F1": 78.2, "Bias to o3": 3.0, "Bias to R1-0528": 0.8, "Bias to Grok4": 1.5, "Bias Index": 2.2, "Input Tokens": 1618.0, "Output Tokens": 207.0, "Cost": 17.05}
|
| 39 |
+
{"Model": "OpenAI/o3 (low)", "Category": "Closed-source Reasoning", "Overall": 76.4, "Physics": 88.9, "Chemistry": 69.3, "Finance": 70.3, "Consulting": 81.9, "Extraction": 79.7, "Reasoning": 76.8, "Style": 76.7, "Overall F1": 78.7, "Bias to o3": 3.8, "Bias to R1-0528": 1.5, "Bias to Grok4": 2.6, "Bias Index": 2.3, "Input Tokens": 1618.0, "Output Tokens": 98.0, "Cost": 14.01}
|
| 40 |
+
{"Model": "OpenAI/o4-mini (high)", "Category": "Closed-source Reasoning", "Overall": 75.8, "Physics": 88.5, "Chemistry": 68.9, "Finance": 70.5, "Consulting": 81.5, "Extraction": 78.7, "Reasoning": 76.8, "Style": 76.5, "Overall F1": 78.4, "Bias to o3": 4.5, "Bias to R1-0528": 2.7, "Bias to Grok4": 1.9, "Bias Index": 2.6, "Input Tokens": 1618.0, "Output Tokens": 308.0, "Cost": 10.93}
|
| 41 |
+
{"Model": "OpenAI/o4-mini (med)", "Category": "Closed-source Reasoning", "Overall": 75.8, "Physics": 88.1, "Chemistry": 69.6, "Finance": 70.8, "Consulting": 81.6, "Extraction": 78.9, "Reasoning": 76.8, "Style": 74.1, "Overall F1": 78.6, "Bias to o3": 4.0, "Bias to R1-0528": 2.8, "Bias to Grok4": 1.2, "Bias Index": 2.8, "Input Tokens": 1618.0, "Output Tokens": 228.0, "Cost": 9.7}
|
| 42 |
+
{"Model": "OpenAI/o4-mini (low)", "Category": "Closed-source Reasoning", "Overall": 76.8, "Physics": 88.6, "Chemistry": 70.1, "Finance": 70.1, "Consulting": 81.0, "Extraction": 78.8, "Reasoning": 76.8, "Style": 74.1, "Overall F1": 78.5, "Bias to o3": 3.4, "Bias to R1-0528": 3.3, "Bias to Grok4": 1.7, "Bias Index": 1.7, "Input Tokens": 1618.0, "Output Tokens": 104.0, "Cost": 7.8}
|
| 43 |
+
{"Model": "xAI/grok-4", "Category": "Closed-source Reasoning", "Overall": 75.9, "Physics": 86.1, "Chemistry": 68.5, "Finance": 70.7, "Consulting": 80.8, "Extraction": 78.5, "Reasoning": 76.3, "Style": 75.2, "Overall F1": 77.7, "Bias to o3": 0.7, "Bias to R1-0528": 2.5, "Bias to Grok4": 1.8, "Bias Index": 1.8, "Input Tokens": 1549.0, "Output Tokens": 812.0, "Cost": 58.7}
|
| 44 |
+
{"Model": "xAI/grok-3-mini", "Category": "Closed-source Reasoning", "Overall": 75.1, "Physics": 85.8, "Chemistry": 66.9, "Finance": 69.4, "Consulting": 82.0, "Extraction": 78.1, "Reasoning": 75.3, "Style": 75.2, "Overall F1": 77.2, "Bias to o3": 4.5, "Bias to R1-0528": 2.4, "Bias to Grok4": 2.9, "Bias Index": 2.1, "Input Tokens": 1549.0, "Output Tokens": 633.0, "Cost": 2.72}
|
| 45 |
+
{"Model": "Anthropic/claude-sonnet-4-20250514", "Category": "Closed-source Reasoning", "Overall": 70.9, "Physics": 75.7, "Chemistry": 66.3, "Finance": 69.9, "Consulting": 77.8, "Extraction": 77.5, "Reasoning": 72.3, "Style": 66.0, "Overall F1": 74.0, "Bias to o3": -11.2, "Bias to R1-0528": -8.1, "Bias to Grok4": -10.7, "Bias Index": 3.1, "Input Tokens": 1940.0, "Output Tokens": 810.0, "Cost": 62.64}
|
| 46 |
+
{"Model": "Google/Gemini-2.5-Pro", "Category": "Closed-source Reasoning", "Overall": 78.2, "Physics": 87.3, "Chemistry": 70.2, "Finance": 71.9, "Consulting": 82.6, "Extraction": 81.3, "Reasoning": 77.4, "Style": 76.8, "Overall F1": 79.2, "Bias to o3": 3.1, "Bias to R1-0528": 2.8, "Bias to Grok4": 2.1, "Bias Index": 1.0, "Input Tokens": 1779.0, "Output Tokens": 967.0, "Cost": 41.46}
|
| 47 |
+
{"Model": "Google/Gemini-2.5-Flash (Thinking)", "Category": "Closed-source Reasoning", "Overall": 78.1, "Physics": 87.0, "Chemistry": 68.7, "Finance": 71.6, "Consulting": 81.2, "Extraction": 80.1, "Reasoning": 76.7, "Style": 74.6, "Overall F1": 78.4, "Bias to o3": 2.3, "Bias to R1-0528": 2.5, "Bias to Grok4": 2.2, "Bias Index": 0.3, "Input Tokens": 1779.0, "Output Tokens": 695.0, "Cost": 7.92}
|
| 48 |
+
{"Model": "Google/Gemini-2.5-Flash-Lite (Thinking)", "Category": "Closed-source Reasoning", "Overall": 74.7, "Physics": 83.7, "Chemistry": 67.0, "Finance": 72.2, "Consulting": 81.9, "Extraction": 78.7, "Reasoning": 75.9, "Style": 79.1, "Overall F1": 77.5, "Bias to o3": -1.1, "Bias to R1-0528": 0.2, "Bias to Grok4": -2.6, "Bias Index": 2.8, "Input Tokens": 1779.0, "Output Tokens": 1670.0, "Cost": 2.95}
|
| 49 |
+
{"Model": "OpenAI/gpt-oss-20b (high)", "Category": "Open-weight Reasoning", "Overall": 74.4, "Physics": 89.3, "Chemistry": 68.7, "Finance": 68.5, "Consulting": 80.7, "Extraction": 77.8, "Reasoning": 76.5, "Style": 77.7, "Overall F1": 77.9, "Bias to o3": 3.3, "Bias to R1-0528": -0.2, "Bias to Grok4": 0.9, "Bias Index": 3.5, "Input Tokens": 1679.0, "Output Tokens": 465.0, "Cost": 0.46}
|
| 50 |
+
{"Model": "OpenAI/gpt-oss-20b (medium)", "Category": "Open-weight Reasoning", "Overall": 74.8, "Physics": 87.7, "Chemistry": 68.3, "Finance": 69.7, "Consulting": 80.9, "Extraction": 78.5, "Reasoning": 76.3, "Style": 76.2, "Overall F1": 77.8, "Bias to o3": 3.6, "Bias to R1-0528": 1.1, "Bias to Grok4": 0.6, "Bias Index": 3.0, "Input Tokens": 1683.0, "Output Tokens": 216.0, "Cost": 0.35}
|
| 51 |
+
{"Model": "OpenAI/gpt-oss-20b (low)", "Category": "Open-weight Reasoning", "Overall": 75.6, "Physics": 85.4, "Chemistry": 69.3, "Finance": 70.8, "Consulting": 79.2, "Extraction": 77.6, "Reasoning": 76.3, "Style": 71.1, "Overall F1": 77.5, "Bias to o3": 0.4, "Bias to R1-0528": -0.3, "Bias to Grok4": 1.6, "Bias Index": 1.9, "Input Tokens": 1677.0, "Output Tokens": 85.0, "Cost": 0.28}
|
| 52 |
+
{"Model": "OpenAI/gpt-oss-120b (high)", "Category": "Open-weight Reasoning", "Overall": 75.4, "Physics": 89.5, "Chemistry": 68.9, "Finance": 69.7, "Consulting": 80.8, "Extraction": 78.9, "Reasoning": 76.7, "Style": 80.8, "Overall F1": 78.4, "Bias to o3": 1.6, "Bias to R1-0528": -1.4, "Bias to Grok4": 0.3, "Bias Index": 3.0, "Input Tokens": 1683.0, "Output Tokens": 439.0, "Cost": 0.88}
|
| 53 |
+
{"Model": "OpenAI/gpt-oss-120b (med)", "Category": "Open-weight Reasoning", "Overall": 75.8, "Physics": 88.1, "Chemistry": 67.4, "Finance": 70.5, "Consulting": 79.9, "Extraction": 79.6, "Reasoning": 76.0, "Style": 75.3, "Overall F1": 77.7, "Bias to o3": 0.6, "Bias to R1-0528": -1.3, "Bias to Grok4": -0.9, "Bias Index": 1.9, "Input Tokens": 1683.0, "Output Tokens": 196.0, "Cost": 0.63}
|
| 54 |
+
{"Model": "OpenAI/gpt-oss-120b (low)", "Category": "Open-weight Reasoning", "Overall": 76.7, "Physics": 86.0, "Chemistry": 67.2, "Finance": 72.1, "Consulting": 79.0, "Extraction": 79.2, "Reasoning": 75.7, "Style": 72.4, "Overall F1": 77.3, "Bias to o3": -1.0, "Bias to R1-0528": -1.6, "Bias to Grok4": -1.5, "Bias Index": 0.6, "Input Tokens": 1683.0, "Output Tokens": 84.0, "Cost": 0.5}
|
| 55 |
+
{"Model": "OpenAI/gpt-oss-120b (mixed)", "Category": "Open-weight Reasoning", "Overall": 78.2, "Physics": 89.5, "Chemistry": 68.9, "Finance": 72.2, "Consulting": 79.7, "Extraction": 79.7, "Reasoning": 76.9, "Style": 80.8, "Overall F1": 78.7, "Bias to o3": -0.5, "Bias to R1-0528": -0.9, "Bias to Grok4": -1.0, "Bias Index": 0.5, "Input Tokens": 1683.0, "Output Tokens": 282.0, "Cost": 0.7}
|
| 56 |
+
{"Model": "DeepSeek-AI/DeepSeek-V3.1 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 76.6, "Physics": 84.3, "Chemistry": 69.3, "Finance": 70.8, "Consulting": 80.3, "Extraction": 78.9, "Reasoning": 75.6, "Style": 72.0, "Overall F1": 77.3, "Bias to o3": 3.2, "Bias to R1-0528": 3.3, "Bias to Grok4": 2.6, "Bias Index": 0.7, "Input Tokens": 1587.0, "Output Tokens": 657.0, "Cost": 2.94}
|
| 57 |
+
{"Model": "DeepSeek-AI/DeepSeek-R1-0528", "Category": "Open-weight Reasoning", "Overall": 69.4, "Physics": 79.6, "Chemistry": 65.1, "Finance": 68.5, "Consulting": 71.6, "Extraction": 74.7, "Reasoning": 70.9, "Style": 64.1, "Overall F1": 72.2, "Bias to o3": -11.6, "Bias to R1-0528": -9.3, "Bias to Grok4": -8.8, "Bias Index": 2.8, "Input Tokens": 1601.0, "Output Tokens": 693.0, "Cost": 3.05}
|
| 58 |
+
{"Model": "Qwen/Qwen3-30B-A3B-Thinking-2507", "Category": "Open-weight Reasoning", "Overall": 39.8, "Physics": 46.7, "Chemistry": 35.9, "Finance": 45.4, "Consulting": 35.8, "Extraction": 42.1, "Reasoning": 41.2, "Style": 35.3, "Overall F1": 41.5, "Bias to o3": -0.2, "Bias to R1-0528": -1.3, "Bias to Grok4": 0.4, "Bias Index": 1.7, "Input Tokens": 1780.0, "Output Tokens": 742.0, "Cost": 1.1}
|
| 59 |
+
{"Model": "Qwen/Qwen3-235B-A22B-Thinking-2507", "Category": "Open-weight Reasoning", "Overall": 76.5, "Physics": 87.2, "Chemistry": 67.9, "Finance": 69.0, "Consulting": 80.4, "Extraction": 79.3, "Reasoning": 75.6, "Style": 74.3, "Overall F1": 77.3, "Bias to o3": -1.0, "Bias to R1-0528": -1.8, "Bias to Grok4": -1.5, "Bias Index": 0.8, "Input Tokens": 1782.0, "Output Tokens": 1245.0, "Cost": 1.84}
|