Upload 3 files
Browse files- app.py +1 -1
- report_generation.jsonl +2 -2
- report_generation_w_docs.jsonl +2 -2
app.py
CHANGED
|
@@ -21,7 +21,7 @@ color_map = {
|
|
| 21 |
"Open-weight Reasoning": "#f59c03",
|
| 22 |
}
|
| 23 |
|
| 24 |
-
CAPTION_V2 = f"""**ProfBench**:
|
| 25 |
ProfBench is a high-quality, text-only dataset that represent the complex reasoning tasks faced by professionals in fields like finance and chemistry. We're not talking about simple Q&A or retrieval-based tasks. We're talking about multi-page assignments that require deep domain knowledge and reasoning. Can AI generate comprehensive reports by applying the nuanced reasoning that a PhD-level physicist/chemist or an MBA-level consultant/financier would have? \n
|
| 26 |
[Blog](https://huggingface.co/blog/nvidia/profbench) | [Paper](https://arxiv.org/abs/2510.18941) | [Data](https://huggingface.co/datasets/nvidia/ProfBench) | [Code](https://github.com/NVlabs/ProfBench) | [Nemo Evaluator SDK](https://github.com/NVIDIA-NeMo/Evaluator)\n
|
| 27 |
Want to see your favorite models added? Run it with [Nemo Evaluator SDK for scalable evaluation](https://github.com/NVIDIA-NeMo/Evaluator) or [ProfBench code for quick evaluation](https://github.com/NVlabs/ProfBench), send us the scores or ping us to run it for you!"""
|
|
|
|
| 21 |
"Open-weight Reasoning": "#f59c03",
|
| 22 |
}
|
| 23 |
|
| 24 |
+
CAPTION_V2 = f"""**ProfBench**: Over 7,000 brand-new expert-authored response–criterion pairs across 80 professional tasks across PhD STEM (Chemistry, Physics) and MBA Services (Finance, Consulting) domains. \n
|
| 25 |
ProfBench is a high-quality, text-only dataset that represent the complex reasoning tasks faced by professionals in fields like finance and chemistry. We're not talking about simple Q&A or retrieval-based tasks. We're talking about multi-page assignments that require deep domain knowledge and reasoning. Can AI generate comprehensive reports by applying the nuanced reasoning that a PhD-level physicist/chemist or an MBA-level consultant/financier would have? \n
|
| 26 |
[Blog](https://huggingface.co/blog/nvidia/profbench) | [Paper](https://arxiv.org/abs/2510.18941) | [Data](https://huggingface.co/datasets/nvidia/ProfBench) | [Code](https://github.com/NVlabs/ProfBench) | [Nemo Evaluator SDK](https://github.com/NVIDIA-NeMo/Evaluator)\n
|
| 27 |
Want to see your favorite models added? Run it with [Nemo Evaluator SDK for scalable evaluation](https://github.com/NVIDIA-NeMo/Evaluator) or [ProfBench code for quick evaluation](https://github.com/NVlabs/ProfBench), send us the scores or ping us to run it for you!"""
|
report_generation.jsonl
CHANGED
|
@@ -8,8 +8,8 @@
|
|
| 8 |
{"Model": "Google/Gemini-2.5-Flash-Lite (Thinking)", "Category": "Closed-source Reasoning", "Overall": 44.3, "Physics": 32.3, "Chemistry": 52.7, "Finance": 31.3, "Consulting": 61.0, "Extraction": 35.8, "Reasoning": 43.3, "Style": 56.7, "Response Characters": 12153, "Input Tokens": 480, "Output Tokens": 17302, "Cost": 1.12}
|
| 9 |
{"Model": "xAI/grok-4-0709", "Category": "Closed-source Reasoning", "Overall": 45.1, "Physics": 20.6, "Chemistry": 59.8, "Finance": 29.4, "Consulting": 70.5, "Extraction": 40.1, "Reasoning": 48.4, "Style": 65.2, "Response Characters": 4977, "Input Tokens": 1126, "Output Tokens": 17957, "Cost": 43.64}
|
| 10 |
{"Model": "Anthropic/claude-sonnet-4 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 42.5, "Physics": 39.5, "Chemistry": 53.3, "Finance": 21.2, "Consulting": 56.1, "Extraction": 29.5, "Reasoning": 42.5, "Style": 66.1, "Response Characters": 3621, "Input Tokens": 559, "Output Tokens": 7924, "Cost": 19.29}
|
| 11 |
-
{"Model": "OpenAI/gpt-oss-120b", "Category": "Open-weight Reasoning", "Overall": 50.0, "Physics": 43.6, "Chemistry": 53.5, "Finance": 35.3, "Consulting": 67.6, "Extraction": 39.7, "Reasoning": 51.5, "Style": 63.4, "Response Characters": 8657, "Input Tokens": 530, "Output Tokens": 4817, "Cost": 0.31}
|
| 12 |
-
{"Model": "OpenAI/gpt-oss-20b", "Category": "Open-weight Reasoning", "Overall": 42.3, "Physics": 33.6, "Chemistry": 40.5, "Finance": 28.7, "Consulting": 66.4, "Extraction": 29.9, "Reasoning": 44.1, "Style": 59.1, "Response Characters": 5609, "Input Tokens": 508, "Output Tokens": 5375, "Cost": 0.12}
|
| 13 |
{"Model": "DeepSeek-AI/DeepSeek-V3.1 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 45.9, "Physics": 32.3, "Chemistry": 53.9, "Finance": 35.6, "Consulting": 61.9, "Extraction": 39.4, "Reasoning": 50.0, "Style": 59.1, "Response Characters": 5760, "Input Tokens": 415, "Output Tokens": 6253, "Cost": 0.81}
|
| 14 |
{"Model": "Qwen/Qwen3-235B-A22B-Thinking-2507", "Category": "Open-weight Reasoning", "Overall": 41.1, "Physics": 32.7, "Chemistry": 46.4, "Finance": 23.5, "Consulting": 61.9, "Extraction": 35.0, "Reasoning": 46.1, "Style": 63.4, "Response Characters": 11390, "Input Tokens": 490, "Output Tokens": 5568, "Cost": 0.54}
|
| 15 |
{"Model": "Qwen/Qwen3-30B-A3B-Thinking-2507", "Category": "Open-weight Reasoning", "Overall": 37.6, "Physics": 19.0, "Chemistry": 44.7, "Finance": 25.6, "Consulting": 61.3, "Extraction": 29.4, "Reasoning": 42.4, "Style": 73.1, "Response Characters": 5892, "Input Tokens": 469, "Output Tokens": 6376, "Cost": 0.3}
|
|
|
|
| 8 |
{"Model": "Google/Gemini-2.5-Flash-Lite (Thinking)", "Category": "Closed-source Reasoning", "Overall": 44.3, "Physics": 32.3, "Chemistry": 52.7, "Finance": 31.3, "Consulting": 61.0, "Extraction": 35.8, "Reasoning": 43.3, "Style": 56.7, "Response Characters": 12153, "Input Tokens": 480, "Output Tokens": 17302, "Cost": 1.12}
|
| 9 |
{"Model": "xAI/grok-4-0709", "Category": "Closed-source Reasoning", "Overall": 45.1, "Physics": 20.6, "Chemistry": 59.8, "Finance": 29.4, "Consulting": 70.5, "Extraction": 40.1, "Reasoning": 48.4, "Style": 65.2, "Response Characters": 4977, "Input Tokens": 1126, "Output Tokens": 17957, "Cost": 43.64}
|
| 10 |
{"Model": "Anthropic/claude-sonnet-4 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 42.5, "Physics": 39.5, "Chemistry": 53.3, "Finance": 21.2, "Consulting": 56.1, "Extraction": 29.5, "Reasoning": 42.5, "Style": 66.1, "Response Characters": 3621, "Input Tokens": 559, "Output Tokens": 7924, "Cost": 19.29}
|
| 11 |
+
{"Model": "OpenAI/gpt-oss-120b (high)", "Category": "Open-weight Reasoning", "Overall": 50.0, "Physics": 43.6, "Chemistry": 53.5, "Finance": 35.3, "Consulting": 67.6, "Extraction": 39.7, "Reasoning": 51.5, "Style": 63.4, "Response Characters": 8657, "Input Tokens": 530, "Output Tokens": 4817, "Cost": 0.31}
|
| 12 |
+
{"Model": "OpenAI/gpt-oss-20b (high)", "Category": "Open-weight Reasoning", "Overall": 42.3, "Physics": 33.6, "Chemistry": 40.5, "Finance": 28.7, "Consulting": 66.4, "Extraction": 29.9, "Reasoning": 44.1, "Style": 59.1, "Response Characters": 5609, "Input Tokens": 508, "Output Tokens": 5375, "Cost": 0.12}
|
| 13 |
{"Model": "DeepSeek-AI/DeepSeek-V3.1 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 45.9, "Physics": 32.3, "Chemistry": 53.9, "Finance": 35.6, "Consulting": 61.9, "Extraction": 39.4, "Reasoning": 50.0, "Style": 59.1, "Response Characters": 5760, "Input Tokens": 415, "Output Tokens": 6253, "Cost": 0.81}
|
| 14 |
{"Model": "Qwen/Qwen3-235B-A22B-Thinking-2507", "Category": "Open-weight Reasoning", "Overall": 41.1, "Physics": 32.7, "Chemistry": 46.4, "Finance": 23.5, "Consulting": 61.9, "Extraction": 35.0, "Reasoning": 46.1, "Style": 63.4, "Response Characters": 11390, "Input Tokens": 490, "Output Tokens": 5568, "Cost": 0.54}
|
| 15 |
{"Model": "Qwen/Qwen3-30B-A3B-Thinking-2507", "Category": "Open-weight Reasoning", "Overall": 37.6, "Physics": 19.0, "Chemistry": 44.7, "Finance": 25.6, "Consulting": 61.3, "Extraction": 29.4, "Reasoning": 42.4, "Style": 73.1, "Response Characters": 5892, "Input Tokens": 469, "Output Tokens": 6376, "Cost": 0.3}
|
report_generation_w_docs.jsonl
CHANGED
|
@@ -8,8 +8,8 @@
|
|
| 8 |
{"Model": "Google/Gemini-2.5-Flash-Lite (Thinking)", "Category": "Closed-source Reasoning", "Overall": 49.4, "Physics": 31.7, "Chemistry": 53.1, "Finance": 44.6, "Consulting": 68.0, "Extraction": 48.3, "Reasoning": 48.8, "Style": 54.0, "Response Characters": 10058.0, "Input Tokens": 6086.0, "Output Tokens": 18584.0, "Cost": 5.15}
|
| 9 |
{"Model": "xAI/grok-4-0709", "Category": "Closed-source Reasoning", "Overall": 53.4, "Physics": 33.6, "Chemistry": 62.2, "Finance": 44.3, "Consulting": 73.4, "Extraction": 51.9, "Reasoning": 51.6, "Style": 64.1, "Response Characters": 5380.0, "Input Tokens": 13481.0, "Output Tokens": 9885.0, "Cost": 122.78}
|
| 10 |
{"Model": "Anthropic/claude-sonnet-4 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 55.8, "Physics": 43.9, "Chemistry": 57.1, "Finance": 50.8, "Consulting": 71.4, "Extraction": 53.8, "Reasoning": 54.0, "Style": 61.8, "Response Characters": 3866.0, "Input Tokens": 51044.0, "Output Tokens": 6916.0, "Cost": 164.39}
|
| 11 |
-
{"Model": "OpenAI/gpt-oss-120b", "Category": "Open-weight Reasoning", "Overall": 54.9, "Physics": 49.1, "Chemistry": 55.3, "Finance": 45.5, "Consulting": 69.4, "Extraction": 48.7, "Reasoning": 55.5, "Style": 59.0, "Response Characters": 7442.0, "Input Tokens": 11606.0, "Output Tokens": 4572.0, "Cost": 1.35}
|
| 12 |
-
{"Model": "OpenAI/gpt-oss-20b", "Category": "Open-weight Reasoning", "Overall": 48.4, "Physics": 41.4, "Chemistry": 46.5, "Finance": 39.8, "Consulting": 66.0, "Extraction": 40.9, "Reasoning": 48.2, "Style": 56.2, "Response Characters": 5331.0, "Input Tokens": 11600.0, "Output Tokens": 4705.0, "Cost": 0.75}
|
| 13 |
{"Model": "DeepSeek-AI/DeepSeek-V3.1 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 53.8, "Physics": 44.8, "Chemistry": 59.8, "Finance": 43.3, "Consulting": 67.4, "Extraction": 51.1, "Reasoning": 53.0, "Style": 60.5, "Response Characters": 5239.0, "Input Tokens": 11258.0, "Output Tokens": 7486.0, "Cost": 5.27}
|
| 14 |
{"Model": "Qwen/Qwen3-235B-A22B-Thinking-2507", "Category": "Open-weight Reasoning", "Overall": 54.0, "Physics": 45.1, "Chemistry": 61.4, "Finance": 42.3, "Consulting": 67.3, "Extraction": 51.4, "Reasoning": 51.6, "Style": 61.9, "Response Characters": 6046.0, "Input Tokens": 12442.0, "Output Tokens": 9256.0, "Cost": 2.47}
|
| 15 |
{"Model": "Qwen/Qwen3-30B-A3B-Thinking-2507", "Category": "Open-weight Reasoning", "Overall": 44.6, "Physics": 34.4, "Chemistry": 45.4, "Finance": 36.8, "Consulting": 61.8, "Extraction": 40.4, "Reasoning": 42.3, "Style": 63.9, "Response Characters": 4757.0, "Input Tokens": 12339.0, "Output Tokens": 9027.0, "Cost": 2.16}
|
|
|
|
| 8 |
{"Model": "Google/Gemini-2.5-Flash-Lite (Thinking)", "Category": "Closed-source Reasoning", "Overall": 49.4, "Physics": 31.7, "Chemistry": 53.1, "Finance": 44.6, "Consulting": 68.0, "Extraction": 48.3, "Reasoning": 48.8, "Style": 54.0, "Response Characters": 10058.0, "Input Tokens": 6086.0, "Output Tokens": 18584.0, "Cost": 5.15}
|
| 9 |
{"Model": "xAI/grok-4-0709", "Category": "Closed-source Reasoning", "Overall": 53.4, "Physics": 33.6, "Chemistry": 62.2, "Finance": 44.3, "Consulting": 73.4, "Extraction": 51.9, "Reasoning": 51.6, "Style": 64.1, "Response Characters": 5380.0, "Input Tokens": 13481.0, "Output Tokens": 9885.0, "Cost": 122.78}
|
| 10 |
{"Model": "Anthropic/claude-sonnet-4 (Thinking)", "Category": "Closed-source Reasoning", "Overall": 55.8, "Physics": 43.9, "Chemistry": 57.1, "Finance": 50.8, "Consulting": 71.4, "Extraction": 53.8, "Reasoning": 54.0, "Style": 61.8, "Response Characters": 3866.0, "Input Tokens": 51044.0, "Output Tokens": 6916.0, "Cost": 164.39}
|
| 11 |
+
{"Model": "OpenAI/gpt-oss-120b (high)", "Category": "Open-weight Reasoning", "Overall": 54.9, "Physics": 49.1, "Chemistry": 55.3, "Finance": 45.5, "Consulting": 69.4, "Extraction": 48.7, "Reasoning": 55.5, "Style": 59.0, "Response Characters": 7442.0, "Input Tokens": 11606.0, "Output Tokens": 4572.0, "Cost": 1.35}
|
| 12 |
+
{"Model": "OpenAI/gpt-oss-20b (high)", "Category": "Open-weight Reasoning", "Overall": 48.4, "Physics": 41.4, "Chemistry": 46.5, "Finance": 39.8, "Consulting": 66.0, "Extraction": 40.9, "Reasoning": 48.2, "Style": 56.2, "Response Characters": 5331.0, "Input Tokens": 11600.0, "Output Tokens": 4705.0, "Cost": 0.75}
|
| 13 |
{"Model": "DeepSeek-AI/DeepSeek-V3.1 (Thinking)", "Category": "Open-weight Reasoning", "Overall": 53.8, "Physics": 44.8, "Chemistry": 59.8, "Finance": 43.3, "Consulting": 67.4, "Extraction": 51.1, "Reasoning": 53.0, "Style": 60.5, "Response Characters": 5239.0, "Input Tokens": 11258.0, "Output Tokens": 7486.0, "Cost": 5.27}
|
| 14 |
{"Model": "Qwen/Qwen3-235B-A22B-Thinking-2507", "Category": "Open-weight Reasoning", "Overall": 54.0, "Physics": 45.1, "Chemistry": 61.4, "Finance": 42.3, "Consulting": 67.3, "Extraction": 51.4, "Reasoning": 51.6, "Style": 61.9, "Response Characters": 6046.0, "Input Tokens": 12442.0, "Output Tokens": 9256.0, "Cost": 2.47}
|
| 15 |
{"Model": "Qwen/Qwen3-30B-A3B-Thinking-2507", "Category": "Open-weight Reasoning", "Overall": 44.6, "Physics": 34.4, "Chemistry": 45.4, "Finance": 36.8, "Consulting": 61.8, "Extraction": 40.4, "Reasoning": 42.3, "Style": 63.9, "Response Characters": 4757.0, "Input Tokens": 12339.0, "Output Tokens": 9027.0, "Cost": 2.16}
|