Presidentlin's picture
x
567ce0b
import { Benchmark } from "./types";
export const googleBenchmarks: Benchmark[] = [
{
model: "Gemini 2.5 Pro (Thinking-enabled, <=200k context)",
provider: "Google",
inputPrice: 1.25,
outputPrice: 10.0,
source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
benchmark: {
livecodebench_v6: 69.0,
aider_polyglot: 82.2,
swe_bench_verified: 67.2,
gpqa_diamond: 86.4,
aime_2025: 88.0,
humanitys_last_exam: 21.6,
simpleqa: 54.0,
facts_grounding: 87.8,
global_mmlu_lite: 89.2,
mrcr_v2_avg_128k: 58.0,
mrcr_v2_pointwise_1m: 16.4,
mmmu: 82.0,
},
},
{
model: "Gemini 2.5 Pro (Thinking-enabled, >200k context)",
provider: "Google",
inputPrice: 2.5,
outputPrice: 15.0,
source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
benchmark: {
livecodebench_v6: 69.0,
aider_polyglot: 82.2,
swe_bench_verified: 67.2,
gpqa_diamond: 86.4,
aime_2025: 88.0,
humanitys_last_exam: 21.6,
simpleqa: 54.0,
facts_grounding: 87.8,
global_mmlu_lite: 89.2,
mrcr_v2_avg_128k: 58.0,
mrcr_v2_pointwise_1m: 16.4,
mmmu: 82.0,
},
},
{
model: "Gemini 2.5 Pro Experimental (03-25)",
provider: "Google",
inputPrice: 1.25,
outputPrice: 10.0,
source: "https://blog.google/products/gemini/gemini-2-5-pro-updates/",
benchmark: {
livecodebench_v6: 70.4,
aider_polyglot: 74.0,
swe_bench_verified: 63.8,
gpqa_diamond: 84.0,
aime_2025: 86.7,
humanitys_last_exam: 18.8,
simpleqa: 52.9,
global_mmlu_lite: 89.8,
mrcr_v2_avg_128k: 94.5,
mrcr_v2_pointwise_1m: 83.1,
mmmu: 81.7,
// vibe_eval: 69.4,
// video_mme: not reported
},
},
{
model: "Gemini 2.5 Pro Preview (05-06)",
provider: "Google",
inputPrice: 1.25,
outputPrice: 10.0,
source: "https://blog.google/products/gemini/gemini-2-5-pro-updates/",
benchmark: {
livecodebench_v6: 75.6,
aider_polyglot: 76.5,
swe_bench_verified: 63.2,
gpqa_diamond: 83.0,
aime_2025: 83.0,
humanitys_last_exam: 17.8,
simpleqa: 50.8,
global_mmlu_lite: 88.6,
mrcr_v2_avg_128k: 93.0,
mrcr_v2_pointwise_1m: 82.9,
mmmu: 79.6,
// vibe_eval: 65.6,
// video_mme: 84.8,
},
},
{
model: "Gemini 2.5 Flash (Thinking-enabled, default)",
provider: "Google",
inputPrice: 0.30,
outputPrice: 3.5,
source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
benchmark: {
livecodebench_v6: 55.4,
aider_polyglot: 56.7,
swe_bench_verified: 60.3,
gpqa_diamond: 82.8,
aime_2025: 72.0,
humanitys_last_exam: 11.0,
simpleqa: 26.9,
facts_grounding: 85.3,
global_mmlu_lite: 88.4,
mrcr_v2_avg_128k: 54.3,
mrcr_v2_pointwise_1m: 21.0,
mmmu: 79.7,
// loft_128k: 82.1,
// loft_1m: 58.9,
},
},
{
model: "Gemini 2.5 Flash (Non-Thinking)",
provider: "Google",
inputPrice: 0.30,
outputPrice: 2.50,
source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
benchmark: {
humanitys_last_exam: 8.4,
gpqa_diamond: 78.3,
aime_2025: 61.6,
livecodebench_v6: 41.1,
aider_polyglot: 44.0,
swe_bench_verified: 50.0,
simpleqa: 25.8,
facts_grounding: 83.4,
mmmu: 76.9,
// vibe_eval: 66.2,
mrcr_v2_avg_128k: 34.1,
mrcr_v2_pointwise_1m: 16.8,
global_mmlu_lite: 85.8,
// loft_128k: 76.2,
// loft_1m: 49.5,
},
},
{
model: "Gemini 2.5 Flash-Lite (Non-Thinking)",
provider: "Google",
inputPrice: 0.10,
outputPrice: 0.40,
source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
benchmark: {
humanitys_last_exam: 5.1,
gpqa_diamond: 64.6,
aime_2025: 49.8,
livecodebench_v6: 33.7,
aider_polyglot: 26.7,
swe_bench_verified: 42.6,
simpleqa: 10.7,
facts_grounding: 84.1,
mmmu: 72.9,
// vibe_eval: 51.3,
mrcr_v2_avg_128k: 16.6,
mrcr_v2_pointwise_1m: 4.1,
global_mmlu_lite: 81.1,
// loft_128k: 65.7,
// loft_1m: 31.1,
},
},
{
model: "Gemini 2.5 Flash-Lite (Thinking)",
provider: "Google",
inputPrice: 0.10,
outputPrice: 0.40,
source: "https://blog.google/products/gemini/gemini-2-5-model-family-expands/",
benchmark: {
humanitys_last_exam: 6.9,
gpqa_diamond: 66.7,
aime_2025: 63.1,
livecodebench_v6: 34.3,
aider_polyglot: 27.1,
swe_bench_verified: 44.9,
simpleqa: 13.0,
facts_grounding: 86.8,
mmmu: 72.9,
// vibe_eval: 57.5,
mrcr_v2_avg_128k: 30.6,
mrcr_v2_pointwise_1m: 5.4,
global_mmlu_lite: 84.5,
// loft_128k: 67.3,
// loft_1m: 38.4,
},
},
{
model: "Gemini 2.0 Flash-Lite",
provider: "Google",
inputPrice: 0.10,
outputPrice: 0.40,
source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
benchmark: {
livecodebench_v6: 29.1,
aider_polyglot: 10.5,
swe_bench_verified: 23.1,
gpqa_diamond: 50.5,
aime_2025: 23.8,
humanitys_last_exam: 4.6,
simpleqa: 16.5,
facts_grounding: 82.4,
global_mmlu_lite: 78.0,
// loft_128k: 50.7,
// loft_1m: 7.6,
},
},
{
model: "Gemini Diffusion",
provider: "Google",
inputPrice: 0,
outputPrice: 0,
source: "https://deepmind.google/models/gemini-diffusion/",
benchmark: {
livecodebench_v6: 30.9,
bigcodebench: 45.4,
lbpp_v2: 56.8,
swe_bench_verified: 22.9,
humaneval: 89.6,
mbpp: 76.0,
gpqa_diamond: 40.4,
aime_2025: 23.3,
bigbench_extra_hard: 15.0,
global_mmlu_lite: 69.1,
},
},
{
model: "Gemini 2.0 Flash",
provider: "Google",
inputPrice: 0.1,
outputPrice: 0.4,
source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
benchmark: {
aime_2025: 29.7,
gpqa_diamond: 65.2,
simpleqa: 29.9,
global_mmlu_lite: 83.4,
livecodebench_v6: 29.1,
mmmu: 69.3,
facts_grounding: 84.6,
humanitys_last_exam: 5.1,
mrcr_v2_avg_128k: 19.0,
mrcr_v2_pointwise_1m: 5.3,
// loft_128k: 58.0,
// loft_1m: 7.6,
},
},
{
model: "Gemini 1.5 Pro (<=128k context)",
provider: "Google",
inputPrice: 1.25,
outputPrice: 5.00,
source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
benchmark: {
livecodebench_v6: 29.7,
aider_polyglot: 16.9,
swe_bench_verified: 34.2,
gpqa_diamond: 58.1,
aime_2025: 17.5,
humanitys_last_exam: 4.6,
simpleqa: 24.9,
facts_grounding: 80.0,
global_mmlu_lite: 80.8,
mrcr_v2_avg_128k: 26.2,
mrcr_v2_pointwise_1m: 12.1,
mmmu: 67.7,
},
},
{
model: "Gemini 1.5 Pro (>128k context)",
provider: "Google",
inputPrice: 2.50,
outputPrice: 10.00,
source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
benchmark: {
livecodebench_v6: 29.7,
aider_polyglot: 16.9,
swe_bench_verified: 34.2,
gpqa_diamond: 58.1,
aime_2025: 17.5,
humanitys_last_exam: 4.6,
simpleqa: 24.9,
facts_grounding: 80.0,
global_mmlu_lite: 80.8,
mrcr_v2_avg_128k: 26.2,
mrcr_v2_pointwise_1m: 12.1,
mmmu: 67.7,
},
},
// Gemini 1.5 Flash
{
model: "Gemini 1.5 Flash (<=128k context)",
provider: "Google",
inputPrice: 0.075,
outputPrice: 0.30,
source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
benchmark: {
livecodebench_v6: 30.3,
aider_polyglot: 2.8,
swe_bench_verified: 19.7,
gpqa_diamond: 50.0,
aime_2025: 14.7,
simpleqa: 8.6,
facts_grounding: 82.9,
global_mmlu_lite: 72.5,
mrcr_v2_avg_128k: 18.4,
mrcr_v2_pointwise_1m: 10.2,
mmmu: 58.3,
},
},
{
model: "Gemini 1.5 Flash (>128k context)",
provider: "Google",
inputPrice: 0.15,
outputPrice: 0.60,
source: "https://storage.googleapis.com/deepmind-media/gemini/gemini_v2_5_report.pdf",
benchmark: {
livecodebench_v6: 30.3,
aider_polyglot: 2.8,
swe_bench_verified: 19.7,
gpqa_diamond: 50.0,
aime_2025: 14.7,
simpleqa: 8.6,
facts_grounding: 82.9,
global_mmlu_lite: 72.5,
mrcr_v2_avg_128k: 18.4,
mrcr_v2_pointwise_1m: 10.2,
mmmu: 58.3,
},
},
];