import { Benchmark } from "./types"; export const openaiBenchmarks: Benchmark[] = [ { model: "GPT-4o-2024-11-20", provider: "OpenAI", inputPrice: 2.5, outputPrice: 10.0, source: "https://github.com/openai/simple-evals", benchmark: { mmlu: 85.7, gpqa: 46.0, humaneval: 90.2, simpleqa: 38.8, // math: 68.5, // mgsm: 90.3, // drop: 81.5, }, }, { model: "GPT-4o-2024-08-06", provider: "OpenAI", inputPrice: 2.5, outputPrice: 10.0, source: "https://github.com/openai/simple-evals", benchmark: { mmlu: 88.7, gpqa: 53.1, humaneval: 90.2, simpleqa: 40.1, // math: 75.9, // mgsm: 90.0, // drop: 79.8, }, }, { model: "GPT-4o-2024-05-13", provider: "OpenAI", inputPrice: 5.0, outputPrice: 15.0, source: "https://github.com/openai/simple-evals", benchmark: { mmlu: 87.2, gpqa: 49.9, humaneval: 91.0, simpleqa: 39.0, // math: 76.6, // mgsm: 89.9, // drop: 83.7, }, }, { model: "GPT-4o-mini-2024-07-18", provider: "OpenAI", inputPrice: 0.15, outputPrice: 0.60, source: "https://github.com/openai/simple-evals", benchmark: { mmlu: 82.0, gpqa: 40.2, humaneval: 87.2, mmmu: 59.4, simpleqa: 9.5, // mgsm: 87.0, // drop: 79.7, // math: 70.2, }, }, { model: "GPT-4.1-2025-04-14", provider: "OpenAI", inputPrice: 2.0, outputPrice: 8.0, source: "https://openai.com/index/gpt-4-1/", benchmark: { mmlu: 90.2, gpqa: 66.3, gpqa_diamond: 66.3, humaneval: 94.5, simpleqa: 41.6, swe_bench_verified: 54.6, aider_polyglot: 52.9, mmmlu: 90.2, video_mme: 72.0, // Not yet in BenchmarkMetric aime_24: 48.1, // aime_2025: undefined, // mmlu_pro: undefined, // egoschema: undefined, // loft: undefined, // lcb: undefined, // bigcodebench: undefined, // mbpp: undefined, // livecodebench_v6: undefined, // lbpp_v2: undefined, // bigbench_extra_hard: undefined, // global_mmlu_lite: undefined, // facts_grounding: undefined, // humanitys_last_exam: undefined, mrcr_v2_avg_128k: 57.2, mrcr_v2_pointwise_1m: 46.3, }, }, { model: "GPT-4.1-mini-2025-04-14", provider: "OpenAI", inputPrice: 0.4, outputPrice: 1.6, source: "https://openai.com/index/gpt-4-1/", benchmark: { mmlu: 87.5, gpqa: 65.0, gpqa_diamond: 65.0, humaneval: 93.8, simpleqa: 16.8, swe_bench_verified: 23.6, aider_polyglot: 31.6, mmmlu: 87.5, aime_24: 49.6, mrcr_v2_avg_128k: 47.2, mrcr_v2_pointwise_1m: 33.3, // video_mme: undefined, }, }, { model: "GPT-4.1-nano-2025-04-14", provider: "OpenAI", inputPrice: 0.1, outputPrice: 0.4, source: "https://openai.com/index/gpt-4-1/", benchmark: { mmlu: 80.1, gpqa: 50.3, gpqa_diamond: 50.3, humaneval: 87.0, simpleqa: 7.6, swe_bench_verified: 9.8, aider_polyglot: 6.2, mmmlu: 80.1, aime_24: 29.4, mrcr_v2_avg_128k: 36.6, mrcr_v2_pointwise_1m: 12.0, // video_mme: undefined, }, }, { model: "GPT-4.5-preview-2025-02-27", provider: "OpenAI", inputPrice: 75.0, outputPrice: 150.0, source: "https://github.com/openai/simple-evals", benchmark: { mmlu: 90.8, gpqa: 69.5, simpleqa: 62.5, humaneval: 88.6, // mgsm: 86.9, // drop: 83.4, // math: 87.1, }, }, { model: "GPT-4-turbo-2024-04-09", provider: "OpenAI", inputPrice: 10.0, outputPrice: 30.0, source: "https://github.com/openai/simple-evals", benchmark: { mmlu: 86.7, gpqa: 49.3, humaneval: 88.2, simpleqa: 24.2, // math: 73.4, // mgsm: 89.6, // drop: 86.0, }, }, { model: "GPT-4-0125-preview", provider: "OpenAI", inputPrice: 10.0, outputPrice: 30.0, source: "https://github.com/openai/simple-evals", benchmark: { mmlu: 85.4, gpqa: 41.4, humaneval: 86.6, // math: 64.5, // mgsm: 85.1, // drop: 81.5, }, }, { model: "GPT-4-1106-preview", provider: "OpenAI", inputPrice: 10.0, outputPrice: 30.0, source: "https://github.com/openai/simple-evals", benchmark: { mmlu: 84.7, gpqa: 42.5, humaneval: 83.7, // math: 64.3, // mgsm: 87.1, // drop: 83.2, }, }, { model: "OpenAI o3", provider: "OpenAI", inputPrice: 2.0, outputPrice: 8.0, source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini", benchmark: { aime_24: 91.6, // "o3 (no tools)" aime_2025: 88.9, // "o3 (no tools)" //codeforces: 2706, // "o3 (with terminal)" gpqa_diamond: 83.3, // "o3 (no tools)" humanitys_last_exam: 20.32, // "o3 (no tools)" mmmu: 82.9, //mathvista: 86.8, //charxiv_reasoning: 78.6, //swe_lancer_ic_swe_diamond: 65250, // "o3-high" swe_bench_verified: 69.1, aider_polyglot: 81.3, // "(whole)" //scale_multichallenge: 56.51, //browsecomp: 8.35, // "o3 with python +browsing*" tau_bench_airline: 52.0, // "(Airline)" tau_bench_retail: 73.9, // "(Retail)" }, }, { model: "OpenAI o3-pro", provider: "OpenAI", inputPrice: 20.0, outputPrice: 80.0, source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini", benchmark: { // Benchmarks for o3-pro are not explicitly listed, but it's described as "designed to think longer and provide the most reliable responses." // Assuming similar or slightly better performance than o3 in relevant areas. gpqa_diamond: 83.3, // Placeholder, likely similar or slightly better than o3 humanitys_last_exam: 24.90, // "o3 (python + browsing**tools)" - this is likely the "pro" version's capability }, }, { model: "OpenAI o4-mini", provider: "OpenAI", inputPrice: 1.10, outputPrice: 4.40, source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini", benchmark: { aime_24: 93.4, // "o4-mini (no tools)" aime_2025: 92.7, // "o4-mini (no tools)" //codeforces: 2719, // "o4-mini (with terminal)" gpqa_diamond: 81.4, // "o4-mini (no tools)" humanitys_last_exam: 14.28, // "o4-mini (no tools)" mmmu: 81.6, //mathvista: 84.3, //charxiv_reasoning: 72.0, //swe_lancer_ic_swe_diamond: 56375, // "o4-mini-high" swe_bench_verified: 68.1, aider_polyglot: 68.9, // "(whole)" //scale_multichallenge: 42.99, //browsecomp: 1.5, // "o4-mini with python +browsing** tools" tau_bench_airline: 49.2, // "(Airline)" tau_bench_retail: 71.8, // "(Retail)" }, }, { model: "OpenAI o1", provider: "OpenAI", inputPrice: 15.0, outputPrice: 60.0, source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini", benchmark: { aime_24: 74.3, aime_2025: 79.2, //codeforces: 189, gpqa_diamond: 78.0, humanitys_last_exam: 8.12, // "o1-pro" mmmu: 77.6, //mathvista: 71.8, //charxiv_reasoning: 55.1, //swe_lancer_ic_swe_diamond: 28500, // "o1-high" swe_bench_verified: 48.9, aider_polyglot: 64.4, // "(whole)" //scale_multichallenge: 44.93, //browsecomp: 1.94, // "4o + browsing" - this seems to be a typo in the source, likely refers to o1's browsing capability tau_bench_airline: 50.0, // "(Airline)" tau_bench_retail: 70.8, // "(Retail)" }, }, { model: "OpenAI o3-mini", provider: "OpenAI", inputPrice: 1.10, outputPrice: 4.40, source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini", benchmark: { aime_24: 87.3, aime_2025: 86.5, //codeforces: 1207, gpqa_diamond: 77.0, humanitys_last_exam: 13.40, // MMMU, MathVista, CharXiv-Reasoning not explicitly listed for o3-mini, assuming lower than o4-mini //swe_lancer_ic_swe_diamond: 17375, // "o3-mini-high" swe_bench_verified: 49.3, aider_polyglot: 61.7, // "(diff)" //scale_multichallenge: 39.89, // BrowseComp not explicitly listed for o3-mini tau_bench_airline: 32.4, // "(Airline)" tau_bench_retail: 57.6, // "(Retail)" }, }, ];