Presidentlin's picture
x
d7502bf
import { Benchmark } from "./types";
export const openaiBenchmarks: Benchmark[] = [
{
model: "GPT-4o-2024-11-20",
provider: "OpenAI",
inputPrice: 2.5,
outputPrice: 10.0,
source: "https://github.com/openai/simple-evals",
benchmark: {
mmlu: 85.7,
gpqa: 46.0,
humaneval: 90.2,
simpleqa: 38.8,
// math: 68.5,
// mgsm: 90.3,
// drop: 81.5,
},
},
{
model: "GPT-4o-2024-08-06",
provider: "OpenAI",
inputPrice: 2.5,
outputPrice: 10.0,
source: "https://github.com/openai/simple-evals",
benchmark: {
mmlu: 88.7,
gpqa: 53.1,
humaneval: 90.2,
simpleqa: 40.1,
// math: 75.9,
// mgsm: 90.0,
// drop: 79.8,
},
},
{
model: "GPT-4o-2024-05-13",
provider: "OpenAI",
inputPrice: 5.0,
outputPrice: 15.0,
source: "https://github.com/openai/simple-evals",
benchmark: {
mmlu: 87.2,
gpqa: 49.9,
humaneval: 91.0,
simpleqa: 39.0,
// math: 76.6,
// mgsm: 89.9,
// drop: 83.7,
},
},
{
model: "GPT-4o-mini-2024-07-18",
provider: "OpenAI",
inputPrice: 0.15,
outputPrice: 0.60,
source: "https://github.com/openai/simple-evals",
benchmark: {
mmlu: 82.0,
gpqa: 40.2,
humaneval: 87.2,
mmmu: 59.4,
simpleqa: 9.5,
// mgsm: 87.0,
// drop: 79.7,
// math: 70.2,
},
},
{
model: "GPT-4.1-2025-04-14",
provider: "OpenAI",
inputPrice: 2.0,
outputPrice: 8.0,
source: "https://openai.com/index/gpt-4-1/",
benchmark: {
mmlu: 90.2,
gpqa: 66.3,
gpqa_diamond: 66.3,
humaneval: 94.5,
simpleqa: 41.6,
swe_bench_verified: 54.6,
aider_polyglot: 52.9,
mmmlu: 90.2,
video_mme: 72.0,
// Not yet in BenchmarkMetric
aime_24: 48.1,
// aime_2025: undefined,
// mmlu_pro: undefined,
// egoschema: undefined,
// loft: undefined,
// lcb: undefined,
// bigcodebench: undefined,
// mbpp: undefined,
// livecodebench_v6: undefined,
// lbpp_v2: undefined,
// bigbench_extra_hard: undefined,
// global_mmlu_lite: undefined,
// facts_grounding: undefined,
// humanitys_last_exam: undefined,
mrcr_v2_avg_128k: 57.2,
mrcr_v2_pointwise_1m: 46.3,
},
},
{
model: "GPT-4.1-mini-2025-04-14",
provider: "OpenAI",
inputPrice: 0.4,
outputPrice: 1.6,
source: "https://openai.com/index/gpt-4-1/",
benchmark: {
mmlu: 87.5,
gpqa: 65.0,
gpqa_diamond: 65.0,
humaneval: 93.8,
simpleqa: 16.8,
swe_bench_verified: 23.6,
aider_polyglot: 31.6,
mmmlu: 87.5,
aime_24: 49.6,
mrcr_v2_avg_128k: 47.2,
mrcr_v2_pointwise_1m: 33.3,
// video_mme: undefined,
},
},
{
model: "GPT-4.1-nano-2025-04-14",
provider: "OpenAI",
inputPrice: 0.1,
outputPrice: 0.4,
source: "https://openai.com/index/gpt-4-1/",
benchmark: {
mmlu: 80.1,
gpqa: 50.3,
gpqa_diamond: 50.3,
humaneval: 87.0,
simpleqa: 7.6,
swe_bench_verified: 9.8,
aider_polyglot: 6.2,
mmmlu: 80.1,
aime_24: 29.4,
mrcr_v2_avg_128k: 36.6,
mrcr_v2_pointwise_1m: 12.0,
// video_mme: undefined,
},
},
{
model: "GPT-4.5-preview-2025-02-27",
provider: "OpenAI",
inputPrice: 75.0,
outputPrice: 150.0,
source: "https://github.com/openai/simple-evals",
benchmark: {
mmlu: 90.8,
gpqa: 69.5,
simpleqa: 62.5,
humaneval: 88.6,
// mgsm: 86.9,
// drop: 83.4,
// math: 87.1,
},
},
{
model: "GPT-4-turbo-2024-04-09",
provider: "OpenAI",
inputPrice: 10.0,
outputPrice: 30.0,
source: "https://github.com/openai/simple-evals",
benchmark: {
mmlu: 86.7,
gpqa: 49.3,
humaneval: 88.2,
simpleqa: 24.2,
// math: 73.4,
// mgsm: 89.6,
// drop: 86.0,
},
},
{
model: "GPT-4-0125-preview",
provider: "OpenAI",
inputPrice: 10.0,
outputPrice: 30.0,
source: "https://github.com/openai/simple-evals",
benchmark: {
mmlu: 85.4,
gpqa: 41.4,
humaneval: 86.6,
// math: 64.5,
// mgsm: 85.1,
// drop: 81.5,
},
},
{
model: "GPT-4-1106-preview",
provider: "OpenAI",
inputPrice: 10.0,
outputPrice: 30.0,
source: "https://github.com/openai/simple-evals",
benchmark: {
mmlu: 84.7,
gpqa: 42.5,
humaneval: 83.7,
// math: 64.3,
// mgsm: 87.1,
// drop: 83.2,
},
},
{
model: "OpenAI o3",
provider: "OpenAI",
inputPrice: 2.0,
outputPrice: 8.0,
source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
benchmark: {
aime_24: 91.6, // "o3 (no tools)"
aime_2025: 88.9, // "o3 (no tools)"
//codeforces: 2706, // "o3 (with terminal)"
gpqa_diamond: 83.3, // "o3 (no tools)"
humanitys_last_exam: 20.32, // "o3 (no tools)"
mmmu: 82.9,
//mathvista: 86.8,
//charxiv_reasoning: 78.6,
//swe_lancer_ic_swe_diamond: 65250, // "o3-high"
swe_bench_verified: 69.1,
aider_polyglot: 81.3, // "(whole)"
//scale_multichallenge: 56.51,
//browsecomp: 8.35, // "o3 with python +browsing*"
tau_bench_airline: 52.0, // "(Airline)"
tau_bench_retail: 73.9, // "(Retail)"
},
},
{
model: "OpenAI o3-pro",
provider: "OpenAI",
inputPrice: 20.0,
outputPrice: 80.0,
source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
benchmark: {
// Benchmarks for o3-pro are not explicitly listed, but it's described as "designed to think longer and provide the most reliable responses."
// Assuming similar or slightly better performance than o3 in relevant areas.
gpqa_diamond: 83.3, // Placeholder, likely similar or slightly better than o3
humanitys_last_exam: 24.90, // "o3 (python + browsing**tools)" - this is likely the "pro" version's capability
},
},
{
model: "OpenAI o4-mini",
provider: "OpenAI",
inputPrice: 1.10,
outputPrice: 4.40,
source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
benchmark: {
aime_24: 93.4, // "o4-mini (no tools)"
aime_2025: 92.7, // "o4-mini (no tools)"
//codeforces: 2719, // "o4-mini (with terminal)"
gpqa_diamond: 81.4, // "o4-mini (no tools)"
humanitys_last_exam: 14.28, // "o4-mini (no tools)"
mmmu: 81.6,
//mathvista: 84.3,
//charxiv_reasoning: 72.0,
//swe_lancer_ic_swe_diamond: 56375, // "o4-mini-high"
swe_bench_verified: 68.1,
aider_polyglot: 68.9, // "(whole)"
//scale_multichallenge: 42.99,
//browsecomp: 1.5, // "o4-mini with python +browsing** tools"
tau_bench_airline: 49.2, // "(Airline)"
tau_bench_retail: 71.8, // "(Retail)"
},
},
{
model: "OpenAI o1",
provider: "OpenAI",
inputPrice: 15.0,
outputPrice: 60.0,
source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
benchmark: {
aime_24: 74.3,
aime_2025: 79.2,
//codeforces: 189,
gpqa_diamond: 78.0,
humanitys_last_exam: 8.12, // "o1-pro"
mmmu: 77.6,
//mathvista: 71.8,
//charxiv_reasoning: 55.1,
//swe_lancer_ic_swe_diamond: 28500, // "o1-high"
swe_bench_verified: 48.9,
aider_polyglot: 64.4, // "(whole)"
//scale_multichallenge: 44.93,
//browsecomp: 1.94, // "4o + browsing" - this seems to be a typo in the source, likely refers to o1's browsing capability
tau_bench_airline: 50.0, // "(Airline)"
tau_bench_retail: 70.8, // "(Retail)"
},
},
{
model: "OpenAI o3-mini",
provider: "OpenAI",
inputPrice: 1.10,
outputPrice: 4.40,
source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
benchmark: {
aime_24: 87.3,
aime_2025: 86.5,
//codeforces: 1207,
gpqa_diamond: 77.0,
humanitys_last_exam: 13.40,
// MMMU, MathVista, CharXiv-Reasoning not explicitly listed for o3-mini, assuming lower than o4-mini
//swe_lancer_ic_swe_diamond: 17375, // "o3-mini-high"
swe_bench_verified: 49.3,
aider_polyglot: 61.7, // "(diff)"
//scale_multichallenge: 39.89,
// BrowseComp not explicitly listed for o3-mini
tau_bench_airline: 32.4, // "(Airline)"
tau_bench_retail: 57.6, // "(Retail)"
},
},
];