llm-pricing-calculator

Running

llm-pricing-calculator / src /lib /benchmarks /openai.ts

d7502bf 2 days ago

10 kB

	import { Benchmark } from "./types";

	export const openaiBenchmarks: Benchmark[] = [
	{
	model: "GPT-4o-2024-11-20",
	provider: "OpenAI",
	inputPrice: 2.5,
	outputPrice: 10.0,
	source: "https://github.com/openai/simple-evals",
	benchmark: {
	mmlu: 85.7,
	gpqa: 46.0,
	humaneval: 90.2,
	simpleqa: 38.8,
	// math: 68.5,
	// mgsm: 90.3,
	// drop: 81.5,
	},
	},
	{
	model: "GPT-4o-2024-08-06",
	provider: "OpenAI",
	inputPrice: 2.5,
	outputPrice: 10.0,
	source: "https://github.com/openai/simple-evals",
	benchmark: {
	mmlu: 88.7,
	gpqa: 53.1,
	humaneval: 90.2,
	simpleqa: 40.1,
	// math: 75.9,
	// mgsm: 90.0,
	// drop: 79.8,
	},
	},
	{
	model: "GPT-4o-2024-05-13",
	provider: "OpenAI",
	inputPrice: 5.0,
	outputPrice: 15.0,
	source: "https://github.com/openai/simple-evals",
	benchmark: {
	mmlu: 87.2,
	gpqa: 49.9,
	humaneval: 91.0,
	simpleqa: 39.0,
	// math: 76.6,
	// mgsm: 89.9,
	// drop: 83.7,
	},
	},
	{
	model: "GPT-4o-mini-2024-07-18",
	provider: "OpenAI",
	inputPrice: 0.15,
	outputPrice: 0.60,
	source: "https://github.com/openai/simple-evals",
	benchmark: {
	mmlu: 82.0,
	gpqa: 40.2,
	humaneval: 87.2,
	mmmu: 59.4,
	simpleqa: 9.5,
	// mgsm: 87.0,
	// drop: 79.7,
	// math: 70.2,
	},
	},
	{
	model: "GPT-4.1-2025-04-14",
	provider: "OpenAI",
	inputPrice: 2.0,
	outputPrice: 8.0,
	source: "https://openai.com/index/gpt-4-1/",
	benchmark: {
	mmlu: 90.2,
	gpqa: 66.3,
	gpqa_diamond: 66.3,
	humaneval: 94.5,
	simpleqa: 41.6,
	swe_bench_verified: 54.6,
	aider_polyglot: 52.9,
	mmmlu: 90.2,
	video_mme: 72.0,
	// Not yet in BenchmarkMetric
	aime_24: 48.1,
	// aime_2025: undefined,
	// mmlu_pro: undefined,
	// egoschema: undefined,
	// loft: undefined,
	// lcb: undefined,
	// bigcodebench: undefined,
	// mbpp: undefined,
	// livecodebench_v6: undefined,
	// lbpp_v2: undefined,
	// bigbench_extra_hard: undefined,
	// global_mmlu_lite: undefined,
	// facts_grounding: undefined,
	// humanitys_last_exam: undefined,
	mrcr_v2_avg_128k: 57.2,
	mrcr_v2_pointwise_1m: 46.3,
	},
	},
	{
	model: "GPT-4.1-mini-2025-04-14",
	provider: "OpenAI",
	inputPrice: 0.4,
	outputPrice: 1.6,
	source: "https://openai.com/index/gpt-4-1/",
	benchmark: {
	mmlu: 87.5,
	gpqa: 65.0,
	gpqa_diamond: 65.0,
	humaneval: 93.8,
	simpleqa: 16.8,
	swe_bench_verified: 23.6,
	aider_polyglot: 31.6,
	mmmlu: 87.5,

	aime_24: 49.6,
	mrcr_v2_avg_128k: 47.2,
	mrcr_v2_pointwise_1m: 33.3,
	// video_mme: undefined,
	},
	},
	{
	model: "GPT-4.1-nano-2025-04-14",
	provider: "OpenAI",
	inputPrice: 0.1,
	outputPrice: 0.4,
	source: "https://openai.com/index/gpt-4-1/",
	benchmark: {
	mmlu: 80.1,
	gpqa: 50.3,
	gpqa_diamond: 50.3,
	humaneval: 87.0,
	simpleqa: 7.6,
	swe_bench_verified: 9.8,
	aider_polyglot: 6.2,
	mmmlu: 80.1,
	aime_24: 29.4,
	mrcr_v2_avg_128k: 36.6,
	mrcr_v2_pointwise_1m: 12.0,
	// video_mme: undefined,
	},
	},

	{
	model: "GPT-4.5-preview-2025-02-27",
	provider: "OpenAI",
	inputPrice: 75.0,
	outputPrice: 150.0,
	source: "https://github.com/openai/simple-evals",
	benchmark: {
	mmlu: 90.8,
	gpqa: 69.5,
	simpleqa: 62.5,
	humaneval: 88.6,
	// mgsm: 86.9,
	// drop: 83.4,
	// math: 87.1,

	},
	},
	{
	model: "GPT-4-turbo-2024-04-09",
	provider: "OpenAI",
	inputPrice: 10.0,
	outputPrice: 30.0,
	source: "https://github.com/openai/simple-evals",
	benchmark: {
	mmlu: 86.7,
	gpqa: 49.3,
	humaneval: 88.2,
	simpleqa: 24.2,
	// math: 73.4,
	// mgsm: 89.6,
	// drop: 86.0,

	},
	},
	{
	model: "GPT-4-0125-preview",
	provider: "OpenAI",
	inputPrice: 10.0,
	outputPrice: 30.0,
	source: "https://github.com/openai/simple-evals",
	benchmark: {
	mmlu: 85.4,
	gpqa: 41.4,
	humaneval: 86.6,
	// math: 64.5,
	// mgsm: 85.1,
	// drop: 81.5,
	},
	},
	{
	model: "GPT-4-1106-preview",
	provider: "OpenAI",
	inputPrice: 10.0,
	outputPrice: 30.0,
	source: "https://github.com/openai/simple-evals",
	benchmark: {
	mmlu: 84.7,
	gpqa: 42.5,
	humaneval: 83.7,
	// math: 64.3,
	// mgsm: 87.1,
	// drop: 83.2,
	},
	},
	{
	model: "OpenAI o3",
	provider: "OpenAI",
	inputPrice: 2.0,
	outputPrice: 8.0,
	source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
	benchmark: {
	aime_24: 91.6, // "o3 (no tools)"
	aime_2025: 88.9, // "o3 (no tools)"
	//codeforces: 2706, // "o3 (with terminal)"
	gpqa_diamond: 83.3, // "o3 (no tools)"
	humanitys_last_exam: 20.32, // "o3 (no tools)"
	mmmu: 82.9,
	//mathvista: 86.8,
	//charxiv_reasoning: 78.6,
	//swe_lancer_ic_swe_diamond: 65250, // "o3-high"
	swe_bench_verified: 69.1,
	aider_polyglot: 81.3, // "(whole)"
	//scale_multichallenge: 56.51,
	//browsecomp: 8.35, // "o3 with python +browsing*"
	tau_bench_airline: 52.0, // "(Airline)"
	tau_bench_retail: 73.9, // "(Retail)"
	},
	},
	{
	model: "OpenAI o3-pro",
	provider: "OpenAI",
	inputPrice: 20.0,
	outputPrice: 80.0,
	source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
	benchmark: {
	// Benchmarks for o3-pro are not explicitly listed, but it's described as "designed to think longer and provide the most reliable responses."
	// Assuming similar or slightly better performance than o3 in relevant areas.
	gpqa_diamond: 83.3, // Placeholder, likely similar or slightly better than o3
	humanitys_last_exam: 24.90, // "o3 (python + browsing**tools)" - this is likely the "pro" version's capability
	},
	},
	{
	model: "OpenAI o4-mini",
	provider: "OpenAI",
	inputPrice: 1.10,
	outputPrice: 4.40,

	source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
	benchmark: {
	aime_24: 93.4, // "o4-mini (no tools)"
	aime_2025: 92.7, // "o4-mini (no tools)"
	//codeforces: 2719, // "o4-mini (with terminal)"
	gpqa_diamond: 81.4, // "o4-mini (no tools)"
	humanitys_last_exam: 14.28, // "o4-mini (no tools)"
	mmmu: 81.6,
	//mathvista: 84.3,
	//charxiv_reasoning: 72.0,
	//swe_lancer_ic_swe_diamond: 56375, // "o4-mini-high"
	swe_bench_verified: 68.1,
	aider_polyglot: 68.9, // "(whole)"
	//scale_multichallenge: 42.99,
	//browsecomp: 1.5, // "o4-mini with python +browsing** tools"
	tau_bench_airline: 49.2, // "(Airline)"
	tau_bench_retail: 71.8, // "(Retail)"
	},
	},
	{
	model: "OpenAI o1",
	provider: "OpenAI",
	inputPrice: 15.0,
	outputPrice: 60.0,
	source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
	benchmark: {
	aime_24: 74.3,
	aime_2025: 79.2,
	//codeforces: 189,
	gpqa_diamond: 78.0,
	humanitys_last_exam: 8.12, // "o1-pro"
	mmmu: 77.6,
	//mathvista: 71.8,
	//charxiv_reasoning: 55.1,
	//swe_lancer_ic_swe_diamond: 28500, // "o1-high"
	swe_bench_verified: 48.9,
	aider_polyglot: 64.4, // "(whole)"
	//scale_multichallenge: 44.93,
	//browsecomp: 1.94, // "4o + browsing" - this seems to be a typo in the source, likely refers to o1's browsing capability
	tau_bench_airline: 50.0, // "(Airline)"
	tau_bench_retail: 70.8, // "(Retail)"
	},
	},
	{
	model: "OpenAI o3-mini",
	provider: "OpenAI",
	inputPrice: 1.10,
	outputPrice: 4.40,
	source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
	benchmark: {
	aime_24: 87.3,
	aime_2025: 86.5,
	//codeforces: 1207,
	gpqa_diamond: 77.0,
	humanitys_last_exam: 13.40,
	// MMMU, MathVista, CharXiv-Reasoning not explicitly listed for o3-mini, assuming lower than o4-mini
	//swe_lancer_ic_swe_diamond: 17375, // "o3-mini-high"
	swe_bench_verified: 49.3,
	aider_polyglot: 61.7, // "(diff)"
	//scale_multichallenge: 39.89,
	// BrowseComp not explicitly listed for o3-mini
	tau_bench_airline: 32.4, // "(Airline)"
	tau_bench_retail: 57.6, // "(Retail)"
	},
	},
	];