import { Benchmark } from "./types"; | |
export const deepseekBenchmarks: Benchmark[] = [ | |
{ | |
model: "DeepSeek-R1-0528", | |
provider: "DeepSeek", | |
inputPrice: 0.55, // Placeholder, update if pricing becomes available | |
outputPrice: 2.19, | |
benchmark: { | |
aime_24: 91.4, | |
aime_2025: 87.5, | |
gpqa_diamond: 81.0, | |
gpqa: 81.0, // For compatibility; can remove if you want to only use gpqa_diamond | |
mmlu_pro: 85.0, | |
mmlu: 93.4, // MMLU-Redux assumed to be "mmlu" | |
simpleqa: 27.8, | |
lcb: 73.3, // LiveCodeBench | |
aider_polyglot: 71.6, | |
swe_bench_verified: 57.6, | |
// Optional or less frequent benchmarks: | |
humanitys_last_exam: 17.7, | |
// Not in BenchmarkMetric, but useful (commented for type safety): | |
// codeforces_div1: 1930, | |
// frames: 83.0, | |
tau_bench_airline: 53.5, | |
tau_bench_retail: 63.9, | |
// bfcl_v3_multiturn: 37.0, | |
// cnmo_2024: 86.9, | |
// hmmt_2025: 79.4, | |
}, | |
source: "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528", | |
}, | |
{ | |
model: "DeepSeek-V3-0324", | |
provider: "DeepSeek", | |
inputPrice: 0.27, // Placeholder — adjust if actual pricing becomes available | |
outputPrice: 1.10, | |
benchmark: { | |
mmlu: 87.1, // From original DeepSeek-V3 | |
mmlu_pro: 81.2, // Updated in V3-0324 | |
gpqa: 68.4, // Updated in V3-0324 | |
gpqa_diamond: 59.1, // From V3 | |
aime_24: 59.4, // Updated in V3-0324 | |
lcb: 49.2, // Updated LiveCodeBench | |
simpleqa: 24.9, // From V3 | |
aider_polyglot: 49.6, // From V3 | |
swe_bench_verified: 42.0 // From V3 | |
}, | |
source: "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324", | |
}, | |
{ | |
model: "DeepSeek-V3", | |
provider: "DeepSeek", | |
inputPrice: 0.27, // Placeholder — update if real pricing is known | |
outputPrice: 1.10, | |
benchmark: { | |
mmlu: 87.1, | |
mmlu_pro: 64.4, | |
// mmlu_redux: 86.2, // Commented: not in BenchmarkMetric | |
gpqa_diamond: 59.1, | |
simpleqa: 24.9, | |
aime_24: 39.2, | |
lcb: 37.6, // LiveCodeBench (Pass@1) | |
aider_polyglot: 49.6, | |
swe_bench_verified: 42.0, | |
// Optional or not yet in your schema: | |
// humanitys_last_exam: undefined, | |
// codeforces: 51.6, | |
// drop: 89.0, | |
// gsm8k: 89.3, | |
// math_em: 61.6, | |
// mgsm: 79.8, | |
// cmath: 90.7, | |
// cruxeval_i: 67.3, | |
// cruxeval_o: 69.8, | |
// triviaqa: 82.9, | |
// naturalquestions: 40.0, | |
// agieval: 79.6, | |
// hellaSwag: 88.9, | |
// piqa: 84.7, | |
// winogrande: 84.9, | |
}, | |
source: "https://huggingface.co/deepseek-ai/DeepSeek-V3", | |
}, | |
{ | |
model: "DeepSeek-R1", | |
provider: "DeepSeek", | |
inputPrice: 0.55, // Placeholder, update if pricing becomes available | |
outputPrice: 2.19, | |
benchmark: { | |
mmlu: 90.8, | |
mmlu_pro: 84.0, | |
gpqa_diamond: 71.5, | |
simpleqa: 30.1, | |
lcb: 65.9, // LiveCodeBench (Pass@1-CoT) | |
swe_bench_verified: 49.2, | |
aider_polyglot: 53.3, | |
aime_24: 79.8, | |
// aime_2025: undefined, // not provided | |
// gpqa: undefined, // use gpqa_diamond | |
// egoschema: undefined, | |
// mmmu: undefined, | |
// loft: undefined, | |
// humanitys_last_exam: undefined, // optional | |
}, | |
source: "https://huggingface.co/deepseek-ai/DeepSeek-R1", | |
}, | |
]; | |