Presidentlin's picture
x
d7502bf
import { Benchmark } from "./types";
export const deepseekBenchmarks: Benchmark[] = [
{
model: "DeepSeek-R1-0528",
provider: "DeepSeek",
inputPrice: 0.55, // Placeholder, update if pricing becomes available
outputPrice: 2.19,
benchmark: {
aime_24: 91.4,
aime_2025: 87.5,
gpqa_diamond: 81.0,
gpqa: 81.0, // For compatibility; can remove if you want to only use gpqa_diamond
mmlu_pro: 85.0,
mmlu: 93.4, // MMLU-Redux assumed to be "mmlu"
simpleqa: 27.8,
lcb: 73.3, // LiveCodeBench
aider_polyglot: 71.6,
swe_bench_verified: 57.6,
// Optional or less frequent benchmarks:
humanitys_last_exam: 17.7,
// Not in BenchmarkMetric, but useful (commented for type safety):
// codeforces_div1: 1930,
// frames: 83.0,
tau_bench_airline: 53.5,
tau_bench_retail: 63.9,
// bfcl_v3_multiturn: 37.0,
// cnmo_2024: 86.9,
// hmmt_2025: 79.4,
},
source: "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528",
},
{
model: "DeepSeek-V3-0324",
provider: "DeepSeek",
inputPrice: 0.27, // Placeholder — adjust if actual pricing becomes available
outputPrice: 1.10,
benchmark: {
mmlu: 87.1, // From original DeepSeek-V3
mmlu_pro: 81.2, // Updated in V3-0324
gpqa: 68.4, // Updated in V3-0324
gpqa_diamond: 59.1, // From V3
aime_24: 59.4, // Updated in V3-0324
lcb: 49.2, // Updated LiveCodeBench
simpleqa: 24.9, // From V3
aider_polyglot: 49.6, // From V3
swe_bench_verified: 42.0 // From V3
},
source: "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324",
},
{
model: "DeepSeek-V3",
provider: "DeepSeek",
inputPrice: 0.27, // Placeholder — update if real pricing is known
outputPrice: 1.10,
benchmark: {
mmlu: 87.1,
mmlu_pro: 64.4,
// mmlu_redux: 86.2, // Commented: not in BenchmarkMetric
gpqa_diamond: 59.1,
simpleqa: 24.9,
aime_24: 39.2,
lcb: 37.6, // LiveCodeBench (Pass@1)
aider_polyglot: 49.6,
swe_bench_verified: 42.0,
// Optional or not yet in your schema:
// humanitys_last_exam: undefined,
// codeforces: 51.6,
// drop: 89.0,
// gsm8k: 89.3,
// math_em: 61.6,
// mgsm: 79.8,
// cmath: 90.7,
// cruxeval_i: 67.3,
// cruxeval_o: 69.8,
// triviaqa: 82.9,
// naturalquestions: 40.0,
// agieval: 79.6,
// hellaSwag: 88.9,
// piqa: 84.7,
// winogrande: 84.9,
},
source: "https://huggingface.co/deepseek-ai/DeepSeek-V3",
},
{
model: "DeepSeek-R1",
provider: "DeepSeek",
inputPrice: 0.55, // Placeholder, update if pricing becomes available
outputPrice: 2.19,
benchmark: {
mmlu: 90.8,
mmlu_pro: 84.0,
gpqa_diamond: 71.5,
simpleqa: 30.1,
lcb: 65.9, // LiveCodeBench (Pass@1-CoT)
swe_bench_verified: 49.2,
aider_polyglot: 53.3,
aime_24: 79.8,
// aime_2025: undefined, // not provided
// gpqa: undefined, // use gpqa_diamond
// egoschema: undefined,
// mmmu: undefined,
// loft: undefined,
// humanitys_last_exam: undefined, // optional
},
source: "https://huggingface.co/deepseek-ai/DeepSeek-R1",
},
];