File size: 3,940 Bytes
713e157 d7502bf 713e157 d7502bf 6f8c125 713e157 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import { Benchmark } from "./types";
export const deepseekBenchmarks: Benchmark[] = [
{
model: "DeepSeek-R1-0528",
provider: "DeepSeek",
inputPrice: 0.55, // Placeholder, update if pricing becomes available
outputPrice: 2.19,
benchmark: {
aime_24: 91.4,
aime_2025: 87.5,
gpqa_diamond: 81.0,
gpqa: 81.0, // For compatibility; can remove if you want to only use gpqa_diamond
mmlu_pro: 85.0,
mmlu: 93.4, // MMLU-Redux assumed to be "mmlu"
simpleqa: 27.8,
lcb: 73.3, // LiveCodeBench
aider_polyglot: 71.6,
swe_bench_verified: 57.6,
// Optional or less frequent benchmarks:
humanitys_last_exam: 17.7,
// Not in BenchmarkMetric, but useful (commented for type safety):
// codeforces_div1: 1930,
// frames: 83.0,
tau_bench_airline: 53.5,
tau_bench_retail: 63.9,
// bfcl_v3_multiturn: 37.0,
// cnmo_2024: 86.9,
// hmmt_2025: 79.4,
},
source: "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528",
},
{
model: "DeepSeek-V3-0324",
provider: "DeepSeek",
inputPrice: 0.27, // Placeholder — adjust if actual pricing becomes available
outputPrice: 1.10,
benchmark: {
mmlu: 87.1, // From original DeepSeek-V3
mmlu_pro: 81.2, // Updated in V3-0324
gpqa: 68.4, // Updated in V3-0324
gpqa_diamond: 59.1, // From V3
aime_24: 59.4, // Updated in V3-0324
lcb: 49.2, // Updated LiveCodeBench
simpleqa: 24.9, // From V3
aider_polyglot: 49.6, // From V3
swe_bench_verified: 42.0 // From V3
},
source: "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324",
},
{
model: "DeepSeek-V3",
provider: "DeepSeek",
inputPrice: 0.27, // Placeholder — update if real pricing is known
outputPrice: 1.10,
benchmark: {
mmlu: 87.1,
mmlu_pro: 64.4,
// mmlu_redux: 86.2, // Commented: not in BenchmarkMetric
gpqa_diamond: 59.1,
simpleqa: 24.9,
aime_24: 39.2,
lcb: 37.6, // LiveCodeBench (Pass@1)
aider_polyglot: 49.6,
swe_bench_verified: 42.0,
// Optional or not yet in your schema:
// humanitys_last_exam: undefined,
// codeforces: 51.6,
// drop: 89.0,
// gsm8k: 89.3,
// math_em: 61.6,
// mgsm: 79.8,
// cmath: 90.7,
// cruxeval_i: 67.3,
// cruxeval_o: 69.8,
// triviaqa: 82.9,
// naturalquestions: 40.0,
// agieval: 79.6,
// hellaSwag: 88.9,
// piqa: 84.7,
// winogrande: 84.9,
},
source: "https://huggingface.co/deepseek-ai/DeepSeek-V3",
},
{
model: "DeepSeek-R1",
provider: "DeepSeek",
inputPrice: 0.55, // Placeholder, update if pricing becomes available
outputPrice: 2.19,
benchmark: {
mmlu: 90.8,
mmlu_pro: 84.0,
gpqa_diamond: 71.5,
simpleqa: 30.1,
lcb: 65.9, // LiveCodeBench (Pass@1-CoT)
swe_bench_verified: 49.2,
aider_polyglot: 53.3,
aime_24: 79.8,
// aime_2025: undefined, // not provided
// gpqa: undefined, // use gpqa_diamond
// egoschema: undefined,
// mmmu: undefined,
// loft: undefined,
// humanitys_last_exam: undefined, // optional
},
source: "https://huggingface.co/deepseek-ai/DeepSeek-R1",
},
];
|