File size: 3,940 Bytes
713e157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7502bf
 
713e157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7502bf
6f8c125
713e157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import { Benchmark } from "./types";

export const deepseekBenchmarks: Benchmark[] = [
    {
        model: "DeepSeek-R1-0528",
        provider: "DeepSeek",
        inputPrice: 0.55, // Placeholder, update if pricing becomes available
        outputPrice: 2.19,
        benchmark: {
            aime_24: 91.4,
            aime_2025: 87.5,
            gpqa_diamond: 81.0,
            gpqa: 81.0, // For compatibility; can remove if you want to only use gpqa_diamond
            mmlu_pro: 85.0,
            mmlu: 93.4, // MMLU-Redux assumed to be "mmlu"
            simpleqa: 27.8,
            lcb: 73.3, // LiveCodeBench
            aider_polyglot: 71.6,
            swe_bench_verified: 57.6,
            // Optional or less frequent benchmarks:
            humanitys_last_exam: 17.7,
            // Not in BenchmarkMetric, but useful (commented for type safety):
            // codeforces_div1: 1930,
            // frames: 83.0,
            tau_bench_airline: 53.5,
            tau_bench_retail: 63.9,
            // bfcl_v3_multiturn: 37.0,
            // cnmo_2024: 86.9,
            // hmmt_2025: 79.4,
        },
        source: "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528",
    },

    {
        model: "DeepSeek-V3-0324",
        provider: "DeepSeek",
        inputPrice: 0.27, // Placeholder — adjust if actual pricing becomes available
        outputPrice: 1.10,
        benchmark: {
            mmlu: 87.1,              // From original DeepSeek-V3
            mmlu_pro: 81.2,          // Updated in V3-0324
            gpqa: 68.4,              // Updated in V3-0324
            gpqa_diamond: 59.1,      // From V3
            aime_24: 59.4,           // Updated in V3-0324
            lcb: 49.2,               // Updated LiveCodeBench
            simpleqa: 24.9,          // From V3
            aider_polyglot: 49.6,    // From V3
            swe_bench_verified: 42.0 // From V3
        },
        source: "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324",
    },
    {
        model: "DeepSeek-V3",
        provider: "DeepSeek",
        inputPrice: 0.27, // Placeholder — update if real pricing is known
        outputPrice: 1.10,
        benchmark: {
            mmlu: 87.1,
            mmlu_pro: 64.4,
            // mmlu_redux: 86.2, // Commented: not in BenchmarkMetric
            gpqa_diamond: 59.1,
            simpleqa: 24.9,
            aime_24: 39.2,
            lcb: 37.6, // LiveCodeBench (Pass@1)
            aider_polyglot: 49.6,
            swe_bench_verified: 42.0,

            // Optional or not yet in your schema:
            // humanitys_last_exam: undefined,
            // codeforces: 51.6,
            // drop: 89.0,
            // gsm8k: 89.3,
            // math_em: 61.6,
            // mgsm: 79.8,
            // cmath: 90.7,
            // cruxeval_i: 67.3,
            // cruxeval_o: 69.8,
            // triviaqa: 82.9,
            // naturalquestions: 40.0,
            // agieval: 79.6,
            // hellaSwag: 88.9,
            // piqa: 84.7,
            // winogrande: 84.9,
        },
        source: "https://huggingface.co/deepseek-ai/DeepSeek-V3",
    },
    {
        model: "DeepSeek-R1",
        provider: "DeepSeek",
        inputPrice: 0.55, // Placeholder, update if pricing becomes available
        outputPrice: 2.19,
        benchmark: {
            mmlu: 90.8,
            mmlu_pro: 84.0,
            gpqa_diamond: 71.5,
            simpleqa: 30.1,
            lcb: 65.9, // LiveCodeBench (Pass@1-CoT)
            swe_bench_verified: 49.2,
            aider_polyglot: 53.3,
            aime_24: 79.8,
            // aime_2025: undefined, // not provided
            // gpqa: undefined,      // use gpqa_diamond
            // egoschema: undefined,
            // mmmu: undefined,
            // loft: undefined,
            // humanitys_last_exam: undefined, // optional
        },
        source: "https://huggingface.co/deepseek-ai/DeepSeek-R1",
    },
];