Presidentlin's picture
x
d7502bf
import { Benchmark } from "./types";
export const anthropicBenchmarks: Benchmark[] = [
{
model: "Claude Opus 4",
provider: "Anthropic",
inputPrice: 15.0,
outputPrice: 75.0,
source: "https://www.anthropic.com/news/claude-4",
benchmark: {
swe_bench_verified: 72.5,
//terminal_bench: 43.2,
gpqa_diamond: 79.6,
aime_2025: 75.5,
mmmlu: 88.8,
mmmu: 76.5,
tau_bench_retail: 81.4,
tau_bench_airline: 59.6,
},
},
{
model: "Claude Sonnet 4",
provider: "Anthropic",
inputPrice: 3.0,
outputPrice: 15.0,
source: "https://www.anthropic.com/news/claude-4",
benchmark: {
swe_bench_verified: 72.7,
//terminal_bench: 35.5,
gpqa_diamond: 75.4,
aime_2025: 70.5,
mmmlu: 86.5,
mmmu: 74.4,
tau_bench_retail: 80.5,
tau_bench_airline: 60.0,
},
},
{
model: "Claude 3.7 Sonnet (Extended Thinking 64K)",
provider: "Anthropic",
inputPrice: 3.0,
outputPrice: 15.0,
source: "https://www.anthropic.com/news/claude-3-7-sonnet",
benchmark: {
gpqa_diamond: 78.2,
tau_bench_retail: 81.2,
tau_bench_airline: 58.4,
mmmlu: 86.1,
mmmu: 75.0,
aime_24: 61.3,
},
},
{
model: "Claude 3.7 Sonnet (No Extended Thinking)",
provider: "Anthropic",
inputPrice: 3.0,
outputPrice: 15.0,
source: "https://www.anthropic.com/news/claude-3-7-sonnet",
benchmark: {
gpqa_diamond: 68.0,
swe_bench_verified: 62.3,
mmmlu: 83.2,
mmmu: 71.8,
aime_24: 51.7, // using average of 23.3 & 80.0
},
},
{
model: "Claude 3.5 Sonnet (New)",
provider: "Anthropic",
inputPrice: 3.0,
outputPrice: 15.0,
source: "https://www.anthropic.com/news/claude-3-7-sonnet",
benchmark: {
gpqa_diamond: 65.0,
swe_bench_verified: 49.0,
tau_bench_retail: 71.5,
tau_bench_airline: 48.8,
mmmlu: 82.1,
mmmu: 70.4,
aime_24: 16.0, // average of 16.0 & 65.4
},
},
{
model: "Claude 3.5 Haiku",
provider: "Anthropic",
inputPrice: 3.0,
outputPrice: 15.0,
source: "https://www.anthropic.com/news/3-5-models-and-computer-use",
benchmark: {
gpqa_diamond: 41.6,
swe_bench_verified: 49.0,
tau_bench_retail: 51.0,
tau_bench_airline: 22.8,
humaneval: 88.1,
mmmlu: 65.0,
aime_24: 5.3,
},
},
{
model: "Claude 3 Opus",
provider: "Anthropic",
inputPrice: 15.0,
outputPrice: 75.0,
source: "https://www.anthropic.com/news/claude-3-family",
benchmark: {
gpqa_diamond: 50.4,
mmmlu: 86.8,
mmmu: 59.4,
// gsm8k: 95.0,
// math: 60.1,
// mgsm: 90.7,
// humaneval: 84.9,
// drop: 83.1,
// big_bench_hard: 86.8,
// arc_challenge: 96.4,
// hellaswag: 95.4,
// mathvista: 50.5,
// ai2d: 88.1,
// chart_qa: 80.8,
// docvqa_anls: 89.3,
},
},
{
model: "Claude 3 Sonnet",
provider: "Anthropic",
inputPrice: 3.0,
outputPrice: 15.0,
source: "https://www.anthropic.com/news/claude-3-family",
benchmark: {
gpqa_diamond: 40.4,
mmmlu: 79.0,
mmmu: 53.1,
// gsm8k: 92.3,
// math: 43.1,
// mgsm: 83.5,
// humaneval: 73.0,
// drop: 78.9,
// big_bench_hard: 82.9,
// arc_challenge: 93.2,
// hellaswag: 89.0,
// mathvista: 47.9,
// ai2d: 88.7,
// chart_qa: 81.1,
// docvqa_anls: 89.5,
},
},
{
model: "Claude 3 Haiku",
provider: "Anthropic",
inputPrice: 0.25,
outputPrice: 1.25,
source: "https://www.anthropic.com/news/claude-3-family",
benchmark: {
gpqa_diamond: 33.3,
mmmlu: 75.2,
mmmu: 50.2,
// gsm8k: 88.9,
// math: 38.9,
// mgsm: 75.1,
// humaneval: 75.9,
// drop: 78.4,
// big_bench_hard: 73.7,
// arc_challenge: 89.2,
// hellaswag: 85.9,
// mathvista: 46.4,
// ai2d: 86.7,
// chart_qa: 81.7,
// docvqa_anls: 88.8,
},
},
];