import { Benchmark } from "./types";

export const openaiBenchmarks: Benchmark[] = [
    {
        model: "GPT-4o-2024-11-20",
        provider: "OpenAI",
        inputPrice: 2.5,
        outputPrice: 10.0,
        source: "https://github.com/openai/simple-evals",
        benchmark: {
            mmlu: 85.7,
            gpqa: 46.0,
            humaneval: 90.2,
            simpleqa: 38.8,
            // math: 68.5,
            // mgsm: 90.3,
            // drop: 81.5,
        },
    },
    {
        model: "GPT-4o-2024-08-06",
        provider: "OpenAI",
        inputPrice: 2.5,
        outputPrice: 10.0,
        source: "https://github.com/openai/simple-evals",
        benchmark: {
            mmlu: 88.7,
            gpqa: 53.1,
            humaneval: 90.2,
            simpleqa: 40.1,
            // math: 75.9,
            // mgsm: 90.0,
            // drop: 79.8,
        },
    },
    {
        model: "GPT-4o-2024-05-13",
        provider: "OpenAI",
        inputPrice: 5.0,
        outputPrice: 15.0,
        source: "https://github.com/openai/simple-evals",
        benchmark: {
            mmlu: 87.2,
            gpqa: 49.9,
            humaneval: 91.0,
            simpleqa: 39.0,
            // math: 76.6,
            // mgsm: 89.9,
            // drop: 83.7,
        },
    },
    {
        model: "GPT-4o-mini-2024-07-18",
        provider: "OpenAI",
        inputPrice: 0.15,
        outputPrice: 0.60,
        source: "https://github.com/openai/simple-evals",
        benchmark: {
            mmlu: 82.0,
            gpqa: 40.2,
            humaneval: 87.2,
            mmmu: 59.4,
            simpleqa: 9.5,
            // mgsm: 87.0,
            // drop: 79.7,
            // math: 70.2,
        },
    },
    {
        model: "GPT-4.1-2025-04-14",
        provider: "OpenAI",
        inputPrice: 2.0,
        outputPrice: 8.0,
        source: "https://openai.com/index/gpt-4-1/",
        benchmark: {
            mmlu: 90.2,
            gpqa: 66.3,
            gpqa_diamond: 66.3,
            humaneval: 94.5,
            simpleqa: 41.6,
            swe_bench_verified: 54.6,
            aider_polyglot: 52.9,
            mmmlu: 90.2,
            video_mme: 72.0,
            // Not yet in BenchmarkMetric
            aime_24: 48.1,
            // aime_2025: undefined,
            // mmlu_pro: undefined,
            // egoschema: undefined,
            // loft: undefined,
            // lcb: undefined,
            // bigcodebench: undefined,
            // mbpp: undefined,
            // livecodebench_v6: undefined,
            // lbpp_v2: undefined,
            // bigbench_extra_hard: undefined,
            // global_mmlu_lite: undefined,
            // facts_grounding: undefined,
            // humanitys_last_exam: undefined,
            mrcr_v2_avg_128k: 57.2,
            mrcr_v2_pointwise_1m: 46.3,
        },
    },
    {
        model: "GPT-4.1-mini-2025-04-14",
        provider: "OpenAI",
        inputPrice: 0.4,
        outputPrice: 1.6,
        source: "https://openai.com/index/gpt-4-1/",
        benchmark: {
            mmlu: 87.5,
            gpqa: 65.0,
            gpqa_diamond: 65.0,
            humaneval: 93.8,
            simpleqa: 16.8,
            swe_bench_verified: 23.6,
            aider_polyglot: 31.6,
            mmmlu: 87.5,

            aime_24: 49.6,
            mrcr_v2_avg_128k: 47.2,
            mrcr_v2_pointwise_1m: 33.3,
            // video_mme: undefined,
        },
    },
    {
        model: "GPT-4.1-nano-2025-04-14",
        provider: "OpenAI",
        inputPrice: 0.1,
        outputPrice: 0.4,
        source: "https://openai.com/index/gpt-4-1/",
        benchmark: {
            mmlu: 80.1,
            gpqa: 50.3,
            gpqa_diamond: 50.3,
            humaneval: 87.0,
            simpleqa: 7.6,
            swe_bench_verified: 9.8,
            aider_polyglot: 6.2,
            mmmlu: 80.1,
            aime_24: 29.4,
            mrcr_v2_avg_128k: 36.6,
            mrcr_v2_pointwise_1m: 12.0,
            // video_mme: undefined,
        },
    },

    {
        model: "GPT-4.5-preview-2025-02-27",
        provider: "OpenAI",
        inputPrice: 75.0,
        outputPrice: 150.0,
        source: "https://github.com/openai/simple-evals",
        benchmark: {
            mmlu: 90.8,
            gpqa: 69.5,
            simpleqa: 62.5,
            humaneval: 88.6,
            // mgsm: 86.9,
            // drop: 83.4,
            //  math: 87.1,

        },
    },
    {
        model: "GPT-4-turbo-2024-04-09",
        provider: "OpenAI",
        inputPrice: 10.0,
        outputPrice: 30.0,
        source: "https://github.com/openai/simple-evals",
        benchmark: {
            mmlu: 86.7,
            gpqa: 49.3,
            humaneval: 88.2,
            simpleqa: 24.2,
            // math: 73.4,
            // mgsm: 89.6,
            // drop: 86.0,

        },
    },
    {
        model: "GPT-4-0125-preview",
        provider: "OpenAI",
        inputPrice: 10.0,
        outputPrice: 30.0,
        source: "https://github.com/openai/simple-evals",
        benchmark: {
            mmlu: 85.4,
            gpqa: 41.4,
            humaneval: 86.6,
            // math: 64.5,
            // mgsm: 85.1,
            // drop: 81.5,
        },
    },
    {
        model: "GPT-4-1106-preview",
        provider: "OpenAI",
        inputPrice: 10.0,
        outputPrice: 30.0,
        source: "https://github.com/openai/simple-evals",
        benchmark: {
            mmlu: 84.7,
            gpqa: 42.5,
            humaneval: 83.7,
            // math: 64.3,
            // mgsm: 87.1,
            // drop: 83.2,
        },
    },
    {
        model: "OpenAI o3",
        provider: "OpenAI",
        inputPrice: 2.0,
        outputPrice: 8.0,
        source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
        benchmark: {
            aime_24: 91.6, // "o3 (no tools)"
            aime_2025: 88.9, // "o3 (no tools)"
            //codeforces: 2706, // "o3 (with terminal)"
            gpqa_diamond: 83.3, // "o3 (no tools)"
            humanitys_last_exam: 20.32, // "o3 (no tools)"
            mmmu: 82.9,
            //mathvista: 86.8,
            //charxiv_reasoning: 78.6,
            //swe_lancer_ic_swe_diamond: 65250, // "o3-high"
            swe_bench_verified: 69.1,
            aider_polyglot: 81.3, // "(whole)"
            //scale_multichallenge: 56.51,
            //browsecomp: 8.35, // "o3 with python +browsing*"
            tau_bench_airline: 52.0, // "(Airline)"
            tau_bench_retail: 73.9, // "(Retail)"
        },
    },
    {
        model: "OpenAI o3-pro",
        provider: "OpenAI",
        inputPrice: 20.0,
        outputPrice: 80.0,
        source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
        benchmark: {
            // Benchmarks for o3-pro are not explicitly listed, but it's described as "designed to think longer and provide the most reliable responses."
            // Assuming similar or slightly better performance than o3 in relevant areas.
            gpqa_diamond: 83.3, // Placeholder, likely similar or slightly better than o3
            humanitys_last_exam: 24.90, // "o3 (python + browsing**tools)" - this is likely the "pro" version's capability
        },
    },
    {
        model: "OpenAI o4-mini",
        provider: "OpenAI",
        inputPrice: 1.10,
        outputPrice: 4.40,

        source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
        benchmark: {
            aime_24: 93.4, // "o4-mini (no tools)"
            aime_2025: 92.7, // "o4-mini (no tools)"
            //codeforces: 2719, // "o4-mini (with terminal)"
            gpqa_diamond: 81.4, // "o4-mini (no tools)"
            humanitys_last_exam: 14.28, // "o4-mini (no tools)"
            mmmu: 81.6,
            //mathvista: 84.3,
            //charxiv_reasoning: 72.0,
            //swe_lancer_ic_swe_diamond: 56375, // "o4-mini-high"
            swe_bench_verified: 68.1,
            aider_polyglot: 68.9, // "(whole)"
            //scale_multichallenge: 42.99,
            //browsecomp: 1.5, // "o4-mini with python +browsing** tools"
            tau_bench_airline: 49.2, // "(Airline)"
            tau_bench_retail: 71.8, // "(Retail)"
        },
    },
    {
        model: "OpenAI o1",
        provider: "OpenAI",
        inputPrice: 15.0,
        outputPrice: 60.0,
        source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
        benchmark: {
            aime_24: 74.3,
            aime_2025: 79.2,
            //codeforces: 189,
            gpqa_diamond: 78.0,
            humanitys_last_exam: 8.12, // "o1-pro"
            mmmu: 77.6,
            //mathvista: 71.8,
            //charxiv_reasoning: 55.1,
            //swe_lancer_ic_swe_diamond: 28500, // "o1-high"
            swe_bench_verified: 48.9,
            aider_polyglot: 64.4, // "(whole)"
            //scale_multichallenge: 44.93,
            //browsecomp: 1.94, // "4o + browsing" - this seems to be a typo in the source, likely refers to o1's browsing capability
            tau_bench_airline: 50.0, // "(Airline)"
            tau_bench_retail: 70.8, // "(Retail)"
        },
    },
    {
        model: "OpenAI o3-mini",
        provider: "OpenAI",
        inputPrice: 1.10,
        outputPrice: 4.40,
        source: "https://openai.com/blog/introducing-openai-o3-and-o4-mini",
        benchmark: {
            aime_24: 87.3,
            aime_2025: 86.5,
            //codeforces: 1207,
            gpqa_diamond: 77.0,
            humanitys_last_exam: 13.40,
            // MMMU, MathVista, CharXiv-Reasoning not explicitly listed for o3-mini, assuming lower than o4-mini
            //swe_lancer_ic_swe_diamond: 17375, // "o3-mini-high"
            swe_bench_verified: 49.3,
            aider_polyglot: 61.7, // "(diff)"
            //scale_multichallenge: 39.89,
            // BrowseComp not explicitly listed for o3-mini
            tau_bench_airline: 32.4, // "(Airline)"
            tau_bench_retail: 57.6, // "(Retail)"
        },
    },
];