Spaces:

HPAI-BSC
/

TuRTLe-Leaderboard

Running

File size: 8,050 Bytes

import csv
import json
import locale
import os
import sys
from typing import Dict, Union

import pandas as pd

model_details = {
    "DeepSeek R1-0528": (
        "https://huggingface.co/deepseek-ai/DeepSeek-R1-0528",
        685,
        "General",
        "V2",
    ),
    "DeepSeek R1": (
        "https://huggingface.co/deepseek-ai/DeepSeek-R1",
        685,
        "General",
        "V1",
    ),
    "Llama 3.1 405B": (
        "https://huggingface.co/RedHatAI/Meta-Llama-3.1-405B-FP8",
        406,
        "General",
        "V1",
    ),
    "Qwen3 236B A22B": (
        "https://huggingface.co/Qwen/Qwen3-235B-A22B",
        235,
        "General",
        "V2",
    ),
    "Llama 3.(1-3) 70B": (
        "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct",
        70.6,
        "General",
        "V1",
    ),
    "Qwen2.5 72B": (
        "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
        72.7,
        "General",
        "V1",
    ),
    "QwQ 32B": ("https://huggingface.co/Qwen/QwQ-32B", 32.8, "General", "V2"),
    "Qwen2.5 32B": ("https://huggingface.co/Qwen/Qwen2.5-32B", 32.5, "General", "V1"),
    "StarChat2 15B v0.1": (
        "https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1",
        16,
        "General",
        "V1",
    ),
    "DeepSeek R1 Distill Qwen 14B": (
        "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
        14.8,
        "General",
        "V1",
    ),
    "CodeLlama 70B": (
        "https://huggingface.co/codellama/CodeLlama-70b-hf",
        69,
        "Coding",
        "V1",
    ),
    "QwenCoder 2.5 32B": (
        "https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct",
        32.5,
        "Coding",
        "V1",
    ),
    "DeepSeek Coder 33B": (
        "https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct",
        33.3,
        "Coding",
        "V1",
    ),
    "QwenCoder 2.5 14B": (
        "https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct",
        14.7,
        "Coding",
        "V1",
    ),
    "DeepCoder 14B": (
        "https://huggingface.co/agentica-org/DeepCoder-14B-Preview",
        14.8,
        "Coding",
        "V2",
    ),
    "OpenCoder 8B": (
        "https://huggingface.co/infly/OpenCoder-8B-Instruct",
        7.77,
        "Coding",
        "V1",
    ),
    "SeedCoder 8B": (
        "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Instruct",
        8.25,
        "Coding",
        "V2",
    ),
    "SeedCoder 8B Reasoning": (
        "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16",
        8.25,
        "Coding",
        "V2",
    ),
    "QwenCoder 2.5 7B": (
        "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct",
        7.61,
        "Coding",
        "V1",
    ),
    "DeepSeek Coder 6.7B": (
        "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct",
        6.74,
        "Coding",
        "V1",
    ),
    "HaVen-CodeQwen": (
        "https://huggingface.co/yangyiyao/HaVen-CodeQwen",
        7.25,
        "RTL-Specific",
        "V1",
    ),
    "CodeV R1 Distill Qwen 7B": (
        "https://huggingface.co/zhuyaoyu/CodeV-R1-Distill-Qwen-7B",
        7.62,
        "RTL-Specific",
        "V2",
    ),
    "CodeV-CL-7B": (
        "https://huggingface.co/yang-z/CodeV-CL-7B",
        6.74,
        "RTL-Specific",
        "V1",
    ),
    "CodeV-QW-7B": (
        "https://huggingface.co/yang-z/CodeV-QW-7B",
        7.25,
        "RTL-Specific",
        "V1",
    ),
    "CodeV-DS-6.7B": (
        "https://huggingface.co/yang-z/CodeV-DS-6.7B",
        6.74,
        "RTL-Specific",
        "V1",
    ),
    "RTLCoder Mistral": (
        "https://huggingface.co/ishorn5/RTLCoder-v1.1",
        7.24,
        "RTL-Specific",
        "V1",
    ),
    "RTLCoder DeepSeek": (
        "https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1",
        6.74,
        "RTL-Specific",
        "V1",
    ),
    "OriGen": ("https://huggingface.co/henryen/OriGen", 6.74, "RTL-Specific", "V1"),
    "Qwen3 Coder 480B A35B": (
        "https://huggingface.co/Qwen/Qwen3-Coder-480B-A35B-Instruct",
        480,
        "Coding",
        "V2",
    ),
    "Magistral Small 2506": (
        "https://huggingface.co/mistralai/Magistral-Small-2506",
        23.6,
        "General",
        "V2",
    ),
}


def get_headers(reader, agg=False) -> Union[list, list]:
    metrics, benchs = [], []
    for i, row in enumerate(reader):
        if i == 0:
            metrics = row[1:]
        elif i == 1 and not agg:
            benchs = row[1:]
            break
        else:
            return metrics
    return metrics, benchs


def get_model_params_and_url(model) -> Union[str, str, float, str]:
    if model not in model_details:
        return "-", 0.0, "Unknown", "Unknown"
    url = model_details[model][0]
    params = model_details[model][1]
    type = model_details[model][2]
    release = model_details[model][3]
    return url, params, type, release


def parse_results(csv_path: str) -> list[dict]:
    """
    Each row has the following format:
        MODEL | BENCHMARK | TASK | METRIC | RESULT
    """
    dataset = []
    models = []
    with open(csv_path, newline="") as csvfile:
        reader = csv.reader(csvfile, delimiter=",")
        metrics, benchs = get_headers(reader)
        for i, row in enumerate(reader):
            if not row or all(not cell.strip() for cell in row):
                continue
            model = row[0]
            if not model:
                continue
            url, params, type, release = get_model_params_and_url(model)
            models.append(model)
            row = row[1:]
            ctr = 0
            for metric, bench in zip(metrics, benchs):
                if metric == "EM":
                    metric = "Exact Matching (EM)"
                record = {}
                record["Model"] = model
                record["Model Type"] = type
                record["Benchmark"] = bench
                record["Task"] = metric
                record["Result"] = float(row[ctr].replace(",", "."))
                record["Model URL"] = url
                record["Params"] = params
                record["Release"] = release
                dataset.append(record)
                ctr += 1
    print(models)
    return dataset


def parse_agg(csv_path: str = "results/aggregated_scores_icarus.csv") -> pd.DataFrame:
    """
    Each row has the following format:
        MODEL | BENCHMARK | TASK | METRIC | RESULT
    """
    return pd.read_csv(csv_path)


def writeJson(data: list, path: str):
    with open(path, "w") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    print("Done")


def read_json(json_path: str = "results/results_icarus.json"):
    with open(json_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data


def read_data(
    json_path: str = "results/results_icarus.json",
) -> tuple[pd.DataFrame, list, list, str]:
    data = read_json(json_path)
    df = pd.DataFrame(data)
    df.rename(
        columns={
            "Model": "Model",
            "Benchmark": "Benchmark",
            "Task": "Metric",
            "Result": "Score",
            "EM": "Exact Matching (EM)",
        },
        inplace=True,
    )
    df["Params"] = pd.to_numeric(df["Params"], errors="coerce")
    benchmarks = sorted(df["Benchmark"].unique().tolist(), reverse=True)
    metrics = df["Metric"].unique().tolist()
    default_metric = (
        "Functionality (FNC)" if "Functionality (FNC)" in metrics else metrics[0]
    )
    return df, benchmarks, metrics, default_metric


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python results/parse.py <path_to_input_csv>")
        sys.exit(1)

    csv_path = sys.argv[1]

    if not os.path.exists(csv_path):
        print(f"Error: File not found at {csv_path}")
        sys.exit(1)

    json_path = os.path.splitext(csv_path)[0] + ".json"

    print(f"Parsing {csv_path}...")
    parsed_data = parse_results(csv_path)
    writeJson(parsed_data, json_path)