Spaces:

HPAI-BSC
/

TuRTLe-Leaderboard

Running

App Files Files Community

ggcristian commited on May 28

Commit

1df4c13

1 Parent(s): b410573

Release TuRTLe V2

Browse files

Files changed (9) hide show

app.py +18 -11
results/aggregated_scores.csv +28 -22
results/parse.py +75 -10
results/results.csv +27 -21
results/results.json +0 -0
results/v1/aggregated_scores.csv +22 -0
results/v1/results.csv +23 -0
results/v1/results.json +0 -0
utils.py +194 -67

app.py CHANGED Viewed

@@ -191,19 +191,16 @@ with gr.Blocks(
     gr.HTML(
         """
-    <p align="center" style="margin-bottom: -10px;">
-        <img src='/gradio_api/file=logo.png' alt='TuRTLe Logo' width='220'/> <br/>
-    </p>
     """
     )
     gr.HTML(
         """
     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
     <script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script>
-    <div style="text-align: center; margin-bottom: 15px;">
-        <p style="margin-bottom: 15px;">Welcome to the TuRTLe Model Leaderboard! TuRTLe is a <b>unified evaluation framework designed to systematically assess Large Language Models (LLMs) in RTL (Register-Transfer Level) generation</b> for hardware design.
-        Evaluation criteria include <b>syntax correctness, functional accuracy, synthesizability, and post-synthesis quality</b> (PPA: Power, Performance, Area). TuRTLe integrates multiple benchmarks to highlight strengths and weaknesses of available LLMs.
-        Use the filters below to explore different RTL benchmarks and models.</p>
         <a href="https://github.com/HPAI-BSC/TuRTLe" target="_blank" style="text-decoration: none; margin-right: 10px;">
             <button style="background: #333; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
                 GitHub Repo
@@ -221,9 +218,19 @@ with gr.Blocks(
                 How to submit
             </button>
         </a>
-    <p style="margin-top: 15px;">If you have any inquiries or wish to collaborate:
-        <a href="mailto:hpai@bsc.es">hpai@bsc.es</a>
-    </p>
     </div>
     """
     )
@@ -353,7 +360,7 @@ with gr.Blocks(
                 citation_button = gr.Textbox(
                     value=CITATION_BUTTON_TEXT,
                     label=CITATION_BUTTON_LABEL,
-                    lines=20,
                     elem_id="citation-button",
                     show_copy_button=True,
                 )

     gr.HTML(
         """
+    <div align="center">
+        <img src='/gradio_api/file=logo.png' alt='TuRTLe Logo' width='220'/>
+    </div>
     """
     )
     gr.HTML(
         """
     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
     <script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script>
+    <div style="text-align: center; margin-bottom: 0px; margin-top: 0px;">
         <a href="https://github.com/HPAI-BSC/TuRTLe" target="_blank" style="text-decoration: none; margin-right: 10px;">
             <button style="background: #333; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
                 GitHub Repo
                 How to submit
             </button>
         </a>
+        <p style="margin-top: 15px;">If you have any inquiries or wish to collaborate:
+            <a href="mailto:hpai@bsc.es">hpai@bsc.es</a>
+        </p>
+    </div>
+    """
+    )
+    gr.HTML(
+        """
+    <div style=" margin-top:-10px !important;">
+        <p style="margin-bottom: 15px;  text-align: start !important;">Welcome to the TuRTLe Model Leaderboard! TuRTLe is a <b>unified evaluation framework designed to systematically assess Large Language Models (LLMs) in RTL (Register-Transfer Level) generation</b> for hardware design.
+        Evaluation criteria include <b>syntax correctness, functional accuracy, synthesizability, and post-synthesis quality</b> (PPA: Power, Performance, Area). TuRTLe integrates multiple benchmarks to highlight strengths and weaknesses of available LLMs.
+        Use the filters below to explore different RTL benchmarks and models.</p>
+    <p style="margin-top: 15px; text-align: start !important; "><span style="font-variant: small-caps; font-weight: bold;">NEW UPDATE (JUNE 2025)</span>: We make our framework open-source on GitHub, and add 6 new recent models! For a total of 27 models and 5 RTL benchmarks.</p>
     </div>
     """
     )
                 citation_button = gr.Textbox(
                     value=CITATION_BUTTON_TEXT,
                     label=CITATION_BUTTON_LABEL,
+                    lines=10,
                     elem_id="citation-button",
                     show_copy_button=True,
                 )

results/aggregated_scores.csv CHANGED Viewed

@@ -1,22 +1,28 @@
-Model,Agg S2R,Agg MC,Agg VerilogEval S2R,Agg VerilogEval MC,Agg RTLLM,Agg VeriGen
-DeepSeek R1,74.84,75.51,77.01,77.81,68.06,54.4
-Llama 3.1 405B,49.72,42.8,53.98,42.92,36.43,41.67
-Llama 3.(1-3) 70B,39.0,38.49,38.64,37.45,40.12,48.05
-Qwen2.5 72B,49.23,48.82,49.17,51.22,49.45,26.75
-Qwen2.5 32B,50.58,40.73,50.53,41.85,50.71,30.46
-StarChat2 15B v0.1,39.04,38.9,37.45,37.69,44.0,49.99
-DeepSeek R1 Distill Qwen 14B,22.98,23.61,23.21,23.47,22.27,24.91
-CodeLlama 70B,31.46,31.29,34.17,29.8,22.99,44.96
-QwenCoder 2.5 32B,42.53,43.71,42.27,43.96,43.33,41.4
-DeepSeek Coder 33B,25.71,36.47,19.49,37.25,45.11,29.29
-QwenCoder 2.5 14B,36.75,38.49,35.61,39.03,40.33,33.55
-OpenCoder 8B,31.13,34.76,27.12,34.55,43.63,36.67
-QwenCoder 2.5 7B,13.86,32.31,6.31,31.75,37.41,37.47
-"DeepSeek Coder 6,7B",31.6,30.03,28.69,30.41,40.67,26.61
-RTLCoder Mistral,21.86,27.2,22.73,26.21,19.15,36.3
-RTLCoder DeepSeek,32.21,37.6,31.75,37.47,33.64,38.81
-OriGen,37.22,41.29,46.0,41.97,9.82,35.07
-HaVen-CodeQwen,41.66,46.09,42.97,46.57,37.55,41.74
-CodeV-CL-7B,28.19,35.7,25.75,35.39,35.79,38.53
-CodeV-QW-7B,20.79,47.26,18.73,50.28,27.23,19.55
-CodeV-DS-6.7B,18.19,44.1,14.28,47.05,30.39,17.03

+Model,Agg S2R,Agg MC,Agg VerilogEval S2R,Agg VerilogEval MC,Agg RTLLM,Agg VeriGen
+DeepSeek R1,75.53,72.96,77.67,77.55,68.49,57.82
+Llama 3.1 405B,53.23,53.88,56.55,54.35,42.26,52.35
+Qwen3 236B A22B,69.82,61.71,74.83,68.36,53.31,39.8
+Llama 3.(1-3) 70B,39.48,43.29,39.47,40.83,39.53,51.42
+Qwen2.5 72B,49.36,47.23,50.22,50.74,46.51,35.65
+QwQ 32B,62.6,39.46,65.02,38.68,54.6,42.03
+Qwen2.5 32B,50.39,38.93,50.86,41.01,48.86,32.09
+StarChat2 15B v0.1,38.76,38.98,36.68,35.58,45.61,50.2
+DeepSeek R1 Distill Qwen 14B,23.14,23.3,24.94,24.3,17.22,20.01
+CodeLlama 70B,33.04,32.86,32.2,32.27,35.81,34.8
+DeepSeek Coder 33B,27.03,36.31,22.65,37.64,41.47,31.91
+QwenCoder 2.5 32B,44.02,43.75,43.68,44.05,45.15,42.76
+QwenCoder 2.5 14B,37.69,38.97,35.32,40.26,45.5,34.72
+DeepCoder 14B,26.4,30.75,27.32,33.18,23.38,22.75
+OpenCoder 8B,30.06,35.86,26.8,34.57,40.83,40.1
+SeedCoder 8B,50.89,34.05,51.71,36.52,48.2,25.89
+SeedCoder 8B Reasoning,43.75,47.1,47.85,46.76,30.22,48.23
+QwenCoder 2.5 7B,14.15,32.86,6.57,33.0,39.16,32.4
+"DeepSeek Coder 6,7B",31.87,27.89,28.88,28.99,41.75,24.27
+RTLCoder Mistral,21.82,28.65,23.71,26.34,15.58,36.27
+RTLCoder DeepSeek,37.22,36.64,38.33,36.13,33.58,38.33
+OriGen,52.88,51.89,53.0,50.0,52.47,58.12
+CodeV R1 Distill Qwen 7B,36.12,26.84,32.35,20.56,48.57,47.55
+HaVen-CodeQwen,43.58,47.13,44.67,47.23,39.98,46.8
+CodeV-CL-7B,14.73,33.73,12.71,30.93,21.38,42.97
+CodeV-QW-7B,20.37,50.11,18.82,50.64,25.48,48.38
+CodeV-DS-6.7B,19.62,47.1,14.8,46.08,35.52,50.46

results/parse.py CHANGED Viewed

@@ -1,96 +1,159 @@
 import csv
 import json
 import locale
 from typing import Dict, Union
 import pandas as pd
 model_details = {
-    "DeepSeek R1": ("https://huggingface.co/deepseek-ai/DeepSeek-R1", 685, "General"),
     "Llama 3.1 405B": (
-        "https://huggingface.co/meta-llama/Llama-3.1-405B",
         406,
         "General",
     ),
     "Llama 3.(1-3) 70B": (
         "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct",
         70.6,
         "General",
     ),
     "Qwen2.5 72B": (
         "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
         72.7,
         "General",
     ),
-    "Qwen2.5 32B": ("https://huggingface.co/Qwen/Qwen2.5-32B", 32.5, "General"),
     "StarChat2 15B v0.1": (
         "https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1",
         16,
         "General",
     ),
     "DeepSeek R1 Distill Qwen 14B": (
         "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
         14.8,
         "General",
     ),
     "CodeLlama 70B": (
         "https://huggingface.co/codellama/CodeLlama-70b-hf",
         69,
         "Coding",
     ),
     "QwenCoder 2.5 32B": (
         "https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct",
         32.5,
         "Coding",
     ),
     "DeepSeek Coder 33B": (
         "https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct",
         33.3,
         "Coding",
     ),
     "QwenCoder 2.5 14B": (
         "https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct",
         14.7,
         "Coding",
     ),
     "OpenCoder 8B": (
         "https://huggingface.co/infly/OpenCoder-8B-Instruct",
         7.77,
         "Coding",
     ),
     "QwenCoder 2.5 7B": (
         "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct",
         7.61,
         "Coding",
     ),
     "DeepSeek Coder 6,7B": (
         "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct",
         6.74,
         "Coding",
     ),
     "HaVen-CodeQwen": (
         "https://huggingface.co/yangyiyao/HaVen-CodeQwen",
         7.25,
         "RTL-Specific",
     ),
-    "CodeV-CL-7B": ("https://huggingface.co/yang-z/CodeV-CL-7B", 6.74, "RTL-Specific"),
-    "CodeV-QW-7B": ("https://huggingface.co/yang-z/CodeV-QW-7B", 7.25, "RTL-Specific"),
     "CodeV-DS-6.7B": (
         "https://huggingface.co/yang-z/CodeV-DS-6.7B",
         6.74,
         "RTL-Specific",
     ),
     "RTLCoder Mistral": (
         "https://huggingface.co/ishorn5/RTLCoder-v1.1",
         7.24,
         "RTL-Specific",
     ),
     "RTLCoder DeepSeek": (
         "https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1",
         6.74,
         "RTL-Specific",
     ),
-    "OriGen": ("https://huggingface.co/henryen/OriGen_Fix", 6.74, "RTL-Specific"),
 }
@@ -107,13 +170,14 @@ def get_headers(reader, agg=False) -> Union[list, list]:
     return metrics, benchs
-def get_model_params_and_url(model) -> Union[str, str, float]:
     if model not in model_details:
         return "-", "-", "-"
     url = model_details[model][0]
     params = model_details[model][1]
     type = model_details[model][2]
-    return url, params, type
 def parse_results(csv_path: str) -> list[dict]:
@@ -123,12 +187,12 @@ def parse_results(csv_path: str) -> list[dict]:
     """
     dataset = []
     models = []
-    with open(csv_path, newline="") as csvfile:
         reader = csv.reader(csvfile, delimiter=",")
         metrics, benchs = get_headers(reader)
         for i, row in enumerate(reader):
             model = row[0]
-            url, params, type = get_model_params_and_url(model)
             models.append(model)
             row = row[1:]
             ctr = 0
@@ -143,6 +207,7 @@ def parse_results(csv_path: str) -> list[dict]:
                 record["Result"] = float(row[ctr].replace(",", "."))
                 record["Model URL"] = url
                 record["Params"] = params
                 dataset.append(record)
                 ctr += 1
     print(models)

 import csv
 import json
 import locale
+import os
 from typing import Dict, Union
 import pandas as pd
 model_details = {
+    "DeepSeek R1": (
+        "https://huggingface.co/deepseek-ai/DeepSeek-R1",
+        685,
+        "General",
+        "V1",
+    ),
     "Llama 3.1 405B": (
+        "https://huggingface.co/RedHatAI/Meta-Llama-3.1-405B-FP8",
         406,
         "General",
+        "V1",
+    ),
+    "Qwen3 236B A22B": (
+        "https://huggingface.co/Qwen/Qwen3-235B-A22B",
+        235,
+        "General",
+        "V2",
     ),
     "Llama 3.(1-3) 70B": (
         "https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct",
         70.6,
         "General",
+        "V1",
     ),
     "Qwen2.5 72B": (
         "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
         72.7,
         "General",
+        "V1",
     ),
+    "QwQ 32B": ("https://huggingface.co/Qwen/QwQ-32B", 32.8, "General", "V2"),
+    "Qwen2.5 32B": ("https://huggingface.co/Qwen/Qwen2.5-32B", 32.5, "General", "V1"),
     "StarChat2 15B v0.1": (
         "https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1",
         16,
         "General",
+        "V1",
     ),
     "DeepSeek R1 Distill Qwen 14B": (
         "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
         14.8,
         "General",
+        "V1",
     ),
     "CodeLlama 70B": (
         "https://huggingface.co/codellama/CodeLlama-70b-hf",
         69,
         "Coding",
+        "V1",
     ),
     "QwenCoder 2.5 32B": (
         "https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct",
         32.5,
         "Coding",
+        "V1",
     ),
     "DeepSeek Coder 33B": (
         "https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct",
         33.3,
         "Coding",
+        "V1",
     ),
     "QwenCoder 2.5 14B": (
         "https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct",
         14.7,
         "Coding",
+        "V1",
+    ),
+    "DeepCoder 14B": (
+        "https://huggingface.co/agentica-org/DeepCoder-14B-Preview",
+        14.8,
+        "Coding",
+        "V2",
     ),
     "OpenCoder 8B": (
         "https://huggingface.co/infly/OpenCoder-8B-Instruct",
         7.77,
         "Coding",
+        "V1",
+    ),
+    "SeedCoder 8B": (
+        "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Instruct",
+        8.25,
+        "Coding",
+        "V2",
+    ),
+    "SeedCoder 8B Reasoning": (
+        "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16",
+        8.25,
+        "Coding",
+        "V2",
     ),
     "QwenCoder 2.5 7B": (
         "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct",
         7.61,
         "Coding",
+        "V1",
     ),
     "DeepSeek Coder 6,7B": (
         "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct",
         6.74,
         "Coding",
+        "V1",
     ),
     "HaVen-CodeQwen": (
         "https://huggingface.co/yangyiyao/HaVen-CodeQwen",
         7.25,
         "RTL-Specific",
+        "V1",
+    ),
+    "CodeV R1 Distill Qwen 7B": (
+        "https://huggingface.co/zhuyaoyu/CodeV-R1-Distill-Qwen-7B",
+        7.62,
+        "RTL-Specific",
+        "V2",
+    ),
+    "CodeV-CL-7B": (
+        "https://huggingface.co/yang-z/CodeV-CL-7B",
+        6.74,
+        "RTL-Specific",
+        "V1",
+    ),
+    "CodeV-QW-7B": (
+        "https://huggingface.co/yang-z/CodeV-QW-7B",
+        7.25,
+        "RTL-Specific",
+        "V1",
     ),
     "CodeV-DS-6.7B": (
         "https://huggingface.co/yang-z/CodeV-DS-6.7B",
         6.74,
         "RTL-Specific",
+        "V1",
     ),
     "RTLCoder Mistral": (
         "https://huggingface.co/ishorn5/RTLCoder-v1.1",
         7.24,
         "RTL-Specific",
+        "V1",
     ),
     "RTLCoder DeepSeek": (
         "https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1",
         6.74,
         "RTL-Specific",
+        "V1",
     ),
+    "OriGen": ("https://huggingface.co/henryen/OriGen_Fix", 6.74, "RTL-Specific", "V1"),
 }
     return metrics, benchs
+def get_model_params_and_url(model) -> Union[str, str, float, str]:
     if model not in model_details:
         return "-", "-", "-"
     url = model_details[model][0]
     params = model_details[model][1]
     type = model_details[model][2]
+    release = model_details[model][3]
+    return url, params, type, release
 def parse_results(csv_path: str) -> list[dict]:
     """
     dataset = []
     models = []
+    with open(os.path.join("results", csv_path), newline="") as csvfile:
         reader = csv.reader(csvfile, delimiter=",")
         metrics, benchs = get_headers(reader)
         for i, row in enumerate(reader):
             model = row[0]
+            url, params, type, release = get_model_params_and_url(model)
             models.append(model)
             row = row[1:]
             ctr = 0
                 record["Result"] = float(row[ctr].replace(",", "."))
                 record["Model URL"] = url
                 record["Params"] = params
+                record["Release"] = release
                 dataset.append(record)
                 ctr += 1
     print(models)

results/results.csv CHANGED Viewed

@@ -1,23 +1,29 @@
 ,Syntax (STX),Syntax (STX),Functionality (FNC),Functionality (FNC),Synthesis (SYN),Synthesis (SYN),Power,Power,Performance,Performance,Area,Area,EM,Syntax (STX),Syntax (STX),Functionality (FNC),Functionality (FNC),Synthesis (SYN),Synthesis (SYN),Power,Power,Performance,Performance,Area,Area
 ,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,RTL-Repo,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen
-DeepSeek R1,"96,54","91,43","79,74","67,76","78,97","63,27","38,94","35,64","37,82","31,95","38,76","34,5","33,02","97,95","94,12","80,26",60,"79,62","54,12","39,35","26,62","38,16","26,97","39,21","28,01"
-Llama 3.1 405B,"89,1","65,71","57,05","37,55","56,67","35,92","27,18","19,7",27,"16,04","26,79","18,91","33,29","91,41","72,94","44,74","45,88","44,1","44,71","21,98","19,24","20,74","22,19","21,66","21,08"
-Llama 3.(1-3) 70B,"67,69","73,88",40,"43,27","39,87","39,18","19,76","20,58","18,69","19,32","19,51","20,28","28,62","70,51","78,82","38,59","48,24","37,95","48,24","18,97","23,22","18,35","24,15","18,86","24,71"
-Qwen2.5 72B,"81,15","82,04","51,15","47,35","50,38","46,53","25,4","24,83","23,83","23,88","24,52","25,46","37,19","81,67","70,59","53,08","27,06","52,56","27,06","26,08","12,74","24,92","13,5","25,83","13,89"
-Qwen2.5 32B,"89,36","86,94","52,95","50,61","51,67","46,94","26,02","25,9","24,46","23,87","25,32","26,29","28,67","93,46","67,06","43,08","32,94","42,31","30,59","21,14",15,"20,48","15,38","21,15","15,31"
-StarChat2 15B v0.1,"86,54","85,71","38,72","42,45","38,59","42,45","19,18","22,44","17,99","21,03",19,"22,53","13,24","81,54","92,94","39,36","50,59","38,59","50,59","19,21","24,02","18,28","25,23","19,05","25,73"
-DeepSeek R1 Distill Qwen 14B,"41,28","40,82","23,85","20,82","23,72","20,41","11,81","12,46","11,25","10,24","11,76","10,7","20,65","42,18","62,35","24,36","25,88","24,1","25,88","11,96","11,54","11,38","12,93","11,86","12,9"
-CodeLlama 70B,"72,05","41,63","35,51","23,27","35,38","22,86","17,32","11,92","16,74","10,85","17,2","11,71","24,58","89,36","89,41","30,9","45,88","30,9","45,88","15,3","21,74","14,19","22,88","15,21","22,82"
-QwenCoder 2.5 32B,"87,69","79,59","45,64","43,27","43,33","42,04","21,51","22,02","20,72","20,95","21,17","22,03","30,44","84,87","72,94","45,51","41,18","44,87","41,18","22,26","20,56","21,48","20,67","22,2","20,87"
-DeepSeek Coder 33B,"57,82","83,67","19,87","43,67","19,87","42,86","9,94","23,28","9,83","21,19","9,47","23,2","30,58","78,72","83,53","39,49","29,41","38,33","29,41","18,92","14,52","18,2","14,74","18,76","14,67"
-QwenCoder 2.5 14B,"79,74","78,37","37,82","41,63","37,05","40,41","18,03","20,14","17,6","20,1","17,78","20,25","37,16","79,36","67,06","40,26","34,12","39,49","34,12","19,74","16,5","19,07","17,07","19,73","16,75"
-OpenCoder 8B,"75,77","75,1","28,59","46,53","28,21","42,86","13,81","22,24","13,16","21,47","13,71","21,73","16,63","79,87","92,94","36,03","43,53","35,51","37,65","17,57","17,19","16,74","18,76","17,52","19,06"
-QwenCoder 2.5 7B,"19,62","77,96","6,41","37,96","6,41","35,51","3,12","19,26","3,18","17,98","3,16","18,87","28,45","75,9","71,76","32,44","37,65","32,44","37,65","16,2","18,38","15,26","18,91","16,16","18,92"
-"DeepSeek Coder 6,7B","80,12","78,37","29,87","40,41","29,36","37,96","14,71","20,72","13,69","19,25","14,64","21,03","24,57","68,85","81,18","32,82","27,06","31,15","27,06","15,53","12,94","14,62","13,39","15,46","13,58"
-RTLCoder Mistral,"52,05","38,78","23,59","19,18","23,59","19,18","11,67","10,08","10,87","8,7","11,56","9,95","14,97","63,59","85,88","26,92","35,29","26,92","35,29","13,43","18,49","12,53","17,61","13,36","18,35"
-RTLCoder DeepSeek,"75,26","68,57","33,33","37,14","32,95","33,06","16,02","17,29","15,71","16,35","15,9","16,82","19,76","84,1","84,71","39,23","38,82","38,59","38,82","19,08","19,1","18,31","19,35","18,82","19,76"
-OriGen,"91,02","23,67","46,54","12,65","46,92","10,61","23,38","5,33","22,18","4,61","23,44","4,79","19,45","79,35","87,06","43,07","35,29","42,95","35,29","21,5","16,55","20,13","17,7","21,33","18,35"
-HaVen-CodeQwen,"90,26","82,45","45,9","40,41","44,36","38,37","21,77","19,1","21,23","18,31","21,46","18,92","25,38","93,33","97,65",50,"48,24","48,72","42,35","23,37","20,21","23,39","21,15","23,09","21,25"
-CodeV-CL-7B,"55,38","69,8","27,05","37,14","26,79","35,1","13,2","18,92","12,39","16,88","13,03","17,89","12,39","91,92","98,82","36,79","44,71","36,41","38,82","18,15","19,06","16,88","19,38","18,05","19,35"
-CodeV-QW-7B,"41,79","71,02","19,1","35,51","18,72","27,76","9,36","14,85","9,36","12,21","9,38","13,78","20,56","93,85","57,65","52,56","25,88","51,15",20,"25,64","9,39","24,22","9,99","25,56","9,94"
-CodeV-DS-6.7B,"30,77","62,45","14,87","33,88","14,62","30,61","7,3","15,49","6,9","14,75","7,22","15,35","21,06","95,13","58,82","48,85","23,53","48,33","17,65","24,02","8,26","22,82","8,81","23,73","8,47"

 ,Syntax (STX),Syntax (STX),Functionality (FNC),Functionality (FNC),Synthesis (SYN),Synthesis (SYN),Power,Power,Performance,Performance,Area,Area,EM,Syntax (STX),Syntax (STX),Functionality (FNC),Functionality (FNC),Synthesis (SYN),Synthesis (SYN),Power,Power,Performance,Performance,Area,Area
 ,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,RTL-Repo,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen
+DeepSeek R1,97.18,89.80,79.74,65.71,79.62,63.27,78.33,71.34,76.49,64.06,78.19,70.08,-1,97.44,96.47,79.49,60.00,79.49,60.00,78.27,50.25,76.43,60.15,77.96,63.07
+Llama 3.1 405B,87.44,77.14,58.97,45.71,58.85,41.63,57.58,50.88,55.93,32.44,56.13,43.45,34.62,88.59,95.29,56.15,52.94,55.90,52.94,55.13,49.22,53.45,52.52,54.48,55.31
+Qwen3 236B A22B,91.28,80.41,76.92,53.06,76.79,51.43,75.25,57.77,73.56,49.20,75.67,52.95,41.94,82.18,91.76,69.62,40.00,69.62,40.00,69.04,39.09,66.89,40.16,69.15,40.14
+Llama 3.(1-3) 70B,66.15,73.88,40.64,42.45,40.64,39.18,40.46,40.81,38.08,38.14,39.86,39.65,28.72,84.74,89.41,41.67,51.76,41.67,51.76,41.38,50.61,39.75,51.76,41.36,51.88
+Qwen2.5 72B,82.18,79.59,52.44,45.31,51.92,44.08,51.83,46.47,48.75,45.40,50.09,47.65,37.44,80.90,84.71,52.95,35.29,52.69,35.29,51.66,35.82,49.37,35.20,51.18,35.94
+QwQ 32B,87.95,82.45,66.41,56.73,66.41,52.24,66.15,55.83,63.80,51.91,65.12,56.07,-1,58.97,68.24,40.00,42.35,39.62,42.35,39.40,40.90,37.53,42.31,39.10,42.87
+Qwen2.5 32B,88.59,84.08,52.56,50.20,52.18,46.12,52.32,49.73,49.43,46.43,50.82,50.43,28.93,93.21,85.88,41.54,32.94,41.54,32.94,41.31,30.65,40.48,33.11,41.23,32.50
+StarChat2 15B v0.1,88.46,84.90,37.95,44.49,37.95,44.08,37.56,46.95,35.30,43.22,37.19,46.65,13.42,79.74,92.94,36.41,51.76,36.03,51.76,36.08,46.30,34.91,51.49,35.76,52.80
+DeepSeek R1 Distill Qwen 14B,42.18,34.69,25.51,18.37,25.51,16.33,25.36,17.86,24.19,16.48,25.27,17.33,-1,45.00,44.71,25.64,21.18,25.26,21.18,24.79,17.65,23.48,21.08,24.63,21.29
+CodeLlama 70B,67.05,69.80,33.08,36.33,33.08,34.29,32.69,37.19,31.46,34.29,32.44,35.95,24.33,90.77,88.24,33.33,35.29,33.33,35.29,33.02,34.03,30.80,35.15,32.99,35.21
+DeepSeek Coder 33B,62.82,83.67,23.33,42.45,23.08,42.04,22.86,42.29,22.81,39.42,22.29,42.71,24.58,75.26,88.24,39.62,31.76,39.36,31.76,38.23,32.16,36.79,31.46,37.90,32.12
+QwenCoder 2.5 32B,87.18,77.96,45.00,43.27,44.87,43.27,44.25,46.82,43.03,43.20,43.76,45.42,31.07,83.72,87.06,45.64,42.35,45.13,42.35,44.59,42.79,43.01,42.24,44.55,43.25
+QwenCoder 2.5 14B,78.97,81.63,37.82,46.12,37.44,45.31,35.94,45.82,34.83,44.64,35.18,46.05,37.53,80.00,83.53,41.67,35.29,41.15,35.29,40.74,34.17,39.20,35.32,40.83,34.67
+DeepCoder 14B,43.85,39.59,28.08,23.67,28.08,22.04,27.94,25.00,26.26,22.00,27.77,23.15,-1,61.92,48.24,34.10,23.53,33.72,23.53,33.70,21.18,32.17,23.43,33.67,23.65
+OpenCoder 8B,78.21,75.92,28.46,42.86,27.82,40.82,27.34,41.36,25.95,39.77,27.11,41.36,16.17,80.00,95.29,35.64,41.18,35.38,41.18,35.12,37.69,33.47,41.05,35.13,41.55
+SeedCoder 8B,91.41,85.31,53.46,47.35,53.33,46.53,52.86,49.42,50.62,45.60,51.65,49.59,28.23,77.44,94.12,37.31,30.59,37.31,27.06,37.32,23.53,35.35,26.92,36.89,27.23
+SeedCoder 8B Reasoning,67.82,53.47,49.23,30.20,49.23,29.39,48.92,32.04,46.76,28.64,47.87,29.99,-1,83.33,78.82,48.21,50.59,48.08,50.59,47.78,41.74,45.44,50.02,47.06,52.92
+QwenCoder 2.5 7B,20.13,76.33,6.92,38.78,6.67,37.14,6.51,40.65,6.63,37.25,6.56,39.58,28.33,74.10,90.59,33.72,32.94,33.72,32.94,33.59,30.67,31.78,33.01,33.62,33.51
+"DeepSeek Coder 6,7B",82.05,78.78,29.62,41.22,29.49,38.78,29.51,42.62,27.73,39.33,29.41,43.30,24.63,67.18,84.71,31.67,24.71,29.87,24.71,29.78,23.53,27.98,24.50,29.21,24.79
+RTLCoder Mistral,54.87,32.24,24.62,16.33,24.62,15.92,24.28,16.03,22.78,14.71,24.06,16.00,14.77,60.51,85.88,27.05,36.47,27.05,36.47,26.94,34.63,25.22,36.55,26.87,37.64
+RTLCoder DeepSeek,84.62,73.06,39.49,37.14,39.49,34.69,38.91,34.30,37.52,32.76,38.55,33.69,19.35,77.31,85.88,36.92,40.00,36.79,40.00,36.94,35.57,34.84,39.83,36.62,39.60
+OriGen,96.15,81.63,54.23,50.61,54.23,50.61,54.29,53.10,51.57,50.86,53.15,53.44,17.07,92.44,98.82,50.77,58.82,50.77,58.82,50.95,54.14,48.53,58.81,50.51,61.40
+CodeV R1 Distill Qwen 7B,56.92,73.06,33.33,49.80,33.33,47.35,32.58,49.25,32.01,47.45,32.45,49.01,-1,92.69,89.41,21.28,49.41,21.28,49.41,21.04,43.68,19.59,49.06,21.05,49.91
+HaVen-CodeQwen,93.33,80.41,47.31,42.86,46.15,41.22,45.08,40.59,44.26,38.83,44.68,40.53,25.14,93.59,100.00,50.13,47.06,49.49,47.06,47.55,46.60,47.05,47.14,47.09,46.67
+CodeV-CL-7B,32.18,48.16,13.08,24.49,12.95,21.63,12.80,22.25,12.51,20.59,12.82,21.29,12.27,92.05,98.82,31.79,43.53,31.79,43.53,31.74,42.25,29.45,43.46,31.61,43.20
+CodeV-QW-7B,45.38,68.16,19.62,34.29,18.97,26.53,18.91,28.14,18.71,21.80,18.85,26.50,20.94,93.33,100.00,52.31,48.24,51.54,48.24,51.69,48.14,48.79,48.18,51.45,48.81
+CodeV-DS-6.7B,33.59,67.35,15.00,38.78,15.00,37.14,15.10,35.56,14.46,35.13,14.85,35.88,21.26,95.51,100.00,47.05,50.59,47.05,50.59,47.37,50.47,44.35,50.54,46.52,50.36

results/results.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

results/v1/aggregated_scores.csv ADDED Viewed

	@@ -0,0 +1,22 @@

+Model,Agg S2R,Agg MC,Agg VerilogEval S2R,Agg VerilogEval MC,Agg RTLLM,Agg VeriGen
+DeepSeek R1,74.84,75.51,77.01,77.81,68.06,54.4
+Llama 3.1 405B,49.72,42.8,53.98,42.92,36.43,41.67
+Llama 3.(1-3) 70B,39.0,38.49,38.64,37.45,40.12,48.05
+Qwen2.5 72B,49.23,48.82,49.17,51.22,49.45,26.75
+Qwen2.5 32B,50.58,40.73,50.53,41.85,50.71,30.46
+StarChat2 15B v0.1,39.04,38.9,37.45,37.69,44.0,49.99
+DeepSeek R1 Distill Qwen 14B,22.98,23.61,23.21,23.47,22.27,24.91
+CodeLlama 70B,31.46,31.29,34.17,29.8,22.99,44.96
+QwenCoder 2.5 32B,42.53,43.71,42.27,43.96,43.33,41.4
+DeepSeek Coder 33B,25.71,36.47,19.49,37.25,45.11,29.29
+QwenCoder 2.5 14B,36.75,38.49,35.61,39.03,40.33,33.55
+OpenCoder 8B,31.13,34.76,27.12,34.55,43.63,36.67
+QwenCoder 2.5 7B,13.86,32.31,6.31,31.75,37.41,37.47
+"DeepSeek Coder 6,7B",31.6,30.03,28.69,30.41,40.67,26.61
+RTLCoder Mistral,21.86,27.2,22.73,26.21,19.15,36.3
+RTLCoder DeepSeek,32.21,37.6,31.75,37.47,33.64,38.81
+OriGen,37.22,41.29,46.0,41.97,9.82,35.07
+HaVen-CodeQwen,41.66,46.09,42.97,46.57,37.55,41.74
+CodeV-CL-7B,28.19,35.7,25.75,35.39,35.79,38.53
+CodeV-QW-7B,20.79,47.26,18.73,50.28,27.23,19.55
+CodeV-DS-6.7B,18.19,44.1,14.28,47.05,30.39,17.03

results/v1/results.csv ADDED Viewed

	@@ -0,0 +1,23 @@

+,Syntax (STX),Syntax (STX),Functionality (FNC),Functionality (FNC),Synthesis (SYN),Synthesis (SYN),Power,Power,Performance,Performance,Area,Area,EM,Syntax (STX),Syntax (STX),Functionality (FNC),Functionality (FNC),Synthesis (SYN),Synthesis (SYN),Power,Power,Performance,Performance,Area,Area
+,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,RTL-Repo,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen
+DeepSeek R1,"96,54","91,43","79,74","67,76","78,97","63,27","38,94","35,64","37,82","31,95","38,76","34,5","33,02","97,95","94,12","80,26",60,"79,62","54,12","39,35","26,62","38,16","26,97","39,21","28,01"
+Llama 3.1 405B,"89,1","65,71","57,05","37,55","56,67","35,92","27,18","19,7",27,"16,04","26,79","18,91","33,29","91,41","72,94","44,74","45,88","44,1","44,71","21,98","19,24","20,74","22,19","21,66","21,08"
+Llama 3.(1-3) 70B,"67,69","73,88",40,"43,27","39,87","39,18","19,76","20,58","18,69","19,32","19,51","20,28","28,62","70,51","78,82","38,59","48,24","37,95","48,24","18,97","23,22","18,35","24,15","18,86","24,71"
+Qwen2.5 72B,"81,15","82,04","51,15","47,35","50,38","46,53","25,4","24,83","23,83","23,88","24,52","25,46","37,19","81,67","70,59","53,08","27,06","52,56","27,06","26,08","12,74","24,92","13,5","25,83","13,89"
+Qwen2.5 32B,"89,36","86,94","52,95","50,61","51,67","46,94","26,02","25,9","24,46","23,87","25,32","26,29","28,67","93,46","67,06","43,08","32,94","42,31","30,59","21,14",15,"20,48","15,38","21,15","15,31"
+StarChat2 15B v0.1,"86,54","85,71","38,72","42,45","38,59","42,45","19,18","22,44","17,99","21,03",19,"22,53","13,24","81,54","92,94","39,36","50,59","38,59","50,59","19,21","24,02","18,28","25,23","19,05","25,73"
+DeepSeek R1 Distill Qwen 14B,"41,28","40,82","23,85","20,82","23,72","20,41","11,81","12,46","11,25","10,24","11,76","10,7","20,65","42,18","62,35","24,36","25,88","24,1","25,88","11,96","11,54","11,38","12,93","11,86","12,9"
+CodeLlama 70B,"72,05","41,63","35,51","23,27","35,38","22,86","17,32","11,92","16,74","10,85","17,2","11,71","24,58","89,36","89,41","30,9","45,88","30,9","45,88","15,3","21,74","14,19","22,88","15,21","22,82"
+QwenCoder 2.5 32B,"87,69","79,59","45,64","43,27","43,33","42,04","21,51","22,02","20,72","20,95","21,17","22,03","30,44","84,87","72,94","45,51","41,18","44,87","41,18","22,26","20,56","21,48","20,67","22,2","20,87"
+DeepSeek Coder 33B,"57,82","83,67","19,87","43,67","19,87","42,86","9,94","23,28","9,83","21,19","9,47","23,2","30,58","78,72","83,53","39,49","29,41","38,33","29,41","18,92","14,52","18,2","14,74","18,76","14,67"
+QwenCoder 2.5 14B,"79,74","78,37","37,82","41,63","37,05","40,41","18,03","20,14","17,6","20,1","17,78","20,25","37,16","79,36","67,06","40,26","34,12","39,49","34,12","19,74","16,5","19,07","17,07","19,73","16,75"
+OpenCoder 8B,"75,77","75,1","28,59","46,53","28,21","42,86","13,81","22,24","13,16","21,47","13,71","21,73","16,63","79,87","92,94","36,03","43,53","35,51","37,65","17,57","17,19","16,74","18,76","17,52","19,06"
+QwenCoder 2.5 7B,"19,62","77,96","6,41","37,96","6,41","35,51","3,12","19,26","3,18","17,98","3,16","18,87","28,45","75,9","71,76","32,44","37,65","32,44","37,65","16,2","18,38","15,26","18,91","16,16","18,92"
+"DeepSeek Coder 6,7B","80,12","78,37","29,87","40,41","29,36","37,96","14,71","20,72","13,69","19,25","14,64","21,03","24,57","68,85","81,18","32,82","27,06","31,15","27,06","15,53","12,94","14,62","13,39","15,46","13,58"
+RTLCoder Mistral,"52,05","38,78","23,59","19,18","23,59","19,18","11,67","10,08","10,87","8,7","11,56","9,95","14,97","63,59","85,88","26,92","35,29","26,92","35,29","13,43","18,49","12,53","17,61","13,36","18,35"
+RTLCoder DeepSeek,"75,26","68,57","33,33","37,14","32,95","33,06","16,02","17,29","15,71","16,35","15,9","16,82","19,76","84,1","84,71","39,23","38,82","38,59","38,82","19,08","19,1","18,31","19,35","18,82","19,76"
+OriGen,"91,02","23,67","46,54","12,65","46,92","10,61","23,38","5,33","22,18","4,61","23,44","4,79","19,45","79,35","87,06","43,07","35,29","42,95","35,29","21,5","16,55","20,13","17,7","21,33","18,35"
+HaVen-CodeQwen,"90,26","82,45","45,9","40,41","44,36","38,37","21,77","19,1","21,23","18,31","21,46","18,92","25,38","93,33","97,65",50,"48,24","48,72","42,35","23,37","20,21","23,39","21,15","23,09","21,25"
+CodeV-CL-7B,"55,38","69,8","27,05","37,14","26,79","35,1","13,2","18,92","12,39","16,88","13,03","17,89","12,39","91,92","98,82","36,79","44,71","36,41","38,82","18,15","19,06","16,88","19,38","18,05","19,35"
+CodeV-QW-7B,"41,79","71,02","19,1","35,51","18,72","27,76","9,36","14,85","9,36","12,21","9,38","13,78","20,56","93,85","57,65","52,56","25,88","51,15",20,"25,64","9,39","24,22","9,99","25,56","9,94"
+CodeV-DS-6.7B,"30,77","62,45","14,87","33,88","14,62","30,61","7,3","15,49","6,9","14,75","7,22","15,35","21,06","95,13","58,82","48,85","23,53","48,33","17,65","24,02","8,26","22,82","8,81","23,73","8,47"

results/v1/results.json ADDED Viewed

The diff for this file is too large to render. See raw diff

utils.py CHANGED Viewed

@@ -1,89 +1,216 @@
-import pandas as pd
 import gradio as gr
-import plotly.graph_objects as go
-import plotly.express as px
 import numpy as np
-type_emoji = {
-    "RTL-Specific": "🔴",
-    "General": "🟢",
-    "Coding": "🔵"
-}
-def model_hyperlink(link, model_name):
-    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
 def handle_special_cases(benchmark, metric):
-    if metric == 'Exact Matching (EM)':
-        benchmark = 'RTL-Repo'
-    elif benchmark == 'RTL-Repo':
-        metric = 'Exact Matching (EM)'
     return benchmark, metric
 def filter_RTLRepo(subset: pd.DataFrame) -> pd.DataFrame:
-    details = subset[['Model', 'Model URL', 'Model Type', 'Params']].drop_duplicates('Model')
-    filtered_df = subset[['Model', 'Score']].rename(columns={'Score': 'Exact Matching (EM)'})
-    filtered_df = pd.merge(filtered_df, details, on='Model', how='left')
-    filtered_df['Model'] = filtered_df.apply(lambda row: model_hyperlink(row["Model URL"], row["Model"]), axis=1)
-    filtered_df['Type'] = filtered_df['Model Type'].map(lambda x: type_emoji.get(x, ""))
-    filtered_df = filtered_df[['Type', 'Model', 'Params', 'Exact Matching (EM)']]
-    filtered_df = filtered_df.sort_values(by='Exact Matching (EM)', ascending=False).reset_index(drop=True)
     return filtered_df
 def filter_bench(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataFrame:
-    details = subset[['Model', 'Model URL', 'Model Type', 'Params']].drop_duplicates('Model')
-    pivot_df = subset.pivot_table(index='Model', columns='Metric', values='Score', aggfunc='mean').reset_index()
     if df_agg is not None and agg_column is not None and agg_column in df_agg.columns:
-        agg_data = df_agg[['Model', agg_column]].rename(columns={agg_column: 'Aggregated ⬆️'})
-        pivot_df = pd.merge(pivot_df, agg_data, on='Model', how='left')
-    else:# fallback
-        pivot_df['Aggregated ⬆️'] = pivot_df.mean(axis=1, numeric_only=True).round(2)
-    pivot_df = pd.merge(pivot_df, details, on='Model', how='left')
-    pivot_df['Model'] = pivot_df.apply(lambda row: model_hyperlink(row["Model URL"], row["Model"]), axis=1)
-    pivot_df['Type'] = pivot_df['Model Type'].map(lambda x: type_emoji.get(x, ""))
-    pivot_df.rename(columns={'Syntax (STX)': 'STX', 'Functionality (FNC)': 'FNC', 'Synthesis (SYN)': 'SYN', 'Performance': 'Perf'}, inplace=True)
-    columns_order = ['Type', 'Model', 'Params', 'Aggregated ⬆️', 'STX', 'FNC', 'SYN', 'Power', 'Perf', 'Area']
     pivot_df = pivot_df[[col for col in columns_order if col in pivot_df.columns]]
-    pivot_df = pivot_df.sort_values(by='Aggregated ⬆️', ascending=False).reset_index(drop=True)
     return pivot_df
-def filter_bench_all(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataFrame:
-    details = subset[['Model', 'Model URL', 'Model Type', 'Params']].drop_duplicates('Model')
-    pivot_df = subset.pivot_table(index='Model', columns='Metric', values='Score', aggfunc='mean').reset_index().round(2)
     if df_agg is not None:
         if agg_column is not None and agg_column in df_agg.columns:
-            agg_data = df_agg[['Model', agg_column]].rename(columns={agg_column: 'Aggregated ⬆️'})
-            pivot_df = pd.merge(pivot_df, agg_data, on='Model', how='left')
         else:
-            agg_columns = [col for col in df_agg.columns if col.startswith('Agg ')]
             if agg_columns:
-                df_agg['Average_Agg'] = df_agg[agg_columns].mean(axis=1)
-                agg_data = df_agg[['Model', 'Average_Agg']].rename(columns={'Average_Agg': 'Aggregated ⬆️'})
-                pivot_df = pd.merge(pivot_df, agg_data, on='Model', how='left')
-            else: # fallback
-                pivot_df['Aggregated ⬆️'] = pivot_df.mean(axis=1, numeric_only=True).round(2)
-    else: # fallback
-        pivot_df['Aggregated ⬆️'] = pivot_df.mean(axis=1, numeric_only=True).round(2)
-    pivot_df = pd.merge(pivot_df, details, on='Model', how='left')
-    pivot_df['Model'] = pivot_df.apply(lambda row: model_hyperlink(row["Model URL"], row["Model"]), axis=1)
-    pivot_df['Type'] = pivot_df['Model Type'].map(lambda x: type_emoji.get(x, ""))
-    pivot_df.rename(columns={
-        'Exact Matching (EM)': 'EM',
-        'Syntax (STX)': 'Avg STX',
-        'Functionality (FNC)': 'Avg FNC',
-        'Synthesis (SYN)': 'Avg SYN',
-        'Power': 'Avg Power',
-        'Performance': 'Avg Perf',
-        'Area': 'Avg Area',
-    }, inplace=True)
-    columns_order = ['Type', 'Model', 'Params', 'Aggregated ⬆️', 'Avg STX', 'Avg FNC', 'Avg SYN', 'Avg Power', 'Avg Perf', 'Avg Area']
     pivot_df = pivot_df[[col for col in columns_order if col in pivot_df.columns]]
-    pivot_df = pivot_df.sort_values(by='Aggregated ⬆️', ascending=False).reset_index(drop=True)
     return pivot_df

+import sys
 import gradio as gr
 import numpy as np
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+type_emoji = {"RTL-Specific": "🔴", "General": "🟢", "Coding": "🔵"}
+def model_hyperlink(link, model_name, release):
+    if release == "V1":
+        return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+    else:
+        return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a> <span style="font-variant: all-small-caps; font-weight: 600">new</span>'
 def handle_special_cases(benchmark, metric):
+    if metric == "Exact Matching (EM)":
+        benchmark = "RTL-Repo"
+    elif benchmark == "RTL-Repo":
+        metric = "Exact Matching (EM)"
     return benchmark, metric
 def filter_RTLRepo(subset: pd.DataFrame) -> pd.DataFrame:
+    subset = subset.drop(subset[subset.Score < 0.0].index)
+    details = subset[
+        ["Model", "Model URL", "Model Type", "Params", "Release"]
+    ].drop_duplicates("Model")
+    filtered_df = subset[["Model", "Score"]].rename(
+        columns={"Score": "Exact Matching (EM)"}
+    )
+    filtered_df = pd.merge(filtered_df, details, on="Model", how="left")
+    filtered_df["Model"] = filtered_df.apply(
+        lambda row: model_hyperlink(row["Model URL"], row["Model"], row["Release"]),
+        axis=1,
+    )
+    filtered_df["Type"] = filtered_df["Model Type"].map(lambda x: type_emoji.get(x, ""))
+    filtered_df = filtered_df[["Type", "Model", "Params", "Exact Matching (EM)"]]
+    filtered_df = filtered_df.sort_values(
+        by="Exact Matching (EM)", ascending=False
+    ).reset_index(drop=True)
     return filtered_df
 def filter_bench(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataFrame:
+    details = subset[
+        ["Model", "Model URL", "Model Type", "Params", "Release"]
+    ].drop_duplicates("Model")
+    pivot_df = subset.pivot_table(
+        index="Model", columns="Metric", values="Score", aggfunc="mean"
+    ).reset_index()
     if df_agg is not None and agg_column is not None and agg_column in df_agg.columns:
+        agg_data = df_agg[["Model", agg_column]].rename(
+            columns={agg_column: "Aggregated ⬆️"}
+        )
+        pivot_df = pd.merge(pivot_df, agg_data, on="Model", how="left")
+    else:  # fallback
+        pivot_df["Aggregated ⬆️"] = pivot_df.mean(axis=1, numeric_only=True).round(2)
+    pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
+    pivot_df["Model"] = pivot_df.apply(
+        lambda row: model_hyperlink(row["Model URL"], row["Model"], row["Release"]),
+        axis=1,
+    )
+    pivot_df["Type"] = pivot_df["Model Type"].map(lambda x: type_emoji.get(x, ""))
+    pivot_df.rename(
+        columns={
+            "Syntax (STX)": "STX",
+            "Functionality (FNC)": "FNC",
+            "Synthesis (SYN)": "SYN",
+            "Performance": "Perf",
+        },
+        inplace=True,
+    )
+    columns_order = [
+        "Type",
+        "Model",
+        "Params",
+        "Aggregated ⬆️",
+        "STX",
+        "FNC",
+        "SYN",
+        "Power",
+        "Perf",
+        "Area",
+    ]
     pivot_df = pivot_df[[col for col in columns_order if col in pivot_df.columns]]
+    pivot_df = pivot_df.sort_values(by="Aggregated ⬆️", ascending=False).reset_index(
+        drop=True
+    )
     return pivot_df
+def custom_agg_s2r(vals):
+    s2r_val = vals.iloc[0]
+    rtllm_val = vals.iloc[1]
+    w1 = 155
+    w2 = 47
+    result = (w1 * s2r_val + w2 * rtllm_val) / (w1 + w2)
+    return round(result, 2)
+def custom_agg_cc(vals):
+    veval_val = vals.iloc[0]
+    vgen_val = vals.iloc[1]
+    w1 = 155
+    w2 = 17
+    result = (w1 * veval_val + w2 * vgen_val) / (w1 + w2)
+    return round(result, 2)
+def filter_bench_all(
+    subset: pd.DataFrame, df_agg=None, agg_column=None
+) -> pd.DataFrame:
+    details = subset[
+        ["Model", "Model URL", "Model Type", "Params", "Release"]
+    ].drop_duplicates("Model")
+    if "RTLLM" in subset["Benchmark"].unique():
+        pivot_df = (
+            subset.pivot_table(
+                index="Model", columns="Metric", values="Score", aggfunc=custom_agg_s2r
+            )
+            .reset_index()
+            .round(2)
+        )
+    else:
+        pivot_df = (
+            subset.pivot_table(
+                index="Model", columns="Metric", values="Score", aggfunc=custom_agg_cc
+            )
+            .reset_index()
+            .round(2)
+        )
     if df_agg is not None:
         if agg_column is not None and agg_column in df_agg.columns:
+            agg_data = df_agg[["Model", agg_column]].rename(
+                columns={agg_column: "Aggregated ⬆️"}
+            )
+            pivot_df = pd.merge(pivot_df, agg_data, on="Model", how="left")
         else:
+            agg_columns = [col for col in df_agg.columns if col.startswith("Agg ")]
             if agg_columns:
+                df_agg["Average_Agg"] = df_agg[agg_columns].mean(axis=1)
+                agg_data = df_agg[["Model", "Average_Agg"]].rename(
+                    columns={"Average_Agg": "Aggregated ⬆️"}
+                )
+                pivot_df = pd.merge(pivot_df, agg_data, on="Model", how="left")
+            else:  # fallback
+                pivot_df["Aggregated ⬆️"] = pivot_df.mean(
+                    axis=1, numeric_only=True
+                ).round(2)
+    else:  # fallback
+        print("We do mean")
+        pivot_df["Aggregated ⬆️"] = pivot_df.mean(axis=1, numeric_only=True).round(2)
+    pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
+    pivot_df["Model"] = pivot_df.apply(
+        lambda row: model_hyperlink(row["Model URL"], row["Model"], row["Release"]),
+        axis=1,
+    )
+    pivot_df["Type"] = pivot_df["Model Type"].map(lambda x: type_emoji.get(x, ""))
+    pivot_df.rename(
+        columns={
+            "Exact Matching (EM)": "EM",
+            "Syntax (STX)": "Agg STX",
+            "Functionality (FNC)": "Agg FNC",
+            "Synthesis (SYN)": "Agg SYN",
+            "Power": "Agg Power",
+            "Performance": "Agg Perf",
+            "Area": "Agg Area",
+        },
+        inplace=True,
+    )
+    columns_order = [
+        "Type",
+        "Model",
+        "Params",
+        "Aggregated ⬆️",
+        "Agg STX",
+        "Agg FNC",
+        "Agg SYN",
+        "Agg Power",
+        "Agg Perf",
+        "Agg Area",
+    ]
     pivot_df = pivot_df[[col for col in columns_order if col in pivot_df.columns]]
+    pivot_df = pivot_df.sort_values(by="Aggregated ⬆️", ascending=False).reset_index(
+        drop=True
+    )
     return pivot_df
+def agg_S2R_metrics(verilog_eval_rtl, rtllm):
+    if not verilog_eval_rtl or not rtllm:
+        return None
+    w1 = 155
+    w2 = 47
+    result = (w1 * verilog_eval_rtl + w2 * rtllm) / (w1 + w2)
+    return round(result, 2)
+def agg_MC_metrics(verilog_eval_cc, verigen):
+    if not verilog_eval_cc or not verigen:
+        return None
+    w1 = 155
+    w2 = 17
+    result = (w1 * verilog_eval_cc + w2 * verigen) / (w1 + w2)
+    return round(result, 2)