Spaces:
Running
Running
Commit
·
1df4c13
1
Parent(s):
b410573
Release TuRTLe V2
Browse files- app.py +18 -11
- results/aggregated_scores.csv +28 -22
- results/parse.py +75 -10
- results/results.csv +27 -21
- results/results.json +0 -0
- results/v1/aggregated_scores.csv +22 -0
- results/v1/results.csv +23 -0
- results/v1/results.json +0 -0
- utils.py +194 -67
app.py
CHANGED
@@ -191,19 +191,16 @@ with gr.Blocks(
|
|
191 |
|
192 |
gr.HTML(
|
193 |
"""
|
194 |
-
<
|
195 |
-
<img src='/gradio_api/file=logo.png' alt='TuRTLe Logo' width='220'/>
|
196 |
-
</
|
197 |
"""
|
198 |
)
|
199 |
gr.HTML(
|
200 |
"""
|
201 |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
|
202 |
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script>
|
203 |
-
<div style="text-align: center; margin-bottom:
|
204 |
-
<p style="margin-bottom: 15px;">Welcome to the TuRTLe Model Leaderboard! TuRTLe is a <b>unified evaluation framework designed to systematically assess Large Language Models (LLMs) in RTL (Register-Transfer Level) generation</b> for hardware design.
|
205 |
-
Evaluation criteria include <b>syntax correctness, functional accuracy, synthesizability, and post-synthesis quality</b> (PPA: Power, Performance, Area). TuRTLe integrates multiple benchmarks to highlight strengths and weaknesses of available LLMs.
|
206 |
-
Use the filters below to explore different RTL benchmarks and models.</p>
|
207 |
<a href="https://github.com/HPAI-BSC/TuRTLe" target="_blank" style="text-decoration: none; margin-right: 10px;">
|
208 |
<button style="background: #333; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
|
209 |
GitHub Repo
|
@@ -221,9 +218,19 @@ with gr.Blocks(
|
|
221 |
How to submit
|
222 |
</button>
|
223 |
</a>
|
224 |
-
|
225 |
-
|
226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
</div>
|
228 |
"""
|
229 |
)
|
@@ -353,7 +360,7 @@ with gr.Blocks(
|
|
353 |
citation_button = gr.Textbox(
|
354 |
value=CITATION_BUTTON_TEXT,
|
355 |
label=CITATION_BUTTON_LABEL,
|
356 |
-
lines=
|
357 |
elem_id="citation-button",
|
358 |
show_copy_button=True,
|
359 |
)
|
|
|
191 |
|
192 |
gr.HTML(
|
193 |
"""
|
194 |
+
<div align="center">
|
195 |
+
<img src='/gradio_api/file=logo.png' alt='TuRTLe Logo' width='220'/>
|
196 |
+
</div>
|
197 |
"""
|
198 |
)
|
199 |
gr.HTML(
|
200 |
"""
|
201 |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
|
202 |
<script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/js/all.min.js"></script>
|
203 |
+
<div style="text-align: center; margin-bottom: 0px; margin-top: 0px;">
|
|
|
|
|
|
|
204 |
<a href="https://github.com/HPAI-BSC/TuRTLe" target="_blank" style="text-decoration: none; margin-right: 10px;">
|
205 |
<button style="background: #333; color: white; padding: 10px 14px; border-radius: 8px; border: none; font-size: 16px; cursor: pointer;">
|
206 |
GitHub Repo
|
|
|
218 |
How to submit
|
219 |
</button>
|
220 |
</a>
|
221 |
+
<p style="margin-top: 15px;">If you have any inquiries or wish to collaborate:
|
222 |
+
<a href="mailto:hpai@bsc.es">hpai@bsc.es</a>
|
223 |
+
</p>
|
224 |
+
</div>
|
225 |
+
"""
|
226 |
+
)
|
227 |
+
gr.HTML(
|
228 |
+
"""
|
229 |
+
<div style=" margin-top:-10px !important;">
|
230 |
+
<p style="margin-bottom: 15px; text-align: start !important;">Welcome to the TuRTLe Model Leaderboard! TuRTLe is a <b>unified evaluation framework designed to systematically assess Large Language Models (LLMs) in RTL (Register-Transfer Level) generation</b> for hardware design.
|
231 |
+
Evaluation criteria include <b>syntax correctness, functional accuracy, synthesizability, and post-synthesis quality</b> (PPA: Power, Performance, Area). TuRTLe integrates multiple benchmarks to highlight strengths and weaknesses of available LLMs.
|
232 |
+
Use the filters below to explore different RTL benchmarks and models.</p>
|
233 |
+
<p style="margin-top: 15px; text-align: start !important; "><span style="font-variant: small-caps; font-weight: bold;">NEW UPDATE (JUNE 2025)</span>: We make our framework open-source on GitHub, and add 6 new recent models! For a total of 27 models and 5 RTL benchmarks.</p>
|
234 |
</div>
|
235 |
"""
|
236 |
)
|
|
|
360 |
citation_button = gr.Textbox(
|
361 |
value=CITATION_BUTTON_TEXT,
|
362 |
label=CITATION_BUTTON_LABEL,
|
363 |
+
lines=10,
|
364 |
elem_id="citation-button",
|
365 |
show_copy_button=True,
|
366 |
)
|
results/aggregated_scores.csv
CHANGED
@@ -1,22 +1,28 @@
|
|
1 |
-
Model,Agg S2R,Agg MC,Agg VerilogEval S2R,Agg VerilogEval MC,Agg RTLLM,Agg VeriGen
|
2 |
-
DeepSeek R1,
|
3 |
-
Llama 3.1 405B,
|
4 |
-
|
5 |
-
|
6 |
-
Qwen2.5
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
QwenCoder 2.5
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,Agg S2R,Agg MC,Agg VerilogEval S2R,Agg VerilogEval MC,Agg RTLLM,Agg VeriGen
|
2 |
+
DeepSeek R1,75.53,72.96,77.67,77.55,68.49,57.82
|
3 |
+
Llama 3.1 405B,53.23,53.88,56.55,54.35,42.26,52.35
|
4 |
+
Qwen3 236B A22B,69.82,61.71,74.83,68.36,53.31,39.8
|
5 |
+
Llama 3.(1-3) 70B,39.48,43.29,39.47,40.83,39.53,51.42
|
6 |
+
Qwen2.5 72B,49.36,47.23,50.22,50.74,46.51,35.65
|
7 |
+
QwQ 32B,62.6,39.46,65.02,38.68,54.6,42.03
|
8 |
+
Qwen2.5 32B,50.39,38.93,50.86,41.01,48.86,32.09
|
9 |
+
StarChat2 15B v0.1,38.76,38.98,36.68,35.58,45.61,50.2
|
10 |
+
DeepSeek R1 Distill Qwen 14B,23.14,23.3,24.94,24.3,17.22,20.01
|
11 |
+
CodeLlama 70B,33.04,32.86,32.2,32.27,35.81,34.8
|
12 |
+
DeepSeek Coder 33B,27.03,36.31,22.65,37.64,41.47,31.91
|
13 |
+
QwenCoder 2.5 32B,44.02,43.75,43.68,44.05,45.15,42.76
|
14 |
+
QwenCoder 2.5 14B,37.69,38.97,35.32,40.26,45.5,34.72
|
15 |
+
DeepCoder 14B,26.4,30.75,27.32,33.18,23.38,22.75
|
16 |
+
OpenCoder 8B,30.06,35.86,26.8,34.57,40.83,40.1
|
17 |
+
SeedCoder 8B,50.89,34.05,51.71,36.52,48.2,25.89
|
18 |
+
SeedCoder 8B Reasoning,43.75,47.1,47.85,46.76,30.22,48.23
|
19 |
+
QwenCoder 2.5 7B,14.15,32.86,6.57,33.0,39.16,32.4
|
20 |
+
"DeepSeek Coder 6,7B",31.87,27.89,28.88,28.99,41.75,24.27
|
21 |
+
RTLCoder Mistral,21.82,28.65,23.71,26.34,15.58,36.27
|
22 |
+
RTLCoder DeepSeek,37.22,36.64,38.33,36.13,33.58,38.33
|
23 |
+
OriGen,52.88,51.89,53.0,50.0,52.47,58.12
|
24 |
+
CodeV R1 Distill Qwen 7B,36.12,26.84,32.35,20.56,48.57,47.55
|
25 |
+
HaVen-CodeQwen,43.58,47.13,44.67,47.23,39.98,46.8
|
26 |
+
CodeV-CL-7B,14.73,33.73,12.71,30.93,21.38,42.97
|
27 |
+
CodeV-QW-7B,20.37,50.11,18.82,50.64,25.48,48.38
|
28 |
+
CodeV-DS-6.7B,19.62,47.1,14.8,46.08,35.52,50.46
|
results/parse.py
CHANGED
@@ -1,96 +1,159 @@
|
|
1 |
import csv
|
2 |
import json
|
3 |
import locale
|
|
|
4 |
from typing import Dict, Union
|
5 |
|
6 |
import pandas as pd
|
7 |
|
8 |
model_details = {
|
9 |
-
"DeepSeek R1": (
|
|
|
|
|
|
|
|
|
|
|
10 |
"Llama 3.1 405B": (
|
11 |
-
"https://huggingface.co/
|
12 |
406,
|
13 |
"General",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
),
|
15 |
"Llama 3.(1-3) 70B": (
|
16 |
"https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct",
|
17 |
70.6,
|
18 |
"General",
|
|
|
19 |
),
|
20 |
"Qwen2.5 72B": (
|
21 |
"https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
|
22 |
72.7,
|
23 |
"General",
|
|
|
24 |
),
|
25 |
-
"
|
|
|
26 |
"StarChat2 15B v0.1": (
|
27 |
"https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1",
|
28 |
16,
|
29 |
"General",
|
|
|
30 |
),
|
31 |
"DeepSeek R1 Distill Qwen 14B": (
|
32 |
"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
|
33 |
14.8,
|
34 |
"General",
|
|
|
35 |
),
|
36 |
"CodeLlama 70B": (
|
37 |
"https://huggingface.co/codellama/CodeLlama-70b-hf",
|
38 |
69,
|
39 |
"Coding",
|
|
|
40 |
),
|
41 |
"QwenCoder 2.5 32B": (
|
42 |
"https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct",
|
43 |
32.5,
|
44 |
"Coding",
|
|
|
45 |
),
|
46 |
"DeepSeek Coder 33B": (
|
47 |
"https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct",
|
48 |
33.3,
|
49 |
"Coding",
|
|
|
50 |
),
|
51 |
"QwenCoder 2.5 14B": (
|
52 |
"https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct",
|
53 |
14.7,
|
54 |
"Coding",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
),
|
56 |
"OpenCoder 8B": (
|
57 |
"https://huggingface.co/infly/OpenCoder-8B-Instruct",
|
58 |
7.77,
|
59 |
"Coding",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
),
|
61 |
"QwenCoder 2.5 7B": (
|
62 |
"https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct",
|
63 |
7.61,
|
64 |
"Coding",
|
|
|
65 |
),
|
66 |
"DeepSeek Coder 6,7B": (
|
67 |
"https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct",
|
68 |
6.74,
|
69 |
"Coding",
|
|
|
70 |
),
|
71 |
"HaVen-CodeQwen": (
|
72 |
"https://huggingface.co/yangyiyao/HaVen-CodeQwen",
|
73 |
7.25,
|
74 |
"RTL-Specific",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
),
|
76 |
-
"CodeV-CL-7B": ("https://huggingface.co/yang-z/CodeV-CL-7B", 6.74, "RTL-Specific"),
|
77 |
-
"CodeV-QW-7B": ("https://huggingface.co/yang-z/CodeV-QW-7B", 7.25, "RTL-Specific"),
|
78 |
"CodeV-DS-6.7B": (
|
79 |
"https://huggingface.co/yang-z/CodeV-DS-6.7B",
|
80 |
6.74,
|
81 |
"RTL-Specific",
|
|
|
82 |
),
|
83 |
"RTLCoder Mistral": (
|
84 |
"https://huggingface.co/ishorn5/RTLCoder-v1.1",
|
85 |
7.24,
|
86 |
"RTL-Specific",
|
|
|
87 |
),
|
88 |
"RTLCoder DeepSeek": (
|
89 |
"https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1",
|
90 |
6.74,
|
91 |
"RTL-Specific",
|
|
|
92 |
),
|
93 |
-
"OriGen": ("https://huggingface.co/henryen/OriGen_Fix", 6.74, "RTL-Specific"),
|
94 |
}
|
95 |
|
96 |
|
@@ -107,13 +170,14 @@ def get_headers(reader, agg=False) -> Union[list, list]:
|
|
107 |
return metrics, benchs
|
108 |
|
109 |
|
110 |
-
def get_model_params_and_url(model) -> Union[str, str, float]:
|
111 |
if model not in model_details:
|
112 |
return "-", "-", "-"
|
113 |
url = model_details[model][0]
|
114 |
params = model_details[model][1]
|
115 |
type = model_details[model][2]
|
116 |
-
|
|
|
117 |
|
118 |
|
119 |
def parse_results(csv_path: str) -> list[dict]:
|
@@ -123,12 +187,12 @@ def parse_results(csv_path: str) -> list[dict]:
|
|
123 |
"""
|
124 |
dataset = []
|
125 |
models = []
|
126 |
-
with open(csv_path, newline="") as csvfile:
|
127 |
reader = csv.reader(csvfile, delimiter=",")
|
128 |
metrics, benchs = get_headers(reader)
|
129 |
for i, row in enumerate(reader):
|
130 |
model = row[0]
|
131 |
-
url, params, type = get_model_params_and_url(model)
|
132 |
models.append(model)
|
133 |
row = row[1:]
|
134 |
ctr = 0
|
@@ -143,6 +207,7 @@ def parse_results(csv_path: str) -> list[dict]:
|
|
143 |
record["Result"] = float(row[ctr].replace(",", "."))
|
144 |
record["Model URL"] = url
|
145 |
record["Params"] = params
|
|
|
146 |
dataset.append(record)
|
147 |
ctr += 1
|
148 |
print(models)
|
|
|
1 |
import csv
|
2 |
import json
|
3 |
import locale
|
4 |
+
import os
|
5 |
from typing import Dict, Union
|
6 |
|
7 |
import pandas as pd
|
8 |
|
9 |
model_details = {
|
10 |
+
"DeepSeek R1": (
|
11 |
+
"https://huggingface.co/deepseek-ai/DeepSeek-R1",
|
12 |
+
685,
|
13 |
+
"General",
|
14 |
+
"V1",
|
15 |
+
),
|
16 |
"Llama 3.1 405B": (
|
17 |
+
"https://huggingface.co/RedHatAI/Meta-Llama-3.1-405B-FP8",
|
18 |
406,
|
19 |
"General",
|
20 |
+
"V1",
|
21 |
+
),
|
22 |
+
"Qwen3 236B A22B": (
|
23 |
+
"https://huggingface.co/Qwen/Qwen3-235B-A22B",
|
24 |
+
235,
|
25 |
+
"General",
|
26 |
+
"V2",
|
27 |
),
|
28 |
"Llama 3.(1-3) 70B": (
|
29 |
"https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct",
|
30 |
70.6,
|
31 |
"General",
|
32 |
+
"V1",
|
33 |
),
|
34 |
"Qwen2.5 72B": (
|
35 |
"https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
|
36 |
72.7,
|
37 |
"General",
|
38 |
+
"V1",
|
39 |
),
|
40 |
+
"QwQ 32B": ("https://huggingface.co/Qwen/QwQ-32B", 32.8, "General", "V2"),
|
41 |
+
"Qwen2.5 32B": ("https://huggingface.co/Qwen/Qwen2.5-32B", 32.5, "General", "V1"),
|
42 |
"StarChat2 15B v0.1": (
|
43 |
"https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1",
|
44 |
16,
|
45 |
"General",
|
46 |
+
"V1",
|
47 |
),
|
48 |
"DeepSeek R1 Distill Qwen 14B": (
|
49 |
"https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
|
50 |
14.8,
|
51 |
"General",
|
52 |
+
"V1",
|
53 |
),
|
54 |
"CodeLlama 70B": (
|
55 |
"https://huggingface.co/codellama/CodeLlama-70b-hf",
|
56 |
69,
|
57 |
"Coding",
|
58 |
+
"V1",
|
59 |
),
|
60 |
"QwenCoder 2.5 32B": (
|
61 |
"https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct",
|
62 |
32.5,
|
63 |
"Coding",
|
64 |
+
"V1",
|
65 |
),
|
66 |
"DeepSeek Coder 33B": (
|
67 |
"https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct",
|
68 |
33.3,
|
69 |
"Coding",
|
70 |
+
"V1",
|
71 |
),
|
72 |
"QwenCoder 2.5 14B": (
|
73 |
"https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct",
|
74 |
14.7,
|
75 |
"Coding",
|
76 |
+
"V1",
|
77 |
+
),
|
78 |
+
"DeepCoder 14B": (
|
79 |
+
"https://huggingface.co/agentica-org/DeepCoder-14B-Preview",
|
80 |
+
14.8,
|
81 |
+
"Coding",
|
82 |
+
"V2",
|
83 |
),
|
84 |
"OpenCoder 8B": (
|
85 |
"https://huggingface.co/infly/OpenCoder-8B-Instruct",
|
86 |
7.77,
|
87 |
"Coding",
|
88 |
+
"V1",
|
89 |
+
),
|
90 |
+
"SeedCoder 8B": (
|
91 |
+
"https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Instruct",
|
92 |
+
8.25,
|
93 |
+
"Coding",
|
94 |
+
"V2",
|
95 |
+
),
|
96 |
+
"SeedCoder 8B Reasoning": (
|
97 |
+
"https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Reasoning-bf16",
|
98 |
+
8.25,
|
99 |
+
"Coding",
|
100 |
+
"V2",
|
101 |
),
|
102 |
"QwenCoder 2.5 7B": (
|
103 |
"https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct",
|
104 |
7.61,
|
105 |
"Coding",
|
106 |
+
"V1",
|
107 |
),
|
108 |
"DeepSeek Coder 6,7B": (
|
109 |
"https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct",
|
110 |
6.74,
|
111 |
"Coding",
|
112 |
+
"V1",
|
113 |
),
|
114 |
"HaVen-CodeQwen": (
|
115 |
"https://huggingface.co/yangyiyao/HaVen-CodeQwen",
|
116 |
7.25,
|
117 |
"RTL-Specific",
|
118 |
+
"V1",
|
119 |
+
),
|
120 |
+
"CodeV R1 Distill Qwen 7B": (
|
121 |
+
"https://huggingface.co/zhuyaoyu/CodeV-R1-Distill-Qwen-7B",
|
122 |
+
7.62,
|
123 |
+
"RTL-Specific",
|
124 |
+
"V2",
|
125 |
+
),
|
126 |
+
"CodeV-CL-7B": (
|
127 |
+
"https://huggingface.co/yang-z/CodeV-CL-7B",
|
128 |
+
6.74,
|
129 |
+
"RTL-Specific",
|
130 |
+
"V1",
|
131 |
+
),
|
132 |
+
"CodeV-QW-7B": (
|
133 |
+
"https://huggingface.co/yang-z/CodeV-QW-7B",
|
134 |
+
7.25,
|
135 |
+
"RTL-Specific",
|
136 |
+
"V1",
|
137 |
),
|
|
|
|
|
138 |
"CodeV-DS-6.7B": (
|
139 |
"https://huggingface.co/yang-z/CodeV-DS-6.7B",
|
140 |
6.74,
|
141 |
"RTL-Specific",
|
142 |
+
"V1",
|
143 |
),
|
144 |
"RTLCoder Mistral": (
|
145 |
"https://huggingface.co/ishorn5/RTLCoder-v1.1",
|
146 |
7.24,
|
147 |
"RTL-Specific",
|
148 |
+
"V1",
|
149 |
),
|
150 |
"RTLCoder DeepSeek": (
|
151 |
"https://huggingface.co/ishorn5/RTLCoder-Deepseek-v1.1",
|
152 |
6.74,
|
153 |
"RTL-Specific",
|
154 |
+
"V1",
|
155 |
),
|
156 |
+
"OriGen": ("https://huggingface.co/henryen/OriGen_Fix", 6.74, "RTL-Specific", "V1"),
|
157 |
}
|
158 |
|
159 |
|
|
|
170 |
return metrics, benchs
|
171 |
|
172 |
|
173 |
+
def get_model_params_and_url(model) -> Union[str, str, float, str]:
|
174 |
if model not in model_details:
|
175 |
return "-", "-", "-"
|
176 |
url = model_details[model][0]
|
177 |
params = model_details[model][1]
|
178 |
type = model_details[model][2]
|
179 |
+
release = model_details[model][3]
|
180 |
+
return url, params, type, release
|
181 |
|
182 |
|
183 |
def parse_results(csv_path: str) -> list[dict]:
|
|
|
187 |
"""
|
188 |
dataset = []
|
189 |
models = []
|
190 |
+
with open(os.path.join("results", csv_path), newline="") as csvfile:
|
191 |
reader = csv.reader(csvfile, delimiter=",")
|
192 |
metrics, benchs = get_headers(reader)
|
193 |
for i, row in enumerate(reader):
|
194 |
model = row[0]
|
195 |
+
url, params, type, release = get_model_params_and_url(model)
|
196 |
models.append(model)
|
197 |
row = row[1:]
|
198 |
ctr = 0
|
|
|
207 |
record["Result"] = float(row[ctr].replace(",", "."))
|
208 |
record["Model URL"] = url
|
209 |
record["Params"] = params
|
210 |
+
record["Release"] = release
|
211 |
dataset.append(record)
|
212 |
ctr += 1
|
213 |
print(models)
|
results/results.csv
CHANGED
@@ -1,23 +1,29 @@
|
|
1 |
,Syntax (STX),Syntax (STX),Functionality (FNC),Functionality (FNC),Synthesis (SYN),Synthesis (SYN),Power,Power,Performance,Performance,Area,Area,EM,Syntax (STX),Syntax (STX),Functionality (FNC),Functionality (FNC),Synthesis (SYN),Synthesis (SYN),Power,Power,Performance,Performance,Area,Area
|
2 |
,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,RTL-Repo,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen
|
3 |
-
DeepSeek R1,
|
4 |
-
Llama 3.1 405B,
|
5 |
-
|
6 |
-
|
7 |
-
Qwen2.5
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
QwenCoder 2.5
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
,Syntax (STX),Syntax (STX),Functionality (FNC),Functionality (FNC),Synthesis (SYN),Synthesis (SYN),Power,Power,Performance,Performance,Area,Area,EM,Syntax (STX),Syntax (STX),Functionality (FNC),Functionality (FNC),Synthesis (SYN),Synthesis (SYN),Power,Power,Performance,Performance,Area,Area
|
2 |
,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,RTL-Repo,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen
|
3 |
+
DeepSeek R1,97.18,89.80,79.74,65.71,79.62,63.27,78.33,71.34,76.49,64.06,78.19,70.08,-1,97.44,96.47,79.49,60.00,79.49,60.00,78.27,50.25,76.43,60.15,77.96,63.07
|
4 |
+
Llama 3.1 405B,87.44,77.14,58.97,45.71,58.85,41.63,57.58,50.88,55.93,32.44,56.13,43.45,34.62,88.59,95.29,56.15,52.94,55.90,52.94,55.13,49.22,53.45,52.52,54.48,55.31
|
5 |
+
Qwen3 236B A22B,91.28,80.41,76.92,53.06,76.79,51.43,75.25,57.77,73.56,49.20,75.67,52.95,41.94,82.18,91.76,69.62,40.00,69.62,40.00,69.04,39.09,66.89,40.16,69.15,40.14
|
6 |
+
Llama 3.(1-3) 70B,66.15,73.88,40.64,42.45,40.64,39.18,40.46,40.81,38.08,38.14,39.86,39.65,28.72,84.74,89.41,41.67,51.76,41.67,51.76,41.38,50.61,39.75,51.76,41.36,51.88
|
7 |
+
Qwen2.5 72B,82.18,79.59,52.44,45.31,51.92,44.08,51.83,46.47,48.75,45.40,50.09,47.65,37.44,80.90,84.71,52.95,35.29,52.69,35.29,51.66,35.82,49.37,35.20,51.18,35.94
|
8 |
+
QwQ 32B,87.95,82.45,66.41,56.73,66.41,52.24,66.15,55.83,63.80,51.91,65.12,56.07,-1,58.97,68.24,40.00,42.35,39.62,42.35,39.40,40.90,37.53,42.31,39.10,42.87
|
9 |
+
Qwen2.5 32B,88.59,84.08,52.56,50.20,52.18,46.12,52.32,49.73,49.43,46.43,50.82,50.43,28.93,93.21,85.88,41.54,32.94,41.54,32.94,41.31,30.65,40.48,33.11,41.23,32.50
|
10 |
+
StarChat2 15B v0.1,88.46,84.90,37.95,44.49,37.95,44.08,37.56,46.95,35.30,43.22,37.19,46.65,13.42,79.74,92.94,36.41,51.76,36.03,51.76,36.08,46.30,34.91,51.49,35.76,52.80
|
11 |
+
DeepSeek R1 Distill Qwen 14B,42.18,34.69,25.51,18.37,25.51,16.33,25.36,17.86,24.19,16.48,25.27,17.33,-1,45.00,44.71,25.64,21.18,25.26,21.18,24.79,17.65,23.48,21.08,24.63,21.29
|
12 |
+
CodeLlama 70B,67.05,69.80,33.08,36.33,33.08,34.29,32.69,37.19,31.46,34.29,32.44,35.95,24.33,90.77,88.24,33.33,35.29,33.33,35.29,33.02,34.03,30.80,35.15,32.99,35.21
|
13 |
+
DeepSeek Coder 33B,62.82,83.67,23.33,42.45,23.08,42.04,22.86,42.29,22.81,39.42,22.29,42.71,24.58,75.26,88.24,39.62,31.76,39.36,31.76,38.23,32.16,36.79,31.46,37.90,32.12
|
14 |
+
QwenCoder 2.5 32B,87.18,77.96,45.00,43.27,44.87,43.27,44.25,46.82,43.03,43.20,43.76,45.42,31.07,83.72,87.06,45.64,42.35,45.13,42.35,44.59,42.79,43.01,42.24,44.55,43.25
|
15 |
+
QwenCoder 2.5 14B,78.97,81.63,37.82,46.12,37.44,45.31,35.94,45.82,34.83,44.64,35.18,46.05,37.53,80.00,83.53,41.67,35.29,41.15,35.29,40.74,34.17,39.20,35.32,40.83,34.67
|
16 |
+
DeepCoder 14B,43.85,39.59,28.08,23.67,28.08,22.04,27.94,25.00,26.26,22.00,27.77,23.15,-1,61.92,48.24,34.10,23.53,33.72,23.53,33.70,21.18,32.17,23.43,33.67,23.65
|
17 |
+
OpenCoder 8B,78.21,75.92,28.46,42.86,27.82,40.82,27.34,41.36,25.95,39.77,27.11,41.36,16.17,80.00,95.29,35.64,41.18,35.38,41.18,35.12,37.69,33.47,41.05,35.13,41.55
|
18 |
+
SeedCoder 8B,91.41,85.31,53.46,47.35,53.33,46.53,52.86,49.42,50.62,45.60,51.65,49.59,28.23,77.44,94.12,37.31,30.59,37.31,27.06,37.32,23.53,35.35,26.92,36.89,27.23
|
19 |
+
SeedCoder 8B Reasoning,67.82,53.47,49.23,30.20,49.23,29.39,48.92,32.04,46.76,28.64,47.87,29.99,-1,83.33,78.82,48.21,50.59,48.08,50.59,47.78,41.74,45.44,50.02,47.06,52.92
|
20 |
+
QwenCoder 2.5 7B,20.13,76.33,6.92,38.78,6.67,37.14,6.51,40.65,6.63,37.25,6.56,39.58,28.33,74.10,90.59,33.72,32.94,33.72,32.94,33.59,30.67,31.78,33.01,33.62,33.51
|
21 |
+
"DeepSeek Coder 6,7B",82.05,78.78,29.62,41.22,29.49,38.78,29.51,42.62,27.73,39.33,29.41,43.30,24.63,67.18,84.71,31.67,24.71,29.87,24.71,29.78,23.53,27.98,24.50,29.21,24.79
|
22 |
+
RTLCoder Mistral,54.87,32.24,24.62,16.33,24.62,15.92,24.28,16.03,22.78,14.71,24.06,16.00,14.77,60.51,85.88,27.05,36.47,27.05,36.47,26.94,34.63,25.22,36.55,26.87,37.64
|
23 |
+
RTLCoder DeepSeek,84.62,73.06,39.49,37.14,39.49,34.69,38.91,34.30,37.52,32.76,38.55,33.69,19.35,77.31,85.88,36.92,40.00,36.79,40.00,36.94,35.57,34.84,39.83,36.62,39.60
|
24 |
+
OriGen,96.15,81.63,54.23,50.61,54.23,50.61,54.29,53.10,51.57,50.86,53.15,53.44,17.07,92.44,98.82,50.77,58.82,50.77,58.82,50.95,54.14,48.53,58.81,50.51,61.40
|
25 |
+
CodeV R1 Distill Qwen 7B,56.92,73.06,33.33,49.80,33.33,47.35,32.58,49.25,32.01,47.45,32.45,49.01,-1,92.69,89.41,21.28,49.41,21.28,49.41,21.04,43.68,19.59,49.06,21.05,49.91
|
26 |
+
HaVen-CodeQwen,93.33,80.41,47.31,42.86,46.15,41.22,45.08,40.59,44.26,38.83,44.68,40.53,25.14,93.59,100.00,50.13,47.06,49.49,47.06,47.55,46.60,47.05,47.14,47.09,46.67
|
27 |
+
CodeV-CL-7B,32.18,48.16,13.08,24.49,12.95,21.63,12.80,22.25,12.51,20.59,12.82,21.29,12.27,92.05,98.82,31.79,43.53,31.79,43.53,31.74,42.25,29.45,43.46,31.61,43.20
|
28 |
+
CodeV-QW-7B,45.38,68.16,19.62,34.29,18.97,26.53,18.91,28.14,18.71,21.80,18.85,26.50,20.94,93.33,100.00,52.31,48.24,51.54,48.24,51.69,48.14,48.79,48.18,51.45,48.81
|
29 |
+
CodeV-DS-6.7B,33.59,67.35,15.00,38.78,15.00,37.14,15.10,35.56,14.46,35.13,14.85,35.88,21.26,95.51,100.00,47.05,50.59,47.05,50.59,47.37,50.47,44.35,50.54,46.52,50.36
|
results/results.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
results/v1/aggregated_scores.csv
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,Agg S2R,Agg MC,Agg VerilogEval S2R,Agg VerilogEval MC,Agg RTLLM,Agg VeriGen
|
2 |
+
DeepSeek R1,74.84,75.51,77.01,77.81,68.06,54.4
|
3 |
+
Llama 3.1 405B,49.72,42.8,53.98,42.92,36.43,41.67
|
4 |
+
Llama 3.(1-3) 70B,39.0,38.49,38.64,37.45,40.12,48.05
|
5 |
+
Qwen2.5 72B,49.23,48.82,49.17,51.22,49.45,26.75
|
6 |
+
Qwen2.5 32B,50.58,40.73,50.53,41.85,50.71,30.46
|
7 |
+
StarChat2 15B v0.1,39.04,38.9,37.45,37.69,44.0,49.99
|
8 |
+
DeepSeek R1 Distill Qwen 14B,22.98,23.61,23.21,23.47,22.27,24.91
|
9 |
+
CodeLlama 70B,31.46,31.29,34.17,29.8,22.99,44.96
|
10 |
+
QwenCoder 2.5 32B,42.53,43.71,42.27,43.96,43.33,41.4
|
11 |
+
DeepSeek Coder 33B,25.71,36.47,19.49,37.25,45.11,29.29
|
12 |
+
QwenCoder 2.5 14B,36.75,38.49,35.61,39.03,40.33,33.55
|
13 |
+
OpenCoder 8B,31.13,34.76,27.12,34.55,43.63,36.67
|
14 |
+
QwenCoder 2.5 7B,13.86,32.31,6.31,31.75,37.41,37.47
|
15 |
+
"DeepSeek Coder 6,7B",31.6,30.03,28.69,30.41,40.67,26.61
|
16 |
+
RTLCoder Mistral,21.86,27.2,22.73,26.21,19.15,36.3
|
17 |
+
RTLCoder DeepSeek,32.21,37.6,31.75,37.47,33.64,38.81
|
18 |
+
OriGen,37.22,41.29,46.0,41.97,9.82,35.07
|
19 |
+
HaVen-CodeQwen,41.66,46.09,42.97,46.57,37.55,41.74
|
20 |
+
CodeV-CL-7B,28.19,35.7,25.75,35.39,35.79,38.53
|
21 |
+
CodeV-QW-7B,20.79,47.26,18.73,50.28,27.23,19.55
|
22 |
+
CodeV-DS-6.7B,18.19,44.1,14.28,47.05,30.39,17.03
|
results/v1/results.csv
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,Syntax (STX),Syntax (STX),Functionality (FNC),Functionality (FNC),Synthesis (SYN),Synthesis (SYN),Power,Power,Performance,Performance,Area,Area,EM,Syntax (STX),Syntax (STX),Functionality (FNC),Functionality (FNC),Synthesis (SYN),Synthesis (SYN),Power,Power,Performance,Performance,Area,Area
|
2 |
+
,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,VerilogEval S2R,RTLLM,RTL-Repo,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen,VerilogEval MC,VeriGen
|
3 |
+
DeepSeek R1,"96,54","91,43","79,74","67,76","78,97","63,27","38,94","35,64","37,82","31,95","38,76","34,5","33,02","97,95","94,12","80,26",60,"79,62","54,12","39,35","26,62","38,16","26,97","39,21","28,01"
|
4 |
+
Llama 3.1 405B,"89,1","65,71","57,05","37,55","56,67","35,92","27,18","19,7",27,"16,04","26,79","18,91","33,29","91,41","72,94","44,74","45,88","44,1","44,71","21,98","19,24","20,74","22,19","21,66","21,08"
|
5 |
+
Llama 3.(1-3) 70B,"67,69","73,88",40,"43,27","39,87","39,18","19,76","20,58","18,69","19,32","19,51","20,28","28,62","70,51","78,82","38,59","48,24","37,95","48,24","18,97","23,22","18,35","24,15","18,86","24,71"
|
6 |
+
Qwen2.5 72B,"81,15","82,04","51,15","47,35","50,38","46,53","25,4","24,83","23,83","23,88","24,52","25,46","37,19","81,67","70,59","53,08","27,06","52,56","27,06","26,08","12,74","24,92","13,5","25,83","13,89"
|
7 |
+
Qwen2.5 32B,"89,36","86,94","52,95","50,61","51,67","46,94","26,02","25,9","24,46","23,87","25,32","26,29","28,67","93,46","67,06","43,08","32,94","42,31","30,59","21,14",15,"20,48","15,38","21,15","15,31"
|
8 |
+
StarChat2 15B v0.1,"86,54","85,71","38,72","42,45","38,59","42,45","19,18","22,44","17,99","21,03",19,"22,53","13,24","81,54","92,94","39,36","50,59","38,59","50,59","19,21","24,02","18,28","25,23","19,05","25,73"
|
9 |
+
DeepSeek R1 Distill Qwen 14B,"41,28","40,82","23,85","20,82","23,72","20,41","11,81","12,46","11,25","10,24","11,76","10,7","20,65","42,18","62,35","24,36","25,88","24,1","25,88","11,96","11,54","11,38","12,93","11,86","12,9"
|
10 |
+
CodeLlama 70B,"72,05","41,63","35,51","23,27","35,38","22,86","17,32","11,92","16,74","10,85","17,2","11,71","24,58","89,36","89,41","30,9","45,88","30,9","45,88","15,3","21,74","14,19","22,88","15,21","22,82"
|
11 |
+
QwenCoder 2.5 32B,"87,69","79,59","45,64","43,27","43,33","42,04","21,51","22,02","20,72","20,95","21,17","22,03","30,44","84,87","72,94","45,51","41,18","44,87","41,18","22,26","20,56","21,48","20,67","22,2","20,87"
|
12 |
+
DeepSeek Coder 33B,"57,82","83,67","19,87","43,67","19,87","42,86","9,94","23,28","9,83","21,19","9,47","23,2","30,58","78,72","83,53","39,49","29,41","38,33","29,41","18,92","14,52","18,2","14,74","18,76","14,67"
|
13 |
+
QwenCoder 2.5 14B,"79,74","78,37","37,82","41,63","37,05","40,41","18,03","20,14","17,6","20,1","17,78","20,25","37,16","79,36","67,06","40,26","34,12","39,49","34,12","19,74","16,5","19,07","17,07","19,73","16,75"
|
14 |
+
OpenCoder 8B,"75,77","75,1","28,59","46,53","28,21","42,86","13,81","22,24","13,16","21,47","13,71","21,73","16,63","79,87","92,94","36,03","43,53","35,51","37,65","17,57","17,19","16,74","18,76","17,52","19,06"
|
15 |
+
QwenCoder 2.5 7B,"19,62","77,96","6,41","37,96","6,41","35,51","3,12","19,26","3,18","17,98","3,16","18,87","28,45","75,9","71,76","32,44","37,65","32,44","37,65","16,2","18,38","15,26","18,91","16,16","18,92"
|
16 |
+
"DeepSeek Coder 6,7B","80,12","78,37","29,87","40,41","29,36","37,96","14,71","20,72","13,69","19,25","14,64","21,03","24,57","68,85","81,18","32,82","27,06","31,15","27,06","15,53","12,94","14,62","13,39","15,46","13,58"
|
17 |
+
RTLCoder Mistral,"52,05","38,78","23,59","19,18","23,59","19,18","11,67","10,08","10,87","8,7","11,56","9,95","14,97","63,59","85,88","26,92","35,29","26,92","35,29","13,43","18,49","12,53","17,61","13,36","18,35"
|
18 |
+
RTLCoder DeepSeek,"75,26","68,57","33,33","37,14","32,95","33,06","16,02","17,29","15,71","16,35","15,9","16,82","19,76","84,1","84,71","39,23","38,82","38,59","38,82","19,08","19,1","18,31","19,35","18,82","19,76"
|
19 |
+
OriGen,"91,02","23,67","46,54","12,65","46,92","10,61","23,38","5,33","22,18","4,61","23,44","4,79","19,45","79,35","87,06","43,07","35,29","42,95","35,29","21,5","16,55","20,13","17,7","21,33","18,35"
|
20 |
+
HaVen-CodeQwen,"90,26","82,45","45,9","40,41","44,36","38,37","21,77","19,1","21,23","18,31","21,46","18,92","25,38","93,33","97,65",50,"48,24","48,72","42,35","23,37","20,21","23,39","21,15","23,09","21,25"
|
21 |
+
CodeV-CL-7B,"55,38","69,8","27,05","37,14","26,79","35,1","13,2","18,92","12,39","16,88","13,03","17,89","12,39","91,92","98,82","36,79","44,71","36,41","38,82","18,15","19,06","16,88","19,38","18,05","19,35"
|
22 |
+
CodeV-QW-7B,"41,79","71,02","19,1","35,51","18,72","27,76","9,36","14,85","9,36","12,21","9,38","13,78","20,56","93,85","57,65","52,56","25,88","51,15",20,"25,64","9,39","24,22","9,99","25,56","9,94"
|
23 |
+
CodeV-DS-6.7B,"30,77","62,45","14,87","33,88","14,62","30,61","7,3","15,49","6,9","14,75","7,22","15,35","21,06","95,13","58,82","48,85","23,53","48,33","17,65","24,02","8,26","22,82","8,81","23,73","8,47"
|
results/v1/results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
utils.py
CHANGED
@@ -1,89 +1,216 @@
|
|
1 |
-
import
|
|
|
2 |
import gradio as gr
|
3 |
-
import plotly.graph_objects as go
|
4 |
-
import plotly.express as px
|
5 |
import numpy as np
|
|
|
|
|
|
|
6 |
|
7 |
-
type_emoji = {
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
12 |
|
13 |
-
def model_hyperlink(link, model_name):
|
14 |
-
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
15 |
|
16 |
def handle_special_cases(benchmark, metric):
|
17 |
-
if metric ==
|
18 |
-
benchmark =
|
19 |
-
elif benchmark ==
|
20 |
-
metric =
|
21 |
return benchmark, metric
|
22 |
|
|
|
23 |
def filter_RTLRepo(subset: pd.DataFrame) -> pd.DataFrame:
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
filtered_df
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
return filtered_df
|
32 |
|
|
|
33 |
def filter_bench(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataFrame:
|
34 |
-
details = subset[
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
37 |
if df_agg is not None and agg_column is not None and agg_column in df_agg.columns:
|
38 |
-
agg_data = df_agg[[
|
39 |
-
|
40 |
-
|
41 |
-
pivot_df
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
pivot_df
|
46 |
-
pivot_df
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
pivot_df = pivot_df[[col for col in columns_order if col in pivot_df.columns]]
|
50 |
-
pivot_df = pivot_df.sort_values(by=
|
|
|
|
|
51 |
return pivot_df
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
if df_agg is not None:
|
58 |
if agg_column is not None and agg_column in df_agg.columns:
|
59 |
-
agg_data = df_agg[[
|
60 |
-
|
|
|
|
|
61 |
else:
|
62 |
-
agg_columns = [col for col in df_agg.columns if col.startswith(
|
63 |
if agg_columns:
|
64 |
-
df_agg[
|
65 |
-
agg_data = df_agg[[
|
66 |
-
|
67 |
-
|
68 |
-
pivot_df
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
pivot_df = pivot_df[[col for col in columns_order if col in pivot_df.columns]]
|
88 |
-
pivot_df = pivot_df.sort_values(by=
|
|
|
|
|
89 |
return pivot_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
|
3 |
import gradio as gr
|
|
|
|
|
4 |
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
import plotly.express as px
|
7 |
+
import plotly.graph_objects as go
|
8 |
|
9 |
+
type_emoji = {"RTL-Specific": "🔴", "General": "🟢", "Coding": "🔵"}
|
10 |
+
|
11 |
+
|
12 |
+
def model_hyperlink(link, model_name, release):
|
13 |
+
if release == "V1":
|
14 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
15 |
+
else:
|
16 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a> <span style="font-variant: all-small-caps; font-weight: 600">new</span>'
|
17 |
|
|
|
|
|
18 |
|
19 |
def handle_special_cases(benchmark, metric):
|
20 |
+
if metric == "Exact Matching (EM)":
|
21 |
+
benchmark = "RTL-Repo"
|
22 |
+
elif benchmark == "RTL-Repo":
|
23 |
+
metric = "Exact Matching (EM)"
|
24 |
return benchmark, metric
|
25 |
|
26 |
+
|
27 |
def filter_RTLRepo(subset: pd.DataFrame) -> pd.DataFrame:
|
28 |
+
subset = subset.drop(subset[subset.Score < 0.0].index)
|
29 |
+
details = subset[
|
30 |
+
["Model", "Model URL", "Model Type", "Params", "Release"]
|
31 |
+
].drop_duplicates("Model")
|
32 |
+
filtered_df = subset[["Model", "Score"]].rename(
|
33 |
+
columns={"Score": "Exact Matching (EM)"}
|
34 |
+
)
|
35 |
+
filtered_df = pd.merge(filtered_df, details, on="Model", how="left")
|
36 |
+
filtered_df["Model"] = filtered_df.apply(
|
37 |
+
lambda row: model_hyperlink(row["Model URL"], row["Model"], row["Release"]),
|
38 |
+
axis=1,
|
39 |
+
)
|
40 |
+
filtered_df["Type"] = filtered_df["Model Type"].map(lambda x: type_emoji.get(x, ""))
|
41 |
+
filtered_df = filtered_df[["Type", "Model", "Params", "Exact Matching (EM)"]]
|
42 |
+
filtered_df = filtered_df.sort_values(
|
43 |
+
by="Exact Matching (EM)", ascending=False
|
44 |
+
).reset_index(drop=True)
|
45 |
return filtered_df
|
46 |
|
47 |
+
|
48 |
def filter_bench(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataFrame:
|
49 |
+
details = subset[
|
50 |
+
["Model", "Model URL", "Model Type", "Params", "Release"]
|
51 |
+
].drop_duplicates("Model")
|
52 |
+
pivot_df = subset.pivot_table(
|
53 |
+
index="Model", columns="Metric", values="Score", aggfunc="mean"
|
54 |
+
).reset_index()
|
55 |
+
|
56 |
if df_agg is not None and agg_column is not None and agg_column in df_agg.columns:
|
57 |
+
agg_data = df_agg[["Model", agg_column]].rename(
|
58 |
+
columns={agg_column: "Aggregated ⬆️"}
|
59 |
+
)
|
60 |
+
pivot_df = pd.merge(pivot_df, agg_data, on="Model", how="left")
|
61 |
+
else: # fallback
|
62 |
+
pivot_df["Aggregated ⬆️"] = pivot_df.mean(axis=1, numeric_only=True).round(2)
|
63 |
+
|
64 |
+
pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
|
65 |
+
pivot_df["Model"] = pivot_df.apply(
|
66 |
+
lambda row: model_hyperlink(row["Model URL"], row["Model"], row["Release"]),
|
67 |
+
axis=1,
|
68 |
+
)
|
69 |
+
pivot_df["Type"] = pivot_df["Model Type"].map(lambda x: type_emoji.get(x, ""))
|
70 |
+
pivot_df.rename(
|
71 |
+
columns={
|
72 |
+
"Syntax (STX)": "STX",
|
73 |
+
"Functionality (FNC)": "FNC",
|
74 |
+
"Synthesis (SYN)": "SYN",
|
75 |
+
"Performance": "Perf",
|
76 |
+
},
|
77 |
+
inplace=True,
|
78 |
+
)
|
79 |
+
|
80 |
+
columns_order = [
|
81 |
+
"Type",
|
82 |
+
"Model",
|
83 |
+
"Params",
|
84 |
+
"Aggregated ⬆️",
|
85 |
+
"STX",
|
86 |
+
"FNC",
|
87 |
+
"SYN",
|
88 |
+
"Power",
|
89 |
+
"Perf",
|
90 |
+
"Area",
|
91 |
+
]
|
92 |
pivot_df = pivot_df[[col for col in columns_order if col in pivot_df.columns]]
|
93 |
+
pivot_df = pivot_df.sort_values(by="Aggregated ⬆️", ascending=False).reset_index(
|
94 |
+
drop=True
|
95 |
+
)
|
96 |
return pivot_df
|
97 |
|
98 |
+
|
99 |
+
def custom_agg_s2r(vals):
|
100 |
+
s2r_val = vals.iloc[0]
|
101 |
+
rtllm_val = vals.iloc[1]
|
102 |
+
w1 = 155
|
103 |
+
w2 = 47
|
104 |
+
result = (w1 * s2r_val + w2 * rtllm_val) / (w1 + w2)
|
105 |
+
return round(result, 2)
|
106 |
+
|
107 |
+
|
108 |
+
def custom_agg_cc(vals):
|
109 |
+
veval_val = vals.iloc[0]
|
110 |
+
vgen_val = vals.iloc[1]
|
111 |
+
w1 = 155
|
112 |
+
w2 = 17
|
113 |
+
result = (w1 * veval_val + w2 * vgen_val) / (w1 + w2)
|
114 |
+
return round(result, 2)
|
115 |
+
|
116 |
+
|
117 |
+
def filter_bench_all(
|
118 |
+
subset: pd.DataFrame, df_agg=None, agg_column=None
|
119 |
+
) -> pd.DataFrame:
|
120 |
+
details = subset[
|
121 |
+
["Model", "Model URL", "Model Type", "Params", "Release"]
|
122 |
+
].drop_duplicates("Model")
|
123 |
+
if "RTLLM" in subset["Benchmark"].unique():
|
124 |
+
pivot_df = (
|
125 |
+
subset.pivot_table(
|
126 |
+
index="Model", columns="Metric", values="Score", aggfunc=custom_agg_s2r
|
127 |
+
)
|
128 |
+
.reset_index()
|
129 |
+
.round(2)
|
130 |
+
)
|
131 |
+
else:
|
132 |
+
pivot_df = (
|
133 |
+
subset.pivot_table(
|
134 |
+
index="Model", columns="Metric", values="Score", aggfunc=custom_agg_cc
|
135 |
+
)
|
136 |
+
.reset_index()
|
137 |
+
.round(2)
|
138 |
+
)
|
139 |
+
|
140 |
if df_agg is not None:
|
141 |
if agg_column is not None and agg_column in df_agg.columns:
|
142 |
+
agg_data = df_agg[["Model", agg_column]].rename(
|
143 |
+
columns={agg_column: "Aggregated ⬆️"}
|
144 |
+
)
|
145 |
+
pivot_df = pd.merge(pivot_df, agg_data, on="Model", how="left")
|
146 |
else:
|
147 |
+
agg_columns = [col for col in df_agg.columns if col.startswith("Agg ")]
|
148 |
if agg_columns:
|
149 |
+
df_agg["Average_Agg"] = df_agg[agg_columns].mean(axis=1)
|
150 |
+
agg_data = df_agg[["Model", "Average_Agg"]].rename(
|
151 |
+
columns={"Average_Agg": "Aggregated ⬆️"}
|
152 |
+
)
|
153 |
+
pivot_df = pd.merge(pivot_df, agg_data, on="Model", how="left")
|
154 |
+
else: # fallback
|
155 |
+
pivot_df["Aggregated ⬆️"] = pivot_df.mean(
|
156 |
+
axis=1, numeric_only=True
|
157 |
+
).round(2)
|
158 |
+
else: # fallback
|
159 |
+
print("We do mean")
|
160 |
+
pivot_df["Aggregated ⬆️"] = pivot_df.mean(axis=1, numeric_only=True).round(2)
|
161 |
+
|
162 |
+
pivot_df = pd.merge(pivot_df, details, on="Model", how="left")
|
163 |
+
pivot_df["Model"] = pivot_df.apply(
|
164 |
+
lambda row: model_hyperlink(row["Model URL"], row["Model"], row["Release"]),
|
165 |
+
axis=1,
|
166 |
+
)
|
167 |
+
pivot_df["Type"] = pivot_df["Model Type"].map(lambda x: type_emoji.get(x, ""))
|
168 |
+
|
169 |
+
pivot_df.rename(
|
170 |
+
columns={
|
171 |
+
"Exact Matching (EM)": "EM",
|
172 |
+
"Syntax (STX)": "Agg STX",
|
173 |
+
"Functionality (FNC)": "Agg FNC",
|
174 |
+
"Synthesis (SYN)": "Agg SYN",
|
175 |
+
"Power": "Agg Power",
|
176 |
+
"Performance": "Agg Perf",
|
177 |
+
"Area": "Agg Area",
|
178 |
+
},
|
179 |
+
inplace=True,
|
180 |
+
)
|
181 |
+
|
182 |
+
columns_order = [
|
183 |
+
"Type",
|
184 |
+
"Model",
|
185 |
+
"Params",
|
186 |
+
"Aggregated ⬆️",
|
187 |
+
"Agg STX",
|
188 |
+
"Agg FNC",
|
189 |
+
"Agg SYN",
|
190 |
+
"Agg Power",
|
191 |
+
"Agg Perf",
|
192 |
+
"Agg Area",
|
193 |
+
]
|
194 |
pivot_df = pivot_df[[col for col in columns_order if col in pivot_df.columns]]
|
195 |
+
pivot_df = pivot_df.sort_values(by="Aggregated ⬆️", ascending=False).reset_index(
|
196 |
+
drop=True
|
197 |
+
)
|
198 |
return pivot_df
|
199 |
+
|
200 |
+
|
201 |
+
def agg_S2R_metrics(verilog_eval_rtl, rtllm):
|
202 |
+
if not verilog_eval_rtl or not rtllm:
|
203 |
+
return None
|
204 |
+
w1 = 155
|
205 |
+
w2 = 47
|
206 |
+
result = (w1 * verilog_eval_rtl + w2 * rtllm) / (w1 + w2)
|
207 |
+
return round(result, 2)
|
208 |
+
|
209 |
+
|
210 |
+
def agg_MC_metrics(verilog_eval_cc, verigen):
|
211 |
+
if not verilog_eval_cc or not verigen:
|
212 |
+
return None
|
213 |
+
w1 = 155
|
214 |
+
w2 = 17
|
215 |
+
result = (w1 * verilog_eval_cc + w2 * verigen) / (w1 + w2)
|
216 |
+
return round(result, 2)
|