Spaces:
Running
Running
Gül Sena Altıntaş
commited on
Commit
·
199862a
1
Parent(s):
d9779a0
Small improvement for visualization
Browse files
app.py
CHANGED
@@ -81,14 +81,23 @@ def generate_interactive_tokenization(results):
|
|
81 |
|
82 |
# Add styles first
|
83 |
html_parts.append("""
|
84 |
-
<div id="tokenizer-container">
|
85 |
<style>
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
.tokenizer-section {
|
87 |
margin-bottom: 20px;
|
88 |
border: 1px solid #e0e0e0;
|
89 |
border-radius: 8px;
|
90 |
padding: 15px;
|
91 |
background: white;
|
|
|
|
|
|
|
92 |
}
|
93 |
.tokenizer-header {
|
94 |
font-weight: bold;
|
@@ -157,6 +166,9 @@ def generate_interactive_tokenization(results):
|
|
157 |
font-size: 12px;
|
158 |
display: none;
|
159 |
z-index: 1000;
|
|
|
|
|
|
|
160 |
}
|
161 |
</style>
|
162 |
|
|
|
81 |
|
82 |
# Add styles first
|
83 |
html_parts.append("""
|
84 |
+
<div id="tokenizer-container" class="tokenizer-container">
|
85 |
<style>
|
86 |
+
.tokenizer-container {
|
87 |
+
display: flex;
|
88 |
+
flex-wrap: wrap;
|
89 |
+
justify-content: space-between;
|
90 |
+
gap: 20px;
|
91 |
+
}
|
92 |
.tokenizer-section {
|
93 |
margin-bottom: 20px;
|
94 |
border: 1px solid #e0e0e0;
|
95 |
border-radius: 8px;
|
96 |
padding: 15px;
|
97 |
background: white;
|
98 |
+
flex-wrap: wrap;
|
99 |
+
display: inline-block;
|
100 |
+
justify-content: space-between;
|
101 |
}
|
102 |
.tokenizer-header {
|
103 |
font-weight: bold;
|
|
|
166 |
font-size: 12px;
|
167 |
display: none;
|
168 |
z-index: 1000;
|
169 |
+
flex-wrap: wrap;
|
170 |
+
display: inline-block;
|
171 |
+
justify-content: space-between;
|
172 |
}
|
173 |
</style>
|
174 |
|
utils.py
CHANGED
@@ -8,6 +8,8 @@ from transformers import AutoTokenizer
|
|
8 |
|
9 |
from mappings import MODEL_MAP, TOKENIZER_INFO
|
10 |
|
|
|
|
|
11 |
|
12 |
class TokenMonsterTokenizer:
|
13 |
def __init__(self, name):
|
@@ -116,25 +118,33 @@ def tokenize_with_tiktoken(text, model):
|
|
116 |
}
|
117 |
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
def tokenize_with_hf(text, model):
|
120 |
try:
|
121 |
-
|
122 |
-
# Get token from environment
|
123 |
-
hf_token = os.getenv("HF_TOKEN")
|
124 |
-
if not hf_token:
|
125 |
-
return {
|
126 |
-
"model": TOKENIZER_INFO[model]["name"],
|
127 |
-
"token_count": 0,
|
128 |
-
"tokens": [],
|
129 |
-
"error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
|
130 |
-
}
|
131 |
-
|
132 |
-
if "tokenmonster" in model_name:
|
133 |
-
tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
|
134 |
-
else:
|
135 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
136 |
-
model_name, token=hf_token, trust_remote_code=True
|
137 |
-
)
|
138 |
token_data = []
|
139 |
for text_ in text.split("\n"):
|
140 |
text_ = text_ + "\n"
|
|
|
8 |
|
9 |
from mappings import MODEL_MAP, TOKENIZER_INFO
|
10 |
|
11 |
+
TOKENIZER_CACHE = {}
|
12 |
+
|
13 |
|
14 |
class TokenMonsterTokenizer:
|
15 |
def __init__(self, name):
|
|
|
118 |
}
|
119 |
|
120 |
|
121 |
+
def get_hf_tokenizer(model):
|
122 |
+
model_name = MODEL_MAP.get(model, "gpt2")
|
123 |
+
if model_name in TOKENIZER_CACHE:
|
124 |
+
return TOKENIZER_CACHE[model_name]
|
125 |
+
# Get token from environment
|
126 |
+
hf_token = os.getenv("HF_TOKEN")
|
127 |
+
if not hf_token:
|
128 |
+
return {
|
129 |
+
"model": TOKENIZER_INFO[model]["name"],
|
130 |
+
"token_count": 0,
|
131 |
+
"tokens": [],
|
132 |
+
"error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
|
133 |
+
}
|
134 |
+
|
135 |
+
if "tokenmonster" in model_name:
|
136 |
+
tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
|
137 |
+
else:
|
138 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
139 |
+
model_name, token=hf_token, trust_remote_code=True
|
140 |
+
)
|
141 |
+
TOKENIZER_CACHE[model_name] = tokenizer
|
142 |
+
return tokenizer
|
143 |
+
|
144 |
+
|
145 |
def tokenize_with_hf(text, model):
|
146 |
try:
|
147 |
+
tokenizer = get_hf_tokenizer(model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
token_data = []
|
149 |
for text_ in text.split("\n"):
|
150 |
text_ = text_ + "\n"
|