Gül Sena Altıntaş commited on
Commit
199862a
·
1 Parent(s): d9779a0

Small improvement for visualization

Browse files
Files changed (2) hide show
  1. app.py +13 -1
  2. utils.py +27 -17
app.py CHANGED
@@ -81,14 +81,23 @@ def generate_interactive_tokenization(results):
81
 
82
  # Add styles first
83
  html_parts.append("""
84
- <div id="tokenizer-container">
85
  <style>
 
 
 
 
 
 
86
  .tokenizer-section {
87
  margin-bottom: 20px;
88
  border: 1px solid #e0e0e0;
89
  border-radius: 8px;
90
  padding: 15px;
91
  background: white;
 
 
 
92
  }
93
  .tokenizer-header {
94
  font-weight: bold;
@@ -157,6 +166,9 @@ def generate_interactive_tokenization(results):
157
  font-size: 12px;
158
  display: none;
159
  z-index: 1000;
 
 
 
160
  }
161
  </style>
162
 
 
81
 
82
  # Add styles first
83
  html_parts.append("""
84
+ <div id="tokenizer-container" class="tokenizer-container">
85
  <style>
86
+ .tokenizer-container {
87
+ display: flex;
88
+ flex-wrap: wrap;
89
+ justify-content: space-between;
90
+ gap: 20px;
91
+ }
92
  .tokenizer-section {
93
  margin-bottom: 20px;
94
  border: 1px solid #e0e0e0;
95
  border-radius: 8px;
96
  padding: 15px;
97
  background: white;
98
+ flex-wrap: wrap;
99
+ display: inline-block;
100
+ justify-content: space-between;
101
  }
102
  .tokenizer-header {
103
  font-weight: bold;
 
166
  font-size: 12px;
167
  display: none;
168
  z-index: 1000;
169
+ flex-wrap: wrap;
170
+ display: inline-block;
171
+ justify-content: space-between;
172
  }
173
  </style>
174
 
utils.py CHANGED
@@ -8,6 +8,8 @@ from transformers import AutoTokenizer
8
 
9
  from mappings import MODEL_MAP, TOKENIZER_INFO
10
 
 
 
11
 
12
  class TokenMonsterTokenizer:
13
  def __init__(self, name):
@@ -116,25 +118,33 @@ def tokenize_with_tiktoken(text, model):
116
  }
117
 
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  def tokenize_with_hf(text, model):
120
  try:
121
- model_name = MODEL_MAP.get(model, "gpt2")
122
- # Get token from environment
123
- hf_token = os.getenv("HF_TOKEN")
124
- if not hf_token:
125
- return {
126
- "model": TOKENIZER_INFO[model]["name"],
127
- "token_count": 0,
128
- "tokens": [],
129
- "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
130
- }
131
-
132
- if "tokenmonster" in model_name:
133
- tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
134
- else:
135
- tokenizer = AutoTokenizer.from_pretrained(
136
- model_name, token=hf_token, trust_remote_code=True
137
- )
138
  token_data = []
139
  for text_ in text.split("\n"):
140
  text_ = text_ + "\n"
 
8
 
9
  from mappings import MODEL_MAP, TOKENIZER_INFO
10
 
11
+ TOKENIZER_CACHE = {}
12
+
13
 
14
  class TokenMonsterTokenizer:
15
  def __init__(self, name):
 
118
  }
119
 
120
 
121
+ def get_hf_tokenizer(model):
122
+ model_name = MODEL_MAP.get(model, "gpt2")
123
+ if model_name in TOKENIZER_CACHE:
124
+ return TOKENIZER_CACHE[model_name]
125
+ # Get token from environment
126
+ hf_token = os.getenv("HF_TOKEN")
127
+ if not hf_token:
128
+ return {
129
+ "model": TOKENIZER_INFO[model]["name"],
130
+ "token_count": 0,
131
+ "tokens": [],
132
+ "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
133
+ }
134
+
135
+ if "tokenmonster" in model_name:
136
+ tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
137
+ else:
138
+ tokenizer = AutoTokenizer.from_pretrained(
139
+ model_name, token=hf_token, trust_remote_code=True
140
+ )
141
+ TOKENIZER_CACHE[model_name] = tokenizer
142
+ return tokenizer
143
+
144
+
145
  def tokenize_with_hf(text, model):
146
  try:
147
+ tokenizer = get_hf_tokenizer(model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  token_data = []
149
  for text_ in text.split("\n"):
150
  text_ = text_ + "\n"