Spaces:
Running
Running
Gül Sena Altıntaş
commited on
Commit
·
44cdae3
1
Parent(s):
199862a
Improvements
Browse files
app.py
CHANGED
@@ -576,7 +576,13 @@ with gr.Blocks(
|
|
576 |
"tokenmonster",
|
577 |
"byt5",
|
578 |
],
|
579 |
-
value=[
|
|
|
|
|
|
|
|
|
|
|
|
|
580 |
label="Select tokenizers to compare",
|
581 |
)
|
582 |
show_details = gr.Checkbox(
|
@@ -679,7 +685,7 @@ with gr.Blocks(
|
|
679 |
)
|
680 |
|
681 |
# Combine or show separately
|
682 |
-
combined_html = f"<h3>Normalized Text: {normalized_text}</h3>{norm_html}\n<h2>Original</h2>{orig_html}"
|
683 |
|
684 |
return (
|
685 |
orig_eff,
|
|
|
576 |
"tokenmonster",
|
577 |
"byt5",
|
578 |
],
|
579 |
+
value=[
|
580 |
+
"gpt-4",
|
581 |
+
"llama-3",
|
582 |
+
"gemma-2",
|
583 |
+
"qwen2.5",
|
584 |
+
"tokenmonster",
|
585 |
+
],
|
586 |
label="Select tokenizers to compare",
|
587 |
)
|
588 |
show_details = gr.Checkbox(
|
|
|
685 |
)
|
686 |
|
687 |
# Combine or show separately
|
688 |
+
combined_html = f"<h3>Normalized ({norm_method}) Text: {normalized_text} </h3>{norm_html}\n<h2>Original</h2>{orig_html}"
|
689 |
|
690 |
return (
|
691 |
orig_eff,
|
utils.py
CHANGED
@@ -110,9 +110,9 @@ def tokenize_with_tiktoken(text, model):
|
|
110 |
|
111 |
return {
|
112 |
"model": TOKENIZER_INFO[model]["name"],
|
113 |
-
"token_count": len(
|
114 |
"tokens": token_data,
|
115 |
-
"compression_ratio": len(text) / len(
|
116 |
"encoding": TOKENIZER_INFO[model]["encoding"],
|
117 |
"vocab_size": TOKENIZER_INFO[model]["vocab_size"],
|
118 |
}
|
@@ -187,9 +187,9 @@ def tokenize_with_hf(text, model):
|
|
187 |
|
188 |
return {
|
189 |
"model": TOKENIZER_INFO[model]["name"],
|
190 |
-
"token_count": len(
|
191 |
"tokens": token_data,
|
192 |
-
"compression_ratio": len(text) / len(
|
193 |
"encoding": TOKENIZER_INFO[model]["encoding"],
|
194 |
"vocab_size": TOKENIZER_INFO[model]["vocab_size"],
|
195 |
}
|
|
|
110 |
|
111 |
return {
|
112 |
"model": TOKENIZER_INFO[model]["name"],
|
113 |
+
"token_count": len(token_data),
|
114 |
"tokens": token_data,
|
115 |
+
"compression_ratio": len(text) / len(token_data) if token_data else 0,
|
116 |
"encoding": TOKENIZER_INFO[model]["encoding"],
|
117 |
"vocab_size": TOKENIZER_INFO[model]["vocab_size"],
|
118 |
}
|
|
|
187 |
|
188 |
return {
|
189 |
"model": TOKENIZER_INFO[model]["name"],
|
190 |
+
"token_count": len(token_data),
|
191 |
"tokens": token_data,
|
192 |
+
"compression_ratio": len(text) / len(token_data) if token_data else 0,
|
193 |
"encoding": TOKENIZER_INFO[model]["encoding"],
|
194 |
"vocab_size": TOKENIZER_INFO[model]["vocab_size"],
|
195 |
}
|