Gül Sena Altıntaş commited on
Commit
44cdae3
·
1 Parent(s): 199862a

Improvements

Browse files
Files changed (2) hide show
  1. app.py +8 -2
  2. utils.py +4 -4
app.py CHANGED
@@ -576,7 +576,13 @@ with gr.Blocks(
576
  "tokenmonster",
577
  "byt5",
578
  ],
579
- value=["gpt-4", "llama-3", "gpt-2"],
 
 
 
 
 
 
580
  label="Select tokenizers to compare",
581
  )
582
  show_details = gr.Checkbox(
@@ -679,7 +685,7 @@ with gr.Blocks(
679
  )
680
 
681
  # Combine or show separately
682
- combined_html = f"<h3>Normalized Text: {normalized_text}</h3>{norm_html}\n<h2>Original</h2>{orig_html}"
683
 
684
  return (
685
  orig_eff,
 
576
  "tokenmonster",
577
  "byt5",
578
  ],
579
+ value=[
580
+ "gpt-4",
581
+ "llama-3",
582
+ "gemma-2",
583
+ "qwen2.5",
584
+ "tokenmonster",
585
+ ],
586
  label="Select tokenizers to compare",
587
  )
588
  show_details = gr.Checkbox(
 
685
  )
686
 
687
  # Combine or show separately
688
+ combined_html = f"<h3>Normalized ({norm_method}) Text: {normalized_text} </h3>{norm_html}\n<h2>Original</h2>{orig_html}"
689
 
690
  return (
691
  orig_eff,
utils.py CHANGED
@@ -110,9 +110,9 @@ def tokenize_with_tiktoken(text, model):
110
 
111
  return {
112
  "model": TOKENIZER_INFO[model]["name"],
113
- "token_count": len(tokens),
114
  "tokens": token_data,
115
- "compression_ratio": len(text) / len(tokens) if tokens else 0,
116
  "encoding": TOKENIZER_INFO[model]["encoding"],
117
  "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
118
  }
@@ -187,9 +187,9 @@ def tokenize_with_hf(text, model):
187
 
188
  return {
189
  "model": TOKENIZER_INFO[model]["name"],
190
- "token_count": len(token_ids),
191
  "tokens": token_data,
192
- "compression_ratio": len(text) / len(token_ids) if token_ids else 0,
193
  "encoding": TOKENIZER_INFO[model]["encoding"],
194
  "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
195
  }
 
110
 
111
  return {
112
  "model": TOKENIZER_INFO[model]["name"],
113
+ "token_count": len(token_data),
114
  "tokens": token_data,
115
+ "compression_ratio": len(text) / len(token_data) if token_data else 0,
116
  "encoding": TOKENIZER_INFO[model]["encoding"],
117
  "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
118
  }
 
187
 
188
  return {
189
  "model": TOKENIZER_INFO[model]["name"],
190
+ "token_count": len(token_data),
191
  "tokens": token_data,
192
+ "compression_ratio": len(text) / len(token_data) if token_data else 0,
193
  "encoding": TOKENIZER_INFO[model]["encoding"],
194
  "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
195
  }