Gül Sena Altıntaş
Refactoring, and visual improvements
c02e89e
raw
history blame
4.45 kB
import os
import re
import tiktoken
from transformers import AutoTokenizer
from mappings import MODEL_MAP, TOKENIZER_INFO
def get_token_type(token_text):
if re.match(r"^\s+$", token_text):
return "whitespace"
elif re.match(r"^[a-zA-Z]+$", token_text):
return "word"
elif re.match(r"^\d+$", token_text):
return "number"
elif re.match(r"^[^\w\s]+$", token_text):
return "punctuation"
elif token_text.startswith("<") and token_text.endswith(">"):
return "special"
else:
return "mixed"
def is_subword(token_text, model, is_first):
if model in ["llama-2", "llama-3", "qwen3"]:
return not token_text.startswith("▁") and not is_first
elif model == "bert":
return token_text.startswith("##")
else: # BPE models
return not token_text.startswith(" ") and not is_first and len(token_text) > 0
def tokenize_with_tiktoken(text, model):
encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
enc = tiktoken.get_encoding(encoding)
tokens = enc.encode(text)
token_data = []
current_pos = 0
for i, token_id in enumerate(tokens):
token_text = enc.decode([token_id])
token_type = get_token_type(token_text)
subword = is_subword(token_text, model, i == 0)
token_data.append(
{
"text": token_text,
"id": int(token_id),
"type": token_type,
"is_subword": subword,
"bytes": len(token_text.encode("utf-8")),
"position": i,
}
)
current_pos += len(token_text)
return {
"model": TOKENIZER_INFO[model]["name"],
"token_count": len(tokens),
"tokens": token_data,
"compression_ratio": len(text) / len(tokens) if tokens else 0,
"encoding": TOKENIZER_INFO[model]["encoding"],
"vocab_size": TOKENIZER_INFO[model]["vocab_size"],
}
def tokenize_with_hf(text, model):
try:
model_name = MODEL_MAP.get(model, "gpt2")
# Get token from environment
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
return {
"model": TOKENIZER_INFO[model]["name"],
"token_count": 0,
"tokens": [],
"error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
}
print(f"DEBUG: Loading model {model_name} with token")
tokenizer = AutoTokenizer.from_pretrained(
model_name, token=hf_token, trust_remote_code=True
)
tokens = tokenizer.encode(text)
token_data = []
for i, token_id in enumerate(tokens):
token_text = tokenizer.decode([token_id], skip_special_tokens=False)
token_type = get_token_type(token_text)
subword = is_subword(token_text, model, i == 0)
token_data.append(
{
"text": token_text,
"id": int(token_id),
"type": token_type,
"is_subword": subword,
"bytes": len(token_text.encode("utf-8")),
"position": i,
}
)
return {
"model": TOKENIZER_INFO[model]["name"],
"token_count": len(tokens),
"tokens": token_data,
"compression_ratio": len(text) / len(tokens) if tokens else 0,
"encoding": TOKENIZER_INFO[model]["encoding"],
"vocab_size": TOKENIZER_INFO[model]["vocab_size"],
}
except Exception as e:
error_msg = str(e)
# Provide helpful error messages
if "gated repo" in error_msg.lower():
error_msg = f"Model is gated. Request access at https://huggingface.co/{model_name} and ensure HF_TOKEN is set."
elif "401" in error_msg:
error_msg = "Authentication failed. Check your HF_TOKEN in Space secrets."
elif "not found" in error_msg.lower():
error_msg = (
f"Model {model_name} not found. It may have been moved or renamed."
)
return {
"model": TOKENIZER_INFO[model]["name"],
"token_count": 0,
"tokens": [],
"compression_ratio": 0,
"encoding": "Error",
"vocab_size": 0,
"error": error_msg,
}