Spaces:
Running
Running
import os | |
import re | |
import tiktoken | |
from transformers import AutoTokenizer | |
from mappings import MODEL_MAP, TOKENIZER_INFO | |
def get_token_type(token_text): | |
if re.match(r"^\s+$", token_text): | |
return "whitespace" | |
elif re.match(r"^[a-zA-Z]+$", token_text): | |
return "word" | |
elif re.match(r"^\d+$", token_text): | |
return "number" | |
elif re.match(r"^[^\w\s]+$", token_text): | |
return "punctuation" | |
elif token_text.startswith("<") and token_text.endswith(">"): | |
return "special" | |
else: | |
return "mixed" | |
def is_subword(token_text, model, is_first): | |
if model in ["llama-2", "llama-3", "qwen3"]: | |
return not token_text.startswith("▁") and not is_first | |
elif model == "bert": | |
return token_text.startswith("##") | |
else: # BPE models | |
return not token_text.startswith(" ") and not is_first and len(token_text) > 0 | |
def tokenize_with_tiktoken(text, model): | |
encoding = "cl100k_base" if model == "gpt-4" else "gpt2" | |
enc = tiktoken.get_encoding(encoding) | |
tokens = enc.encode(text) | |
token_data = [] | |
current_pos = 0 | |
for i, token_id in enumerate(tokens): | |
token_text = enc.decode([token_id]) | |
token_type = get_token_type(token_text) | |
subword = is_subword(token_text, model, i == 0) | |
token_data.append( | |
{ | |
"text": token_text, | |
"id": int(token_id), | |
"type": token_type, | |
"is_subword": subword, | |
"bytes": len(token_text.encode("utf-8")), | |
"position": i, | |
} | |
) | |
current_pos += len(token_text) | |
return { | |
"model": TOKENIZER_INFO[model]["name"], | |
"token_count": len(tokens), | |
"tokens": token_data, | |
"compression_ratio": len(text) / len(tokens) if tokens else 0, | |
"encoding": TOKENIZER_INFO[model]["encoding"], | |
"vocab_size": TOKENIZER_INFO[model]["vocab_size"], | |
} | |
def tokenize_with_hf(text, model): | |
try: | |
model_name = MODEL_MAP.get(model, "gpt2") | |
# Get token from environment | |
hf_token = os.getenv("HF_TOKEN") | |
if not hf_token: | |
return { | |
"model": TOKENIZER_INFO[model]["name"], | |
"token_count": 0, | |
"tokens": [], | |
"error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.", | |
} | |
print(f"DEBUG: Loading model {model_name} with token") | |
tokenizer = AutoTokenizer.from_pretrained( | |
model_name, token=hf_token, trust_remote_code=True | |
) | |
tokens = tokenizer.encode(text) | |
token_data = [] | |
for i, token_id in enumerate(tokens): | |
token_text = tokenizer.decode([token_id], skip_special_tokens=False) | |
token_type = get_token_type(token_text) | |
subword = is_subword(token_text, model, i == 0) | |
token_data.append( | |
{ | |
"text": token_text, | |
"id": int(token_id), | |
"type": token_type, | |
"is_subword": subword, | |
"bytes": len(token_text.encode("utf-8")), | |
"position": i, | |
} | |
) | |
return { | |
"model": TOKENIZER_INFO[model]["name"], | |
"token_count": len(tokens), | |
"tokens": token_data, | |
"compression_ratio": len(text) / len(tokens) if tokens else 0, | |
"encoding": TOKENIZER_INFO[model]["encoding"], | |
"vocab_size": TOKENIZER_INFO[model]["vocab_size"], | |
} | |
except Exception as e: | |
error_msg = str(e) | |
# Provide helpful error messages | |
if "gated repo" in error_msg.lower(): | |
error_msg = f"Model is gated. Request access at https://huggingface.co/{model_name} and ensure HF_TOKEN is set." | |
elif "401" in error_msg: | |
error_msg = "Authentication failed. Check your HF_TOKEN in Space secrets." | |
elif "not found" in error_msg.lower(): | |
error_msg = ( | |
f"Model {model_name} not found. It may have been moved or renamed." | |
) | |
return { | |
"model": TOKENIZER_INFO[model]["name"], | |
"token_count": 0, | |
"tokens": [], | |
"compression_ratio": 0, | |
"encoding": "Error", | |
"vocab_size": 0, | |
"error": error_msg, | |
} | |