| import gradio as gr | |
| from transformers import RobertaTokenizer | |
| import pandas as pd | |
| import json | |
| tokenizer = RobertaTokenizer.from_pretrained("roberta-base") | |
| def process_text(text, include_special_tokens=False, show_attention_mask=False): | |
| encoding = tokenizer(text, return_tensors="np", padding=True, truncation=True) | |
| tokens = tokenizer.tokenize(text) | |
| token_ids = tokenizer.encode(text) | |
| if not include_special_tokens: | |
| tokens = tokens | |
| token_ids = token_ids[1:-1] | |
| token_info = [] | |
| for token, token_id in zip(tokens, token_ids): | |
| info = { | |
| "Token": token, | |
| "ID": token_id, | |
| } | |
| if show_attention_mask: | |
| info["Attention Mask"] = encoding["attention_mask"][0][len(token_info)] | |
| token_info.append(info) | |
| df = pd.DataFrame(token_info) | |
| stats = f""" | |
| Number of tokens: {len(tokens)} | |
| Input text length: {len(text)} | |
| Tokens/character ratio: {len(tokens)/len(text):.2f} | |
| Vocabulary size: {tokenizer.vocab_size} | |
| """ | |
| json_output = json.dumps( | |
| { | |
| "input_ids": token_ids, | |
| "tokens": tokens, | |
| }, | |
| indent=2, | |
| ) | |
| return df, stats, json_output | |
| iface = gr.Interface( | |
| fn=process_text, | |
| inputs=[ | |
| gr.Textbox( | |
| lines=5, placeholder="Enter text to tokenize...", label="Input Text" | |
| ), | |
| gr.Checkbox(label="Include Special Tokens", value=False), | |
| gr.Checkbox(label="Show Attention Mask", value=False), | |
| ], | |
| outputs=[ | |
| gr.Dataframe( | |
| headers=["Token", "ID", "Attention Mask"], label="Tokenization Results" | |
| ), | |
| gr.Textbox(label="Statistics", lines=4), | |
| gr.JSON(label="JSON Output"), | |
| ], | |
| title="RoBERTa Tokenizer Playground", | |
| description=""" | |
| An interactive demonstration of the RoBERTa tokenizer. | |
| """, | |
| theme="default", | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch(share=True) | |