# tokenizer_i3.py import os import json from transformers import PreTrainedTokenizer from i3_model import ChunkTokenizer # ====================================================================== # HuggingFace Tokenizer Wrapper for ChunkTokenizer # ====================================================================== class I3Tokenizer(PreTrainedTokenizer): """ HuggingFace-compatible tokenizer for i3 model using ChunkTokenizer. """ vocab_files_names = {"vocab_file": "chunk_vocab_combined.json"} pretrained_vocab_files_map = {} max_model_input_sizes = {"i3": 512} def __init__(self, vocab_file=None, **kwargs): """ Args: vocab_file: Path to chunk_vocab_combined.json """ super().__init__(**kwargs) self.chunk_tokenizer = ChunkTokenizer() if vocab_file: self.chunk_tokenizer.load(vocab_file) self.vocab_file = vocab_file @property def vocab_size(self): return self.chunk_tokenizer.vocab_size def _tokenize(self, text, **kwargs): """ Convert text string to list of token strings (chunks). """ # Encode to indices, then convert back to chunk strings indices = self.chunk_tokenizer.encode(text) tokens = [self.chunk_tokenizer.idx_to_chunk[i] for i in indices] return tokens def _convert_token_to_id(self, token): """ Convert chunk string to integer ID. """ return self.chunk_tokenizer.chunk_to_idx.get(token, self.chunk_tokenizer.unk_idx) def _convert_id_to_token(self, index): """ Convert integer ID to chunk string. """ return self.chunk_tokenizer.idx_to_chunk.get(int(index), self.chunk_tokenizer.unk_token) def encode(self, text, **kwargs): """ Convert text string to list of indices. """ return self.chunk_tokenizer.encode(text) def decode(self, token_ids, **kwargs): """ Convert list of indices back to text string. """ return self.chunk_tokenizer.decode(token_ids) def save_vocabulary(self, save_directory): """ Save the vocabulary to a directory. """ if not os.path.exists(save_directory): os.makedirs(save_directory) save_path = os.path.join(save_directory, "chunk_vocab_combined.json") self.chunk_tokenizer.save(save_path) return (save_path,)