FlameF0X commited on
Commit
7cdda9f
·
verified ·
1 Parent(s): 420490d

Create tokenizer_i3.py

Browse files
Files changed (1) hide show
  1. tokenizer_i3.py +76 -0
tokenizer_i3.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tokenizer_i3.py
2
+ import os
3
+ import json
4
+ from transformers import PreTrainedTokenizer
5
+
6
+ from i3_model import ChunkTokenizer
7
+
8
+ # ======================================================================
9
+ # HuggingFace Tokenizer Wrapper for ChunkTokenizer
10
+ # ======================================================================
11
+ class I3Tokenizer(PreTrainedTokenizer):
12
+ """
13
+ HuggingFace-compatible tokenizer for i3 model using ChunkTokenizer.
14
+ """
15
+
16
+ vocab_files_names = {"vocab_file": "chunk_vocab_combined.json"}
17
+ pretrained_vocab_files_map = {}
18
+ max_model_input_sizes = {"i3": 512}
19
+
20
+ def __init__(self, vocab_file=None, **kwargs):
21
+ """
22
+ Args:
23
+ vocab_file: Path to chunk_vocab_combined.json
24
+ """
25
+ super().__init__(**kwargs)
26
+ self.chunk_tokenizer = ChunkTokenizer()
27
+ if vocab_file:
28
+ self.chunk_tokenizer.load(vocab_file)
29
+ self.vocab_file = vocab_file
30
+
31
+ @property
32
+ def vocab_size(self):
33
+ return self.chunk_tokenizer.vocab_size
34
+
35
+ def _tokenize(self, text, **kwargs):
36
+ """
37
+ Convert text string to list of token strings (chunks).
38
+ """
39
+ # Encode to indices, then convert back to chunk strings
40
+ indices = self.chunk_tokenizer.encode(text)
41
+ tokens = [self.chunk_tokenizer.idx_to_chunk[i] for i in indices]
42
+ return tokens
43
+
44
+ def _convert_token_to_id(self, token):
45
+ """
46
+ Convert chunk string to integer ID.
47
+ """
48
+ return self.chunk_tokenizer.chunk_to_idx.get(token, self.chunk_tokenizer.unk_idx)
49
+
50
+ def _convert_id_to_token(self, index):
51
+ """
52
+ Convert integer ID to chunk string.
53
+ """
54
+ return self.chunk_tokenizer.idx_to_chunk.get(int(index), self.chunk_tokenizer.unk_token)
55
+
56
+ def encode(self, text, **kwargs):
57
+ """
58
+ Convert text string to list of indices.
59
+ """
60
+ return self.chunk_tokenizer.encode(text)
61
+
62
+ def decode(self, token_ids, **kwargs):
63
+ """
64
+ Convert list of indices back to text string.
65
+ """
66
+ return self.chunk_tokenizer.decode(token_ids)
67
+
68
+ def save_vocabulary(self, save_directory):
69
+ """
70
+ Save the vocabulary to a directory.
71
+ """
72
+ if not os.path.exists(save_directory):
73
+ os.makedirs(save_directory)
74
+ save_path = os.path.join(save_directory, "chunk_vocab_combined.json")
75
+ self.chunk_tokenizer.save(save_path)
76
+ return (save_path,)