from tokenizers import ByteLevelBPETokenizer paths = ['train_code.txt', 'train_doc.txt'] # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=32000, min_frequency=3, special_tokens=[ "<pad>", "<s>", "</s>", "<unk>", "<mask>" ]) # Save files to disk tokenizer.save_model("./salesforce", "codet5") print( tokenizer.encode("<s> hello <unk> Don't you love 🤗 Transformers <mask> yes . </s>").tokens )