Spaces:
Runtime error
Runtime error
| import json | |
| import logging | |
| from pathlib import Path | |
| import re | |
| from transformers import SpeechT5Tokenizer | |
| from transformers.models.speecht5.tokenization_speecht5 import ( | |
| PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES, | |
| ) | |
| from itertools import chain | |
| from typing import List, Optional | |
| logger = logging.getLogger(__name__) | |
| NP_CHARCTERS = " !\"#$%&'()=~|`{+*}<>?_-^\\@[;:],./ !”#$%&’()=~|`{+*}<>?_ー^¥@「;:」、。・`" | |
| def _g2p_with_np(text: str, np_lsit: str) -> List[str]: | |
| from pyopenjtalk import g2p | |
| np_pattern = re.compile(f"([{re.escape(np_lsit)}])") | |
| return list( | |
| chain.from_iterable( | |
| [ | |
| (text,) if text in np_lsit else g2p(text, kana=False, join=False) | |
| for text in np_pattern.split(text) | |
| if len(text) > 0 | |
| ] | |
| ) | |
| ) | |
| VOCAB_FILES_NAMES = { | |
| "vocab_file": "vocab.json", | |
| } | |
| PRETRAINED_VOCAB_FILES_MAP = { | |
| "vocab_file": { | |
| "esnya/japanese_speecht5_tts": "https://huggingface.co/esnya/japanese_speecht5_tts/resolve/main/vocab.json", | |
| }, | |
| } | |
| class SpeechT5OpenjtalkTokenizer(SpeechT5Tokenizer): | |
| vocab_files_names = VOCAB_FILES_NAMES | |
| pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP | |
| max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES | |
| model_input_names = ["input_ids", "attention_mask"] | |
| def __init__( | |
| self, | |
| vocab_file, | |
| bos_token: str = "<s>", | |
| eos_token: str = "</s>", | |
| unk_token: str = "<unk>", | |
| pad_token: str = "<pad>", | |
| non_phenome_characters: str = NP_CHARCTERS, | |
| **kwargs, | |
| ): | |
| try: | |
| super().__init__( | |
| vocab_file=None, | |
| bos_token=bos_token, | |
| eos_token=eos_token, | |
| unk_token=unk_token, | |
| pad_token=pad_token, | |
| **kwargs, | |
| ) | |
| except TypeError: | |
| pass | |
| self.non_phenome_characters = non_phenome_characters | |
| self.vocab_file = vocab_file | |
| self._load_vocab() | |
| def _load_vocab(self): | |
| if isinstance(self.vocab_file, str) and self.vocab_file.endswith(".json"): | |
| with open(self.vocab_file, encoding="utf-8") as f: | |
| self.label2id = json.load(f) | |
| self.id2label = {v: k for k, v in self.label2id.items()} | |
| def bos_token_id(self) -> int | None: | |
| return super().bos_token_id | |
| def vocab_size(self): | |
| return len(self.label2id) | |
| def get_vocab(self): | |
| return self.label2id | |
| def __getstate__(self): | |
| state = super().__getstate__() | |
| del state["sp_model"] | |
| return state | |
| def __setstate__(self, d): | |
| self.__dict__ = d | |
| self._load_vocab() | |
| def save_vocabulary( | |
| self, save_directory: str, filename_prefix: Optional[str] = None | |
| ): | |
| if filename_prefix is None: | |
| filename_prefix = ".json" | |
| save_path = Path(save_directory) | |
| if not save_path.is_dir(): | |
| logger.error(f"Vocabulary path ({save_directory}) should be a directory") | |
| return | |
| vocab_path = Path(save_directory) / Path(f"vocab{filename_prefix}") | |
| vocab_path.parent.mkdir(parents=True, exist_ok=True) | |
| with open(vocab_path, "w", encoding="utf-8") as f: | |
| json.dump(self.label2id, f, ensure_ascii=False, indent=2) | |
| return (str(vocab_path),) | |
| def _tokenize(self, text: str) -> List[str]: | |
| return _g2p_with_np(text, self.non_phenome_characters) | |
| def _convert_token_to_id(self, token): | |
| return self.label2id.get(token, self.label2id.get(self.unk_token)) | |
| def _convert_id_to_token(self, index): | |
| return self.id2label.get(index, self.unk_token) | |