Spaces:

kemuriririn
/

GPT-SoVITS-v2-ProPlus

Running

App Files Files Community

GPT-SoVITS-v2-ProPlus / text /g2pw /utils.py

kemuriririn

init

60ea83f 20 days ago

raw

history blame contribute delete

4.83 kB

	# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	Credits
	This code is modified from https://github.com/GitYCC/g2pW
	"""

	import os
	import re


	def wordize_and_map(text: str):
	words = []
	index_map_from_text_to_word = []
	index_map_from_word_to_text = []
	while len(text) > 0:
	match_space = re.match(r"^ +", text)
	if match_space:
	space_str = match_space.group(0)
	index_map_from_text_to_word += [None] * len(space_str)
	text = text[len(space_str) :]
	continue

	match_en = re.match(r"^[a-zA-Z0-9]+", text)
	if match_en:
	en_word = match_en.group(0)

	word_start_pos = len(index_map_from_text_to_word)
	word_end_pos = word_start_pos + len(en_word)
	index_map_from_word_to_text.append((word_start_pos, word_end_pos))

	index_map_from_text_to_word += [len(words)] * len(en_word)

	words.append(en_word)
	text = text[len(en_word) :]
	else:
	word_start_pos = len(index_map_from_text_to_word)
	word_end_pos = word_start_pos + 1
	index_map_from_word_to_text.append((word_start_pos, word_end_pos))

	index_map_from_text_to_word += [len(words)]

	words.append(text[0])
	text = text[1:]
	return words, index_map_from_text_to_word, index_map_from_word_to_text


	def tokenize_and_map(tokenizer, text: str):
	words, text2word, word2text = wordize_and_map(text=text)

	tokens = []
	index_map_from_token_to_text = []
	for word, (word_start, word_end) in zip(words, word2text):
	word_tokens = tokenizer.tokenize(word)

	if len(word_tokens) == 0 or word_tokens == ["[UNK]"]:
	index_map_from_token_to_text.append((word_start, word_end))
	tokens.append("[UNK]")
	else:
	current_word_start = word_start
	for word_token in word_tokens:
	word_token_len = len(re.sub(r"^##", "", word_token))
	index_map_from_token_to_text.append((current_word_start, current_word_start + word_token_len))
	current_word_start = current_word_start + word_token_len
	tokens.append(word_token)

	index_map_from_text_to_token = text2word
	for i, (token_start, token_end) in enumerate(index_map_from_token_to_text):
	for token_pos in range(token_start, token_end):
	index_map_from_text_to_token[token_pos] = i

	return tokens, index_map_from_text_to_token, index_map_from_token_to_text


	def _load_config(config_path: os.PathLike):
	import importlib.util

	spec = importlib.util.spec_from_file_location("__init__", config_path)
	config = importlib.util.module_from_spec(spec)
	spec.loader.exec_module(config)
	return config


	default_config_dict = {
	"manual_seed": 1313,
	"model_source": "bert-base-chinese",
	"window_size": 32,
	"num_workers": 2,
	"use_mask": True,
	"use_char_phoneme": False,
	"use_conditional": True,
	"param_conditional": {
	"affect_location": "softmax",
	"bias": True,
	"char-linear": True,
	"pos-linear": False,
	"char+pos-second": True,
	"char+pos-second_lowrank": False,
	"lowrank_size": 0,
	"char+pos-second_fm": False,
	"fm_size": 0,
	"fix_mode": None,
	"count_json": "train.count.json",
	},
	"lr": 5e-5,
	"val_interval": 200,
	"num_iter": 10000,
	"use_focal": False,
	"param_focal": {"alpha": 0.0, "gamma": 0.7},
	"use_pos": True,
	"param_pos ": {
	"weight": 0.1,
	"pos_joint_training": True,
	"train_pos_path": "train.pos",
	"valid_pos_path": "dev.pos",
	"test_pos_path": "test.pos",
	},
	}


	def load_config(config_path: os.PathLike, use_default: bool = False):
	config = _load_config(config_path)
	if use_default:
	for attr, val in default_config_dict.items():
	if not hasattr(config, attr):
	setattr(config, attr, val)
	elif isinstance(val, dict):
	d = getattr(config, attr)
	for dict_k, dict_v in val.items():
	if dict_k not in d:
	d[dict_k] = dict_v
	return config