Ming-Lite-Omni / sentence_manager /text_norm /utils /string_operator.py

LandyGuo

update 20250516 version

81a8221 about 1 month ago

5.02 kB

	# -- encoding: utf-8 --
	"""
	@Auth: xuelyuxin.xlx
	@Time: 2023/03/07 11:00:59
	@Desc: A tool to perform string-related method
	"""

	import re
	from typing import Union

	# 定义标准中文标点，tts中文文本中的标点一般统一映射为标准中文标点
	PUNC_STANDARD = "。！？，"

	PUNC_PRE_NORM = ""
	REGEX_CN = "\u4e00-\u4E27\u4E29-\u4E3E\u4E42-\u9fa4" # 跳过笔画\u4E28\u4E3F\u4E40\u4E41
	REGEX_EN = "a-zA-Z"
	REGEX_NUM = "0-9"

	BLANK_CHAR = "\t\n\r\f"
	PUNC_MAP_EN2CN = {
	"……": "。",
	"…" : "。",
	"!" : "！",
	"?" : "？",
	";" : "；",
	":" : "：",
	"," : "，",
	"(" : "（",
	")" : "）"
	}
	PUNC_MAP_OTHER2CN = {
	"﹐" : "，",
	"﹔" : "；",
	"｡" : "。"
	}
	PUNC_MAP_STANDARD = {
	"；" : "。",
	"：" : "，",
	"、" : "，"
	}


	class StringOperator:
	@classmethod
	def replace_punc_en2cn(cls, string: str) -> str:
	"""replace english punctuations with chinese punctuations

	"." is not replaced, because "." could represent decimal or date, eg. 12.30,
	and normally would not be mistaken with "。"
	"""
	string = cls.replace(string, PUNC_MAP_EN2CN)
	string = re.sub(r"(\")(.*?)(\")", r"“\2”", string)
	return string

	@classmethod
	def replace(cls, string: str, map_dict: dict) -> str:
	"""replace chars in `string` based on `map_dict`
	Args:
	string: original string
	map_dict: the mapping dict used to perform replacement
	"""
	for pattern, target in map_dict.items():
	string = re.sub(f"{pattern}", target, string)
	return string

	@classmethod
	def delete(cls, string: str, delete: Union[str, re.Pattern]) -> str:
	"""delete chars in `string` matched by `delete`
	"""
	if isinstance(delete, str):
	delete = re.compile(delete)
	string = re.sub(delete, "", string)
	return string

	@classmethod
	def delete_space(cls, string) -> str:
	"""delete space in string
	1. 把除了英文之间的空格去掉，即去除非英文的前后的空格
	2. 英文之间的多个空格换成单个空格
	"""
	string = re.sub("(?<=[^a-zA-Z])[ ]+", "", string)
	string = re.sub("[ ]+(?=[^a-zA-Z])", "", string)
	string = re.sub("(?<=[a-zA-Z])[ ]+(?=[a-zA-Z])", " ", string)
	return string

	@classmethod
	def replace_2u(cls, string: str) -> str:
	"""TODO 转unicode字符"""
	pass

	@classmethod
	def delete_comma_in_number(cls, string: str) -> str:
	"""delete comma of number in string

	eg. xxx12,345,678.123,xxx -> xxx12345678.123xxx
	"""
	string = re.sub(r"(?<=\d),(?=\d{3})", "", string)
	return string

	@classmethod
	def replace_F2H(cls, string: str) -> str:
	"""全角转半角

	Args:
	string: unicode字符串
	"""
	# 单个unicode字符全角转半角
	def F2H(char):
	inside_code = ord(char)
	if inside_code == 0x3000:
	inside_code = 0x0020
	else:
	inside_code -= 0xfee0
	if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符则返回原来的字符
	return char
	return chr(inside_code)

	return "".join([F2H(char) for char in string])

	@classmethod
	def split(cls, pattern: str, text: int) -> list:
	"""split text with matched $pattern

	different with re.split, the matched string is reserved
	"""
	output = []
	start = 0
	for match in re.finditer(pattern, text):
	end = match.span()[1]
	output.append(text[start:end])
	start = end
	if start != len(text):
	output.append(text[start:])
	return output

	@classmethod
	def is_cn(cls, text):
	"""判断text是否是纯中文
	"""
	if re.match(f"[{REGEX_CN}]+$", text):
	return True
	else:
	return False

	@classmethod
	def is_en(cls, text):
	"""判断text是否是纯英文
	"""
	if re.match(f"([{REGEX_EN}]+$)\|([{REGEX_EN}]+['][{REGEX_EN}]+$)", text):
	return True
	else:
	return False

	@classmethod
	def is_num(cls, text):
	"""判断text是否是纯数字
	"""
	if re.match(f"[\d]+$", text):
	return True
	else:
	return False


	if __name__ == "__main__":

	from solutions.multimodal.adaspeech.recipes.asr.tools.get_logger import get_logger
	logger = get_logger("string_operator")
	samples = [
	("replace", "123", {"2": "1", "3": "1"}),
	("replace", "123", {"\d": "1"})
	]
	for s in samples:
	if s[0] == "replace":
	logger.debug(f"raw: {s[1]}")
	logger.debug(f"replace: {StringOperator.replace(string=s[1], map_dict=s[2])}")