File size: 5,023 Bytes
81a8221 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
# -*- encoding: utf-8 -*-
"""
@Auth: xuelyuxin.xlx
@Time: 2023/03/07 11:00:59
@Desc: A tool to perform string-related method
"""
import re
from typing import Union
# 定义标准中文标点,tts中文文本中的标点一般统一映射为标准中文标点
PUNC_STANDARD = "。!?,"
PUNC_PRE_NORM = ""
REGEX_CN = "\u4e00-\u4E27\u4E29-\u4E3E\u4E42-\u9fa4" # 跳过笔画\u4E28\u4E3F\u4E40\u4E41
REGEX_EN = "a-zA-Z"
REGEX_NUM = "0-9"
BLANK_CHAR = "\t\n\r\f"
PUNC_MAP_EN2CN = {
"……": "。",
"…" : "。",
"!" : "!",
"?" : "?",
";" : ";",
":" : ":",
"," : ",",
"(" : "(",
")" : ")"
}
PUNC_MAP_OTHER2CN = {
"﹐" : ",",
"﹔" : ";",
"。" : "。"
}
PUNC_MAP_STANDARD = {
";" : "。",
":" : ",",
"、" : ","
}
class StringOperator:
@classmethod
def replace_punc_en2cn(cls, string: str) -> str:
"""replace english punctuations with chinese punctuations
"." is not replaced, because "." could represent decimal or date, eg. 12.30,
and normally would not be mistaken with "。"
"""
string = cls.replace(string, PUNC_MAP_EN2CN)
string = re.sub(r"(\")(.*?)(\")", r"“\2”", string)
return string
@classmethod
def replace(cls, string: str, map_dict: dict) -> str:
"""replace chars in `string` based on `map_dict`
Args:
string: original string
map_dict: the mapping dict used to perform replacement
"""
for pattern, target in map_dict.items():
string = re.sub(f"{pattern}", target, string)
return string
@classmethod
def delete(cls, string: str, delete: Union[str, re.Pattern]) -> str:
"""delete chars in `string` matched by `delete`
"""
if isinstance(delete, str):
delete = re.compile(delete)
string = re.sub(delete, "", string)
return string
@classmethod
def delete_space(cls, string) -> str:
"""delete space in string
1. 把除了英文之间的空格去掉,即去除非英文的前后的空格
2. 英文之间的多个空格换成单个空格
"""
string = re.sub("(?<=[^a-zA-Z])[ ]+", "", string)
string = re.sub("[ ]+(?=[^a-zA-Z])", "", string)
string = re.sub("(?<=[a-zA-Z])[ ]+(?=[a-zA-Z])", " ", string)
return string
@classmethod
def replace_2u(cls, string: str) -> str:
"""TODO 转unicode字符"""
pass
@classmethod
def delete_comma_in_number(cls, string: str) -> str:
"""delete comma of number in string
eg. xxx12,345,678.123,xxx -> xxx12345678.123xxx
"""
string = re.sub(r"(?<=\d),(?=\d{3})", "", string)
return string
@classmethod
def replace_F2H(cls, string: str) -> str:
"""全角转半角
Args:
string: unicode字符串
"""
# 单个unicode字符 全角转半角
def F2H(char):
inside_code = ord(char)
if inside_code == 0x3000:
inside_code = 0x0020
else:
inside_code -= 0xfee0
if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符则返回原来的字符
return char
return chr(inside_code)
return "".join([F2H(char) for char in string])
@classmethod
def split(cls, pattern: str, text: int) -> list:
"""split text with matched $pattern
different with re.split, the matched string is reserved
"""
output = []
start = 0
for match in re.finditer(pattern, text):
end = match.span()[1]
output.append(text[start:end])
start = end
if start != len(text):
output.append(text[start:])
return output
@classmethod
def is_cn(cls, text):
"""判断text是否是纯中文
"""
if re.match(f"[{REGEX_CN}]+$", text):
return True
else:
return False
@classmethod
def is_en(cls, text):
"""判断text是否是纯英文
"""
if re.match(f"([{REGEX_EN}]+$)|([{REGEX_EN}]+['][{REGEX_EN}]+$)", text):
return True
else:
return False
@classmethod
def is_num(cls, text):
"""判断text是否是纯数字
"""
if re.match(f"[\d]+$", text):
return True
else:
return False
if __name__ == "__main__":
from solutions.multimodal.adaspeech.recipes.asr.tools.get_logger import get_logger
logger = get_logger("string_operator")
samples = [
("replace", "123", {"2": "1", "3": "1"}),
("replace", "123", {"\d": "1"})
]
for s in samples:
if s[0] == "replace":
logger.debug(f"raw: {s[1]}")
logger.debug(f"replace: {StringOperator.replace(string=s[1], map_dict=s[2])}") |