|
|
|
""" |
|
@Auth: xuelyuxin.xlx |
|
@Time: 2023/03/07 11:00:59 |
|
@Desc: A tool to perform string-related method |
|
""" |
|
|
|
import re |
|
from typing import Union |
|
|
|
|
|
PUNC_STANDARD = "。!?," |
|
|
|
PUNC_PRE_NORM = "" |
|
REGEX_CN = "\u4e00-\u4E27\u4E29-\u4E3E\u4E42-\u9fa4" |
|
REGEX_EN = "a-zA-Z" |
|
REGEX_NUM = "0-9" |
|
|
|
BLANK_CHAR = "\t\n\r\f" |
|
PUNC_MAP_EN2CN = { |
|
"……": "。", |
|
"…" : "。", |
|
"!" : "!", |
|
"?" : "?", |
|
";" : ";", |
|
":" : ":", |
|
"," : ",", |
|
"(" : "(", |
|
")" : ")" |
|
} |
|
PUNC_MAP_OTHER2CN = { |
|
"﹐" : ",", |
|
"﹔" : ";", |
|
"。" : "。" |
|
} |
|
PUNC_MAP_STANDARD = { |
|
";" : "。", |
|
":" : ",", |
|
"、" : "," |
|
} |
|
|
|
|
|
class StringOperator: |
|
@classmethod |
|
def replace_punc_en2cn(cls, string: str) -> str: |
|
"""replace english punctuations with chinese punctuations |
|
|
|
"." is not replaced, because "." could represent decimal or date, eg. 12.30, |
|
and normally would not be mistaken with "。" |
|
""" |
|
string = cls.replace(string, PUNC_MAP_EN2CN) |
|
string = re.sub(r"(\")(.*?)(\")", r"“\2”", string) |
|
return string |
|
|
|
@classmethod |
|
def replace(cls, string: str, map_dict: dict) -> str: |
|
"""replace chars in `string` based on `map_dict` |
|
Args: |
|
string: original string |
|
map_dict: the mapping dict used to perform replacement |
|
""" |
|
for pattern, target in map_dict.items(): |
|
string = re.sub(f"{pattern}", target, string) |
|
return string |
|
|
|
@classmethod |
|
def delete(cls, string: str, delete: Union[str, re.Pattern]) -> str: |
|
"""delete chars in `string` matched by `delete` |
|
""" |
|
if isinstance(delete, str): |
|
delete = re.compile(delete) |
|
string = re.sub(delete, "", string) |
|
return string |
|
|
|
@classmethod |
|
def delete_space(cls, string) -> str: |
|
"""delete space in string |
|
1. 把除了英文之间的空格去掉,即去除非英文的前后的空格 |
|
2. 英文之间的多个空格换成单个空格 |
|
""" |
|
string = re.sub("(?<=[^a-zA-Z])[ ]+", "", string) |
|
string = re.sub("[ ]+(?=[^a-zA-Z])", "", string) |
|
string = re.sub("(?<=[a-zA-Z])[ ]+(?=[a-zA-Z])", " ", string) |
|
return string |
|
|
|
@classmethod |
|
def replace_2u(cls, string: str) -> str: |
|
"""TODO 转unicode字符""" |
|
pass |
|
|
|
@classmethod |
|
def delete_comma_in_number(cls, string: str) -> str: |
|
"""delete comma of number in string |
|
|
|
eg. xxx12,345,678.123,xxx -> xxx12345678.123xxx |
|
""" |
|
string = re.sub(r"(?<=\d),(?=\d{3})", "", string) |
|
return string |
|
|
|
@classmethod |
|
def replace_F2H(cls, string: str) -> str: |
|
"""全角转半角 |
|
|
|
Args: |
|
string: unicode字符串 |
|
""" |
|
|
|
def F2H(char): |
|
inside_code = ord(char) |
|
if inside_code == 0x3000: |
|
inside_code = 0x0020 |
|
else: |
|
inside_code -= 0xfee0 |
|
if inside_code < 0x0020 or inside_code > 0x7e: |
|
return char |
|
return chr(inside_code) |
|
|
|
return "".join([F2H(char) for char in string]) |
|
|
|
@classmethod |
|
def split(cls, pattern: str, text: int) -> list: |
|
"""split text with matched $pattern |
|
|
|
different with re.split, the matched string is reserved |
|
""" |
|
output = [] |
|
start = 0 |
|
for match in re.finditer(pattern, text): |
|
end = match.span()[1] |
|
output.append(text[start:end]) |
|
start = end |
|
if start != len(text): |
|
output.append(text[start:]) |
|
return output |
|
|
|
@classmethod |
|
def is_cn(cls, text): |
|
"""判断text是否是纯中文 |
|
""" |
|
if re.match(f"[{REGEX_CN}]+$", text): |
|
return True |
|
else: |
|
return False |
|
|
|
@classmethod |
|
def is_en(cls, text): |
|
"""判断text是否是纯英文 |
|
""" |
|
if re.match(f"([{REGEX_EN}]+$)|([{REGEX_EN}]+['][{REGEX_EN}]+$)", text): |
|
return True |
|
else: |
|
return False |
|
|
|
@classmethod |
|
def is_num(cls, text): |
|
"""判断text是否是纯数字 |
|
""" |
|
if re.match(f"[\d]+$", text): |
|
return True |
|
else: |
|
return False |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
from solutions.multimodal.adaspeech.recipes.asr.tools.get_logger import get_logger |
|
logger = get_logger("string_operator") |
|
samples = [ |
|
("replace", "123", {"2": "1", "3": "1"}), |
|
("replace", "123", {"\d": "1"}) |
|
] |
|
for s in samples: |
|
if s[0] == "replace": |
|
logger.debug(f"raw: {s[1]}") |
|
logger.debug(f"replace: {StringOperator.replace(string=s[1], map_dict=s[2])}") |