File size: 5,023 Bytes

81a8221

# -*- encoding: utf-8 -*-
"""
@Auth: xuelyuxin.xlx
@Time: 2023/03/07 11:00:59
@Desc: A tool to perform string-related method
"""

import re
from typing import Union

# 定义标准中文标点，tts中文文本中的标点一般统一映射为标准中文标点
PUNC_STANDARD = "。！？，"

PUNC_PRE_NORM = ""
REGEX_CN = "\u4e00-\u4E27\u4E29-\u4E3E\u4E42-\u9fa4"  # 跳过笔画\u4E28\u4E3F\u4E40\u4E41
REGEX_EN = "a-zA-Z"
REGEX_NUM = "0-9"

BLANK_CHAR = "\t\n\r\f"
PUNC_MAP_EN2CN = {
    "……": "。",
    "…" : "。",
    "!" : "！",
    "?" : "？",
    ";" : "；",
    ":" : "：",
    "," : "，",
    "(" : "（",
    ")" : "）"
}
PUNC_MAP_OTHER2CN = {
    "﹐" : "，",
    "﹔" : "；",
    "｡"  : "。"
}
PUNC_MAP_STANDARD = {
    "；" : "。",
    "：" : "，",
    "、" : "，"
}


class StringOperator:
    @classmethod
    def replace_punc_en2cn(cls, string: str) -> str:
        """replace english punctuations with chinese punctuations

        "." is not replaced, because "." could represent decimal or date, eg. 12.30, 
        and normally would not be mistaken with "。"
        """
        string = cls.replace(string, PUNC_MAP_EN2CN)
        string = re.sub(r"(\")(.*?)(\")", r"“\2”", string)
        return string

    @classmethod
    def replace(cls, string: str, map_dict: dict) -> str:
        """replace chars in `string` based on `map_dict`
        Args:
            string: original string
            map_dict: the mapping dict used to perform replacement
        """
        for pattern, target in map_dict.items():
            string = re.sub(f"{pattern}", target, string)
        return string

    @classmethod
    def delete(cls, string: str, delete: Union[str, re.Pattern]) -> str:
        """delete chars in `string` matched by `delete`
        """
        if isinstance(delete, str): 
            delete = re.compile(delete)
        string = re.sub(delete, "", string)
        return string

    @classmethod
    def delete_space(cls, string) -> str:
        """delete space in string
        1. 把除了英文之间的空格去掉，即去除非英文的前后的空格
        2. 英文之间的多个空格换成单个空格
        """
        string = re.sub("(?<=[^a-zA-Z])[ ]+", "", string)
        string = re.sub("[ ]+(?=[^a-zA-Z])", "", string)
        string = re.sub("(?<=[a-zA-Z])[ ]+(?=[a-zA-Z])", " ", string)
        return string

    @classmethod
    def replace_2u(cls, string: str) -> str:
        """TODO 转unicode字符"""
        pass

    @classmethod
    def delete_comma_in_number(cls, string: str) -> str:
        """delete comma of number in string

        eg. xxx12,345,678.123,xxx -> xxx12345678.123xxx
        """
        string = re.sub(r"(?<=\d),(?=\d{3})", "", string)
        return string

    @classmethod
    def replace_F2H(cls, string: str) -> str:
        """全角转半角

        Args:
            string: unicode字符串
        """
        # 单个unicode字符 全角转半角
        def F2H(char):
            inside_code = ord(char)
            if inside_code == 0x3000:
                inside_code = 0x0020
            else:
                inside_code -= 0xfee0
            if inside_code < 0x0020 or inside_code > 0x7e:  # 转完之后不是半角字符则返回原来的字符
                return char
            return chr(inside_code)

        return "".join([F2H(char) for char in string])
    
    @classmethod
    def split(cls, pattern: str, text: int) -> list:
        """split text with matched $pattern

        different with re.split, the matched string is reserved
        """
        output = []
        start = 0
        for match in re.finditer(pattern, text):
            end = match.span()[1]
            output.append(text[start:end])
            start = end
        if start != len(text):
            output.append(text[start:])
        return output
    
    @classmethod
    def is_cn(cls, text):
        """判断text是否是纯中文
        """         
        if re.match(f"[{REGEX_CN}]+$", text):
            return True
        else:
            return False
    
    @classmethod
    def is_en(cls, text):
        """判断text是否是纯英文
        """         
        if re.match(f"([{REGEX_EN}]+$)|([{REGEX_EN}]+['][{REGEX_EN}]+$)", text):
            return True
        else:
            return False
    
    @classmethod
    def is_num(cls, text):
        """判断text是否是纯数字
        """
        if re.match(f"[\d]+$", text):
            return True
        else:
            return False


if __name__ == "__main__":

    from solutions.multimodal.adaspeech.recipes.asr.tools.get_logger import get_logger
    logger = get_logger("string_operator")
    samples = [
        ("replace", "123", {"2": "1", "3": "1"}),
        ("replace", "123", {"\d": "1"})
    ]
    for s in samples:
        if s[0] == "replace":
            logger.debug(f"raw: {s[1]}")
            logger.debug(f"replace: {StringOperator.replace(string=s[1], map_dict=s[2])}")