File size: 5,023 Bytes
81a8221
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# -*- encoding: utf-8 -*-
"""
@Auth: xuelyuxin.xlx
@Time: 2023/03/07 11:00:59
@Desc: A tool to perform string-related method
"""

import re
from typing import Union

# 定义标准中文标点,tts中文文本中的标点一般统一映射为标准中文标点
PUNC_STANDARD = "。!?,"

PUNC_PRE_NORM = ""
REGEX_CN = "\u4e00-\u4E27\u4E29-\u4E3E\u4E42-\u9fa4"  # 跳过笔画\u4E28\u4E3F\u4E40\u4E41
REGEX_EN = "a-zA-Z"
REGEX_NUM = "0-9"

BLANK_CHAR = "\t\n\r\f"
PUNC_MAP_EN2CN = {
    "……": "。",
    "…" : "。",
    "!" : "!",
    "?" : "?",
    ";" : ";",
    ":" : ":",
    "," : ",",
    "(" : "(",
    ")" : ")"
}
PUNC_MAP_OTHER2CN = {
    "﹐" : ",",
    "﹔" : ";",
    "。"  : "。"
}
PUNC_MAP_STANDARD = {
    ";" : "。",
    ":" : ",",
    "、" : ","
}


class StringOperator:
    @classmethod
    def replace_punc_en2cn(cls, string: str) -> str:
        """replace english punctuations with chinese punctuations

        "." is not replaced, because "." could represent decimal or date, eg. 12.30, 
        and normally would not be mistaken with "。"
        """
        string = cls.replace(string, PUNC_MAP_EN2CN)
        string = re.sub(r"(\")(.*?)(\")", r"“\2”", string)
        return string

    @classmethod
    def replace(cls, string: str, map_dict: dict) -> str:
        """replace chars in `string` based on `map_dict`
        Args:
            string: original string
            map_dict: the mapping dict used to perform replacement
        """
        for pattern, target in map_dict.items():
            string = re.sub(f"{pattern}", target, string)
        return string

    @classmethod
    def delete(cls, string: str, delete: Union[str, re.Pattern]) -> str:
        """delete chars in `string` matched by `delete`
        """
        if isinstance(delete, str): 
            delete = re.compile(delete)
        string = re.sub(delete, "", string)
        return string

    @classmethod
    def delete_space(cls, string) -> str:
        """delete space in string
        1. 把除了英文之间的空格去掉,即去除非英文的前后的空格
        2. 英文之间的多个空格换成单个空格
        """
        string = re.sub("(?<=[^a-zA-Z])[ ]+", "", string)
        string = re.sub("[ ]+(?=[^a-zA-Z])", "", string)
        string = re.sub("(?<=[a-zA-Z])[ ]+(?=[a-zA-Z])", " ", string)
        return string

    @classmethod
    def replace_2u(cls, string: str) -> str:
        """TODO 转unicode字符"""
        pass

    @classmethod
    def delete_comma_in_number(cls, string: str) -> str:
        """delete comma of number in string

        eg. xxx12,345,678.123,xxx -> xxx12345678.123xxx
        """
        string = re.sub(r"(?<=\d),(?=\d{3})", "", string)
        return string

    @classmethod
    def replace_F2H(cls, string: str) -> str:
        """全角转半角

        Args:
            string: unicode字符串
        """
        # 单个unicode字符 全角转半角
        def F2H(char):
            inside_code = ord(char)
            if inside_code == 0x3000:
                inside_code = 0x0020
            else:
                inside_code -= 0xfee0
            if inside_code < 0x0020 or inside_code > 0x7e:  # 转完之后不是半角字符则返回原来的字符
                return char
            return chr(inside_code)

        return "".join([F2H(char) for char in string])
    
    @classmethod
    def split(cls, pattern: str, text: int) -> list:
        """split text with matched $pattern

        different with re.split, the matched string is reserved
        """
        output = []
        start = 0
        for match in re.finditer(pattern, text):
            end = match.span()[1]
            output.append(text[start:end])
            start = end
        if start != len(text):
            output.append(text[start:])
        return output
    
    @classmethod
    def is_cn(cls, text):
        """判断text是否是纯中文
        """         
        if re.match(f"[{REGEX_CN}]+$", text):
            return True
        else:
            return False
    
    @classmethod
    def is_en(cls, text):
        """判断text是否是纯英文
        """         
        if re.match(f"([{REGEX_EN}]+$)|([{REGEX_EN}]+['][{REGEX_EN}]+$)", text):
            return True
        else:
            return False
    
    @classmethod
    def is_num(cls, text):
        """判断text是否是纯数字
        """
        if re.match(f"[\d]+$", text):
            return True
        else:
            return False


if __name__ == "__main__":

    from solutions.multimodal.adaspeech.recipes.asr.tools.get_logger import get_logger
    logger = get_logger("string_operator")
    samples = [
        ("replace", "123", {"2": "1", "3": "1"}),
        ("replace", "123", {"\d": "1"})
    ]
    for s in samples:
        if s[0] == "replace":
            logger.debug(f"raw: {s[1]}")
            logger.debug(f"replace: {StringOperator.replace(string=s[1], map_dict=s[2])}")