|
|
|
""" |
|
@Auth: xuelyuxin.xlx |
|
@Time: 2023/05/12 14:55:44 |
|
@Desc: 正则模块 |
|
`Normalizer.normalize_regular`输出内容只包含简体中文、英文、tts相关标点的文本 |
|
@Usge: null |
|
""" |
|
|
|
import re |
|
from typing import List, Dict |
|
from .utils.chronology import RE_DATE |
|
from .utils.chronology import RE_DATE2 |
|
from .utils.chronology import RE_TIME, RE_TIME_2, RE_TIME_3 |
|
from .utils.chronology import RE_TIME_RANGE |
|
from .utils.chronology import replace_date |
|
from .utils.chronology import replace_date2 |
|
from .utils.chronology import replace_time, replace_time_nohour |
|
from .utils.num import RE_DECIMAL_NUM |
|
from .utils.num import RE_DEFAULT_NUM |
|
from .utils.num import RE_FRAC |
|
from .utils.num import RE_INTEGER |
|
from .utils.num import RE_NUMBER |
|
from .utils.num import RE_PERCENTAGE |
|
from .utils.num import RE_POSITIVE_QUANTIFIERS |
|
from .utils.num import RE_POSITIVE_QUANTIFIERS_2 |
|
from .utils.num import RE_RANGE |
|
from .utils.num import RE_DIGITS |
|
from .utils.num import RE_LICENSE_PLATE |
|
from .utils.num import replace_default_num_with_altone, replace_default_num_without_altone |
|
from .utils.num import replace_frac |
|
from .utils.num import replace_negative_num |
|
from .utils.num import replace_number |
|
from .utils.num import replace_percentage |
|
from .utils.num import replace_positive_quantifier |
|
from .utils.num import replace_positive_quantifier_2 |
|
from .utils.num import replace_range |
|
from .utils.num import replace_license_plate |
|
from .utils.phonecode import RE_MOBILE_PHONE |
|
from .utils.phonecode import RE_NATIONAL_UNIFORM_NUMBER |
|
from .utils.phonecode import RE_TELEPHONE |
|
from .utils.phonecode import replace_mobile |
|
from .utils.phonecode import replace_phone |
|
from .utils.quantifier import RE_TEMPERATURE |
|
from .utils.quantifier import replace_temperature |
|
from .utils.address import RE_ADDRESS_room, RE_ADDRESS |
|
from .utils.address import replace_address_room, replace_address |
|
from .utils.currency import RE_CURRENCY, RE_CURRENCY_2 |
|
from .utils.currency import replace_currency, replace_currency_2 |
|
from .utils.en_num import normalize_numbers as en_normalize_numbers |
|
from .utils.string_operator import BLANK_CHAR, PUNC_MAP_OTHER2CN, PUNC_MAP_STANDARD, PUNC_STANDARD, REGEX_CN |
|
from .utils.string_operator import StringOperator as stringop |
|
|
|
|
|
def add_blank(match_obj): |
|
return " ".join(list(match_obj.group(0))) |
|
|
|
|
|
def convert_date(string): |
|
nums = re.findall("[\d]+", string) |
|
if len(nums) == 3: |
|
year, month, day = [RE_NUMBER.sub(replace_number, w) for w in nums] |
|
return f"{year}年{month}月{day}日" |
|
elif len(nums) == 2: |
|
month, day = [RE_NUMBER.sub(replace_number, w) for w in nums] |
|
return f"{month}月{day}日" |
|
else: |
|
return string |
|
|
|
|
|
class Normalizer: |
|
"""文本正则 |
|
""" |
|
|
|
@classmethod |
|
def substitute(cls, pattern: re.Pattern, replace_func, text: str, trace: list): |
|
for matchobj in pattern.finditer(text): |
|
origin_word = matchobj.group(0) |
|
new_word = replace_func(matchobj) |
|
trace.append({"origin_word": origin_word, "new_word": new_word}) |
|
text = text.replace(origin_word, new_word) |
|
return text |
|
|
|
@classmethod |
|
def preprocess(cls, text: str) -> str: |
|
"""正则前的预处理 |
|
|
|
1. 繁体转简体 |
|
2. 过滤不影响正则的标点符号等,包括 |
|
a. 数字间的逗号 |
|
b. 空格(除英文之间外) |
|
c. 空白字符\t\n\r\f |
|
3. 英文转小写 |
|
""" |
|
text = stringop.replace_F2H(text) |
|
text = stringop.delete_comma_in_number(text) |
|
|
|
text = re.sub(rf"[{BLANK_CHAR}]", ",", text) |
|
|
|
|
|
text = re.sub(r"㎡", "m²", text) |
|
text = text.replace("㎡", "m²") |
|
text = text.replace("cm²", "平方厘米") |
|
text = text.replace("m²", "平方米") |
|
|
|
|
|
|
|
|
|
|
|
text = re.sub(r">(\d)", r"大于\1", text) |
|
text = re.sub(r"<(\d)", r"小于\1", text) |
|
text = re.sub(r"=", "等于", text) |
|
text = re.sub(r"(?<=\d)ml(?![a-zA-Z])", "毫升", text) |
|
text = re.sub(r"(?<=\d)mmHg(?![a-zA-Z])", "毫米汞柱", text) |
|
text = re.sub(r"([0-9.]+元)(-)([0-9.]+元)", "\\1至\\3", text) |
|
return text |
|
|
|
@classmethod |
|
def postprocess(cls, text: str, custom: List[Dict] = None) -> str: |
|
"""正则后处理,只包括删除和替换操作 |
|
正则后的文本中的字符类型只包括: |
|
1. 中文 2. 英文 3. 标点`,。!?` |
|
""" |
|
if custom is not None: |
|
for map_dict in custom: |
|
text = stringop.replace(text, map_dict) |
|
return text |
|
|
|
text = stringop.replace_punc_en2cn(text) |
|
text = stringop.replace(text, PUNC_MAP_OTHER2CN) |
|
|
|
text = stringop.replace(text, PUNC_MAP_STANDARD) |
|
|
|
text = re.sub(r"。+", "。", text) |
|
|
|
text = re.sub("/", "每", text) |
|
|
|
text = re.sub(r"~+", "~", text) |
|
text = re.sub(r"~+", "~", text) |
|
|
|
text = re.sub(r"[~~]", "。", text) |
|
|
|
text = stringop.delete(text, f"[^{PUNC_STANDARD}{REGEX_CN}A-Za-z0-9@]") |
|
text = stringop.delete(text, "@(?!break)") |
|
return text |
|
|
|
@classmethod |
|
def custom(cls, text: str, *, interpret_as: str) -> str: |
|
text = cls.preprocess(text) |
|
if not text: |
|
return "" |
|
text = cls.normalize_custom(text, interpret_as=interpret_as) |
|
text = cls.postprocess(text) |
|
return text |
|
|
|
@classmethod |
|
def regular(cls, text: str) -> str: |
|
text = cls.preprocess(text) |
|
if not text: |
|
return "" |
|
text = cls.normalize_regular(text) |
|
text = cls.postprocess(text) |
|
return text |
|
|
|
@classmethod |
|
def normalize_custom(cls, text: str, *, interpret_as: str) -> str: |
|
"""指定正则 |
|
|
|
仅针对`interpret_as`进行正则匹配 |
|
|
|
Args: |
|
interpret_as: string in ['cardinal', 'currency', 'digits', 'telephone', 'address', 'date', 'time', 'id'] |
|
""" |
|
assert interpret_as in [ |
|
"cardinal", |
|
"currency", |
|
"digits", |
|
"telephone", |
|
"address", |
|
"date", |
|
"time", |
|
"id", |
|
"measure", |
|
"punctuation" |
|
], f""" |
|
interpret_as:{interpret_as} not supported |
|
""".strip() |
|
|
|
if interpret_as == "cardinal": |
|
text = text.replace(",", "") |
|
text = RE_NUMBER.sub(replace_number, text) |
|
text = RE_FRAC.sub(replace_frac, text) |
|
text = RE_PERCENTAGE.sub(replace_percentage, text) |
|
elif interpret_as == "currency": |
|
text = RE_CURRENCY.sub(replace_currency, text) |
|
text = RE_CURRENCY_2.sub(replace_currency_2, text) |
|
text = text.replace(",", "") |
|
text = RE_NUMBER.sub(replace_number, text) |
|
text = RE_FRAC.sub(replace_frac, text) |
|
elif interpret_as == "digits": |
|
text = RE_DIGITS.sub(replace_default_num_without_altone, text) |
|
elif interpret_as == "telephone": |
|
text = RE_MOBILE_PHONE.sub(replace_mobile, text) |
|
text = RE_TELEPHONE.sub(replace_phone, text) |
|
text = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, text) |
|
text = RE_DIGITS.sub(replace_default_num_with_altone, text) |
|
elif interpret_as == "address": |
|
text = text.replace("-", "杠") |
|
text = RE_ADDRESS_room.sub(replace_address_room, text) |
|
text = RE_ADDRESS.sub(replace_address, text) |
|
elif interpret_as == "date": |
|
text = RE_DATE.sub(replace_date, text) |
|
text = RE_DATE2.sub(replace_date2, text) |
|
text = convert_date(text) |
|
text = text.replace("-", "至") |
|
elif interpret_as == "time": |
|
text = RE_TIME_RANGE.sub(replace_time, text) |
|
text = RE_TIME.sub(replace_time, text) |
|
elif interpret_as == "id": |
|
text = RE_DIGITS.sub(replace_default_num_with_altone, text) |
|
text = text.replace("_", "下划线").replace("-", "杠").upper() |
|
text = re.sub("[a-zA-Z]+", add_blank, text) |
|
elif interpret_as == "measure": |
|
text = text.replace("㎡", "m²") |
|
text = text.replace("cm²", "平方厘米") |
|
text = text.replace("m²", "平方米") |
|
text = text.replace("cm", "厘米") |
|
text = text.replace("mm", "毫米") |
|
text = text.replace("m", "米") |
|
text = text.replace("kg", "千克") |
|
text = text.replace("g", "克") |
|
elif interpret_as == "punctuation": |
|
text = re.sub("…+", "省略号", text) |
|
text = re.sub("\"|“|”", "双引号", text) |
|
text = re.sub("'|‘|’", "单引号", text) |
|
text = re.sub("(|\\(", "左括号", text) |
|
text = re.sub(")|\\)", "右括号", text) |
|
text = re.sub("!|!", "叹号", text) |
|
for sym, txt in zip( |
|
['……', '…', '!', '"', '#', '$', '%', '&', '‘', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_'], |
|
['省略号', '省略号', '叹号', '双引号', '井号', 'dollar', '百分号', 'and', '单引号', '左括号', '右括号', '星号', '加号', '逗号', '杠', '点', '斜杠', '冒号', '分号', '小于', '等号', '大于', '问号', 'at', '左方括号', '反斜线', '右方括号', '脱字符', '下划线'] |
|
): |
|
text = text.replace(sym, txt) |
|
return text |
|
|
|
@classmethod |
|
def normalize_regular(cls, text: str, is_en: bool = False, return_details: bool = False) -> str: |
|
"""通用正则 |
|
|
|
包含了所有可能情况的正则匹配 |
|
输出的文本中只包含中文、英文、tts所需的标点(",。!?") |
|
""" |
|
trace = [] |
|
if is_en is True: |
|
text = en_normalize_numbers(text) |
|
text = text.replace(".", "。") |
|
text = text.replace(",", ",") |
|
else: |
|
text = cls.substitute(RE_DATE, replace_date, text, trace) |
|
text = cls.substitute(RE_DATE2, replace_date2, text, trace) |
|
text = re.sub(r"(?<=[\d%])[-~](?=\d)", "至", text) |
|
text = cls.substitute(RE_TIME_RANGE, replace_time, text, trace) |
|
text = cls.substitute(RE_TIME, replace_time, text, trace) |
|
text = cls.substitute(RE_CURRENCY, replace_currency, text, trace) |
|
text = cls.substitute(RE_CURRENCY_2, replace_currency_2, text, trace) |
|
text = cls.substitute(RE_TEMPERATURE, replace_temperature, text, trace) |
|
text = cls.substitute(RE_LICENSE_PLATE, replace_license_plate, text, trace) |
|
text = cls.substitute(RE_FRAC, replace_frac, text, trace) |
|
text = cls.substitute(RE_PERCENTAGE, replace_percentage, text, trace) |
|
text = cls.substitute(RE_MOBILE_PHONE, replace_mobile, text, trace) |
|
text = cls.substitute(RE_TELEPHONE, replace_phone, text, trace) |
|
text = cls.substitute(RE_NATIONAL_UNIFORM_NUMBER, replace_phone, text, trace) |
|
text = cls.substitute(RE_RANGE, replace_range, text, trace) |
|
text = cls.substitute(RE_INTEGER, replace_negative_num, text, trace) |
|
text = cls.substitute(RE_DECIMAL_NUM, replace_number, text, trace) |
|
text = cls.substitute(RE_POSITIVE_QUANTIFIERS_2, replace_positive_quantifier_2, text, trace) |
|
text = cls.substitute(RE_POSITIVE_QUANTIFIERS, replace_positive_quantifier, text, trace) |
|
text = cls.substitute(RE_DEFAULT_NUM, replace_default_num_with_altone, text, trace) |
|
text = cls.substitute(RE_NUMBER, replace_number, text, trace) |
|
|
|
return text, trace |
|
|