Ming-Lite-Omni / sentence_manager /text_norm /normalizer.py

LandyGuo

update 20250516 version

81a8221 2 months ago

12.9 kB

	# -- encoding: utf-8 --
	"""
	@Auth: xuelyuxin.xlx
	@Time: 2023/05/12 14:55:44
	@Desc: 正则模块
	`Normalizer.normalize_regular`输出内容只包含简体中文、英文、tts相关标点的文本
	@Usge: null
	"""

	import re
	from typing import List, Dict
	from .utils.chronology import RE_DATE
	from .utils.chronology import RE_DATE2
	from .utils.chronology import RE_TIME, RE_TIME_2, RE_TIME_3
	from .utils.chronology import RE_TIME_RANGE
	from .utils.chronology import replace_date
	from .utils.chronology import replace_date2
	from .utils.chronology import replace_time, replace_time_nohour
	from .utils.num import RE_DECIMAL_NUM
	from .utils.num import RE_DEFAULT_NUM
	from .utils.num import RE_FRAC
	from .utils.num import RE_INTEGER
	from .utils.num import RE_NUMBER
	from .utils.num import RE_PERCENTAGE
	from .utils.num import RE_POSITIVE_QUANTIFIERS
	from .utils.num import RE_POSITIVE_QUANTIFIERS_2
	from .utils.num import RE_RANGE
	from .utils.num import RE_DIGITS
	from .utils.num import RE_LICENSE_PLATE
	from .utils.num import replace_default_num_with_altone, replace_default_num_without_altone
	from .utils.num import replace_frac
	from .utils.num import replace_negative_num
	from .utils.num import replace_number
	from .utils.num import replace_percentage
	from .utils.num import replace_positive_quantifier
	from .utils.num import replace_positive_quantifier_2
	from .utils.num import replace_range
	from .utils.num import replace_license_plate
	from .utils.phonecode import RE_MOBILE_PHONE
	from .utils.phonecode import RE_NATIONAL_UNIFORM_NUMBER
	from .utils.phonecode import RE_TELEPHONE
	from .utils.phonecode import replace_mobile
	from .utils.phonecode import replace_phone
	from .utils.quantifier import RE_TEMPERATURE
	from .utils.quantifier import replace_temperature
	from .utils.address import RE_ADDRESS_room, RE_ADDRESS
	from .utils.address import replace_address_room, replace_address
	from .utils.currency import RE_CURRENCY, RE_CURRENCY_2
	from .utils.currency import replace_currency, replace_currency_2
	from .utils.en_num import normalize_numbers as en_normalize_numbers
	from .utils.string_operator import BLANK_CHAR, PUNC_MAP_OTHER2CN, PUNC_MAP_STANDARD, PUNC_STANDARD, REGEX_CN # noqa
	from .utils.string_operator import StringOperator as stringop


	def add_blank(match_obj):
	return " ".join(list(match_obj.group(0)))


	def convert_date(string):
	nums = re.findall("[\d]+", string)
	if len(nums) == 3:
	year, month, day = [RE_NUMBER.sub(replace_number, w) for w in nums]
	return f"{year}年{month}月{day}日"
	elif len(nums) == 2:
	month, day = [RE_NUMBER.sub(replace_number, w) for w in nums]
	return f"{month}月{day}日"
	else:
	return string


	class Normalizer:
	"""文本正则
	"""

	@classmethod
	def substitute(cls, pattern: re.Pattern, replace_func, text: str, trace: list):
	for matchobj in pattern.finditer(text):
	origin_word = matchobj.group(0)
	new_word = replace_func(matchobj)
	trace.append({"origin_word": origin_word, "new_word": new_word})
	text = text.replace(origin_word, new_word)
	return text

	@classmethod
	def preprocess(cls, text: str) -> str:
	"""正则前的预处理

	1. 繁体转简体
	2. 过滤不影响正则的标点符号等，包括
	a. 数字间的逗号
	b. 空格（除英文之间外）
	c. 空白字符\t\n\r\f
	3. 英文转小写
	"""
	text = stringop.replace_F2H(text) # 统一转半角
	text = stringop.delete_comma_in_number(text) # 去掉数字之间的逗号
	# text = stringop.delete_space(text)
	text = re.sub(rf"[{BLANK_CHAR}]", "，", text) # 去掉不影响正则的字符
	# text = text.lower()
	# 处理特殊符号
	text = re.sub(r"㎡", "m²", text)
	text = text.replace("㎡", "m²")
	text = text.replace("cm²", "平方厘米")
	text = text.replace("m²", "平方米")
	# text = text.replace("<", "<")
	# text = text.replace(">", ">")
	# text = text.replace("&", "&")
	# text = text.replace(""", "\"")
	# text = text.replace("'", "'")
	text = re.sub(r">(\d)", r"大于\1", text)
	text = re.sub(r"<(\d)", r"小于\1", text)
	text = re.sub(r"=", "等于", text)
	text = re.sub(r"(?<=\d)ml(?![a-zA-Z])", "毫升", text)
	text = re.sub(r"(?<=\d)mmHg(?![a-zA-Z])", "毫米汞柱", text)
	text = re.sub(r"([0-9.]+元)(-)([0-9.]+元)", "\\1至\\3", text)
	return text

	@classmethod
	def postprocess(cls, text: str, custom: List[Dict] = None) -> str:
	"""正则后处理，只包括删除和替换操作
	正则后的文本中的字符类型只包括：
	1. 中文 2. 英文 3. 标点`，。！？`
	"""
	if custom is not None:
	for map_dict in custom:
	text = stringop.replace(text, map_dict)
	return text
	# 标点符号统一为中文标点`。！？，`
	text = stringop.replace_punc_en2cn(text)
	text = stringop.replace(text, PUNC_MAP_OTHER2CN)
	# 进一步把中文标点统一为标准中文标点`。！？，`
	text = stringop.replace(text, PUNC_MAP_STANDARD)
	# 处理连续句号"。"
	text = re.sub(r"。+", "。", text)
	# 处理正则后的 "/"
	text = re.sub("/", "每", text)
	# 处理正则后的[~～]+
	text = re.sub(r"～+", "～", text)
	text = re.sub(r"~+", "~", text)
	# text = re.sub(r"[~～](?=[0-9])", "至", text)
	text = re.sub(r"[~～]", "。", text)
	# 删除除了中文、英文、数字、标准中文标点、@break外的其他符号
	text = stringop.delete(text, f"[^{PUNC_STANDARD}{REGEX_CN}A-Za-z0-9@]")
	text = stringop.delete(text, "@(?!break)") # 删除@符号，`@break`除外
	return text

	@classmethod
	def custom(cls, text: str, *, interpret_as: str) -> str:
	text = cls.preprocess(text)
	if not text:
	return "" # 对于预处理后为空的字符串直接返回
	text = cls.normalize_custom(text, interpret_as=interpret_as)
	text = cls.postprocess(text)
	return text

	@classmethod
	def regular(cls, text: str) -> str:
	text = cls.preprocess(text)
	if not text:
	return "" # 对于预处理后为空的字符串直接返回
	text = cls.normalize_regular(text)
	text = cls.postprocess(text)
	return text

	@classmethod
	def normalize_custom(cls, text: str, *, interpret_as: str) -> str:
	"""指定正则

	仅针对`interpret_as`进行正则匹配

	Args:
	interpret_as: string in ['cardinal', 'currency', 'digits', 'telephone', 'address', 'date', 'time', 'id']
	"""
	assert interpret_as in [
	"cardinal",
	"currency",
	"digits",
	"telephone",
	"address",
	"date",
	"time",
	"id",
	"measure",
	"punctuation"
	], f"""
	interpret_as:{interpret_as} not supported
	""".strip()

	if interpret_as == "cardinal":
	text = text.replace(",", "")
	text = RE_NUMBER.sub(replace_number, text) # 正负整数小数
	text = RE_FRAC.sub(replace_frac, text)
	text = RE_PERCENTAGE.sub(replace_percentage, text)
	elif interpret_as == "currency":
	text = RE_CURRENCY.sub(replace_currency, text)
	text = RE_CURRENCY_2.sub(replace_currency_2, text)
	text = text.replace(",", "")
	text = RE_NUMBER.sub(replace_number, text) # 正负整数小数
	text = RE_FRAC.sub(replace_frac, text)
	elif interpret_as == "digits":
	text = RE_DIGITS.sub(replace_default_num_without_altone, text)
	elif interpret_as == "telephone":
	text = RE_MOBILE_PHONE.sub(replace_mobile, text)
	text = RE_TELEPHONE.sub(replace_phone, text)
	text = RE_NATIONAL_UNIFORM_NUMBER.sub(replace_phone, text)
	text = RE_DIGITS.sub(replace_default_num_with_altone, text)
	elif interpret_as == "address":
	text = text.replace("-", "杠")
	text = RE_ADDRESS_room.sub(replace_address_room, text)
	text = RE_ADDRESS.sub(replace_address, text)
	elif interpret_as == "date":
	text = RE_DATE.sub(replace_date, text) # 年月日
	text = RE_DATE2.sub(replace_date2, text) # YY/MM/DD 或者 YY-MM-DD
	text = convert_date(text)
	text = text.replace("-", "至")
	elif interpret_as == "time":
	text = RE_TIME_RANGE.sub(replace_time, text) # 8:30-12:30
	text = RE_TIME.sub(replace_time, text) # 12:30:58
	elif interpret_as == "id":
	text = RE_DIGITS.sub(replace_default_num_with_altone, text)
	text = text.replace("_", "下划线").replace("-", "杠").upper()
	text = re.sub("[a-zA-Z]+", add_blank, text)
	elif interpret_as == "measure":
	text = text.replace("㎡", "m²")
	text = text.replace("cm²", "平方厘米")
	text = text.replace("m²", "平方米")
	text = text.replace("cm", "厘米")
	text = text.replace("mm", "毫米")
	text = text.replace("m", "米")
	text = text.replace("kg", "千克")
	text = text.replace("g", "克")
	elif interpret_as == "punctuation":
	text = re.sub("…+", "省略号", text)
	text = re.sub("\"\|“\|”", "双引号", text)
	text = re.sub("'\|‘\|’", "单引号", text)
	text = re.sub("（\|\\(", "左括号", text)
	text = re.sub("）\|\\)", "右括号", text)
	text = re.sub("!\|！", "叹号", text)
	for sym, txt in zip(
	['……', '…', '!', '"', '#', '$', '%', '&', '‘', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_'], # noqa
	['省略号', '省略号', '叹号', '双引号', '井号', 'dollar', '百分号', 'and', '单引号', '左括号', '右括号', '星号', '加号', '逗号', '杠', '点', '斜杠', '冒号', '分号', '小于', '等号', '大于', '问号', 'at', '左方括号', '反斜线', '右方括号', '脱字符', '下划线'] # noqa
	):
	text = text.replace(sym, txt)
	return text

	@classmethod
	def normalize_regular(cls, text: str, is_en: bool = False, return_details: bool = False) -> str:
	"""通用正则

	包含了所有可能情况的正则匹配
	输出的文本中只包含中文、英文、tts所需的标点("，。！？")
	"""
	trace = []
	if is_en is True:
	text = en_normalize_numbers(text)
	text = text.replace(".", "。")
	text = text.replace(",", "，")
	else:
	text = cls.substitute(RE_DATE, replace_date, text, trace)
	text = cls.substitute(RE_DATE2, replace_date2, text, trace)
	text = re.sub(r"(?<=[\d%])[-~](?=\d)", "至", text) # 先判断日期（2023-01-02），并跟减号负号区分
	text = cls.substitute(RE_TIME_RANGE, replace_time, text, trace)
	text = cls.substitute(RE_TIME, replace_time, text, trace)
	text = cls.substitute(RE_CURRENCY, replace_currency, text, trace)
	text = cls.substitute(RE_CURRENCY_2, replace_currency_2, text, trace)
	text = cls.substitute(RE_TEMPERATURE, replace_temperature, text, trace)
	text = cls.substitute(RE_LICENSE_PLATE, replace_license_plate, text, trace)
	text = cls.substitute(RE_FRAC, replace_frac, text, trace)
	text = cls.substitute(RE_PERCENTAGE, replace_percentage, text, trace)
	text = cls.substitute(RE_MOBILE_PHONE, replace_mobile, text, trace)
	text = cls.substitute(RE_TELEPHONE, replace_phone, text, trace)
	text = cls.substitute(RE_NATIONAL_UNIFORM_NUMBER, replace_phone, text, trace)
	text = cls.substitute(RE_RANGE, replace_range, text, trace)
	text = cls.substitute(RE_INTEGER, replace_negative_num, text, trace)
	text = cls.substitute(RE_DECIMAL_NUM, replace_number, text, trace)
	text = cls.substitute(RE_POSITIVE_QUANTIFIERS_2, replace_positive_quantifier_2, text, trace)
	text = cls.substitute(RE_POSITIVE_QUANTIFIERS, replace_positive_quantifier, text, trace)
	text = cls.substitute(RE_DEFAULT_NUM, replace_default_num_with_altone, text, trace)
	text = cls.substitute(RE_NUMBER, replace_number, text, trace)

	return text, trace