text_norm: | |
postprocess: | |
- # EN2CN | |
"…" : "。" | |
"!" : "!" | |
"\\?" : "?" | |
";" : ";" | |
":" : ":" | |
"," : "," | |
"\\(" : "(" | |
"\\)" : ")" | |
- # EN2CN | |
"……": "。" | |
- # OTHER2CN | |
"﹐" : "," | |
"﹔" : ";" | |
"。" : "。" | |
# CN2CN | |
";" : "。" | |
":" : "," | |
"、" : "," | |
- # 处理连续句号"。" | |
"。+": "。" | |
- # 正则后的 "/" | |
"/": "每" | |
- # 处理_ | |
"_": " " | |
- # 处理正则后的[~~]+,根据是否在句尾替换为“。”或“至” | |
"~+": "~" | |
"~+": "~" | |
"[~~]": "。" | |
- # 删除除英文内的“-”, "'" | |
"(?<=[^a-zA-Z])[-']+": "," | |
"[-']+(?=[^a-zA-Z])": "," | |
- # 删除除了标准中文标点、英文、-、’、空格、数字、中文外的其他符号 | |
"[^。!?,\u4e00-\u4E27\u4E29-\u4E3E\u4E42-\u9fa4a-zA-Z ]": "" | |
- # 处理连续逗号"。" | |
",+": "," | |
- # 处理连续空格"。" | |
" +": " " | |
split_token: ["。", ","] | |
split_cn_length: null |