text_norm: postprocess: - # EN2CN "…" : "。" "!" : "!" "\\?" : "?" ";" : ";" ":" : ":" "," : "," "\\(" : "(" "\\)" : ")" - # EN2CN "……": "。" - # OTHER2CN "﹐" : "," "﹔" : ";" "。" : "。" # CN2CN ";" : "。" ":" : "," "、" : "," - # 处理连续句号"。" "。+": "。" - # 正则后的 "/" "/": "每" - # 处理_ "_": " " - # 处理正则后的[~~]+,根据是否在句尾替换为“。”或“至” "~+": "~" "~+": "~" "[~~]": "。" - # 删除除英文内的“-”, "'" "(?<=[^a-zA-Z])[-']+": "," "[-']+(?=[^a-zA-Z])": "," - # 删除除了标准中文标点、英文、-、’、空格、数字、中文外的其他符号 "[^。!?,\u4e00-\u4E27\u4E29-\u4E3E\u4E42-\u9fa4a-zA-Z ]": "" - # 处理连续逗号"。" ",+": "," - # 处理连续空格"。" " +": " " split_token: ["。", ","] split_cn_length: null