Spaces:
Runtime error
Runtime error
| import re | |
| def extract_language_and_text_updated(speaker, dialogue): | |
| # 使用正则表达式匹配<语言>标签和其后的文本 | |
| pattern_language_text = r"<(\S+?)>([^<]+)" | |
| matches = re.findall(pattern_language_text, dialogue, re.DOTALL) | |
| speaker = speaker[1:-1] | |
| # 清理文本:去除两边的空白字符 | |
| matches_cleaned = [(lang.upper(), text.strip()) for lang, text in matches] | |
| matches_cleaned.append(speaker) | |
| return matches_cleaned | |
| def validate_text(input_text): | |
| # 验证说话人的正则表达式 | |
| pattern_speaker = r"(\[\S+?\])((?:\s*<\S+?>[^<\[\]]+?)+)" | |
| # 使用re.DOTALL标志使.匹配包括换行符在内的所有字符 | |
| matches = re.findall(pattern_speaker, input_text, re.DOTALL) | |
| # 对每个匹配到的说话人内容进行进一步验证 | |
| for _, dialogue in matches: | |
| language_text_matches = extract_language_and_text_updated(_, dialogue) | |
| if not language_text_matches: | |
| return ( | |
| False, | |
| "Error: Invalid format detected in dialogue content. Please check your input.", | |
| ) | |
| # 如果输入的文本中没有找到任何匹配项 | |
| if not matches: | |
| return ( | |
| False, | |
| "Error: No valid speaker format detected. Please check your input.", | |
| ) | |
| return True, "Input is valid." | |
| def text_matching(text: str) -> list: | |
| speaker_pattern = r"(\[\S+?\])(.+?)(?=\[\S+?\]|$)" | |
| matches = re.findall(speaker_pattern, text, re.DOTALL) | |
| result = [] | |
| for speaker, dialogue in matches: | |
| result.append(extract_language_and_text_updated(speaker, dialogue)) | |
| return result | |
| def cut_para(text): | |
| splitted_para = re.split("[\n]", text) # 按段分 | |
| splitted_para = [ | |
| sentence.strip() for sentence in splitted_para if sentence.strip() | |
| ] # 删除空字符串 | |
| return splitted_para | |
| def cut_sent(para): | |
| para = re.sub("([。!;?\?])([^”’])", r"\1\n\2", para) # 单字符断句符 | |
| para = re.sub("(\.{6})([^”’])", r"\1\n\2", para) # 英文省略号 | |
| para = re.sub("(\…{2})([^”’])", r"\1\n\2", para) # 中文省略号 | |
| para = re.sub("([。!?\?][”’])([^,。!?\?])", r"\1\n\2", para) | |
| para = para.rstrip() # 段尾如果有多余的\n就去掉它 | |
| return para.split("\n") | |
| if __name__ == "__main__": | |
| text = """ | |
| [说话人1] | |
| [说话人2]<zh>你好吗?<jp>元気ですか?<jp>こんにちは,世界。<zh>你好吗? | |
| [说话人3]<zh>谢谢。<jp>どういたしまして。 | |
| """ | |
| text_matching(text) | |
| # 测试函数 | |
| test_text = """ | |
| [说话人1]<zh>你好,こんにちは!<jp>こんにちは,世界。 | |
| [说话人2]<zh>你好吗? | |
| """ | |
| text_matching(test_text) | |
| res = validate_text(test_text) | |
| print(res) | |