Spaces:
Running
Running
| import regex as re | |
| try: | |
| from config import config | |
| LANGUAGE_IDENTIFICATION_LIBRARY = ( | |
| config.webui_config.language_identification_library | |
| ) | |
| except: | |
| LANGUAGE_IDENTIFICATION_LIBRARY = "langid" | |
| module = LANGUAGE_IDENTIFICATION_LIBRARY.lower() | |
| langid_languages = [ | |
| "af", | |
| "am", | |
| "an", | |
| "ar", | |
| "as", | |
| "az", | |
| "be", | |
| "bg", | |
| "bn", | |
| "br", | |
| "bs", | |
| "ca", | |
| "cs", | |
| "cy", | |
| "da", | |
| "de", | |
| "dz", | |
| "el", | |
| "en", | |
| "eo", | |
| "es", | |
| "et", | |
| "eu", | |
| "fa", | |
| "fi", | |
| "fo", | |
| "fr", | |
| "ga", | |
| "gl", | |
| "gu", | |
| "he", | |
| "hi", | |
| "hr", | |
| "ht", | |
| "hu", | |
| "hy", | |
| "id", | |
| "is", | |
| "it", | |
| "ja", | |
| "jv", | |
| "ka", | |
| "kk", | |
| "km", | |
| "kn", | |
| "ko", | |
| "ku", | |
| "ky", | |
| "la", | |
| "lb", | |
| "lo", | |
| "lt", | |
| "lv", | |
| "mg", | |
| "mk", | |
| "ml", | |
| "mn", | |
| "mr", | |
| "ms", | |
| "mt", | |
| "nb", | |
| "ne", | |
| "nl", | |
| "nn", | |
| "no", | |
| "oc", | |
| "or", | |
| "pa", | |
| "pl", | |
| "ps", | |
| "pt", | |
| "qu", | |
| "ro", | |
| "ru", | |
| "rw", | |
| "se", | |
| "si", | |
| "sk", | |
| "sl", | |
| "sq", | |
| "sr", | |
| "sv", | |
| "sw", | |
| "ta", | |
| "te", | |
| "th", | |
| "tl", | |
| "tr", | |
| "ug", | |
| "uk", | |
| "ur", | |
| "vi", | |
| "vo", | |
| "wa", | |
| "xh", | |
| "zh", | |
| "zu", | |
| ] | |
| def classify_language(text: str, target_languages: list = None) -> str: | |
| if module == "fastlid" or module == "fasttext": | |
| from fastlid import fastlid, supported_langs | |
| classifier = fastlid | |
| if target_languages != None: | |
| target_languages = [ | |
| lang for lang in target_languages if lang in supported_langs | |
| ] | |
| fastlid.set_languages = target_languages | |
| elif module == "langid": | |
| import langid | |
| classifier = langid.classify | |
| if target_languages != None: | |
| target_languages = [ | |
| lang for lang in target_languages if lang in langid_languages | |
| ] | |
| langid.set_languages(target_languages) | |
| else: | |
| raise ValueError(f"Wrong module {module}") | |
| lang = classifier(text)[0] | |
| return lang | |
| def classify_zh_ja(text: str) -> str: | |
| for idx, char in enumerate(text): | |
| unicode_val = ord(char) | |
| # 检测日语字符 | |
| if 0x3040 <= unicode_val <= 0x309F or 0x30A0 <= unicode_val <= 0x30FF: | |
| return "ja" | |
| # 检测汉字字符 | |
| if 0x4E00 <= unicode_val <= 0x9FFF: | |
| # 检查周围的字符 | |
| next_char = text[idx + 1] if idx + 1 < len(text) else None | |
| if next_char and ( | |
| 0x3040 <= ord(next_char) <= 0x309F or 0x30A0 <= ord(next_char) <= 0x30FF | |
| ): | |
| return "ja" | |
| return "zh" | |
| def split_alpha_nonalpha(text, mode=1): | |
| if mode == 1: | |
| pattern = r"(?<=[\u4e00-\u9fff\u3040-\u30FF\d\s])(?=[\p{Latin}])|(?<=[\p{Latin}\s])(?=[\u4e00-\u9fff\u3040-\u30FF\d])" | |
| elif mode == 2: | |
| pattern = r"(?<=[\u4e00-\u9fff\u3040-\u30FF\s])(?=[\p{Latin}\d])|(?<=[\p{Latin}\d\s])(?=[\u4e00-\u9fff\u3040-\u30FF])" | |
| else: | |
| raise ValueError("Invalid mode. Supported modes are 1 and 2.") | |
| return re.split(pattern, text) | |
| if __name__ == "__main__": | |
| text = "这是一个测试文本" | |
| print(classify_language(text)) | |
| print(classify_zh_ja(text)) # "zh" | |
| text = "これはテストテキストです" | |
| print(classify_language(text)) | |
| print(classify_zh_ja(text)) # "ja" | |
| text = "vits和Bert-VITS2是tts模型。花费3days.花费3天。Take 3 days" | |
| print(split_alpha_nonalpha(text, mode=1)) | |
| # output: ['vits', '和', 'Bert-VITS', '2是', 'tts', '模型。花费3', 'days.花费3天。Take 3 days'] | |
| print(split_alpha_nonalpha(text, mode=2)) | |
| # output: ['vits', '和', 'Bert-VITS2', '是', 'tts', '模型。花费', '3days.花费', '3', '天。Take 3 days'] | |
| text = "vits 和 Bert-VITS2 是 tts 模型。花费3days.花费3天。Take 3 days" | |
| print(split_alpha_nonalpha(text, mode=1)) | |
| # output: ['vits ', '和 ', 'Bert-VITS', '2 ', '是 ', 'tts ', '模型。花费3', 'days.花费3天。Take ', '3 ', 'days'] | |
| text = "vits 和 Bert-VITS2 是 tts 模型。花费3days.花费3天。Take 3 days" | |
| print(split_alpha_nonalpha(text, mode=2)) | |
| # output: ['vits ', '和 ', 'Bert-VITS2 ', '是 ', 'tts ', '模型。花费', '3days.花费', '3', '天。Take ', '3 ', 'days'] | |