File size: 1,541 Bytes
227479d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import re

def japanese_cleaners(text):
    from text.japanese import japanese_to_romaji_with_accent
    text = japanese_to_romaji_with_accent(text)
    if len(text) == 0 or re.match('[A-Za-z]', text[-1]):
        text += '.'

    return text


def japanese_cleaners2(text):
    text = text.replace('・・・', '…').replace('・', ' ')
    text = japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…') \
                                    .replace('(', '').replace(')', '') \
                                    .replace('[', '').replace(']', '') \
                                    .replace('*', ' ').replace('{', '').replace('}', '')
    return text


def ko2kata(text):
    return text

def en2kata(text):
    return text



def jke_cleaners(text):
    japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
    korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
    english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
    
    for japanese_text in japanese_texts:
        cleaned_text = japanese_text[4:-4]
        text = text.replace(japanese_text, cleaned_text+' ', 1)

    for korean_text in korean_texts:
        cleaned_text = ko2kata(korean_text[4:-4])
        text = text.replace(korean_text, cleaned_text+' ', 1)

    for english_text in english_texts:
        cleaned_text = en2kata(english_text[4:-4])
        text = text.replace(english_text, cleaned_text+' ', 1)

    text = japanese_cleaners2(text)

    text = text[:-1]
    if re.match(r'[^\.,!\?\-…~]', text[-1]):
        text += '.'
    return text