hynt commited on
Commit
d99103c
·
verified ·
1 Parent(s): 5e8f0ac

change chunk sentence method

Browse files
Files changed (1) hide show
  1. f5_tts/infer/utils_infer.py +45 -23
f5_tts/infer/utils_infer.py CHANGED
@@ -67,33 +67,55 @@ fix_duration = None
67
 
68
 
69
  def chunk_text(text, max_chars=135):
70
- """
71
- Splits the input text into chunks, each with a maximum number of characters.
72
 
73
- Args:
74
- text (str): The text to be split.
75
- max_chars (int): The maximum number of characters per chunk.
76
-
77
- Returns:
78
- List[str]: A list of text chunks.
79
- """
80
- chunks = []
81
- current_chunk = ""
82
- # Split the text into sentences based on punctuation followed by whitespace
83
- sentences = re.split(r"(?<=[;:,.!?])\s+|(?<=[;:,。!?])", text)
84
-
85
- for sentence in sentences:
86
- if len(current_chunk.encode("utf-8")) + len(sentence.encode("utf-8")) <= max_chars:
87
- current_chunk += sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
 
 
 
88
  else:
89
- if current_chunk:
90
- chunks.append(current_chunk.strip())
91
- current_chunk = sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
92
 
93
- if current_chunk:
94
- chunks.append(current_chunk.strip())
95
 
96
- return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
 
99
  # load vocoder
 
67
 
68
 
69
  def chunk_text(text, max_chars=135):
 
 
70
 
71
+ # print(text)
72
+
73
+ # Bước 1: Tách câu theo dấu ". "
74
+ sentences = [s.strip() for s in text.split('. ') if s.strip()]
75
+
76
+ # Ghép câu ngắn hơn 4 từ với câu liền kề
77
+ i = 0
78
+ while i < len(sentences):
79
+ if len(sentences[i].split()) < 4:
80
+ if i == 0:
81
+ # Ghép với câu sau
82
+ sentences[i + 1] = sentences[i] + ', ' + sentences[i + 1]
83
+ del sentences[i]
84
+ else:
85
+ # Ghép với câu trước
86
+ sentences[i - 1] = sentences[i - 1] + ', ' + sentences[i]
87
+ del sentences[i]
88
+ i -= 1
89
  else:
90
+ i += 1
 
 
91
 
92
+ # print(sentences)
 
93
 
94
+ # Bước 2: Tách phần quá dài trong câu theo dấu ", "
95
+ final_sentences = []
96
+ for sentence in sentences:
97
+ parts = [p.strip() for p in sentence.split(', ')]
98
+ buffer = []
99
+ for part in parts:
100
+ buffer.append(part)
101
+ total_words = sum(len(p.split()) for p in buffer)
102
+ if total_words > 20:
103
+ # Tách câu ra
104
+ long_part = ', '.join(buffer)
105
+ final_sentences.append(long_part)
106
+ buffer = []
107
+ if buffer:
108
+ final_sentences.append(', '.join(buffer))
109
+
110
+ # print(final_sentences)
111
+
112
+ if len(final_sentences[-1].split()) < 4 and len(final_sentences) >= 2:
113
+ final_sentences[-2] = final_sentences[-2] + ", " + final_sentences[-1]
114
+ final_sentences = final_sentences[0:-1]
115
+
116
+ # print(final_sentences)
117
+
118
+ return final_sentences
119
 
120
 
121
  # load vocoder