Spaces:

hynt
/

F5-TTS-Vietnamese-100h

Running

App Files Files Community

hynt commited on Apr 23

Commit

d99103c

verified ·

1 Parent(s): 5e8f0ac

change chunk sentence method

Browse files

Files changed (1) hide show

f5_tts/infer/utils_infer.py +45 -23

f5_tts/infer/utils_infer.py CHANGED Viewed

@@ -67,33 +67,55 @@ fix_duration = None
 def chunk_text(text, max_chars=135):
-    """
-    Splits the input text into chunks, each with a maximum number of characters.
-    Args:
-        text (str): The text to be split.
-        max_chars (int): The maximum number of characters per chunk.
-    Returns:
-        List[str]: A list of text chunks.
-    """
-    chunks = []
-    current_chunk = ""
-    # Split the text into sentences based on punctuation followed by whitespace
-    sentences = re.split(r"(?<=[;:,.!?])\s+|(?<=[；：，。！？])", text)
-    for sentence in sentences:
-        if len(current_chunk.encode("utf-8")) + len(sentence.encode("utf-8")) <= max_chars:
-            current_chunk += sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
         else:
-            if current_chunk:
-                chunks.append(current_chunk.strip())
-            current_chunk = sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    return chunks
 # load vocoder

 def chunk_text(text, max_chars=135):
+    # print(text)
+    # Bước 1: Tách câu theo dấu ". "
+    sentences = [s.strip() for s in text.split('. ') if s.strip()]
+    # Ghép câu ngắn hơn 4 từ với câu liền kề
+    i = 0
+    while i < len(sentences):
+        if len(sentences[i].split()) < 4:
+            if i == 0:
+                # Ghép với câu sau
+                sentences[i + 1] = sentences[i] + ', ' + sentences[i + 1]
+                del sentences[i]
+            else:
+                # Ghép với câu trước
+                sentences[i - 1] = sentences[i - 1] + ', ' + sentences[i]
+                del sentences[i]
+                i -= 1
         else:
+            i += 1
+    # print(sentences)
+    # Bước 2: Tách phần quá dài trong câu theo dấu ", "
+    final_sentences = []
+    for sentence in sentences:
+        parts = [p.strip() for p in sentence.split(', ')]
+        buffer = []
+        for part in parts:
+            buffer.append(part)
+            total_words = sum(len(p.split()) for p in buffer)
+            if total_words > 20:
+                # Tách câu ra
+                long_part = ', '.join(buffer)
+                final_sentences.append(long_part)
+                buffer = []
+        if buffer:
+            final_sentences.append(', '.join(buffer))
+    # print(final_sentences)
+    if len(final_sentences[-1].split()) < 4 and len(final_sentences) >= 2:
+        final_sentences[-2] = final_sentences[-2] + ", " + final_sentences[-1]
+        final_sentences = final_sentences[0:-1]
+    # print(final_sentences)
+    return final_sentences
 # load vocoder