Spaces:

longvnhue1
/

finetune-deploy1

Sleeping

App Files Files Community

longvnhue1 commited on Jun 7

Commit

18f57a5

verified ·

1 Parent(s): 0382d49

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -3

app.py CHANGED Viewed

@@ -105,27 +105,56 @@ def startup_event():
     print("Model loaded and ready.")
 def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
     words = re.findall(r'\S+|\n', text)  # giữ nguyên \n như một "từ"
     chunks = []
     start = 0
     while start < len(words):
         end = min(start + max_words, len(words))
         dot_idx = -1
-        for i in range(start + min_words, min(start + max_words, len(words))):
-            if words[i] == '.' or (words[i].endswith('.') and words[i] != '\n'):
                 dot_idx = i
         if dot_idx != -1:
             chunk_end = dot_idx + 1
         elif end - start > fallback_words:
             chunk_end = start + fallback_words
         else:
             chunk_end = end
-        chunk = ' '.join([w if w != '\n' else '\n' for w in words[start:chunk_end]]).replace(' \n ', '\n').replace(' \n', '\n').replace('\n ', '\n')
         chunks.append(chunk.strip())
         start = chunk_end
     return chunks
 class TranslateRequest(BaseModel):
     text: str
     source_lang: str

     print("Model loaded and ready.")
+# def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
+#     words = re.findall(r'\S+|\n', text)  # giữ nguyên \n như một "từ"
+#     chunks = []
+#     start = 0
+#     while start < len(words):
+#         end = min(start + max_words, len(words))
+#         dot_idx = -1
+#         for i in range(start + min_words, min(start + max_words, len(words))):
+#             if words[i] == '.' or (words[i].endswith('.') and words[i] != '\n'):
+#                 dot_idx = i
+#         if dot_idx != -1:
+#             chunk_end = dot_idx + 1
+#         elif end - start > fallback_words:
+#             chunk_end = start + fallback_words
+#         else:
+#             chunk_end = end
+#         chunk = ' '.join([w if w != '\n' else '\n' for w in words[start:chunk_end]]).replace(' \n ', '\n').replace(' \n', '\n').replace('\n ', '\n')
+#         chunks.append(chunk.strip())
+#         start = chunk_end
+#     return chunks
 def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
+    import re
     words = re.findall(r'\S+|\n', text)  # giữ nguyên \n như một "từ"
     chunks = []
     start = 0
     while start < len(words):
         end = min(start + max_words, len(words))
         dot_idx = -1
+        for i in range(start + min_words, end):
+            if words[i] in ['.', '?', '!'] or (words[i].endswith(('.', '?', '!')) and words[i] != '\n'):
                 dot_idx = i
         if dot_idx != -1:
             chunk_end = dot_idx + 1
         elif end - start > fallback_words:
             chunk_end = start + fallback_words
         else:
             chunk_end = end
+        chunk = ' '.join([w if w != '\n' else '\n' for w in words[start:chunk_end]])
+        chunk = chunk.replace(' \n ', '\n').replace(' \n', '\n').replace('\n ', '\n')
         chunks.append(chunk.strip())
         start = chunk_end
     return chunks
 class TranslateRequest(BaseModel):
     text: str
     source_lang: str