Spaces:

longvnhue1
/

finetune-deploy1

Sleeping

App Files Files Community

longvnhue1 commited on May 30

Commit

8346e2e

1 Parent(s): cf13205

fix10

Browse files

Files changed (2) hide show

app.py +31 -21
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -4,30 +4,40 @@ import time
 from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
 import torch
 import re
- # fastapi
 app = FastAPI()
-def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
-    import re
-    words = re.findall(r'\S+|\n', text)
     chunks = []
-    start = 0
-    while start < len(words):
-         # Tìm dấu chấm trong khoảng min_words đến max_words
-        end = min(start + max_words, len(words))
-        dot_idx = -1
-        for i in range(start + min_words, min(start + max_words, len(words))):
-            if words[i] == '.' or (words[i].endswith('.') and words[i] != '\n'):
-                dot_idx = i
-        if dot_idx != -1:
-            chunk_end = dot_idx + 1
-        elif end - start > fallback_words:
-            chunk_end = start + fallback_words
         else:
-            chunk_end = end
-        chunk = ' '.join([w if w != '\n' else '\n' for w in words[start:chunk_end]]).replace(' \n ', '\n').replace(' \n', '\n').replace('\n ', '\n')
-        chunks.append(chunk.strip())
-        start = chunk_end
     return chunks
 # Load model
@@ -66,7 +76,7 @@ class TranslateRequest(BaseModel):
 @app.post("/translate")
 def translate_text(req: TranslateRequest):
     tokenizer.src_lang = req.source_lang
-    text_chunks = split_by_words_and_dot(req.text, min_words=150, max_words=200, fallback_words=180)
     translated_chunks = []
     total_gen_time = 0

 from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
 import torch
 import re
+import nltk
+nltk.download('punkt')
+from nltk.tokenize import sent_tokenize
+#fastapi
 app = FastAPI()
+def split_text_by_sentences(text, min_words=120, max_words=160, fallback_words=150):
+    sentences = sent_tokenize(text)
     chunks = []
+    current_chunk = []
+    current_word_count = 0
+    for sentence in sentences:
+        word_count = len(sentence.split())
+        # Nếu thêm câu này vẫn còn trong giới hạn max_words, thì thêm vào
+        if current_word_count + word_count <= max_words:
+            current_chunk.append(sentence)
+            current_word_count += word_count
         else:
+            # Nếu chưa đạt min_words, nhưng đã vượt max_words, ép buộc kết thúc đoạn
+            if current_word_count >= min_words:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = [sentence]
+                current_word_count = word_count
+            else:
+                # Trường hợp đặc biệt: không đạt min nhưng cũng không được vượt quá nhiều
+                if current_chunk:
+                    chunks.append(' '.join(current_chunk))
+                current_chunk = [sentence]
+                current_word_count = word_count
+    if current_chunk:
+        chunks.append(' '.join(current_chunk))
     return chunks
 # Load model
 @app.post("/translate")
 def translate_text(req: TranslateRequest):
     tokenizer.src_lang = req.source_lang
+    text_chunks = split_text_by_sentences(req.text, min_words=150, max_words=200, fallback_words=180)
     translated_chunks = []
     total_gen_time = 0

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ fastapi
 uvicorn
 torch
 transformers
-sentencepiece

 uvicorn
 torch
 transformers
+sentencepiece
+nltk