Spaces:

longvnhue1
/

finetune-deploy1

Sleeping

longvnhue1 commited on May 30

Commit

41ef6d8

1 Parent(s): 8abb8a3

fix20

Files changed (2) hide show

app.py CHANGED Viewed

@@ -5,7 +5,11 @@ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
 import torch
 import re
 import nltk
-nltk.download('punkt')
 from nltk.tokenize import sent_tokenize
 #fastapi
 app = FastAPI()
@@ -17,24 +21,27 @@ def split_text_by_sentences(text, min_words=120, max_words=160, fallback_words=1
     current_word_count = 0
     for sentence in sentences:
-        word_count = len(sentence.split())
-        # Nếu thêm câu này vẫn còn trong giới hạn max_words, thì thêm vào
-        if current_word_count + word_count <= max_words:
             current_chunk.append(sentence)
-            current_word_count += word_count
         else:
-            # Nếu chưa đạt min_words, nhưng đã vượt max_words, ép buộc kết thúc đoạn
             if current_word_count >= min_words:
                 chunks.append(' '.join(current_chunk))
                 current_chunk = [sentence]
-                current_word_count = word_count
             else:
-                # Trường hợp đặc biệt: không đạt min nhưng cũng không được vượt quá nhiều
                 if current_chunk:
                     chunks.append(' '.join(current_chunk))
                 current_chunk = [sentence]
-                current_word_count = word_count
     if current_chunk:
         chunks.append(' '.join(current_chunk))

 import torch
 import re
 import nltk
+from langdetect import detect
+from pyvi.ViTokenizer import tokenize as vi_tokenize
+nltk.data.path.append('/tmp/nltk_data')
+nltk.download('punkt', download_dir='/tmp/nltk_data')
 from nltk.tokenize import sent_tokenize
 #fastapi
 app = FastAPI()
     current_word_count = 0
     for sentence in sentences:
+        sentence_word_count = len(sentence.split())
+        # Nếu thêm vào mà không vượt quá giới hạn max
+        if current_word_count + sentence_word_count <= max_words:
             current_chunk.append(sentence)
+            current_word_count += sentence_word_count
         else:
+            # Nếu chunk hiện tại đạt đủ min_words, ta đóng chunk lại
             if current_word_count >= min_words:
                 chunks.append(' '.join(current_chunk))
                 current_chunk = [sentence]
+                current_word_count = sentence_word_count
             else:
+                # Nếu chunk hiện tại chưa đạt min_words, nhưng không thể thêm nữa vì vượt max
+                # Ta vẫn đóng lại để tránh vượt giới hạn quá nhiều
                 if current_chunk:
                     chunks.append(' '.join(current_chunk))
                 current_chunk = [sentence]
+                current_word_count = sentence_word_count
+    # Thêm đoạn cuối cùng nếu còn sót lại
     if current_chunk:
         chunks.append(' '.join(current_chunk))

requirements.txt CHANGED Viewed

@@ -3,4 +3,6 @@ uvicorn
 torch
 transformers
 sentencepiece
-nltk

 torch
 transformers
 sentencepiece
+nltk
+pyvi
+langdetect