Spaces:

longvnhue1
/

finetune-deploy1

Sleeping

App Files Files Community

longvnhue1 commited on May 30

Commit

e263c34

1 Parent(s): 41ef6d8

fix23

Browse files

Files changed (2) hide show

app.py +16 -19
requirements.txt +0 -3

app.py CHANGED Viewed

@@ -4,44 +4,41 @@ import time
 from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
 import torch
 import re
-import nltk
-from langdetect import detect
-from pyvi.ViTokenizer import tokenize as vi_tokenize
-nltk.data.path.append('/tmp/nltk_data')
-nltk.download('punkt', download_dir='/tmp/nltk_data')
-from nltk.tokenize import sent_tokenize
 #fastapi
 app = FastAPI()
-def split_text_by_sentences(text, min_words=120, max_words=160, fallback_words=150):
-    sentences = sent_tokenize(text)
     chunks = []
     current_chunk = []
     current_word_count = 0
     for sentence in sentences:
-        sentence_word_count = len(sentence.split())
-        # Nếu thêm vào mà không vượt quá giới hạn max
-        if current_word_count + sentence_word_count <= max_words:
             current_chunk.append(sentence)
-            current_word_count += sentence_word_count
         else:
-            # Nếu chunk hiện tại đạt đủ min_words, ta đóng chunk lại
             if current_word_count >= min_words:
                 chunks.append(' '.join(current_chunk))
                 current_chunk = [sentence]
-                current_word_count = sentence_word_count
             else:
-                # Nếu chunk hiện tại chưa đạt min_words, nhưng không thể thêm nữa vì vượt max
-                # Ta vẫn đóng lại để tránh vượt giới hạn quá nhiều
                 if current_chunk:
                     chunks.append(' '.join(current_chunk))
                 current_chunk = [sentence]
-                current_word_count = sentence_word_count
-    # Thêm đoạn cuối cùng nếu còn sót lại
     if current_chunk:
         chunks.append(' '.join(current_chunk))

 from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
 import torch
 import re
 #fastapi
 app = FastAPI()
+def simple_sentence_tokenize(text):
+    # Tách câu theo dấu chấm, hỏi, chấm than, theo sau là khoảng trắng hoặc xuống dòng
+    sentence_endings = re.compile(r'(?<=[.!?])\s+')
+    return sentence_endings.split(text)
+def split_text_by_sentences(text, min_words=150, max_words=200, fallback_words=180):
+    sentences = simple_sentence_tokenize(text)
     chunks = []
     current_chunk = []
     current_word_count = 0
     for sentence in sentences:
+        sentence = sentence.strip()
+        if not sentence:
+            continue
+        word_count = len(sentence.split())
+        if current_word_count + word_count <= max_words:
             current_chunk.append(sentence)
+            current_word_count += word_count
         else:
             if current_word_count >= min_words:
                 chunks.append(' '.join(current_chunk))
                 current_chunk = [sentence]
+                current_word_count = word_count
             else:
                 if current_chunk:
                     chunks.append(' '.join(current_chunk))
                 current_chunk = [sentence]
+                current_word_count = word_count
     if current_chunk:
         chunks.append(' '.join(current_chunk))

requirements.txt CHANGED Viewed

@@ -3,6 +3,3 @@ uvicorn
 torch
 transformers
 sentencepiece
-nltk
-pyvi
-langdetect

 torch
 transformers
 sentencepiece