Spaces:
Sleeping
Sleeping
Commit
·
41ef6d8
1
Parent(s):
8abb8a3
fix20
Browse files- app.py +16 -9
- requirements.txt +3 -1
app.py
CHANGED
@@ -5,7 +5,11 @@ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
|
|
5 |
import torch
|
6 |
import re
|
7 |
import nltk
|
8 |
-
|
|
|
|
|
|
|
|
|
9 |
from nltk.tokenize import sent_tokenize
|
10 |
#fastapi
|
11 |
app = FastAPI()
|
@@ -17,24 +21,27 @@ def split_text_by_sentences(text, min_words=120, max_words=160, fallback_words=1
|
|
17 |
current_word_count = 0
|
18 |
|
19 |
for sentence in sentences:
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
23 |
current_chunk.append(sentence)
|
24 |
-
current_word_count +=
|
25 |
else:
|
26 |
-
# Nếu
|
27 |
if current_word_count >= min_words:
|
28 |
chunks.append(' '.join(current_chunk))
|
29 |
current_chunk = [sentence]
|
30 |
-
current_word_count =
|
31 |
else:
|
32 |
-
#
|
|
|
33 |
if current_chunk:
|
34 |
chunks.append(' '.join(current_chunk))
|
35 |
current_chunk = [sentence]
|
36 |
-
current_word_count =
|
37 |
|
|
|
38 |
if current_chunk:
|
39 |
chunks.append(' '.join(current_chunk))
|
40 |
|
|
|
5 |
import torch
|
6 |
import re
|
7 |
import nltk
|
8 |
+
from langdetect import detect
|
9 |
+
from pyvi.ViTokenizer import tokenize as vi_tokenize
|
10 |
+
|
11 |
+
nltk.data.path.append('/tmp/nltk_data')
|
12 |
+
nltk.download('punkt', download_dir='/tmp/nltk_data')
|
13 |
from nltk.tokenize import sent_tokenize
|
14 |
#fastapi
|
15 |
app = FastAPI()
|
|
|
21 |
current_word_count = 0
|
22 |
|
23 |
for sentence in sentences:
|
24 |
+
sentence_word_count = len(sentence.split())
|
25 |
+
|
26 |
+
# Nếu thêm vào mà không vượt quá giới hạn max
|
27 |
+
if current_word_count + sentence_word_count <= max_words:
|
28 |
current_chunk.append(sentence)
|
29 |
+
current_word_count += sentence_word_count
|
30 |
else:
|
31 |
+
# Nếu chunk hiện tại đạt đủ min_words, ta đóng chunk lại
|
32 |
if current_word_count >= min_words:
|
33 |
chunks.append(' '.join(current_chunk))
|
34 |
current_chunk = [sentence]
|
35 |
+
current_word_count = sentence_word_count
|
36 |
else:
|
37 |
+
# Nếu chunk hiện tại chưa đạt min_words, nhưng không thể thêm nữa vì vượt max
|
38 |
+
# Ta vẫn đóng lại để tránh vượt giới hạn quá nhiều
|
39 |
if current_chunk:
|
40 |
chunks.append(' '.join(current_chunk))
|
41 |
current_chunk = [sentence]
|
42 |
+
current_word_count = sentence_word_count
|
43 |
|
44 |
+
# Thêm đoạn cuối cùng nếu còn sót lại
|
45 |
if current_chunk:
|
46 |
chunks.append(' '.join(current_chunk))
|
47 |
|
requirements.txt
CHANGED
@@ -3,4 +3,6 @@ uvicorn
|
|
3 |
torch
|
4 |
transformers
|
5 |
sentencepiece
|
6 |
-
nltk
|
|
|
|
|
|
3 |
torch
|
4 |
transformers
|
5 |
sentencepiece
|
6 |
+
nltk
|
7 |
+
pyvi
|
8 |
+
langdetect
|