Spaces:
Sleeping
Sleeping
Commit
·
8346e2e
1
Parent(s):
cf13205
fix10
Browse files- app.py +31 -21
- requirements.txt +2 -1
app.py
CHANGED
@@ -4,30 +4,40 @@ import time
|
|
4 |
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
|
5 |
import torch
|
6 |
import re
|
7 |
-
|
|
|
|
|
|
|
8 |
app = FastAPI()
|
9 |
|
10 |
-
def
|
11 |
-
|
12 |
-
words = re.findall(r'\S+|\n', text)
|
13 |
chunks = []
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
chunk_end = dot_idx + 1
|
24 |
-
elif end - start > fallback_words:
|
25 |
-
chunk_end = start + fallback_words
|
26 |
else:
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
return chunks
|
32 |
|
33 |
# Load model
|
@@ -66,7 +76,7 @@ class TranslateRequest(BaseModel):
|
|
66 |
@app.post("/translate")
|
67 |
def translate_text(req: TranslateRequest):
|
68 |
tokenizer.src_lang = req.source_lang
|
69 |
-
text_chunks =
|
70 |
|
71 |
translated_chunks = []
|
72 |
total_gen_time = 0
|
|
|
4 |
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
|
5 |
import torch
|
6 |
import re
|
7 |
+
import nltk
|
8 |
+
nltk.download('punkt')
|
9 |
+
from nltk.tokenize import sent_tokenize
|
10 |
+
#fastapi
|
11 |
app = FastAPI()
|
12 |
|
13 |
+
def split_text_by_sentences(text, min_words=120, max_words=160, fallback_words=150):
|
14 |
+
sentences = sent_tokenize(text)
|
|
|
15 |
chunks = []
|
16 |
+
current_chunk = []
|
17 |
+
current_word_count = 0
|
18 |
+
|
19 |
+
for sentence in sentences:
|
20 |
+
word_count = len(sentence.split())
|
21 |
+
# Nếu thêm câu này vẫn còn trong giới hạn max_words, thì thêm vào
|
22 |
+
if current_word_count + word_count <= max_words:
|
23 |
+
current_chunk.append(sentence)
|
24 |
+
current_word_count += word_count
|
|
|
|
|
|
|
25 |
else:
|
26 |
+
# Nếu chưa đạt min_words, nhưng đã vượt max_words, ép buộc kết thúc đoạn
|
27 |
+
if current_word_count >= min_words:
|
28 |
+
chunks.append(' '.join(current_chunk))
|
29 |
+
current_chunk = [sentence]
|
30 |
+
current_word_count = word_count
|
31 |
+
else:
|
32 |
+
# Trường hợp đặc biệt: không đạt min nhưng cũng không được vượt quá nhiều
|
33 |
+
if current_chunk:
|
34 |
+
chunks.append(' '.join(current_chunk))
|
35 |
+
current_chunk = [sentence]
|
36 |
+
current_word_count = word_count
|
37 |
+
|
38 |
+
if current_chunk:
|
39 |
+
chunks.append(' '.join(current_chunk))
|
40 |
+
|
41 |
return chunks
|
42 |
|
43 |
# Load model
|
|
|
76 |
@app.post("/translate")
|
77 |
def translate_text(req: TranslateRequest):
|
78 |
tokenizer.src_lang = req.source_lang
|
79 |
+
text_chunks = split_text_by_sentences(req.text, min_words=150, max_words=200, fallback_words=180)
|
80 |
|
81 |
translated_chunks = []
|
82 |
total_gen_time = 0
|
requirements.txt
CHANGED
@@ -2,4 +2,5 @@ fastapi
|
|
2 |
uvicorn
|
3 |
torch
|
4 |
transformers
|
5 |
-
sentencepiece
|
|
|
|
2 |
uvicorn
|
3 |
torch
|
4 |
transformers
|
5 |
+
sentencepiece
|
6 |
+
nltk
|