longvnhue1 commited on
Commit
8346e2e
·
1 Parent(s): cf13205
Files changed (2) hide show
  1. app.py +31 -21
  2. requirements.txt +2 -1
app.py CHANGED
@@ -4,30 +4,40 @@ import time
4
  from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
5
  import torch
6
  import re
7
- # fastapi
 
 
 
8
  app = FastAPI()
9
 
10
- def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
11
- import re
12
- words = re.findall(r'\S+|\n', text)
13
  chunks = []
14
- start = 0
15
- while start < len(words):
16
- # Tìm dấu chấm trong khoảng min_words đến max_words
17
- end = min(start + max_words, len(words))
18
- dot_idx = -1
19
- for i in range(start + min_words, min(start + max_words, len(words))):
20
- if words[i] == '.' or (words[i].endswith('.') and words[i] != '\n'):
21
- dot_idx = i
22
- if dot_idx != -1:
23
- chunk_end = dot_idx + 1
24
- elif end - start > fallback_words:
25
- chunk_end = start + fallback_words
26
  else:
27
- chunk_end = end
28
- chunk = ' '.join([w if w != '\n' else '\n' for w in words[start:chunk_end]]).replace(' \n ', '\n').replace(' \n', '\n').replace('\n ', '\n')
29
- chunks.append(chunk.strip())
30
- start = chunk_end
 
 
 
 
 
 
 
 
 
 
 
31
  return chunks
32
 
33
  # Load model
@@ -66,7 +76,7 @@ class TranslateRequest(BaseModel):
66
  @app.post("/translate")
67
  def translate_text(req: TranslateRequest):
68
  tokenizer.src_lang = req.source_lang
69
- text_chunks = split_by_words_and_dot(req.text, min_words=150, max_words=200, fallback_words=180)
70
 
71
  translated_chunks = []
72
  total_gen_time = 0
 
4
  from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
5
  import torch
6
  import re
7
+ import nltk
8
+ nltk.download('punkt')
9
+ from nltk.tokenize import sent_tokenize
10
+ #fastapi
11
  app = FastAPI()
12
 
13
+ def split_text_by_sentences(text, min_words=120, max_words=160, fallback_words=150):
14
+ sentences = sent_tokenize(text)
 
15
  chunks = []
16
+ current_chunk = []
17
+ current_word_count = 0
18
+
19
+ for sentence in sentences:
20
+ word_count = len(sentence.split())
21
+ # Nếu thêm câu này vẫn còn trong giới hạn max_words, thì thêm vào
22
+ if current_word_count + word_count <= max_words:
23
+ current_chunk.append(sentence)
24
+ current_word_count += word_count
 
 
 
25
  else:
26
+ # Nếu chưa đạt min_words, nhưng đã vượt max_words, ép buộc kết thúc đoạn
27
+ if current_word_count >= min_words:
28
+ chunks.append(' '.join(current_chunk))
29
+ current_chunk = [sentence]
30
+ current_word_count = word_count
31
+ else:
32
+ # Trường hợp đặc biệt: không đạt min nhưng cũng không được vượt quá nhiều
33
+ if current_chunk:
34
+ chunks.append(' '.join(current_chunk))
35
+ current_chunk = [sentence]
36
+ current_word_count = word_count
37
+
38
+ if current_chunk:
39
+ chunks.append(' '.join(current_chunk))
40
+
41
  return chunks
42
 
43
  # Load model
 
76
  @app.post("/translate")
77
  def translate_text(req: TranslateRequest):
78
  tokenizer.src_lang = req.source_lang
79
+ text_chunks = split_text_by_sentences(req.text, min_words=150, max_words=200, fallback_words=180)
80
 
81
  translated_chunks = []
82
  total_gen_time = 0
requirements.txt CHANGED
@@ -2,4 +2,5 @@ fastapi
2
  uvicorn
3
  torch
4
  transformers
5
- sentencepiece
 
 
2
  uvicorn
3
  torch
4
  transformers
5
+ sentencepiece
6
+ nltk