longvnhue1 commited on
Commit
e263c34
·
1 Parent(s): 41ef6d8
Files changed (2) hide show
  1. app.py +16 -19
  2. requirements.txt +0 -3
app.py CHANGED
@@ -4,44 +4,41 @@ import time
4
  from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
5
  import torch
6
  import re
7
- import nltk
8
- from langdetect import detect
9
- from pyvi.ViTokenizer import tokenize as vi_tokenize
10
-
11
- nltk.data.path.append('/tmp/nltk_data')
12
- nltk.download('punkt', download_dir='/tmp/nltk_data')
13
- from nltk.tokenize import sent_tokenize
14
  #fastapi
15
  app = FastAPI()
16
 
17
- def split_text_by_sentences(text, min_words=120, max_words=160, fallback_words=150):
18
- sentences = sent_tokenize(text)
 
 
 
 
 
19
  chunks = []
20
  current_chunk = []
21
  current_word_count = 0
22
 
23
  for sentence in sentences:
24
- sentence_word_count = len(sentence.split())
 
 
 
 
25
 
26
- # Nếu thêm vào không vượt quá giới hạn max
27
- if current_word_count + sentence_word_count <= max_words:
28
  current_chunk.append(sentence)
29
- current_word_count += sentence_word_count
30
  else:
31
- # Nếu chunk hiện tại đạt đủ min_words, ta đóng chunk lại
32
  if current_word_count >= min_words:
33
  chunks.append(' '.join(current_chunk))
34
  current_chunk = [sentence]
35
- current_word_count = sentence_word_count
36
  else:
37
- # Nếu chunk hiện tại chưa đạt min_words, nhưng không thể thêm nữa vì vượt max
38
- # Ta vẫn đóng lại để tránh vượt giới hạn quá nhiều
39
  if current_chunk:
40
  chunks.append(' '.join(current_chunk))
41
  current_chunk = [sentence]
42
- current_word_count = sentence_word_count
43
 
44
- # Thêm đoạn cuối cùng nếu còn sót lại
45
  if current_chunk:
46
  chunks.append(' '.join(current_chunk))
47
 
 
4
  from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
5
  import torch
6
  import re
 
 
 
 
 
 
 
7
  #fastapi
8
  app = FastAPI()
9
 
10
+ def simple_sentence_tokenize(text):
11
+ # Tách câu theo dấu chấm, hỏi, chấm than, theo sau là khoảng trắng hoặc xuống dòng
12
+ sentence_endings = re.compile(r'(?<=[.!?])\s+')
13
+ return sentence_endings.split(text)
14
+
15
+ def split_text_by_sentences(text, min_words=150, max_words=200, fallback_words=180):
16
+ sentences = simple_sentence_tokenize(text)
17
  chunks = []
18
  current_chunk = []
19
  current_word_count = 0
20
 
21
  for sentence in sentences:
22
+ sentence = sentence.strip()
23
+ if not sentence:
24
+ continue
25
+
26
+ word_count = len(sentence.split())
27
 
28
+ if current_word_count + word_count <= max_words:
 
29
  current_chunk.append(sentence)
30
+ current_word_count += word_count
31
  else:
 
32
  if current_word_count >= min_words:
33
  chunks.append(' '.join(current_chunk))
34
  current_chunk = [sentence]
35
+ current_word_count = word_count
36
  else:
 
 
37
  if current_chunk:
38
  chunks.append(' '.join(current_chunk))
39
  current_chunk = [sentence]
40
+ current_word_count = word_count
41
 
 
42
  if current_chunk:
43
  chunks.append(' '.join(current_chunk))
44
 
requirements.txt CHANGED
@@ -3,6 +3,3 @@ uvicorn
3
  torch
4
  transformers
5
  sentencepiece
6
- nltk
7
- pyvi
8
- langdetect
 
3
  torch
4
  transformers
5
  sentencepiece