longvnhue1 commited on
Commit
18f57a5
·
verified ·
1 Parent(s): 0382d49

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -3
app.py CHANGED
@@ -105,27 +105,56 @@ def startup_event():
105
 
106
  print("Model loaded and ready.")
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
 
109
  words = re.findall(r'\S+|\n', text) # giữ nguyên \n như một "từ"
110
  chunks = []
111
  start = 0
 
112
  while start < len(words):
113
  end = min(start + max_words, len(words))
114
  dot_idx = -1
115
- for i in range(start + min_words, min(start + max_words, len(words))):
116
- if words[i] == '.' or (words[i].endswith('.') and words[i] != '\n'):
 
117
  dot_idx = i
 
118
  if dot_idx != -1:
119
  chunk_end = dot_idx + 1
120
  elif end - start > fallback_words:
121
  chunk_end = start + fallback_words
122
  else:
123
  chunk_end = end
124
- chunk = ' '.join([w if w != '\n' else '\n' for w in words[start:chunk_end]]).replace(' \n ', '\n').replace(' \n', '\n').replace('\n ', '\n')
 
 
125
  chunks.append(chunk.strip())
126
  start = chunk_end
 
127
  return chunks
128
 
 
129
  class TranslateRequest(BaseModel):
130
  text: str
131
  source_lang: str
 
105
 
106
  print("Model loaded and ready.")
107
 
108
+ # def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
109
+ # words = re.findall(r'\S+|\n', text) # giữ nguyên \n như một "từ"
110
+ # chunks = []
111
+ # start = 0
112
+ # while start < len(words):
113
+ # end = min(start + max_words, len(words))
114
+ # dot_idx = -1
115
+ # for i in range(start + min_words, min(start + max_words, len(words))):
116
+ # if words[i] == '.' or (words[i].endswith('.') and words[i] != '\n'):
117
+ # dot_idx = i
118
+ # if dot_idx != -1:
119
+ # chunk_end = dot_idx + 1
120
+ # elif end - start > fallback_words:
121
+ # chunk_end = start + fallback_words
122
+ # else:
123
+ # chunk_end = end
124
+ # chunk = ' '.join([w if w != '\n' else '\n' for w in words[start:chunk_end]]).replace(' \n ', '\n').replace(' \n', '\n').replace('\n ', '\n')
125
+ # chunks.append(chunk.strip())
126
+ # start = chunk_end
127
+ # return chunks
128
+
129
  def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
130
+ import re
131
  words = re.findall(r'\S+|\n', text) # giữ nguyên \n như một "từ"
132
  chunks = []
133
  start = 0
134
+
135
  while start < len(words):
136
  end = min(start + max_words, len(words))
137
  dot_idx = -1
138
+
139
+ for i in range(start + min_words, end):
140
+ if words[i] in ['.', '?', '!'] or (words[i].endswith(('.', '?', '!')) and words[i] != '\n'):
141
  dot_idx = i
142
+
143
  if dot_idx != -1:
144
  chunk_end = dot_idx + 1
145
  elif end - start > fallback_words:
146
  chunk_end = start + fallback_words
147
  else:
148
  chunk_end = end
149
+
150
+ chunk = ' '.join([w if w != '\n' else '\n' for w in words[start:chunk_end]])
151
+ chunk = chunk.replace(' \n ', '\n').replace(' \n', '\n').replace('\n ', '\n')
152
  chunks.append(chunk.strip())
153
  start = chunk_end
154
+
155
  return chunks
156
 
157
+
158
  class TranslateRequest(BaseModel):
159
  text: str
160
  source_lang: str