Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -105,27 +105,56 @@ def startup_event():
|
|
105 |
|
106 |
print("Model loaded and ready.")
|
107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
|
|
|
109 |
words = re.findall(r'\S+|\n', text) # giữ nguyên \n như một "từ"
|
110 |
chunks = []
|
111 |
start = 0
|
|
|
112 |
while start < len(words):
|
113 |
end = min(start + max_words, len(words))
|
114 |
dot_idx = -1
|
115 |
-
|
116 |
-
|
|
|
117 |
dot_idx = i
|
|
|
118 |
if dot_idx != -1:
|
119 |
chunk_end = dot_idx + 1
|
120 |
elif end - start > fallback_words:
|
121 |
chunk_end = start + fallback_words
|
122 |
else:
|
123 |
chunk_end = end
|
124 |
-
|
|
|
|
|
125 |
chunks.append(chunk.strip())
|
126 |
start = chunk_end
|
|
|
127 |
return chunks
|
128 |
|
|
|
129 |
class TranslateRequest(BaseModel):
|
130 |
text: str
|
131 |
source_lang: str
|
|
|
105 |
|
106 |
print("Model loaded and ready.")
|
107 |
|
108 |
+
# def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
|
109 |
+
# words = re.findall(r'\S+|\n', text) # giữ nguyên \n như một "từ"
|
110 |
+
# chunks = []
|
111 |
+
# start = 0
|
112 |
+
# while start < len(words):
|
113 |
+
# end = min(start + max_words, len(words))
|
114 |
+
# dot_idx = -1
|
115 |
+
# for i in range(start + min_words, min(start + max_words, len(words))):
|
116 |
+
# if words[i] == '.' or (words[i].endswith('.') and words[i] != '\n'):
|
117 |
+
# dot_idx = i
|
118 |
+
# if dot_idx != -1:
|
119 |
+
# chunk_end = dot_idx + 1
|
120 |
+
# elif end - start > fallback_words:
|
121 |
+
# chunk_end = start + fallback_words
|
122 |
+
# else:
|
123 |
+
# chunk_end = end
|
124 |
+
# chunk = ' '.join([w if w != '\n' else '\n' for w in words[start:chunk_end]]).replace(' \n ', '\n').replace(' \n', '\n').replace('\n ', '\n')
|
125 |
+
# chunks.append(chunk.strip())
|
126 |
+
# start = chunk_end
|
127 |
+
# return chunks
|
128 |
+
|
129 |
def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
|
130 |
+
import re
|
131 |
words = re.findall(r'\S+|\n', text) # giữ nguyên \n như một "từ"
|
132 |
chunks = []
|
133 |
start = 0
|
134 |
+
|
135 |
while start < len(words):
|
136 |
end = min(start + max_words, len(words))
|
137 |
dot_idx = -1
|
138 |
+
|
139 |
+
for i in range(start + min_words, end):
|
140 |
+
if words[i] in ['.', '?', '!'] or (words[i].endswith(('.', '?', '!')) and words[i] != '\n'):
|
141 |
dot_idx = i
|
142 |
+
|
143 |
if dot_idx != -1:
|
144 |
chunk_end = dot_idx + 1
|
145 |
elif end - start > fallback_words:
|
146 |
chunk_end = start + fallback_words
|
147 |
else:
|
148 |
chunk_end = end
|
149 |
+
|
150 |
+
chunk = ' '.join([w if w != '\n' else '\n' for w in words[start:chunk_end]])
|
151 |
+
chunk = chunk.replace(' \n ', '\n').replace(' \n', '\n').replace('\n ', '\n')
|
152 |
chunks.append(chunk.strip())
|
153 |
start = chunk_end
|
154 |
+
|
155 |
return chunks
|
156 |
|
157 |
+
|
158 |
class TranslateRequest(BaseModel):
|
159 |
text: str
|
160 |
source_lang: str
|