Kevin Hu
commited on
Commit
·
da58b16
1
Parent(s):
0a9da14
accelerate tokenize (#3244)
Browse files### What problem does this PR solve?
### Type of change
- [x] Performance Improvement
- rag/nlp/rag_tokenizer.py +40 -25
rag/nlp/rag_tokenizer.py
CHANGED
@@ -281,34 +281,49 @@ class RagTokenizer:
|
|
281 |
print("[FW]", tks, s)
|
282 |
print("[BW]", tks1, s1)
|
283 |
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
i =
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
|
|
307 |
tkslist = []
|
308 |
-
self.dfs_("".join(tks[
|
309 |
res.append(" ".join(self.sortTks_(tkslist)[0][0]))
|
310 |
|
311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
312 |
|
313 |
res = " ".join(self.english_normalize_(res))
|
314 |
if self.DEBUG:
|
|
|
281 |
print("[FW]", tks, s)
|
282 |
print("[BW]", tks1, s1)
|
283 |
|
284 |
+
i, j, _i, _j = 0, 0, 0, 0
|
285 |
+
same = 0
|
286 |
+
while i + same < len(tks1) and j + same < len(tks) and tks1[i + same] == tks[j + same]:
|
287 |
+
same += 1
|
288 |
+
if same > 0: res.append(" ".join(tks[j: j + same]))
|
289 |
+
_i = i + same
|
290 |
+
_j = j + same
|
291 |
+
j = _j + 1
|
292 |
+
i = _i + 1
|
293 |
+
|
294 |
+
while i < len(tks1) and j < len(tks):
|
295 |
+
tk1, tk = "".join(tks1[_i:i]), "".join(tks[_j:j])
|
296 |
+
if tk1 != tk:
|
297 |
+
if len(tk1) > len(tk):
|
298 |
+
j += 1
|
299 |
+
else:
|
300 |
+
i += 1
|
301 |
+
continue
|
302 |
+
|
303 |
+
if tks1[i] != tks[j]:
|
304 |
+
i += 1
|
305 |
+
j += 1
|
306 |
+
continue
|
307 |
+
# backward tokens from_i to i are different from forward tokens from _j to j.
|
308 |
tkslist = []
|
309 |
+
self.dfs_("".join(tks[_j:j]), 0, [], tkslist)
|
310 |
res.append(" ".join(self.sortTks_(tkslist)[0][0]))
|
311 |
|
312 |
+
same = 1
|
313 |
+
while i + same < len(tks1) and j + same < len(tks) and tks1[i + same] == tks[j + same]:
|
314 |
+
same += 1
|
315 |
+
res.append(" ".join(tks[j: j + same]))
|
316 |
+
_i = i + same
|
317 |
+
_j = j + same
|
318 |
+
j = _j + 1
|
319 |
+
i = _i + 1
|
320 |
+
|
321 |
+
if _i < len(tks1):
|
322 |
+
assert _j < len(tks)
|
323 |
+
assert "".join(tks1[_i:]) == "".join(tks[_j:])
|
324 |
+
tkslist = []
|
325 |
+
self.dfs_("".join(tks[_j:]), 0, [], tkslist)
|
326 |
+
res.append(" ".join(self.sortTks_(tkslist)[0][0]))
|
327 |
|
328 |
res = " ".join(self.english_normalize_(res))
|
329 |
if self.DEBUG:
|