Kevin Hu
commited on
Commit
·
13b285d
1
Parent(s):
d2e049e
optimize text parser (#2144)
Browse files### What problem does this PR solve?
### Type of change
- [x] Performance Improvement
- deepdoc/parser/txt_parser.py +26 -10
deepdoc/parser/txt_parser.py
CHANGED
@@ -33,14 +33,30 @@ class RAGFlowTxtParser:
|
|
33 |
def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
|
34 |
if type(txt) != str:
|
35 |
raise TypeError("txt type should be str!")
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
else:
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
33 |
def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
|
34 |
if type(txt) != str:
|
35 |
raise TypeError("txt type should be str!")
|
36 |
+
cks = [""]
|
37 |
+
tk_nums = [0]
|
38 |
+
|
39 |
+
def add_chunk(t):
|
40 |
+
nonlocal cks, tk_nums, delimiter
|
41 |
+
tnum = num_tokens_from_string(t)
|
42 |
+
if tnum < 8:
|
43 |
+
pos = ""
|
44 |
+
if tk_nums[-1] > chunk_token_num:
|
45 |
+
cks.append(t)
|
46 |
+
tk_nums.append(tnum)
|
47 |
+
else:
|
48 |
+
cks[-1] += t
|
49 |
+
tk_nums[-1] += tnum
|
50 |
+
|
51 |
+
s, e = 0, 1
|
52 |
+
while e < len(txt):
|
53 |
+
if txt[e] in delimiter:
|
54 |
+
add_chunk(txt[s: e + 1])
|
55 |
+
s = e + 1
|
56 |
+
e = s + 1
|
57 |
else:
|
58 |
+
e += 1
|
59 |
+
if s < e:
|
60 |
+
add_chunk(txt[s: e + 1])
|
61 |
+
|
62 |
+
return [[c,""] for c in cks]
|