Kevin Hu commited on
Commit
13b285d
·
1 Parent(s): d2e049e

optimize text parser (#2144)

Browse files

### What problem does this PR solve?


### Type of change

- [x] Performance Improvement

Files changed (1) hide show
  1. deepdoc/parser/txt_parser.py +26 -10
deepdoc/parser/txt_parser.py CHANGED
@@ -33,14 +33,30 @@ class RAGFlowTxtParser:
33
  def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
34
  if type(txt) != str:
35
  raise TypeError("txt type should be str!")
36
- sections = []
37
- for sec in re.split(r"[%s]+"%delimiter, txt):
38
- if sections and sec in delimiter:
39
- sections[-1][0] += sec
40
- continue
41
- if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
42
- sections.append([sec[: int(len(sec) / 2)], ""])
43
- sections.append([sec[int(len(sec) / 2) :], ""])
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  else:
45
- sections.append([sec, ""])
46
- return sections
 
 
 
 
33
  def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
34
  if type(txt) != str:
35
  raise TypeError("txt type should be str!")
36
+ cks = [""]
37
+ tk_nums = [0]
38
+
39
+ def add_chunk(t):
40
+ nonlocal cks, tk_nums, delimiter
41
+ tnum = num_tokens_from_string(t)
42
+ if tnum < 8:
43
+ pos = ""
44
+ if tk_nums[-1] > chunk_token_num:
45
+ cks.append(t)
46
+ tk_nums.append(tnum)
47
+ else:
48
+ cks[-1] += t
49
+ tk_nums[-1] += tnum
50
+
51
+ s, e = 0, 1
52
+ while e < len(txt):
53
+ if txt[e] in delimiter:
54
+ add_chunk(txt[s: e + 1])
55
+ s = e + 1
56
+ e = s + 1
57
  else:
58
+ e += 1
59
+ if s < e:
60
+ add_chunk(txt[s: e + 1])
61
+
62
+ return [[c,""] for c in cks]