yqkcn Kevin Hu commited on
Commit
82adb1c
·
1 Parent(s): 0622917

Add get_txt function (#2639)

Browse files

### What problem does this PR solve?

Add get_txt function to reduce duplicate code

### Type of change

- [x] Refactoring

---------

Co-authored-by: Kevin Hu <[email protected]>

deepdoc/parser/txt_parser.py CHANGED
@@ -10,28 +10,18 @@
10
  # See the License for the specific language governing permissions and
11
  # limitations under the License.
12
  #
 
 
13
 
14
- from rag.nlp import find_codec,num_tokens_from_string
15
- import re
16
 
17
  class RAGFlowTxtParser:
18
  def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。;!?"):
19
- txt = ""
20
- if binary:
21
- encoding = find_codec(binary)
22
- txt = binary.decode(encoding, errors="ignore")
23
- else:
24
- with open(fnm, "r") as f:
25
- while True:
26
- l = f.readline()
27
- if not l:
28
- break
29
- txt += l
30
  return self.parser_txt(txt, chunk_token_num, delimiter)
31
 
32
  @classmethod
33
  def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
34
- if type(txt) != str:
35
  raise TypeError("txt type should be str!")
36
  cks = [""]
37
  tk_nums = [0]
 
10
  # See the License for the specific language governing permissions and
11
  # limitations under the License.
12
  #
13
+ from deepdoc.parser.utils import get_txt
14
+ from rag.nlp import num_tokens_from_string
15
 
 
 
16
 
17
  class RAGFlowTxtParser:
18
  def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。;!?"):
19
+ txt = get_txt(fnm, binary)
 
 
 
 
 
 
 
 
 
 
20
  return self.parser_txt(txt, chunk_token_num, delimiter)
21
 
22
  @classmethod
23
  def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
24
+ if not isinstance(txt, str):
25
  raise TypeError("txt type should be str!")
26
  cks = [""]
27
  tk_nums = [0]
deepdoc/parser/utils.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+
14
+ from rag.nlp import find_codec
15
+
16
+
17
+ def get_txt(fnm: str, binary=None) -> str:
18
+ txt = ""
19
+ if binary:
20
+ encoding = find_codec(binary)
21
+ txt = binary.decode(encoding, errors="ignore")
22
+ else:
23
+ with open(fnm, "r") as f:
24
+ while True:
25
+ line = f.readline()
26
+ if not line:
27
+ break
28
+ txt += line
29
+ return txt
rag/app/book.py CHANGED
@@ -15,6 +15,7 @@ from tika import parser
15
  import re
16
  from io import BytesIO
17
 
 
18
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
19
  hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
20
  tokenize_chunks, find_codec
@@ -88,17 +89,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
88
 
89
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
90
  callback(0.1, "Start to parse.")
91
- txt = ""
92
- if binary:
93
- encoding = find_codec(binary)
94
- txt = binary.decode(encoding, errors="ignore")
95
- else:
96
- with open(filename, "r") as f:
97
- while True:
98
- l = f.readline()
99
- if not l:
100
- break
101
- txt += l
102
  sections = txt.split("\n")
103
  sections = [(l, "") for l in sections if l]
104
  remove_contents_table(sections, eng=is_english(
 
15
  import re
16
  from io import BytesIO
17
 
18
+ from deepdoc.parser.utils import get_text
19
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
20
  hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
21
  tokenize_chunks, find_codec
 
89
 
90
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
91
  callback(0.1, "Start to parse.")
92
+ txt = get_text(filename, binary)
 
 
 
 
 
 
 
 
 
 
93
  sections = txt.split("\n")
94
  sections = [(l, "") for l in sections if l]
95
  remove_contents_table(sections, eng=is_english(
rag/app/laws.py CHANGED
@@ -17,6 +17,7 @@ from io import BytesIO
17
  from docx import Document
18
 
19
  from api.db import ParserType
 
20
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
21
  make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level
22
  from rag.nlp import rag_tokenizer
@@ -165,17 +166,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
165
 
166
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
167
  callback(0.1, "Start to parse.")
168
- txt = ""
169
- if binary:
170
- encoding = find_codec(binary)
171
- txt = binary.decode(encoding, errors="ignore")
172
- else:
173
- with open(filename, "r") as f:
174
- while True:
175
- l = f.readline()
176
- if not l:
177
- break
178
- txt += l
179
  sections = txt.split("\n")
180
  sections = [l for l in sections if l]
181
  callback(0.8, "Finish parsing.")
 
17
  from docx import Document
18
 
19
  from api.db import ParserType
20
+ from deepdoc.parser.utils import get_txt
21
  from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
22
  make_colon_as_title, add_positions, tokenize_chunks, find_codec, docx_question_level
23
  from rag.nlp import rag_tokenizer
 
166
 
167
  elif re.search(r"\.txt$", filename, re.IGNORECASE):
168
  callback(0.1, "Start to parse.")
169
+ txt = get_txt(filename, binary)
 
 
 
 
 
 
 
 
 
 
170
  sections = txt.split("\n")
171
  sections = [l for l in sections if l]
172
  callback(0.8, "Finish parsing.")
rag/app/naive.py CHANGED
@@ -169,7 +169,6 @@ class Markdown(MarkdownParser):
169
  return sections, tbls
170
 
171
 
172
-
173
  def chunk(filename, binary=None, from_page=0, to_page=100000,
174
  lang="Chinese", callback=None, **kwargs):
175
  """
@@ -190,7 +189,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
190
  doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
191
  res = []
192
  pdf_parser = None
193
- sections = []
194
  if re.search(r"\.docx$", filename, re.IGNORECASE):
195
  callback(0.1, "Start to parse.")
196
  sections, tbls = Docx()(filename, binary)
@@ -222,13 +220,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
222
  callback(0.1, "Start to parse.")
223
  excel_parser = ExcelParser()
224
  if parser_config.get("html4excel"):
225
- sections = [(l, "") for l in excel_parser.html(binary, 12) if l]
226
  else:
227
- sections = [(l, "") for l in excel_parser(binary) if l]
228
 
229
  elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
230
  callback(0.1, "Start to parse.")
231
- sections = TxtParser()(filename,binary,
232
  parser_config.get("chunk_token_num", 128),
233
  parser_config.get("delimiter", "\n!?;。;!?"))
234
  callback(0.8, "Finish parsing.")
@@ -242,13 +240,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
242
  elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
243
  callback(0.1, "Start to parse.")
244
  sections = HtmlParser()(filename, binary)
245
- sections = [(l, "") for l in sections if l]
246
  callback(0.8, "Finish parsing.")
247
 
248
  elif re.search(r"\.json$", filename, re.IGNORECASE):
249
  callback(0.1, "Start to parse.")
250
  sections = JsonParser(int(parser_config.get("chunk_token_num", 128)))(binary)
251
- sections = [(l, "") for l in sections if l]
252
  callback(0.8, "Finish parsing.")
253
 
254
  elif re.search(r"\.doc$", filename, re.IGNORECASE):
@@ -256,7 +254,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
256
  binary = BytesIO(binary)
257
  doc_parsed = parser.from_buffer(binary)
258
  sections = doc_parsed['content'].split('\n')
259
- sections = [(l, "") for l in sections if l]
260
  callback(0.8, "Finish parsing.")
261
 
262
  else:
 
169
  return sections, tbls
170
 
171
 
 
172
  def chunk(filename, binary=None, from_page=0, to_page=100000,
173
  lang="Chinese", callback=None, **kwargs):
174
  """
 
189
  doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
190
  res = []
191
  pdf_parser = None
 
192
  if re.search(r"\.docx$", filename, re.IGNORECASE):
193
  callback(0.1, "Start to parse.")
194
  sections, tbls = Docx()(filename, binary)
 
220
  callback(0.1, "Start to parse.")
221
  excel_parser = ExcelParser()
222
  if parser_config.get("html4excel"):
223
+ sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
224
  else:
225
+ sections = [(_, "") for _ in excel_parser(binary) if _]
226
 
227
  elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
228
  callback(0.1, "Start to parse.")
229
+ sections = TxtParser()(filename, binary,
230
  parser_config.get("chunk_token_num", 128),
231
  parser_config.get("delimiter", "\n!?;。;!?"))
232
  callback(0.8, "Finish parsing.")
 
240
  elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
241
  callback(0.1, "Start to parse.")
242
  sections = HtmlParser()(filename, binary)
243
+ sections = [(_, "") for _ in sections if _]
244
  callback(0.8, "Finish parsing.")
245
 
246
  elif re.search(r"\.json$", filename, re.IGNORECASE):
247
  callback(0.1, "Start to parse.")
248
  sections = JsonParser(int(parser_config.get("chunk_token_num", 128)))(binary)
249
+ sections = [(_, "") for _ in sections if _]
250
  callback(0.8, "Finish parsing.")
251
 
252
  elif re.search(r"\.doc$", filename, re.IGNORECASE):
 
254
  binary = BytesIO(binary)
255
  doc_parsed = parser.from_buffer(binary)
256
  sections = doc_parsed['content'].split('\n')
257
+ sections = [(_, "") for _ in sections if _]
258
  callback(0.8, "Finish parsing.")
259
 
260
  else:
rag/app/one.py CHANGED
@@ -13,6 +13,8 @@
13
  from tika import parser
14
  from io import BytesIO
15
  import re
 
 
16
  from rag.app import laws
17
  from rag.nlp import rag_tokenizer, tokenize, find_codec
18
  from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
@@ -82,17 +84,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
82
 
83
  elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE):
84
  callback(0.1, "Start to parse.")
85
- txt = ""
86
- if binary:
87
- encoding = find_codec(binary)
88
- txt = binary.decode(encoding, errors="ignore")
89
- else:
90
- with open(filename, "r") as f:
91
- while True:
92
- l = f.readline()
93
- if not l:
94
- break
95
- txt += l
96
  sections = txt.split("\n")
97
  sections = [s for s in sections if s]
98
  callback(0.8, "Finish parsing.")
 
13
  from tika import parser
14
  from io import BytesIO
15
  import re
16
+
17
+ from deepdoc.parser.utils import get_txt
18
  from rag.app import laws
19
  from rag.nlp import rag_tokenizer, tokenize, find_codec
20
  from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser
 
84
 
85
  elif re.search(r"\.(txt|md|markdown)$", filename, re.IGNORECASE):
86
  callback(0.1, "Start to parse.")
87
+ txt = get_txt(filename, binary)
 
 
 
 
 
 
 
 
 
 
88
  sections = txt.split("\n")
89
  sections = [s for s in sections if s]
90
  callback(0.8, "Finish parsing.")
rag/app/qa.py CHANGED
@@ -16,6 +16,8 @@ from io import BytesIO
16
  from timeit import default_timer as timer
17
  from nltk import word_tokenize
18
  from openpyxl import load_workbook
 
 
19
  from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level
20
  from rag.nlp import rag_tokenizer, tokenize_table, concat_img
21
  from rag.settings import cron_logger
@@ -305,17 +307,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
305
  return res
306
  elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
307
  callback(0.1, "Start to parse.")
308
- txt = ""
309
- if binary:
310
- encoding = find_codec(binary)
311
- txt = binary.decode(encoding, errors="ignore")
312
- else:
313
- with open(filename, "r") as f:
314
- while True:
315
- l = f.readline()
316
- if not l:
317
- break
318
- txt += l
319
  lines = txt.split("\n")
320
  comma, tab = 0, 0
321
  for l in lines:
@@ -358,17 +350,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
358
  return res
359
  elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
360
  callback(0.1, "Start to parse.")
361
- txt = ""
362
- if binary:
363
- encoding = find_codec(binary)
364
- txt = binary.decode(encoding, errors="ignore")
365
- else:
366
- with open(filename, "r") as f:
367
- while True:
368
- l = f.readline()
369
- if not l:
370
- break
371
- txt += l
372
  lines = txt.split("\n")
373
  last_question, last_answer = "", ""
374
  question_stack, level_stack = [], []
 
16
  from timeit import default_timer as timer
17
  from nltk import word_tokenize
18
  from openpyxl import load_workbook
19
+
20
+ from deepdoc.parser.utils import get_txt
21
  from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level
22
  from rag.nlp import rag_tokenizer, tokenize_table, concat_img
23
  from rag.settings import cron_logger
 
307
  return res
308
  elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
309
  callback(0.1, "Start to parse.")
310
+ txt = get_txt(filename, binary)
 
 
 
 
 
 
 
 
 
 
311
  lines = txt.split("\n")
312
  comma, tab = 0, 0
313
  for l in lines:
 
350
  return res
351
  elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
352
  callback(0.1, "Start to parse.")
353
+ txt = get_txt(filename, binary)
 
 
 
 
 
 
 
 
 
 
354
  lines = txt.split("\n")
355
  last_question, last_answer = "", ""
356
  question_stack, level_stack = [], []
rag/app/table.py CHANGED
@@ -20,6 +20,7 @@ from openpyxl import load_workbook
20
  from dateutil.parser import parse as datetime_parse
21
 
22
  from api.db.services.knowledgebase_service import KnowledgebaseService
 
23
  from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec
24
  from deepdoc.parser import ExcelParser
25
 
@@ -146,17 +147,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
146
  callback=callback)
147
  elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
148
  callback(0.1, "Start to parse.")
149
- txt = ""
150
- if binary:
151
- encoding = find_codec(binary)
152
- txt = binary.decode(encoding, errors="ignore")
153
- else:
154
- with open(filename, "r") as f:
155
- while True:
156
- l = f.readline()
157
- if not l:
158
- break
159
- txt += l
160
  lines = txt.split("\n")
161
  fails = []
162
  headers = lines[0].split(kwargs.get("delimiter", "\t"))
 
20
  from dateutil.parser import parse as datetime_parse
21
 
22
  from api.db.services.knowledgebase_service import KnowledgebaseService
23
+ from deepdoc.parser.utils import get_text
24
  from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec
25
  from deepdoc.parser import ExcelParser
26
 
 
147
  callback=callback)
148
  elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
149
  callback(0.1, "Start to parse.")
150
+ txt = get_text(filename, binary)
 
 
 
 
 
 
 
 
 
 
151
  lines = txt.split("\n")
152
  fails = []
153
  headers = lines[0].split(kwargs.get("delimiter", "\t"))