KevinHuSh commited on
Commit
977d825
·
1 Parent(s): 8c06509

fix bug of table in docx (#510)

Browse files

### What problem does this PR solve?
#509
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Files changed (2) hide show
  1. rag/app/book.py +1 -0
  2. rag/app/naive.py +4 -4
rag/app/book.py CHANGED
@@ -76,6 +76,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
76
  binary if binary else filename, from_page=from_page, to_page=to_page)
77
  remove_contents_table(sections, eng=is_english(
78
  random_choices([t for t, _ in sections], k=200)))
 
79
  callback(0.8, "Finish parsing.")
80
 
81
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
 
76
  binary if binary else filename, from_page=from_page, to_page=to_page)
77
  remove_contents_table(sections, eng=is_english(
78
  random_choices([t for t, _ in sections], k=200)))
79
+ tbls = [((None, lns), None) for lns in tbls]
80
  callback(0.8, "Finish parsing.")
81
 
82
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
rag/app/naive.py CHANGED
@@ -13,6 +13,7 @@
13
  from tika import parser
14
  from io import BytesIO
15
  from docx import Document
 
16
  import re
17
  from deepdoc.parser.pdf_parser import PlainParser
18
  from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks, find_codec
@@ -67,7 +68,6 @@ class Docx(DocxParser):
67
  class Pdf(PdfParser):
68
  def __call__(self, filename, binary=None, from_page=0,
69
  to_page=100000, zoomin=3, callback=None):
70
- from timeit import default_timer as timer
71
  start = timer()
72
  callback(msg="OCR is running...")
73
  self.__images__(
@@ -83,7 +83,6 @@ class Pdf(PdfParser):
83
  start = timer()
84
  self._layouts_rec(zoomin)
85
  callback(0.63, "Layout analysis finished.")
86
- print("layouts:", timer() - start)
87
  self._table_transformer_job(zoomin)
88
  callback(0.65, "Table analysis finished.")
89
  self._text_merge()
@@ -93,8 +92,7 @@ class Pdf(PdfParser):
93
  self._concat_downward()
94
  #self._filter_forpages()
95
 
96
- cron_logger.info("layouts: {}".format(
97
- (timer() - start) / (self.total_page + 0.1)))
98
  return [(b["text"], self._line_tag(b, zoomin))
99
  for b in self.boxes], tbls
100
 
@@ -167,12 +165,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
167
  raise NotImplementedError(
168
  "file type not supported yet(doc, docx, pdf, txt supported)")
169
 
 
170
  chunks = naive_merge(
171
  sections, parser_config.get(
172
  "chunk_token_num", 128), parser_config.get(
173
  "delimiter", "\n!?。;!?"))
174
 
175
  res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
 
176
  return res
177
 
178
 
 
13
  from tika import parser
14
  from io import BytesIO
15
  from docx import Document
16
+ from timeit import default_timer as timer
17
  import re
18
  from deepdoc.parser.pdf_parser import PlainParser
19
  from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks, find_codec
 
68
  class Pdf(PdfParser):
69
  def __call__(self, filename, binary=None, from_page=0,
70
  to_page=100000, zoomin=3, callback=None):
 
71
  start = timer()
72
  callback(msg="OCR is running...")
73
  self.__images__(
 
83
  start = timer()
84
  self._layouts_rec(zoomin)
85
  callback(0.63, "Layout analysis finished.")
 
86
  self._table_transformer_job(zoomin)
87
  callback(0.65, "Table analysis finished.")
88
  self._text_merge()
 
92
  self._concat_downward()
93
  #self._filter_forpages()
94
 
95
+ cron_logger.info("layouts: {}".format(timer() - start))
 
96
  return [(b["text"], self._line_tag(b, zoomin))
97
  for b in self.boxes], tbls
98
 
 
165
  raise NotImplementedError(
166
  "file type not supported yet(doc, docx, pdf, txt supported)")
167
 
168
+ st = timer()
169
  chunks = naive_merge(
170
  sections, parser_config.get(
171
  "chunk_token_num", 128), parser_config.get(
172
  "delimiter", "\n!?。;!?"))
173
 
174
  res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
175
+ cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
176
  return res
177
 
178