Support displaying images in the chunks of docx files when using general parser (#1253)
Browse files### What problem does this PR solve?
Support displaying images in chunks of docx files when using general
parser
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- rag/app/naive.py +53 -6
- rag/app/qa.py +3 -21
- rag/nlp/__init__.py +65 -1
rag/app/naive.py
CHANGED
|
@@ -16,16 +16,28 @@ from docx import Document
|
|
| 16 |
from timeit import default_timer as timer
|
| 17 |
import re
|
| 18 |
from deepdoc.parser.pdf_parser import PlainParser
|
| 19 |
-
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
|
| 20 |
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser
|
| 21 |
from rag.settings import cron_logger
|
| 22 |
from rag.utils import num_tokens_from_string
|
| 23 |
-
|
|
|
|
| 24 |
|
| 25 |
class Docx(DocxParser):
|
| 26 |
def __init__(self):
|
| 27 |
pass
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
def __clean(self, line):
|
| 30 |
line = re.sub(r"\u3000", " ", line).strip()
|
| 31 |
return line
|
|
@@ -35,17 +47,41 @@ class Docx(DocxParser):
|
|
| 35 |
filename) if not binary else Document(BytesIO(binary))
|
| 36 |
pn = 0
|
| 37 |
lines = []
|
|
|
|
| 38 |
for p in self.doc.paragraphs:
|
| 39 |
if pn > to_page:
|
| 40 |
break
|
| 41 |
-
if from_page <= pn < to_page
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
for run in p.runs:
|
| 44 |
if 'lastRenderedPageBreak' in run._element.xml:
|
| 45 |
pn += 1
|
| 46 |
continue
|
| 47 |
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
| 48 |
pn += 1
|
|
|
|
| 49 |
tbls = []
|
| 50 |
for tb in self.doc.tables:
|
| 51 |
html= "<table>"
|
|
@@ -64,7 +100,7 @@ class Docx(DocxParser):
|
|
| 64 |
html += "</tr>"
|
| 65 |
html += "</table>"
|
| 66 |
tbls.append(((None, html), ""))
|
| 67 |
-
return
|
| 68 |
|
| 69 |
|
| 70 |
class Pdf(PdfParser):
|
|
@@ -123,8 +159,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
| 123 |
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
| 124 |
callback(0.1, "Start to parse.")
|
| 125 |
sections, tbls = Docx()(filename, binary)
|
| 126 |
-
res = tokenize_table(tbls, doc, eng)
|
|
|
|
| 127 |
callback(0.8, "Finish parsing.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
| 130 |
pdf_parser = Pdf(
|
|
|
|
| 16 |
from timeit import default_timer as timer
|
| 17 |
import re
|
| 18 |
from deepdoc.parser.pdf_parser import PlainParser
|
| 19 |
+
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
|
| 20 |
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser
|
| 21 |
from rag.settings import cron_logger
|
| 22 |
from rag.utils import num_tokens_from_string
|
| 23 |
+
from PIL import Image
|
| 24 |
+
from functools import reduce
|
| 25 |
|
| 26 |
class Docx(DocxParser):
|
| 27 |
def __init__(self):
|
| 28 |
pass
|
| 29 |
|
| 30 |
+
def get_picture(self, document, paragraph):
|
| 31 |
+
img = paragraph._element.xpath('.//pic:pic')
|
| 32 |
+
if not img:
|
| 33 |
+
return None
|
| 34 |
+
img = img[0]
|
| 35 |
+
embed = img.xpath('.//a:blip/@r:embed')[0]
|
| 36 |
+
related_part = document.part.related_parts[embed]
|
| 37 |
+
image = related_part.image
|
| 38 |
+
image = Image.open(BytesIO(image.blob)).convert('RGB')
|
| 39 |
+
return image
|
| 40 |
+
|
| 41 |
def __clean(self, line):
|
| 42 |
line = re.sub(r"\u3000", " ", line).strip()
|
| 43 |
return line
|
|
|
|
| 47 |
filename) if not binary else Document(BytesIO(binary))
|
| 48 |
pn = 0
|
| 49 |
lines = []
|
| 50 |
+
last_image = None
|
| 51 |
for p in self.doc.paragraphs:
|
| 52 |
if pn > to_page:
|
| 53 |
break
|
| 54 |
+
if from_page <= pn < to_page:
|
| 55 |
+
current_image = None
|
| 56 |
+
if p.text.strip():
|
| 57 |
+
if p.style.name == 'Caption':
|
| 58 |
+
former_image = None
|
| 59 |
+
if lines and lines[-1][1] and lines[-1][2] != 'Caption':
|
| 60 |
+
former_image = lines[-1][1].pop()
|
| 61 |
+
elif last_image:
|
| 62 |
+
former_image = last_image
|
| 63 |
+
last_image = None
|
| 64 |
+
lines.append((self.__clean(p.text), [former_image], p.style.name))
|
| 65 |
+
else:
|
| 66 |
+
current_image = self.get_picture(self.doc, p)
|
| 67 |
+
image_list = [current_image]
|
| 68 |
+
if last_image:
|
| 69 |
+
image_list.insert(0, last_image)
|
| 70 |
+
last_image = None
|
| 71 |
+
lines.append((self.__clean(p.text), image_list, p.style.name))
|
| 72 |
+
else:
|
| 73 |
+
if current_image := self.get_picture(self.doc, p):
|
| 74 |
+
if lines:
|
| 75 |
+
lines[-1][1].append(current_image)
|
| 76 |
+
else:
|
| 77 |
+
last_image = current_image
|
| 78 |
for run in p.runs:
|
| 79 |
if 'lastRenderedPageBreak' in run._element.xml:
|
| 80 |
pn += 1
|
| 81 |
continue
|
| 82 |
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
| 83 |
pn += 1
|
| 84 |
+
new_line = [(line[0], reduce(concat_img, line[1])) for line in lines]
|
| 85 |
tbls = []
|
| 86 |
for tb in self.doc.tables:
|
| 87 |
html= "<table>"
|
|
|
|
| 100 |
html += "</tr>"
|
| 101 |
html += "</table>"
|
| 102 |
tbls.append(((None, html), ""))
|
| 103 |
+
return new_line, tbls
|
| 104 |
|
| 105 |
|
| 106 |
class Pdf(PdfParser):
|
|
|
|
| 159 |
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
| 160 |
callback(0.1, "Start to parse.")
|
| 161 |
sections, tbls = Docx()(filename, binary)
|
| 162 |
+
res = tokenize_table(tbls, doc, eng) # just for table
|
| 163 |
+
|
| 164 |
callback(0.8, "Finish parsing.")
|
| 165 |
+
st = timer()
|
| 166 |
+
|
| 167 |
+
chunks, images = naive_merge_docx(
|
| 168 |
+
sections, int(parser_config.get(
|
| 169 |
+
"chunk_token_num", 128)), parser_config.get(
|
| 170 |
+
"delimiter", "\n!?。;!?"))
|
| 171 |
+
|
| 172 |
+
res.extend(tokenize_chunks_docx(chunks, doc, eng, images))
|
| 173 |
+
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
| 174 |
+
return res
|
| 175 |
|
| 176 |
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
| 177 |
pdf_parser = Pdf(
|
rag/app/qa.py
CHANGED
|
@@ -17,7 +17,7 @@ from timeit import default_timer as timer
|
|
| 17 |
from nltk import word_tokenize
|
| 18 |
from openpyxl import load_workbook
|
| 19 |
from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level
|
| 20 |
-
from rag.nlp import rag_tokenizer, tokenize_table
|
| 21 |
from rag.settings import cron_logger
|
| 22 |
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
| 23 |
from docx import Document
|
|
@@ -174,26 +174,8 @@ class Docx(DocxParser):
|
|
| 174 |
embed = img.xpath('.//a:blip/@r:embed')[0]
|
| 175 |
related_part = document.part.related_parts[embed]
|
| 176 |
image = related_part.image
|
| 177 |
-
image = Image.open(BytesIO(image.blob))
|
| 178 |
return image
|
| 179 |
-
def concat_img(self, img1, img2):
|
| 180 |
-
if img1 and not img2:
|
| 181 |
-
return img1
|
| 182 |
-
if not img1 and img2:
|
| 183 |
-
return img2
|
| 184 |
-
if not img1 and not img2:
|
| 185 |
-
return None
|
| 186 |
-
width1, height1 = img1.size
|
| 187 |
-
width2, height2 = img2.size
|
| 188 |
-
|
| 189 |
-
new_width = max(width1, width2)
|
| 190 |
-
new_height = height1 + height2
|
| 191 |
-
new_image = Image.new('RGB', (new_width, new_height))
|
| 192 |
-
|
| 193 |
-
new_image.paste(img1, (0, 0))
|
| 194 |
-
new_image.paste(img2, (0, height1))
|
| 195 |
-
|
| 196 |
-
return new_image
|
| 197 |
|
| 198 |
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
|
| 199 |
self.doc = Document(
|
|
@@ -211,7 +193,7 @@ class Docx(DocxParser):
|
|
| 211 |
if not question_level or question_level > 6: # not a question
|
| 212 |
last_answer = f'{last_answer}\n{p_text}'
|
| 213 |
current_image = self.get_picture(self.doc, p)
|
| 214 |
-
last_image =
|
| 215 |
else: # is a question
|
| 216 |
if last_answer or last_image:
|
| 217 |
sum_question = '\n'.join(question_stack)
|
|
|
|
| 17 |
from nltk import word_tokenize
|
| 18 |
from openpyxl import load_workbook
|
| 19 |
from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level
|
| 20 |
+
from rag.nlp import rag_tokenizer, tokenize_table, concat_img
|
| 21 |
from rag.settings import cron_logger
|
| 22 |
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
| 23 |
from docx import Document
|
|
|
|
| 174 |
embed = img.xpath('.//a:blip/@r:embed')[0]
|
| 175 |
related_part = document.part.related_parts[embed]
|
| 176 |
image = related_part.image
|
| 177 |
+
image = Image.open(BytesIO(image.blob)).convert('RGB')
|
| 178 |
return image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
|
| 181 |
self.doc = Document(
|
|
|
|
| 193 |
if not question_level or question_level > 6: # not a question
|
| 194 |
last_answer = f'{last_answer}\n{p_text}'
|
| 195 |
current_image = self.get_picture(self.doc, p)
|
| 196 |
+
last_image = concat_img(last_image, current_image)
|
| 197 |
else: # is a question
|
| 198 |
if last_answer or last_image:
|
| 199 |
sum_question = '\n'.join(question_stack)
|
rag/nlp/__init__.py
CHANGED
|
@@ -24,6 +24,7 @@ import copy
|
|
| 24 |
import roman_numbers as r
|
| 25 |
from word2number import w2n
|
| 26 |
from cn2an import cn2an
|
|
|
|
| 27 |
|
| 28 |
all_codecs = [
|
| 29 |
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
|
|
@@ -246,6 +247,19 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser):
|
|
| 246 |
return res
|
| 247 |
|
| 248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
def tokenize_table(tbls, doc, eng, batch_size=10):
|
| 250 |
res = []
|
| 251 |
# add tables
|
|
@@ -504,4 +518,54 @@ def docx_question_level(p):
|
|
| 504 |
if p.style.name.startswith('Heading'):
|
| 505 |
return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
|
| 506 |
else:
|
| 507 |
-
return 0, re.sub(r"\u3000", " ", p.text).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
import roman_numbers as r
|
| 25 |
from word2number import w2n
|
| 26 |
from cn2an import cn2an
|
| 27 |
+
from PIL import Image
|
| 28 |
|
| 29 |
all_codecs = [
|
| 30 |
'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
|
|
|
|
| 247 |
return res
|
| 248 |
|
| 249 |
|
| 250 |
+
def tokenize_chunks_docx(chunks, doc, eng, images):
|
| 251 |
+
res = []
|
| 252 |
+
# wrap up as es documents
|
| 253 |
+
for ck, image in zip(chunks, images):
|
| 254 |
+
if len(ck.strip()) == 0:continue
|
| 255 |
+
print("--", ck)
|
| 256 |
+
d = copy.deepcopy(doc)
|
| 257 |
+
d["image"] = image
|
| 258 |
+
tokenize(d, ck, eng)
|
| 259 |
+
res.append(d)
|
| 260 |
+
return res
|
| 261 |
+
|
| 262 |
+
|
| 263 |
def tokenize_table(tbls, doc, eng, batch_size=10):
|
| 264 |
res = []
|
| 265 |
# add tables
|
|
|
|
| 518 |
if p.style.name.startswith('Heading'):
|
| 519 |
return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
|
| 520 |
else:
|
| 521 |
+
return 0, re.sub(r"\u3000", " ", p.text).strip()
|
| 522 |
+
|
| 523 |
+
def concat_img(img1, img2):
|
| 524 |
+
if img1 and not img2:
|
| 525 |
+
return img1
|
| 526 |
+
if not img1 and img2:
|
| 527 |
+
return img2
|
| 528 |
+
if not img1 and not img2:
|
| 529 |
+
return None
|
| 530 |
+
width1, height1 = img1.size
|
| 531 |
+
width2, height2 = img2.size
|
| 532 |
+
|
| 533 |
+
new_width = max(width1, width2)
|
| 534 |
+
new_height = height1 + height2
|
| 535 |
+
new_image = Image.new('RGB', (new_width, new_height))
|
| 536 |
+
|
| 537 |
+
new_image.paste(img1, (0, 0))
|
| 538 |
+
new_image.paste(img2, (0, height1))
|
| 539 |
+
|
| 540 |
+
return new_image
|
| 541 |
+
|
| 542 |
+
def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
| 543 |
+
if not sections:
|
| 544 |
+
return []
|
| 545 |
+
|
| 546 |
+
cks = [""]
|
| 547 |
+
images = [None]
|
| 548 |
+
tk_nums = [0]
|
| 549 |
+
|
| 550 |
+
def add_chunk(t, image, pos=""):
|
| 551 |
+
nonlocal cks, tk_nums, delimiter
|
| 552 |
+
tnum = num_tokens_from_string(t)
|
| 553 |
+
if tnum < 8:
|
| 554 |
+
pos = ""
|
| 555 |
+
if tk_nums[-1] > chunk_token_num:
|
| 556 |
+
if t.find(pos) < 0:
|
| 557 |
+
t += pos
|
| 558 |
+
cks.append(t)
|
| 559 |
+
images.append(image)
|
| 560 |
+
tk_nums.append(tnum)
|
| 561 |
+
else:
|
| 562 |
+
if cks[-1].find(pos) < 0:
|
| 563 |
+
t += pos
|
| 564 |
+
cks[-1] += t
|
| 565 |
+
images[-1] = concat_img(images[-1], image)
|
| 566 |
+
tk_nums[-1] += tnum
|
| 567 |
+
|
| 568 |
+
for sec, image in sections:
|
| 569 |
+
add_chunk(sec, image, '')
|
| 570 |
+
|
| 571 |
+
return cks, images
|