Spaces:

retopara
/

ragflow

Build error

aopstudio commited on Jun 24, 2024

Commit

4adcb3c

1 Parent(s): 4417740

Support displaying images in the chunks of docx files when using general parser (#1253)

### What problem does this PR solve?

Support displaying images in chunks of docx files when using general
parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Files changed (3) hide show

rag/app/naive.py +53 -6
rag/app/qa.py +3 -21
rag/nlp/__init__.py +65 -1

rag/app/naive.py CHANGED Viewed

@@ -16,16 +16,28 @@ from docx import Document
 from timeit import default_timer as timer
 import re
 from deepdoc.parser.pdf_parser import PlainParser
-from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
 from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser
 from rag.settings import cron_logger
 from rag.utils import num_tokens_from_string
 class Docx(DocxParser):
     def __init__(self):
         pass
     def __clean(self, line):
         line = re.sub(r"\u3000", " ", line).strip()
         return line
@@ -35,17 +47,41 @@ class Docx(DocxParser):
             filename) if not binary else Document(BytesIO(binary))
         pn = 0
         lines = []
         for p in self.doc.paragraphs:
             if pn > to_page:
                 break
-            if from_page <= pn < to_page and p.text.strip():
-                lines.append(self.__clean(p.text))
             for run in p.runs:
                 if 'lastRenderedPageBreak' in run._element.xml:
                     pn += 1
                     continue
                 if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
                     pn += 1
         tbls = []
         for tb in self.doc.tables:
             html= "<table>"
@@ -64,7 +100,7 @@ class Docx(DocxParser):
                 html += "</tr>"
             html += "</table>"
             tbls.append(((None, html), ""))
-        return [(l, "") for l in lines if l], tbls
 class Pdf(PdfParser):
@@ -123,8 +159,19 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
     if re.search(r"\.docx$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         sections, tbls = Docx()(filename, binary)
-        res = tokenize_table(tbls, doc, eng)
         callback(0.8, "Finish parsing.")
     elif re.search(r"\.pdf$", filename, re.IGNORECASE):
         pdf_parser = Pdf(

 from timeit import default_timer as timer
 import re
 from deepdoc.parser.pdf_parser import PlainParser
+from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
 from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser
 from rag.settings import cron_logger
 from rag.utils import num_tokens_from_string
+from PIL import Image
+from functools import reduce
 class Docx(DocxParser):
     def __init__(self):
         pass
+    def get_picture(self, document, paragraph):
+        img = paragraph._element.xpath('.//pic:pic')
+        if not img:
+            return None
+        img = img[0]
+        embed = img.xpath('.//a:blip/@r:embed')[0]
+        related_part = document.part.related_parts[embed]
+        image = related_part.image
+        image = Image.open(BytesIO(image.blob)).convert('RGB')
+        return image
     def __clean(self, line):
         line = re.sub(r"\u3000", " ", line).strip()
         return line
             filename) if not binary else Document(BytesIO(binary))
         pn = 0
         lines = []
+        last_image = None
         for p in self.doc.paragraphs:
             if pn > to_page:
                 break
+            if from_page <= pn < to_page:
+                current_image = None
+                if p.text.strip():
+                    if p.style.name == 'Caption':
+                        former_image = None
+                        if lines and lines[-1][1] and lines[-1][2] != 'Caption':
+                            former_image = lines[-1][1].pop()
+                        elif last_image:
+                            former_image = last_image
+                            last_image = None
+                        lines.append((self.__clean(p.text), [former_image], p.style.name))
+                    else:
+                        current_image = self.get_picture(self.doc, p)
+                        image_list = [current_image]
+                        if last_image:
+                            image_list.insert(0, last_image)
+                            last_image = None
+                        lines.append((self.__clean(p.text), image_list, p.style.name))
+                else:
+                    if current_image := self.get_picture(self.doc, p):
+                        if lines:
+                            lines[-1][1].append(current_image)
+                        else:
+                            last_image = current_image
             for run in p.runs:
                 if 'lastRenderedPageBreak' in run._element.xml:
                     pn += 1
                     continue
                 if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
                     pn += 1
+        new_line = [(line[0], reduce(concat_img, line[1])) for line in lines]
         tbls = []
         for tb in self.doc.tables:
             html= "<table>"
                 html += "</tr>"
             html += "</table>"
             tbls.append(((None, html), ""))
+        return new_line, tbls
 class Pdf(PdfParser):
     if re.search(r"\.docx$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         sections, tbls = Docx()(filename, binary)
+        res = tokenize_table(tbls, doc, eng)    # just for table
         callback(0.8, "Finish parsing.")
+        st = timer()
+        chunks, images = naive_merge_docx(
+            sections, int(parser_config.get(
+                "chunk_token_num", 128)), parser_config.get(
+                "delimiter", "\n!?。；！？"))
+        res.extend(tokenize_chunks_docx(chunks, doc, eng, images))
+        cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
+        return res
     elif re.search(r"\.pdf$", filename, re.IGNORECASE):
         pdf_parser = Pdf(

rag/app/qa.py CHANGED Viewed

@@ -17,7 +17,7 @@ from timeit import default_timer as timer
 from nltk import word_tokenize
 from openpyxl import load_workbook
 from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level
-from rag.nlp import rag_tokenizer, tokenize_table
 from rag.settings import cron_logger
 from deepdoc.parser import PdfParser, ExcelParser, DocxParser
 from docx import Document
@@ -174,26 +174,8 @@ class Docx(DocxParser):
         embed = img.xpath('.//a:blip/@r:embed')[0]
         related_part = document.part.related_parts[embed]
         image = related_part.image
-        image = Image.open(BytesIO(image.blob))
         return image
-    def concat_img(self, img1, img2):
-        if img1 and not img2:
-            return img1
-        if not img1 and img2:
-            return img2
-        if not img1 and not img2:
-            return None
-        width1, height1 = img1.size
-        width2, height2 = img2.size
-        new_width = max(width1, width2)
-        new_height = height1 + height2
-        new_image = Image.new('RGB', (new_width, new_height))
-        new_image.paste(img1, (0, 0))
-        new_image.paste(img2, (0, height1))
-        return new_image
     def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
         self.doc = Document(
@@ -211,7 +193,7 @@ class Docx(DocxParser):
             if not question_level or question_level > 6: # not a question
                 last_answer = f'{last_answer}\n{p_text}'
                 current_image = self.get_picture(self.doc, p)
-                last_image = self.concat_img(last_image, current_image)
             else:   # is a question
                 if last_answer or last_image:
                     sum_question = '\n'.join(question_stack)

 from nltk import word_tokenize
 from openpyxl import load_workbook
 from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level
+from rag.nlp import rag_tokenizer, tokenize_table, concat_img
 from rag.settings import cron_logger
 from deepdoc.parser import PdfParser, ExcelParser, DocxParser
 from docx import Document
         embed = img.xpath('.//a:blip/@r:embed')[0]
         related_part = document.part.related_parts[embed]
         image = related_part.image
+        image = Image.open(BytesIO(image.blob)).convert('RGB')
         return image
     def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
         self.doc = Document(
             if not question_level or question_level > 6: # not a question
                 last_answer = f'{last_answer}\n{p_text}'
                 current_image = self.get_picture(self.doc, p)
+                last_image = concat_img(last_image, current_image)
             else:   # is a question
                 if last_answer or last_image:
                     sum_question = '\n'.join(question_stack)

rag/nlp/__init__.py CHANGED Viewed

@@ -24,6 +24,7 @@ import copy
 import roman_numbers as r
 from word2number import w2n
 from cn2an import cn2an
 all_codecs = [
     'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
@@ -246,6 +247,19 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser):
     return res
 def tokenize_table(tbls, doc, eng, batch_size=10):
     res = []
     # add tables
@@ -504,4 +518,54 @@ def docx_question_level(p):
     if p.style.name.startswith('Heading'):
         return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
     else:
-        return 0, re.sub(r"\u3000", " ", p.text).strip()

 import roman_numbers as r
 from word2number import w2n
 from cn2an import cn2an
+from PIL import Image
 all_codecs = [
     'utf-8', 'gb2312', 'gbk', 'utf_16', 'ascii', 'big5', 'big5hkscs',
     return res
+def tokenize_chunks_docx(chunks, doc, eng, images):
+    res = []
+    # wrap up as es documents
+    for ck, image in zip(chunks, images):
+        if len(ck.strip()) == 0:continue
+        print("--", ck)
+        d = copy.deepcopy(doc)
+        d["image"] = image
+        tokenize(d, ck, eng)
+        res.append(d)
+    return res
 def tokenize_table(tbls, doc, eng, batch_size=10):
     res = []
     # add tables
     if p.style.name.startswith('Heading'):
         return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip()
     else:
+        return 0, re.sub(r"\u3000", " ", p.text).strip()
+def concat_img(img1, img2):
+    if img1 and not img2:
+        return img1
+    if not img1 and img2:
+        return img2
+    if not img1 and not img2:
+        return None
+    width1, height1 = img1.size
+    width2, height2 = img2.size
+    new_width = max(width1, width2)
+    new_height = height1 + height2
+    new_image = Image.new('RGB', (new_width, new_height))
+    new_image.paste(img1, (0, 0))
+    new_image.paste(img2, (0, height1))
+    return new_image
+def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。；！？"):
+    if not sections:
+        return []
+    cks = [""]
+    images = [None]
+    tk_nums = [0]
+    def add_chunk(t, image, pos=""):
+        nonlocal cks, tk_nums, delimiter
+        tnum = num_tokens_from_string(t)
+        if tnum < 8:
+            pos = ""
+        if tk_nums[-1] > chunk_token_num:
+            if t.find(pos) < 0:
+                t += pos
+            cks.append(t)
+            images.append(image)
+            tk_nums.append(tnum)
+        else:
+            if cks[-1].find(pos) < 0:
+                t += pos
+            cks[-1] += t
+            images[-1] = concat_img(images[-1], image)
+            tk_nums[-1] += tnum
+    for sec, image in sections:
+        add_chunk(sec, image, '')
+    return cks, images