Spaces:

retopara
/

ragflow

Build error

Kevin Hu commited on Oct 15, 2024

Commit

64508f3

1 Parent(s): e9c1552

let presentation do raptor (#2838)

### What problem does this PR solve?

#2837

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Files changed (2) hide show

api/apps/document_app.py +3 -2
rag/app/qa.py +10 -1

api/apps/document_app.py CHANGED Viewed

@@ -439,8 +439,9 @@ def change_parser():
             else:
                 return get_json_result(data=True)
-        if doc.type == FileType.VISUAL or re.search(
-                r"\.(ppt|pptx|pages)$", doc.name):
             return get_data_error_result(retmsg="Not supported yet!")
         e = DocumentService.update_by_id(doc.id,

             else:
                 return get_json_result(data=True)
+        if ((doc.type == FileType.VISUAL and req["parser_id"] != "picture")
+                or (re.search(
+                    r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation")):
             return get_data_error_result(retmsg="Not supported yet!")
         e = DocumentService.update_by_id(doc.id,

rag/app/qa.py CHANGED Viewed

@@ -68,6 +68,7 @@ class Excel(ExcelParser):
             [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
         return res
 class Pdf(PdfParser):
     def __call__(self, filename, binary=None, from_page=0,
                  to_page=100000, zoomin=3, callback=None):
@@ -155,6 +156,7 @@ class Pdf(PdfParser):
         if last_q:
             qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
         return qai_list, tbls
     def get_tbls_info(self, tbls, tbl_index):
         if tbl_index >= len(tbls):
             return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
@@ -166,10 +168,13 @@ class Pdf(PdfParser):
         tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
             .format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
         tbl_text = ''.join(tbls[tbl_index][0][1])
-        return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text
 class Docx(DocxParser):
     def __init__(self):
         pass
     def get_picture(self, document, paragraph):
         img = paragraph._element.xpath('.//pic:pic')
         if not img:
@@ -242,6 +247,7 @@ class Docx(DocxParser):
             tbls.append(((None, html), ""))
         return qai_list, tbls
 def rmPrefix(txt):
     return re.sub(
         r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:： ]+", "", txt.strip(), flags=re.IGNORECASE)
@@ -258,6 +264,7 @@ def beAdocPdf(d, q, a, eng, image, poss):
     add_positions(d, poss)
     return d
 def beAdocDocx(d, q, a, eng, image):
     qprefix = "Question: " if eng else "问题："
     aprefix = "Answer: " if eng else "回答："
@@ -268,6 +275,7 @@ def beAdocDocx(d, q, a, eng, image):
     d["image"] = image
     return d
 def beAdoc(d, q, a, eng):
     qprefix = "Question: " if eng else "问题："
     aprefix = "Answer: " if eng else "回答："
@@ -282,6 +290,7 @@ def mdQuestionLevel(s):
     match = re.match(r'#*', s)
     return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
 def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
     """
         Excel and csv(txt) format files are supported.

             [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
         return res
 class Pdf(PdfParser):
     def __call__(self, filename, binary=None, from_page=0,
                  to_page=100000, zoomin=3, callback=None):
         if last_q:
             qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
         return qai_list, tbls
     def get_tbls_info(self, tbls, tbl_index):
         if tbl_index >= len(tbls):
             return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
         tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
             .format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
         tbl_text = ''.join(tbls[tbl_index][0][1])
+        return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag,
 class Docx(DocxParser):
     def __init__(self):
         pass
     def get_picture(self, document, paragraph):
         img = paragraph._element.xpath('.//pic:pic')
         if not img:
             tbls.append(((None, html), ""))
         return qai_list, tbls
 def rmPrefix(txt):
     return re.sub(
         r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:： ]+", "", txt.strip(), flags=re.IGNORECASE)
     add_positions(d, poss)
     return d
 def beAdocDocx(d, q, a, eng, image):
     qprefix = "Question: " if eng else "问题："
     aprefix = "Answer: " if eng else "回答："
     d["image"] = image
     return d
 def beAdoc(d, q, a, eng):
     qprefix = "Question: " if eng else "问题："
     aprefix = "Answer: " if eng else "回答："
     match = re.match(r'#*', s)
     return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
 def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
     """
         Excel and csv(txt) format files are supported.