Kevin Hu
		
	commited on
		
		
					Commit 
							
							·
						
						64508f3
	
1
								Parent(s):
							
							e9c1552
								
let presentation do raptor (#2838)
Browse files### What problem does this PR solve?
#2837
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
- api/apps/document_app.py +3 -2
- rag/app/qa.py +10 -1
    	
        api/apps/document_app.py
    CHANGED
    
    | @@ -439,8 +439,9 @@ def change_parser(): | |
| 439 | 
             
                        else:
         | 
| 440 | 
             
                            return get_json_result(data=True)
         | 
| 441 |  | 
| 442 | 
            -
                    if doc.type == FileType.VISUAL  | 
| 443 | 
            -
                             | 
|  | |
| 444 | 
             
                        return get_data_error_result(retmsg="Not supported yet!")
         | 
| 445 |  | 
| 446 | 
             
                    e = DocumentService.update_by_id(doc.id,
         | 
|  | |
| 439 | 
             
                        else:
         | 
| 440 | 
             
                            return get_json_result(data=True)
         | 
| 441 |  | 
| 442 | 
            +
                    if ((doc.type == FileType.VISUAL and req["parser_id"] != "picture")
         | 
| 443 | 
            +
                            or (re.search(
         | 
| 444 | 
            +
                                r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation")):
         | 
| 445 | 
             
                        return get_data_error_result(retmsg="Not supported yet!")
         | 
| 446 |  | 
| 447 | 
             
                    e = DocumentService.update_by_id(doc.id,
         | 
    	
        rag/app/qa.py
    CHANGED
    
    | @@ -68,6 +68,7 @@ class Excel(ExcelParser): | |
| 68 | 
             
                        [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
         | 
| 69 | 
             
                    return res
         | 
| 70 |  | 
|  | |
| 71 | 
             
            class Pdf(PdfParser):
         | 
| 72 | 
             
                def __call__(self, filename, binary=None, from_page=0,
         | 
| 73 | 
             
                             to_page=100000, zoomin=3, callback=None):
         | 
| @@ -155,6 +156,7 @@ class Pdf(PdfParser): | |
| 155 | 
             
                    if last_q:
         | 
| 156 | 
             
                        qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
         | 
| 157 | 
             
                    return qai_list, tbls
         | 
|  | |
| 158 | 
             
                def get_tbls_info(self, tbls, tbl_index):
         | 
| 159 | 
             
                    if tbl_index >= len(tbls):
         | 
| 160 | 
             
                        return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
         | 
| @@ -166,10 +168,13 @@ class Pdf(PdfParser): | |
| 166 | 
             
                    tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
         | 
| 167 | 
             
                        .format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
         | 
| 168 | 
             
                    tbl_text = ''.join(tbls[tbl_index][0][1])
         | 
| 169 | 
            -
                    return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, | 
|  | |
|  | |
| 170 | 
             
            class Docx(DocxParser):
         | 
| 171 | 
             
                def __init__(self):
         | 
| 172 | 
             
                    pass
         | 
|  | |
| 173 | 
             
                def get_picture(self, document, paragraph):
         | 
| 174 | 
             
                    img = paragraph._element.xpath('.//pic:pic')
         | 
| 175 | 
             
                    if not img:
         | 
| @@ -242,6 +247,7 @@ class Docx(DocxParser): | |
| 242 | 
             
                        tbls.append(((None, html), ""))
         | 
| 243 | 
             
                    return qai_list, tbls
         | 
| 244 |  | 
|  | |
| 245 | 
             
            def rmPrefix(txt):
         | 
| 246 | 
             
                return re.sub(
         | 
| 247 | 
             
                    r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
         | 
| @@ -258,6 +264,7 @@ def beAdocPdf(d, q, a, eng, image, poss): | |
| 258 | 
             
                add_positions(d, poss)
         | 
| 259 | 
             
                return d
         | 
| 260 |  | 
|  | |
| 261 | 
             
            def beAdocDocx(d, q, a, eng, image):
         | 
| 262 | 
             
                qprefix = "Question: " if eng else "问题:"
         | 
| 263 | 
             
                aprefix = "Answer: " if eng else "回答:"
         | 
| @@ -268,6 +275,7 @@ def beAdocDocx(d, q, a, eng, image): | |
| 268 | 
             
                d["image"] = image
         | 
| 269 | 
             
                return d
         | 
| 270 |  | 
|  | |
| 271 | 
             
            def beAdoc(d, q, a, eng):
         | 
| 272 | 
             
                qprefix = "Question: " if eng else "问题:"
         | 
| 273 | 
             
                aprefix = "Answer: " if eng else "回答:"
         | 
| @@ -282,6 +290,7 @@ def mdQuestionLevel(s): | |
| 282 | 
             
                match = re.match(r'#*', s)
         | 
| 283 | 
             
                return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
         | 
| 284 |  | 
|  | |
| 285 | 
             
            def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
         | 
| 286 | 
             
                """
         | 
| 287 | 
             
                    Excel and csv(txt) format files are supported.
         | 
|  | |
| 68 | 
             
                        [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
         | 
| 69 | 
             
                    return res
         | 
| 70 |  | 
| 71 | 
            +
             | 
| 72 | 
             
            class Pdf(PdfParser):
         | 
| 73 | 
             
                def __call__(self, filename, binary=None, from_page=0,
         | 
| 74 | 
             
                             to_page=100000, zoomin=3, callback=None):
         | 
|  | |
| 156 | 
             
                    if last_q:
         | 
| 157 | 
             
                        qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
         | 
| 158 | 
             
                    return qai_list, tbls
         | 
| 159 | 
            +
             | 
| 160 | 
             
                def get_tbls_info(self, tbls, tbl_index):
         | 
| 161 | 
             
                    if tbl_index >= len(tbls):
         | 
| 162 | 
             
                        return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
         | 
|  | |
| 168 | 
             
                    tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
         | 
| 169 | 
             
                        .format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
         | 
| 170 | 
             
                    tbl_text = ''.join(tbls[tbl_index][0][1])
         | 
| 171 | 
            +
                    return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag,
         | 
| 172 | 
            +
             | 
| 173 | 
            +
             | 
| 174 | 
             
            class Docx(DocxParser):
         | 
| 175 | 
             
                def __init__(self):
         | 
| 176 | 
             
                    pass
         | 
| 177 | 
            +
             | 
| 178 | 
             
                def get_picture(self, document, paragraph):
         | 
| 179 | 
             
                    img = paragraph._element.xpath('.//pic:pic')
         | 
| 180 | 
             
                    if not img:
         | 
|  | |
| 247 | 
             
                        tbls.append(((None, html), ""))
         | 
| 248 | 
             
                    return qai_list, tbls
         | 
| 249 |  | 
| 250 | 
            +
             | 
| 251 | 
             
            def rmPrefix(txt):
         | 
| 252 | 
             
                return re.sub(
         | 
| 253 | 
             
                    r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
         | 
|  | |
| 264 | 
             
                add_positions(d, poss)
         | 
| 265 | 
             
                return d
         | 
| 266 |  | 
| 267 | 
            +
             | 
| 268 | 
             
            def beAdocDocx(d, q, a, eng, image):
         | 
| 269 | 
             
                qprefix = "Question: " if eng else "问题:"
         | 
| 270 | 
             
                aprefix = "Answer: " if eng else "回答:"
         | 
|  | |
| 275 | 
             
                d["image"] = image
         | 
| 276 | 
             
                return d
         | 
| 277 |  | 
| 278 | 
            +
             | 
| 279 | 
             
            def beAdoc(d, q, a, eng):
         | 
| 280 | 
             
                qprefix = "Question: " if eng else "问题:"
         | 
| 281 | 
             
                aprefix = "Answer: " if eng else "回答:"
         | 
|  | |
| 290 | 
             
                match = re.match(r'#*', s)
         | 
| 291 | 
             
                return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
         | 
| 292 |  | 
| 293 | 
            +
             | 
| 294 | 
             
            def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
         | 
| 295 | 
             
                """
         | 
| 296 | 
             
                    Excel and csv(txt) format files are supported.
         |