| """ | |
| 用户输入: | |
| model数组,每个元素代表一个页面 | |
| pdf在s3的路径 | |
| 截图保存的s3位置 | |
| 然后: | |
| 1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader | |
| 2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter | |
| 其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!! | |
| """ | |
| import re | |
| from loguru import logger | |
| from magic_pdf.libs.version import __version__ | |
| from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze | |
| from magic_pdf.rw import AbsReaderWriter | |
| from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr | |
| from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt | |
| PARSE_TYPE_TXT = "txt" | |
| PARSE_TYPE_OCR = "ocr" | |
| def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, | |
| **kwargs): | |
| """ | |
| 解析文本类pdf | |
| """ | |
| pdf_info_dict = parse_pdf_by_txt( | |
| pdf_bytes, | |
| pdf_models, | |
| imageWriter, | |
| start_page_id=start_page, | |
| debug_mode=is_debug, | |
| ) | |
| pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT | |
| pdf_info_dict["_version_name"] = __version__ | |
| return pdf_info_dict | |
| def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, | |
| **kwargs): | |
| """ | |
| 解析ocr类pdf | |
| """ | |
| pdf_info_dict = parse_pdf_by_ocr( | |
| pdf_bytes, | |
| pdf_models, | |
| imageWriter, | |
| start_page_id=start_page, | |
| debug_mode=is_debug, | |
| ) | |
| pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR | |
| pdf_info_dict["_version_name"] = __version__ | |
| return pdf_info_dict | |
| def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, | |
| input_model_is_empty: bool = False, | |
| *args, **kwargs): | |
| """ | |
| ocr和文本混合的pdf,全部解析出来 | |
| """ | |
| def parse_pdf(method): | |
| try: | |
| return method( | |
| pdf_bytes, | |
| pdf_models, | |
| imageWriter, | |
| start_page_id=start_page, | |
| debug_mode=is_debug, | |
| ) | |
| except Exception as e: | |
| logger.exception(e) | |
| return None | |
| pdf_info_dict = parse_pdf(parse_pdf_by_txt) | |
| if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False): | |
| logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr") | |
| if input_model_is_empty: | |
| pdf_models = doc_analyze(pdf_bytes, ocr=True) | |
| pdf_info_dict = parse_pdf(parse_pdf_by_ocr) | |
| if pdf_info_dict is None: | |
| raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.") | |
| else: | |
| pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR | |
| else: | |
| pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT | |
| pdf_info_dict["_version_name"] = __version__ | |
| return pdf_info_dict | |