#!/usr/bin/env python3 """A command line tool for extracting text and images from PDF and output it to plain text, html, xml or tags. """ from __future__ import annotations import argparse import logging import sys from string import Template from typing import List, Optional from pdf2zh import __version__, log from pdf2zh.high_level import translate, download_remote_fonts from pdf2zh.doclayout import OnnxModel, ModelInstance import os from pdf2zh.config import ConfigManager from yadt.translation_config import TranslationConfig as YadtConfig from yadt.high_level import translate as yadt_translate def create_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description=__doc__, add_help=True) parser.add_argument( "files", type=str, default=None, nargs="*", help="One or more paths to PDF files.", ) parser.add_argument( "--version", "-v", action="version", version=f"pdf2zh v{__version__}", ) parser.add_argument( "--debug", "-d", default=False, action="store_true", help="Use debug logging level.", ) parse_params = parser.add_argument_group( "Parser", description="Used during PDF parsing", ) parse_params.add_argument( "--pages", "-p", type=str, help="The list of page numbers to parse.", ) parse_params.add_argument( "--vfont", "-f", type=str, default="", help="The regex to math font name of formula.", ) parse_params.add_argument( "--vchar", "-c", type=str, default="", help="The regex to math character of formula.", ) parse_params.add_argument( "--lang-in", "-li", type=str, default="en", help="The code of source language.", ) parse_params.add_argument( "--lang-out", "-lo", type=str, default="zh", help="The code of target language.", ) parse_params.add_argument( "--service", "-s", type=str, default="google", help="The service to use for translation.", ) parse_params.add_argument( "--output", "-o", type=str, default="", help="Output directory for files.", ) parse_params.add_argument( "--thread", "-t", type=int, default=4, help="The number of threads to execute translation.", ) parse_params.add_argument( "--interactive", "-i", action="store_true", help="Interact with GUI.", ) parse_params.add_argument( "--share", action="store_true", help="Enable Gradio Share", ) parse_params.add_argument( "--flask", action="store_true", help="flask", ) parse_params.add_argument( "--celery", action="store_true", help="celery", ) parse_params.add_argument( "--authorized", type=str, nargs="+", help="user name and password.", ) parse_params.add_argument( "--prompt", type=str, help="user custom prompt.", ) parse_params.add_argument( "--compatible", "-cp", action="store_true", help="Convert the PDF file into PDF/A format to improve compatibility.", ) parse_params.add_argument( "--onnx", type=str, help="custom onnx model path.", ) parse_params.add_argument( "--serverport", type=int, help="custom WebUI port.", ) parse_params.add_argument( "--dir", action="store_true", help="translate directory.", ) parse_params.add_argument( "--config", type=str, help="config file.", ) parse_params.add_argument( "--yadt", default=False, action="store_true", help="Use experimental backend yadt.", ) return parser def parse_args(args: Optional[List[str]]) -> argparse.Namespace: parsed_args = create_parser().parse_args(args=args) if parsed_args.pages: pages = [] for p in parsed_args.pages.split(","): if "-" in p: start, end = p.split("-") pages.extend(range(int(start) - 1, int(end))) else: pages.append(int(p) - 1) parsed_args.raw_pages = parsed_args.pages parsed_args.pages = pages return parsed_args def find_all_files_in_directory(directory_path): """ Recursively search all PDF files in the given directory and return their paths as a list. :param directory_path: str, the path to the directory to search :return: list of PDF file paths """ # Check if the provided path is a directory if not os.path.isdir(directory_path): raise ValueError(f"The provided path '{directory_path}' is not a directory.") file_paths = [] # Walk through the directory recursively for root, _, files in os.walk(directory_path): for file in files: # Check if the file is a PDF if file.lower().endswith(".pdf"): # Append the full file path to the list file_paths.append(os.path.join(root, file)) return file_paths def main(args: Optional[List[str]] = None) -> int: logging.basicConfig() parsed_args = parse_args(args) if parsed_args.config: ConfigManager.custome_config(parsed_args.config) if parsed_args.debug: log.setLevel(logging.DEBUG) if parsed_args.onnx: ModelInstance.value = OnnxModel(parsed_args.onnx) else: ModelInstance.value = OnnxModel.load_available() if parsed_args.interactive: from pdf2zh.gui import setup_gui if parsed_args.serverport: setup_gui( parsed_args.share, parsed_args.authorized, int(parsed_args.serverport) ) else: setup_gui(parsed_args.share, parsed_args.authorized) return 0 if parsed_args.flask: from pdf2zh.backend import flask_app flask_app.run(port=11008) return 0 if parsed_args.celery: from pdf2zh.backend import celery_app celery_app.start(argv=sys.argv[2:]) return 0 if parsed_args.prompt: try: with open(parsed_args.prompt, "r", encoding="utf-8") as file: content = file.read() parsed_args.prompt = Template(content) except Exception: raise ValueError("prompt error.") print(parsed_args) if parsed_args.yadt: return yadt_main(parsed_args) if parsed_args.dir: untranlate_file = find_all_files_in_directory(parsed_args.files[0]) parsed_args.files = untranlate_file translate(model=ModelInstance.value, **vars(parsed_args)) return 0 translate(model=ModelInstance.value, **vars(parsed_args)) return 0 def yadt_main(parsed_args) -> int: if parsed_args.dir: untranlate_file = find_all_files_in_directory(parsed_args.files[0]) else: untranlate_file = parsed_args.files lang_in = parsed_args.lang_in lang_out = parsed_args.lang_out outputdir = None if parsed_args.output: outputdir = parsed_args.output font_path = download_remote_fonts(lang_out.lower()) param = parsed_args.service.split(":", 1) service_name = param[0] service_model = param[1] if len(param) > 1 else None envs = {} prompt = [] if parsed_args.prompt: try: with open(parsed_args.prompt, "r", encoding="utf-8") as file: content = file.read() prompt = Template(content) except Exception: raise ValueError("prompt error.") from pdf2zh.translator import ( AzureOpenAITranslator, GoogleTranslator, BingTranslator, DeepLTranslator, DeepLXTranslator, OllamaTranslator, OpenAITranslator, ZhipuTranslator, ModelScopeTranslator, SiliconTranslator, GeminiTranslator, AzureTranslator, TencentTranslator, DifyTranslator, AnythingLLMTranslator, XinferenceTranslator, ArgosTranslator, GorkTranslator, GroqTranslator, DeepseekTranslator, OpenAIlikedTranslator, ) for translator in [ GoogleTranslator, BingTranslator, DeepLTranslator, DeepLXTranslator, OllamaTranslator, XinferenceTranslator, AzureOpenAITranslator, OpenAITranslator, ZhipuTranslator, ModelScopeTranslator, SiliconTranslator, GeminiTranslator, AzureTranslator, TencentTranslator, DifyTranslator, AnythingLLMTranslator, ArgosTranslator, GorkTranslator, GroqTranslator, DeepseekTranslator, OpenAIlikedTranslator, ]: if service_name == translator.name: translator = translator( lang_in, lang_out, service_model, envs=envs, prompt=prompt ) break else: raise ValueError("Unsupported translation service") for file in untranlate_file: file = file.strip("\"'") yadt_config = YadtConfig( input_file=file, font=font_path, pages=",".join((str(x) for x in parsed_args.raw_pages)), output_dir=outputdir, translator=translator, debug=parsed_args.debug, lang_in=lang_in, lang_out=lang_out, no_dual=False, no_mono=False, qps=parsed_args.thread, ) yadt_translate(yadt_config) return 0 if __name__ == "__main__": sys.exit(main())