|
|
|
"""A command line tool for extracting text and images from PDF and |
|
output it to plain text, html, xml or tags. |
|
""" |
|
|
|
from __future__ import annotations |
|
|
|
import argparse |
|
import logging |
|
import sys |
|
from string import Template |
|
from typing import List, Optional |
|
|
|
from pdf2zh import __version__, log |
|
from pdf2zh.high_level import translate, download_remote_fonts |
|
from pdf2zh.doclayout import OnnxModel, ModelInstance |
|
import os |
|
|
|
from pdf2zh.config import ConfigManager |
|
from yadt.translation_config import TranslationConfig as YadtConfig |
|
from yadt.high_level import translate as yadt_translate |
|
|
|
|
|
def create_parser() -> argparse.ArgumentParser: |
|
parser = argparse.ArgumentParser(description=__doc__, add_help=True) |
|
parser.add_argument( |
|
"files", |
|
type=str, |
|
default=None, |
|
nargs="*", |
|
help="One or more paths to PDF files.", |
|
) |
|
parser.add_argument( |
|
"--version", |
|
"-v", |
|
action="version", |
|
version=f"pdf2zh v{__version__}", |
|
) |
|
parser.add_argument( |
|
"--debug", |
|
"-d", |
|
default=False, |
|
action="store_true", |
|
help="Use debug logging level.", |
|
) |
|
parse_params = parser.add_argument_group( |
|
"Parser", |
|
description="Used during PDF parsing", |
|
) |
|
parse_params.add_argument( |
|
"--pages", |
|
"-p", |
|
type=str, |
|
help="The list of page numbers to parse.", |
|
) |
|
parse_params.add_argument( |
|
"--vfont", |
|
"-f", |
|
type=str, |
|
default="", |
|
help="The regex to math font name of formula.", |
|
) |
|
parse_params.add_argument( |
|
"--vchar", |
|
"-c", |
|
type=str, |
|
default="", |
|
help="The regex to math character of formula.", |
|
) |
|
parse_params.add_argument( |
|
"--lang-in", |
|
"-li", |
|
type=str, |
|
default="en", |
|
help="The code of source language.", |
|
) |
|
parse_params.add_argument( |
|
"--lang-out", |
|
"-lo", |
|
type=str, |
|
default="zh", |
|
help="The code of target language.", |
|
) |
|
parse_params.add_argument( |
|
"--service", |
|
"-s", |
|
type=str, |
|
default="google", |
|
help="The service to use for translation.", |
|
) |
|
parse_params.add_argument( |
|
"--output", |
|
"-o", |
|
type=str, |
|
default="", |
|
help="Output directory for files.", |
|
) |
|
parse_params.add_argument( |
|
"--thread", |
|
"-t", |
|
type=int, |
|
default=4, |
|
help="The number of threads to execute translation.", |
|
) |
|
parse_params.add_argument( |
|
"--interactive", |
|
"-i", |
|
action="store_true", |
|
help="Interact with GUI.", |
|
) |
|
parse_params.add_argument( |
|
"--share", |
|
action="store_true", |
|
help="Enable Gradio Share", |
|
) |
|
parse_params.add_argument( |
|
"--flask", |
|
action="store_true", |
|
help="flask", |
|
) |
|
parse_params.add_argument( |
|
"--celery", |
|
action="store_true", |
|
help="celery", |
|
) |
|
parse_params.add_argument( |
|
"--authorized", |
|
type=str, |
|
nargs="+", |
|
help="user name and password.", |
|
) |
|
parse_params.add_argument( |
|
"--prompt", |
|
type=str, |
|
help="user custom prompt.", |
|
) |
|
|
|
parse_params.add_argument( |
|
"--compatible", |
|
"-cp", |
|
action="store_true", |
|
help="Convert the PDF file into PDF/A format to improve compatibility.", |
|
) |
|
|
|
parse_params.add_argument( |
|
"--onnx", |
|
type=str, |
|
help="custom onnx model path.", |
|
) |
|
|
|
parse_params.add_argument( |
|
"--serverport", |
|
type=int, |
|
help="custom WebUI port.", |
|
) |
|
|
|
parse_params.add_argument( |
|
"--dir", |
|
action="store_true", |
|
help="translate directory.", |
|
) |
|
|
|
parse_params.add_argument( |
|
"--config", |
|
type=str, |
|
help="config file.", |
|
) |
|
|
|
parse_params.add_argument( |
|
"--yadt", |
|
default=False, |
|
action="store_true", |
|
help="Use experimental backend yadt.", |
|
) |
|
|
|
return parser |
|
|
|
|
|
def parse_args(args: Optional[List[str]]) -> argparse.Namespace: |
|
parsed_args = create_parser().parse_args(args=args) |
|
|
|
if parsed_args.pages: |
|
pages = [] |
|
for p in parsed_args.pages.split(","): |
|
if "-" in p: |
|
start, end = p.split("-") |
|
pages.extend(range(int(start) - 1, int(end))) |
|
else: |
|
pages.append(int(p) - 1) |
|
parsed_args.raw_pages = parsed_args.pages |
|
parsed_args.pages = pages |
|
|
|
return parsed_args |
|
|
|
|
|
def find_all_files_in_directory(directory_path): |
|
""" |
|
Recursively search all PDF files in the given directory and return their paths as a list. |
|
|
|
:param directory_path: str, the path to the directory to search |
|
:return: list of PDF file paths |
|
""" |
|
|
|
if not os.path.isdir(directory_path): |
|
raise ValueError(f"The provided path '{directory_path}' is not a directory.") |
|
|
|
file_paths = [] |
|
|
|
|
|
for root, _, files in os.walk(directory_path): |
|
for file in files: |
|
|
|
if file.lower().endswith(".pdf"): |
|
|
|
file_paths.append(os.path.join(root, file)) |
|
|
|
return file_paths |
|
|
|
|
|
def main(args: Optional[List[str]] = None) -> int: |
|
logging.basicConfig() |
|
|
|
parsed_args = parse_args(args) |
|
|
|
if parsed_args.config: |
|
ConfigManager.custome_config(parsed_args.config) |
|
|
|
if parsed_args.debug: |
|
log.setLevel(logging.DEBUG) |
|
|
|
if parsed_args.onnx: |
|
ModelInstance.value = OnnxModel(parsed_args.onnx) |
|
else: |
|
ModelInstance.value = OnnxModel.load_available() |
|
|
|
if parsed_args.interactive: |
|
from pdf2zh.gui import setup_gui |
|
|
|
if parsed_args.serverport: |
|
setup_gui( |
|
parsed_args.share, parsed_args.authorized, int(parsed_args.serverport) |
|
) |
|
else: |
|
setup_gui(parsed_args.share, parsed_args.authorized) |
|
return 0 |
|
|
|
if parsed_args.flask: |
|
from pdf2zh.backend import flask_app |
|
|
|
flask_app.run(port=11008) |
|
return 0 |
|
|
|
if parsed_args.celery: |
|
from pdf2zh.backend import celery_app |
|
|
|
celery_app.start(argv=sys.argv[2:]) |
|
return 0 |
|
|
|
if parsed_args.prompt: |
|
try: |
|
with open(parsed_args.prompt, "r", encoding="utf-8") as file: |
|
content = file.read() |
|
parsed_args.prompt = Template(content) |
|
except Exception: |
|
raise ValueError("prompt error.") |
|
|
|
print(parsed_args) |
|
if parsed_args.yadt: |
|
return yadt_main(parsed_args) |
|
if parsed_args.dir: |
|
untranlate_file = find_all_files_in_directory(parsed_args.files[0]) |
|
parsed_args.files = untranlate_file |
|
translate(model=ModelInstance.value, **vars(parsed_args)) |
|
return 0 |
|
|
|
translate(model=ModelInstance.value, **vars(parsed_args)) |
|
return 0 |
|
|
|
|
|
def yadt_main(parsed_args) -> int: |
|
if parsed_args.dir: |
|
untranlate_file = find_all_files_in_directory(parsed_args.files[0]) |
|
else: |
|
untranlate_file = parsed_args.files |
|
lang_in = parsed_args.lang_in |
|
lang_out = parsed_args.lang_out |
|
outputdir = None |
|
if parsed_args.output: |
|
outputdir = parsed_args.output |
|
font_path = download_remote_fonts(lang_out.lower()) |
|
|
|
param = parsed_args.service.split(":", 1) |
|
service_name = param[0] |
|
service_model = param[1] if len(param) > 1 else None |
|
|
|
envs = {} |
|
prompt = [] |
|
|
|
if parsed_args.prompt: |
|
try: |
|
with open(parsed_args.prompt, "r", encoding="utf-8") as file: |
|
content = file.read() |
|
prompt = Template(content) |
|
except Exception: |
|
raise ValueError("prompt error.") |
|
|
|
from pdf2zh.translator import ( |
|
AzureOpenAITranslator, |
|
GoogleTranslator, |
|
BingTranslator, |
|
DeepLTranslator, |
|
DeepLXTranslator, |
|
OllamaTranslator, |
|
OpenAITranslator, |
|
ZhipuTranslator, |
|
ModelScopeTranslator, |
|
SiliconTranslator, |
|
GeminiTranslator, |
|
AzureTranslator, |
|
TencentTranslator, |
|
DifyTranslator, |
|
AnythingLLMTranslator, |
|
XinferenceTranslator, |
|
ArgosTranslator, |
|
GorkTranslator, |
|
GroqTranslator, |
|
DeepseekTranslator, |
|
OpenAIlikedTranslator, |
|
) |
|
|
|
for translator in [ |
|
GoogleTranslator, |
|
BingTranslator, |
|
DeepLTranslator, |
|
DeepLXTranslator, |
|
OllamaTranslator, |
|
XinferenceTranslator, |
|
AzureOpenAITranslator, |
|
OpenAITranslator, |
|
ZhipuTranslator, |
|
ModelScopeTranslator, |
|
SiliconTranslator, |
|
GeminiTranslator, |
|
AzureTranslator, |
|
TencentTranslator, |
|
DifyTranslator, |
|
AnythingLLMTranslator, |
|
ArgosTranslator, |
|
GorkTranslator, |
|
GroqTranslator, |
|
DeepseekTranslator, |
|
OpenAIlikedTranslator, |
|
]: |
|
if service_name == translator.name: |
|
translator = translator( |
|
lang_in, lang_out, service_model, envs=envs, prompt=prompt |
|
) |
|
break |
|
else: |
|
raise ValueError("Unsupported translation service") |
|
|
|
for file in untranlate_file: |
|
file = file.strip("\"'") |
|
yadt_config = YadtConfig( |
|
input_file=file, |
|
font=font_path, |
|
pages=",".join((str(x) for x in parsed_args.raw_pages)), |
|
output_dir=outputdir, |
|
translator=translator, |
|
debug=parsed_args.debug, |
|
lang_in=lang_in, |
|
lang_out=lang_out, |
|
no_dual=False, |
|
no_mono=False, |
|
qps=parsed_args.thread, |
|
) |
|
yadt_translate(yadt_config) |
|
return 0 |
|
|
|
|
|
if __name__ == "__main__": |
|
sys.exit(main()) |
|
|