pz / pdf2zh /pdf2zh.py
github-actions[bot]
GitHub deploy: 430ada5a033af12b22377dc4dedd36c9b82e0183
35bba99
#!/usr/bin/env python3
"""A command line tool for extracting text and images from PDF and
output it to plain text, html, xml or tags.
"""
from __future__ import annotations
import argparse
import logging
import sys
from string import Template
from typing import List, Optional
from pdf2zh import __version__, log
from pdf2zh.high_level import translate, download_remote_fonts
from pdf2zh.doclayout import OnnxModel, ModelInstance
import os
from pdf2zh.config import ConfigManager
from yadt.translation_config import TranslationConfig as YadtConfig
from yadt.high_level import translate as yadt_translate
def create_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
parser.add_argument(
"files",
type=str,
default=None,
nargs="*",
help="One or more paths to PDF files.",
)
parser.add_argument(
"--version",
"-v",
action="version",
version=f"pdf2zh v{__version__}",
)
parser.add_argument(
"--debug",
"-d",
default=False,
action="store_true",
help="Use debug logging level.",
)
parse_params = parser.add_argument_group(
"Parser",
description="Used during PDF parsing",
)
parse_params.add_argument(
"--pages",
"-p",
type=str,
help="The list of page numbers to parse.",
)
parse_params.add_argument(
"--vfont",
"-f",
type=str,
default="",
help="The regex to math font name of formula.",
)
parse_params.add_argument(
"--vchar",
"-c",
type=str,
default="",
help="The regex to math character of formula.",
)
parse_params.add_argument(
"--lang-in",
"-li",
type=str,
default="en",
help="The code of source language.",
)
parse_params.add_argument(
"--lang-out",
"-lo",
type=str,
default="zh",
help="The code of target language.",
)
parse_params.add_argument(
"--service",
"-s",
type=str,
default="google",
help="The service to use for translation.",
)
parse_params.add_argument(
"--output",
"-o",
type=str,
default="",
help="Output directory for files.",
)
parse_params.add_argument(
"--thread",
"-t",
type=int,
default=4,
help="The number of threads to execute translation.",
)
parse_params.add_argument(
"--interactive",
"-i",
action="store_true",
help="Interact with GUI.",
)
parse_params.add_argument(
"--share",
action="store_true",
help="Enable Gradio Share",
)
parse_params.add_argument(
"--flask",
action="store_true",
help="flask",
)
parse_params.add_argument(
"--celery",
action="store_true",
help="celery",
)
parse_params.add_argument(
"--authorized",
type=str,
nargs="+",
help="user name and password.",
)
parse_params.add_argument(
"--prompt",
type=str,
help="user custom prompt.",
)
parse_params.add_argument(
"--compatible",
"-cp",
action="store_true",
help="Convert the PDF file into PDF/A format to improve compatibility.",
)
parse_params.add_argument(
"--onnx",
type=str,
help="custom onnx model path.",
)
parse_params.add_argument(
"--serverport",
type=int,
help="custom WebUI port.",
)
parse_params.add_argument(
"--dir",
action="store_true",
help="translate directory.",
)
parse_params.add_argument(
"--config",
type=str,
help="config file.",
)
parse_params.add_argument(
"--yadt",
default=False,
action="store_true",
help="Use experimental backend yadt.",
)
return parser
def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
parsed_args = create_parser().parse_args(args=args)
if parsed_args.pages:
pages = []
for p in parsed_args.pages.split(","):
if "-" in p:
start, end = p.split("-")
pages.extend(range(int(start) - 1, int(end)))
else:
pages.append(int(p) - 1)
parsed_args.raw_pages = parsed_args.pages
parsed_args.pages = pages
return parsed_args
def find_all_files_in_directory(directory_path):
"""
Recursively search all PDF files in the given directory and return their paths as a list.
:param directory_path: str, the path to the directory to search
:return: list of PDF file paths
"""
# Check if the provided path is a directory
if not os.path.isdir(directory_path):
raise ValueError(f"The provided path '{directory_path}' is not a directory.")
file_paths = []
# Walk through the directory recursively
for root, _, files in os.walk(directory_path):
for file in files:
# Check if the file is a PDF
if file.lower().endswith(".pdf"):
# Append the full file path to the list
file_paths.append(os.path.join(root, file))
return file_paths
def main(args: Optional[List[str]] = None) -> int:
logging.basicConfig()
parsed_args = parse_args(args)
if parsed_args.config:
ConfigManager.custome_config(parsed_args.config)
if parsed_args.debug:
log.setLevel(logging.DEBUG)
if parsed_args.onnx:
ModelInstance.value = OnnxModel(parsed_args.onnx)
else:
ModelInstance.value = OnnxModel.load_available()
if parsed_args.interactive:
from pdf2zh.gui import setup_gui
if parsed_args.serverport:
setup_gui(
parsed_args.share, parsed_args.authorized, int(parsed_args.serverport)
)
else:
setup_gui(parsed_args.share, parsed_args.authorized)
return 0
if parsed_args.flask:
from pdf2zh.backend import flask_app
flask_app.run(port=11008)
return 0
if parsed_args.celery:
from pdf2zh.backend import celery_app
celery_app.start(argv=sys.argv[2:])
return 0
if parsed_args.prompt:
try:
with open(parsed_args.prompt, "r", encoding="utf-8") as file:
content = file.read()
parsed_args.prompt = Template(content)
except Exception:
raise ValueError("prompt error.")
print(parsed_args)
if parsed_args.yadt:
return yadt_main(parsed_args)
if parsed_args.dir:
untranlate_file = find_all_files_in_directory(parsed_args.files[0])
parsed_args.files = untranlate_file
translate(model=ModelInstance.value, **vars(parsed_args))
return 0
translate(model=ModelInstance.value, **vars(parsed_args))
return 0
def yadt_main(parsed_args) -> int:
if parsed_args.dir:
untranlate_file = find_all_files_in_directory(parsed_args.files[0])
else:
untranlate_file = parsed_args.files
lang_in = parsed_args.lang_in
lang_out = parsed_args.lang_out
outputdir = None
if parsed_args.output:
outputdir = parsed_args.output
font_path = download_remote_fonts(lang_out.lower())
param = parsed_args.service.split(":", 1)
service_name = param[0]
service_model = param[1] if len(param) > 1 else None
envs = {}
prompt = []
if parsed_args.prompt:
try:
with open(parsed_args.prompt, "r", encoding="utf-8") as file:
content = file.read()
prompt = Template(content)
except Exception:
raise ValueError("prompt error.")
from pdf2zh.translator import (
AzureOpenAITranslator,
GoogleTranslator,
BingTranslator,
DeepLTranslator,
DeepLXTranslator,
OllamaTranslator,
OpenAITranslator,
ZhipuTranslator,
ModelScopeTranslator,
SiliconTranslator,
GeminiTranslator,
AzureTranslator,
TencentTranslator,
DifyTranslator,
AnythingLLMTranslator,
XinferenceTranslator,
ArgosTranslator,
GorkTranslator,
GroqTranslator,
DeepseekTranslator,
OpenAIlikedTranslator,
)
for translator in [
GoogleTranslator,
BingTranslator,
DeepLTranslator,
DeepLXTranslator,
OllamaTranslator,
XinferenceTranslator,
AzureOpenAITranslator,
OpenAITranslator,
ZhipuTranslator,
ModelScopeTranslator,
SiliconTranslator,
GeminiTranslator,
AzureTranslator,
TencentTranslator,
DifyTranslator,
AnythingLLMTranslator,
ArgosTranslator,
GorkTranslator,
GroqTranslator,
DeepseekTranslator,
OpenAIlikedTranslator,
]:
if service_name == translator.name:
translator = translator(
lang_in, lang_out, service_model, envs=envs, prompt=prompt
)
break
else:
raise ValueError("Unsupported translation service")
for file in untranlate_file:
file = file.strip("\"'")
yadt_config = YadtConfig(
input_file=file,
font=font_path,
pages=",".join((str(x) for x in parsed_args.raw_pages)),
output_dir=outputdir,
translator=translator,
debug=parsed_args.debug,
lang_in=lang_in,
lang_out=lang_out,
no_dual=False,
no_mono=False,
qps=parsed_args.thread,
)
yadt_translate(yadt_config)
return 0
if __name__ == "__main__":
sys.exit(main())