ctranslate2-4you commited on
Commit
3fdaa7b
·
verified ·
1 Parent(s): 3829982

Create pymupdf_test_WIP_2.py

Browse files
Files changed (1) hide show
  1. pymupdf_test_WIP_2.py +318 -0
pymupdf_test_WIP_2.py ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import os
5
+ import sys
6
+ import json
7
+ import math
8
+ import logging
9
+ import queue
10
+ import argparse
11
+ import threading
12
+ import multiprocessing as mp
13
+ from dataclasses import dataclass
14
+ from pathlib import Path
15
+
16
+ import numpy as np
17
+ import fitz
18
+ import tkinter as tk
19
+ from tkinter import filedialog, messagebox
20
+ from joblib import Parallel, cpu_count, delayed
21
+
22
+ logging.basicConfig(format="%(asctime)s [%(levelname)s] %(message)s", level=logging.INFO)
23
+ _LOGGER = logging.getLogger(__name__)
24
+
25
+ @dataclass
26
+ class Config:
27
+ parallel_threshold: int = 16
28
+ line_tolerance: float = 1.0
29
+ min_rect_area: float = 1e4
30
+ cores: int = max(1, cpu_count() - 2)
31
+
32
+ _CFG = Config()
33
+
34
+ def _cluster(vals, tol):
35
+ if not vals:
36
+ return [], {}
37
+ arr = np.sort(np.unique(np.array(vals, dtype=float)))
38
+ if arr.size == 1:
39
+ return [arr], {round(float(arr[0]), 2): 0}
40
+ breaks = np.where(np.diff(arr) > tol)[0] + 1
41
+ clusters = np.split(arr, breaks)
42
+ mapping = {round(float(v), 2): idx for idx, cl in enumerate(clusters) for v in cl}
43
+ return clusters, mapping
44
+
45
+ def _detect_table_bboxes(page):
46
+ horiz, vert = [], []
47
+ for d in page.get_drawings():
48
+ if d["type"] != 1 or d["linewidth"] > 2:
49
+ continue
50
+ x0, y0, x1, y1 = d["bbox"]
51
+ if abs(y1 - y0) < 2:
52
+ horiz.append(((y0 + y1) / 2, x0, x1))
53
+ elif abs(x1 - x0) < 2:
54
+ vert.append(((x0 + x1) / 2, y0, y1))
55
+ if not horiz or not vert:
56
+ return []
57
+ _, row_map = _cluster([y for y, _, _ in horiz], _CFG.line_tolerance)
58
+ _, col_map = _cluster([x for x, _, _ in vert], _CFG.line_tolerance)
59
+ rows = {}
60
+ for y, x0, x1 in horiz:
61
+ rows.setdefault(row_map[round(y, 2)], []).append(y)
62
+ cols = {}
63
+ for x, y0, y1 in vert:
64
+ cols.setdefault(col_map[round(x, 2)], []).append(x)
65
+ row_pos = sorted(sum(v) / len(v) for v in rows.values())
66
+ col_pos = sorted(sum(v) / len(v) for v in cols.values())
67
+ rects = [
68
+ fitz.Rect(c0, r0, c1, r1)
69
+ for r0, r1 in zip(row_pos, row_pos[1:])
70
+ for c0, c1 in zip(col_pos, col_pos[1:])
71
+ if fitz.Rect(c0, r0, c1, r1).get_area() >= _CFG.min_rect_area
72
+ ]
73
+ uniq = []
74
+ for r in rects:
75
+ if not any(u.contains(r) or r.contains(u) for u in uniq):
76
+ uniq.append(r)
77
+ return uniq
78
+
79
+ def _clean(text):
80
+ return " ".join(text.replace("-\n", "").replace("\n", " ").split()) if isinstance(text, str) else ""
81
+
82
+ def _extract_table(words, rect):
83
+ inside = [w for w in words if rect.x0 <= w[0] <= rect.x1 and rect.y0 <= w[1] <= rect.y1]
84
+ if not inside:
85
+ return []
86
+ inside.sort(key=lambda w: (w[1], w[0]))
87
+ x_centers = [(w[0] + w[2]) / 2 for w in inside]
88
+ _, col_map = _cluster(x_centers, 10)
89
+ columns = sorted(set(col_map.values()))
90
+ rows, cur_y, cur = [], None, []
91
+ for w in inside:
92
+ if cur_y is None or abs(w[1] - cur_y) > 5:
93
+ if cur:
94
+ rows.append(cur)
95
+ cur = [w]
96
+ cur_y = w[1]
97
+ else:
98
+ cur.append(w)
99
+ if cur:
100
+ rows.append(cur)
101
+ def _row_to_cells(r):
102
+ cells = [""] * len(columns)
103
+ for w in r:
104
+ col = col_map[round((w[0] + w[2]) / 2, 2)]
105
+ cells[col] += (" " if cells[col] else "") + w[4]
106
+ return [_clean(c) for c in cells]
107
+ table_matrix = [_row_to_cells(r) for r in rows]
108
+ if len(table_matrix) < 2:
109
+ return []
110
+ headers = table_matrix[0]
111
+ return [dict(zip(headers, row)) for row in table_matrix[1:]]
112
+
113
+ def _process_page(page, page_number):
114
+ words = page.get_text("words")
115
+ tables = _detect_table_bboxes(page)
116
+ tbl_json = [json.dumps(_extract_table(words, r), ensure_ascii=False, indent=1) for r in tables if _extract_table(words, r)]
117
+ tbl_boxes = [(r.x0, r.y0, r.x1, r.y1) for r in tables]
118
+ outside = [
119
+ w for w in words
120
+ if not any(bx0 <= w[0] <= bx2 and by0 <= w[1] <= by3 for bx0, by0, bx2, by3 in tbl_boxes)
121
+ ]
122
+ outside.sort(key=lambda w: (w[1], w[0]))
123
+ text_lines, cur_y, cur = [], None, []
124
+ for w in outside:
125
+ if cur_y is None or abs(w[1] - cur_y) > 10:
126
+ if cur:
127
+ text_lines.append(" ".join(cur))
128
+ cur, cur_y = [w[4]], w[1]
129
+ else:
130
+ cur.append(w[4])
131
+ if cur:
132
+ text_lines.append(" ".join(cur))
133
+ result = f"Page {page_number + 1}\n" + "\n".join(text_lines) + "\n"
134
+ for idx, tbl in enumerate(tbl_json, 1):
135
+ result += f'"table {idx}":\n{tbl}\n'
136
+ return result
137
+
138
+ def _write_output(base_path, content):
139
+ out_path = Path(base_path).with_suffix(".txt")
140
+ out_path.write_text(content, encoding="utf-8")
141
+ _LOGGER.info("Wrote %s", out_path)
142
+
143
+ def _process_document(path):
144
+ try:
145
+ with fitz.open(path) as doc:
146
+ results = []
147
+ for page_number in range(doc.page_count):
148
+ results.append(_process_page(doc.load_page(page_number), page_number))
149
+ _write_output(path, "\n".join(results))
150
+ except fitz.FileDataError as e:
151
+ _LOGGER.error("Cannot open %s: %s", path, e)
152
+ except Exception as e:
153
+ _LOGGER.error("Error processing %s: %s", path, e)
154
+
155
+ def _run_cli(files):
156
+ pdfs = []
157
+ for p in files:
158
+ try:
159
+ _ = fitz.open(p)
160
+ pdfs.append(p)
161
+ except fitz.FileDataError:
162
+ _LOGGER.error("Password-protected PDF skipped: %s", p)
163
+ except Exception as e:
164
+ _LOGGER.error("Error opening %s: %s", p, e)
165
+ if not pdfs:
166
+ return
167
+ if len(pdfs) == 1 or _CFG.cores == 1:
168
+ for p in pdfs:
169
+ _process_document(p)
170
+ else:
171
+ Parallel(n_jobs=min(_CFG.cores, len(pdfs)))(delayed(_process_document)(p) for p in pdfs)
172
+
173
+ class GUI:
174
+ def __init__(self):
175
+ self.root = tk.Tk()
176
+ self.root.title("Parser-Sevenof9 — PyMuPDF")
177
+ self.queue = queue.Queue()
178
+ self.stop_event = threading.Event()
179
+ self.files = []
180
+ self._build_widgets()
181
+ self.thread = None
182
+ self.root.protocol("WM_DELETE_WINDOW", self._on_close)
183
+ self._poll()
184
+ self.root.mainloop()
185
+
186
+ def _build_widgets(self):
187
+ tk.Label(self.root, text="Selected PDF files:").pack(pady=5)
188
+ list_frame = tk.Frame(self.root)
189
+ list_frame.pack(pady=5)
190
+ sb = tk.Scrollbar(list_frame)
191
+ self.listbox = tk.Listbox(list_frame, selectmode=tk.MULTIPLE, width=80, height=6, yscrollcommand=sb.set)
192
+ sb.config(command=self.listbox.yview)
193
+ self.listbox.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
194
+ sb.pack(side=tk.RIGHT, fill=tk.Y)
195
+ btn_frame = tk.Frame(self.root)
196
+ btn_frame.pack(pady=10)
197
+ tk.Button(btn_frame, text="Add Folder", command=self._add_folder).pack(side=tk.LEFT, padx=5)
198
+ tk.Button(btn_frame, text="Select Files", command=self._add_files).pack(side=tk.LEFT, padx=5)
199
+ tk.Button(btn_frame, text="Remove Selected", command=self._remove_sel).pack(side=tk.LEFT, padx=5)
200
+ tk.Button(btn_frame, text="Remove All", command=self._remove_all).pack(side=tk.LEFT, padx=5)
201
+ tk.Button(self.root, text="Start Parser", command=self._start_parser).pack(pady=5)
202
+ tk.Button(self.root, text="Stop", command=self._stop_parser).pack(pady=5)
203
+ prog_frame = tk.Frame(self.root)
204
+ prog_frame.pack(padx=10, pady=5)
205
+ sbp = tk.Scrollbar(prog_frame)
206
+ self.prog = tk.Text(prog_frame, height=15, width=100, state=tk.DISABLED, yscrollcommand=sbp.set)
207
+ sbp.config(command=self.prog.yview)
208
+ self.prog.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
209
+ sbp.pack(side=tk.RIGHT, fill=tk.Y)
210
+
211
+ def _add_folder(self):
212
+ folder = filedialog.askdirectory(title="Select Folder")
213
+ if not folder:
214
+ return
215
+ for root, _, fs in os.walk(folder):
216
+ for f in fs:
217
+ if f.lower().endswith(".pdf"):
218
+ p = os.path.join(root, f)
219
+ if p not in self.files:
220
+ self.files.append(p)
221
+ self.listbox.insert(tk.END, p)
222
+
223
+ def _add_files(self):
224
+ for p in filedialog.askopenfilenames(title="Select PDF Files", filetypes=[("PDF Files", "*.pdf")]):
225
+ if p not in self.files:
226
+ self.files.append(p)
227
+ self.listbox.insert(tk.END, p)
228
+
229
+ def _remove_sel(self):
230
+ sel = self.listbox.curselection()
231
+ for idx in reversed(sel):
232
+ self.listbox.delete(idx)
233
+ del self.files[idx]
234
+
235
+ def _remove_all(self):
236
+ self.listbox.delete(0, tk.END)
237
+ self.files.clear()
238
+
239
+ def _start_parser(self):
240
+ if not self.files:
241
+ messagebox.showinfo("No Files", "Please select at least one file.")
242
+ return
243
+ if self.thread and self.thread.is_alive():
244
+ messagebox.showinfo("Running", "Parser is already running.")
245
+ return
246
+ self.stop_event.clear()
247
+ self.thread = threading.Thread(target=self._worker, daemon=True)
248
+ self.thread.start()
249
+
250
+ def _stop_parser(self):
251
+ if self.thread and self.thread.is_alive():
252
+ self.stop_event.set()
253
+
254
+ def _worker(self):
255
+ for f in self.files:
256
+ if self.stop_event.is_set():
257
+ self.queue.put("[INFO] Parser stopped.\n")
258
+ return
259
+ self.queue.put(f"[INFO] Processing {Path(f).name}\n")
260
+ _process_document(f)
261
+ self.queue.put("[INFO] Parser finished.\n")
262
+
263
+ def _poll(self):
264
+ try:
265
+ while True:
266
+ msg = self.queue.get_nowait()
267
+ self._append(msg)
268
+ except queue.Empty:
269
+ pass
270
+ self.root.after(100, self._poll)
271
+
272
+ def _append(self, txt):
273
+ self.prog.config(state=tk.NORMAL)
274
+ self.prog.insert(tk.END, txt)
275
+ self.prog.see(tk.END)
276
+ self.prog.config(state=tk.DISABLED)
277
+
278
+ def _on_close(self):
279
+ self._stop_parser()
280
+ self.root.destroy()
281
+
282
+ def _positive_int(v):
283
+ iv = int(v)
284
+ if iv <= 0:
285
+ raise argparse.ArgumentTypeError("value must be positive")
286
+ return iv
287
+
288
+ def _positive_float(v):
289
+ fv = float(v)
290
+ if fv <= 0:
291
+ raise argparse.ArgumentTypeError("value must be positive")
292
+ return fv
293
+
294
+ def _parse_args():
295
+ p = argparse.ArgumentParser()
296
+ p.add_argument("pdfs", nargs="*", help="PDF files")
297
+ p.add_argument("--gui", action="store_true", help="Launch GUI")
298
+ p.add_argument("--threshold", type=_positive_int, default=_CFG.parallel_threshold)
299
+ p.add_argument("--tol", type=_positive_float, default=_CFG.line_tolerance)
300
+ p.add_argument("--cores", type=_positive_int, default=_CFG.cores)
301
+ p.add_argument("--log", default="INFO", help="Logging level")
302
+ return p.parse_args()
303
+
304
+ def main():
305
+ args = _parse_args()
306
+ _CFG.parallel_threshold = args.threshold
307
+ _CFG.line_tolerance = args.tol
308
+ _CFG.cores = args.cores
309
+ _LOGGER.setLevel(args.log.upper())
310
+ if args.gui or (not args.pdfs and not args.gui):
311
+ GUI()
312
+ else:
313
+ _run_cli(args.pdfs)
314
+
315
+ if __name__ == "__main__":
316
+ mp.set_start_method("spawn", force=True)
317
+ mp.freeze_support()
318
+ main()