kalle07 commited on
Commit
f5a830e
·
verified ·
1 Parent(s): c4e2dc8

Upload 3 files

Browse files
.gitattributes CHANGED
@@ -39,3 +39,4 @@ parser_sevenof9_v1_1_en.exe filter=lfs diff=lfs merge=lfs -text
39
  parser_sevenof9_v1_2.exe filter=lfs diff=lfs merge=lfs -text
40
  PDF[[:space:]]Parser[[:space:]]-[[:space:]]Sevenof9_v7d.exe filter=lfs diff=lfs merge=lfs -text
41
  PDF[[:space:]]Parser[[:space:]]-[[:space:]]Sevenof9_v7e.exe filter=lfs diff=lfs merge=lfs -text
 
 
39
  parser_sevenof9_v1_2.exe filter=lfs diff=lfs merge=lfs -text
40
  PDF[[:space:]]Parser[[:space:]]-[[:space:]]Sevenof9_v7d.exe filter=lfs diff=lfs merge=lfs -text
41
  PDF[[:space:]]Parser[[:space:]]-[[:space:]]Sevenof9_v7e.exe filter=lfs diff=lfs merge=lfs -text
42
+ PDF_Parser-Sevenof9_v7f.exe filter=lfs diff=lfs merge=lfs -text
PDF_Parser-Sevenof9_v7f.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae4637ad213df05bea040748eb46b131cb7fbe400f94a4d67bab6130796d982d
3
+ size 40811690
PDF_Parser-Sevenof9_v7f.py ADDED
@@ -0,0 +1,1149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Standard library
2
+ import os
3
+ import re
4
+ import sys
5
+ import time
6
+ import json
7
+ import platform
8
+ import threading
9
+ import logging
10
+ import subprocess
11
+ from typing import Any, Dict, Iterable, List, Sequence, Tuple, ClassVar
12
+ from dataclasses import dataclass, field, replace
13
+ import math
14
+
15
+ # Third-party libraries
16
+ import wx
17
+ import pdfplumber
18
+ import psutil
19
+ from pdfminer.pdfparser import PDFParser, PDFSyntaxError
20
+ from pdfminer.pdfdocument import PDFDocument, PDFEncryptionError, PDFPasswordIncorrect
21
+ from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
22
+ from pdfminer.pdfinterp import PDFResourceManager
23
+ from rtree import index
24
+ import numpy as np
25
+
26
+ # Concurrency
27
+ import concurrent.futures
28
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
29
+ import multiprocessing
30
+
31
+ # --------------------------------------------------------------
32
+ # 1. Configuration & compiled regexes
33
+ # --------------------------------------------------------------
34
+ PARALLEL_THRESHOLD = 16
35
+
36
+
37
+ @dataclass(frozen=True)
38
+ class Config:
39
+ PARALLEL_THRESHOLD: int = 16 # pages per file before we switch to parallel mode
40
+
41
+ # Class‑level constant – accessible via Config.TEXT_EXTRACT_SETTINGS
42
+ TEXT_EXTRACT_SETTINGS: ClassVar[Dict[str, Any]] = {
43
+ "x_tolerance": 1.5,
44
+ "y_tolerance": 2.5,
45
+ "keep_blank_chars": False,
46
+ "use_text_flow": False,
47
+ }
48
+
49
+ LEFT_RIGHT_MARGIN_PCT: float = 5.3 # percent of the page
50
+ TOP_BOTTOM_MARGIN_PCT: float = 6.0 # percent of the page
51
+
52
+
53
+
54
+ #CID_PATTERN = re.compile(r"\$cid:\d+$") # Fixed: removed incorrect trailing $
55
+ CID_PATTERN = re.compile(r"\(cid:\d+\)")
56
+ # NON_Keyboard Pattern
57
+ NON_PRINTABLE_RE = re.compile(r"[\x00-\x1F\x7F\u200B-\u200D\uFEFF]")
58
+
59
+ def clean_cell_text(text: str) -> str:
60
+ if not isinstance(text, str):
61
+ return ""
62
+ # Remove hyphenated line endings
63
+ text = text.replace("-\n", "")
64
+ text = text.replace("\n", " ")
65
+ # Remove CID patterns
66
+ text = CID_PATTERN.sub("", text)
67
+ # Remove non-printable/invisible characters
68
+ text = NON_PRINTABLE_RE.sub("", text)
69
+ return text.strip()
70
+
71
+
72
+ # --------------------------------------------------------------
73
+ # 2. Small utilities
74
+ # --------------------------------------------------------------
75
+
76
+ def get_physical_cores():
77
+ count = psutil.cpu_count(logical=False)
78
+ return max(1, count if count else 1) # fallback = 1
79
+ cores = get_physical_cores()
80
+
81
+ # GUI update interval
82
+ def throttle_callback(callback, interval_ms=1):
83
+ last_called = 0
84
+
85
+ def wrapper(status):
86
+ nonlocal last_called
87
+ now = time.time() * 1000 # Time in ms
88
+ if now - last_called >= interval_ms:
89
+ last_called = now
90
+ callback(status)
91
+ return wrapper
92
+
93
+
94
+ def clamp_bbox(bbox: Tuple[float, float, float, float], w: float, h: float) -> Tuple[int, int, int, int]:
95
+ """Clamp a bbox to the page dimensions and round to nearest integer."""
96
+ x0, top, x1, bottom = bbox
97
+ return (
98
+ round(max(0, min(x0, w))),
99
+ round(max(0, min(top, h))),
100
+ round(min(x1, w)),
101
+ round(min(bottom, h)),
102
+ )
103
+
104
+
105
+ def is_valid_cell(cell: Any) -> bool:
106
+ """Return True if a cell contains something meaningful."""
107
+ return bool(str(cell).strip() and len(str(cell).strip()) > 1)
108
+
109
+
110
+
111
+ # Function to suppress PDFMiner logging, reducing verbosity
112
+ def suppress_pdfminer_logging():
113
+ for logger_name in [
114
+ "pdfminer", # Various pdfminer modules to suppress logging from
115
+ "pdfminer.pdfparser",
116
+ "pdfminer.pdfdocument",
117
+ "pdfminer.pdfpage",
118
+ "pdfminer.converter",
119
+ "pdfminer.layout",
120
+ "pdfminer.cmapdb",
121
+ "pdfminer.utils"
122
+ ]:
123
+ logging.getLogger(logger_name).setLevel(logging.ERROR) # Set logging level to ERROR to suppress lower levels
124
+
125
+ suppress_pdfminer_logging()
126
+
127
+ class StatusTracker:
128
+ def __init__(self, total_pages):
129
+ self.start_time = time.time()
130
+ self.total_pages = total_pages
131
+ self.processed_pages = 0
132
+
133
+ def update(self, n=1):
134
+ self.processed_pages += n
135
+
136
+ def get_status(self):
137
+ elapsed = time.time() - self.start_time
138
+ pages_per_sec = round(self.processed_pages / elapsed) if elapsed > 0 else 0
139
+ remaining_pages = self.total_pages - self.processed_pages
140
+ est_time = (remaining_pages / pages_per_sec) / 60 if pages_per_sec > 0 else float('inf')
141
+ return {
142
+ "processed_pages": self.processed_pages,
143
+ "total_pages": self.total_pages,
144
+ "pages_per_sec": pages_per_sec,
145
+ "elapsed_time": round(elapsed / 60, 1),
146
+ "est_time": round(est_time, 1)
147
+ }
148
+
149
+ # --------------------------------------------------------------
150
+ # 3. Data models
151
+ # --------------------------------------------------------------
152
+
153
+ @dataclass(frozen=True)
154
+ class Word:
155
+ text: str
156
+ x0: float
157
+ y0: float
158
+ x1: float
159
+ y1: float
160
+ font_size: float
161
+ font_name: str
162
+ bold: bool
163
+
164
+
165
+ @dataclass
166
+ class Block:
167
+ words: List[Word] = field(default_factory=list)
168
+
169
+ def bbox(self) -> Tuple[float, float, float, float]:
170
+ if not self.words:
171
+ return 0.0, 0.0, 0.0, 0.0
172
+ x0 = min(w.x0 for w in self.words)
173
+ y0 = min(w.y0 for w in self.words)
174
+ x1 = max(w.x1 for w in self.words)
175
+ y1 = max(w.y1 for w in self.words)
176
+ return (x0, y0, x1, y1)
177
+
178
+
179
+ @dataclass
180
+ class ImageInfo:
181
+ bbox: Tuple[float, float, float, float]
182
+ obj: Any # raw image dictionary from pdfplumber
183
+
184
+
185
+ # --------------------------------------------------------------
186
+ # 4. Union‑Find clustering
187
+ # --------------------------------------------------------------
188
+
189
+ class _UnionFind:
190
+ def __init__(self, n: int):
191
+ self.parent = list(range(n))
192
+ self.rank = [0] * n
193
+
194
+ def find(self, x: int) -> int:
195
+ if self.parent[x] != x:
196
+ self.parent[x] = self.find(self.parent[x])
197
+ return self.parent[x]
198
+
199
+ def union(self, a: int, b: int) -> None:
200
+ ra, rb = self.find(a), self.find(b)
201
+ if ra == rb:
202
+ return
203
+ if self.rank[ra] < self.rank[rb]:
204
+ ra, rb = rb, ra
205
+ self.parent[rb] = ra
206
+ if self.rank[ra] == self.rank[rb]:
207
+ self.rank[ra] += 1
208
+
209
+ def cluster_words(words: Sequence[Word], max_dx: int, max_dy: int) -> List[Block]:
210
+ """Group words into blocks based on proximity using optimized neighbor search."""
211
+ n = len(words)
212
+ if n == 0:
213
+ return []
214
+
215
+ uf = _UnionFind(n)
216
+
217
+ def is_neighbor(word1: Word, word2: Word) -> bool:
218
+ dx = max(0.0, max(word1.x0 - word2.x1, word2.x0 - word1.x1))
219
+ dy = max(0.0, max(word1.y0 - word2.y1, word2.y0 - word1.y0))
220
+ return dx <= max_dx and dy <= max_dy
221
+
222
+ # Track which words have already been processed (4 neighbors found)
223
+ processed = [False] * n
224
+
225
+ for i in range(n):
226
+ if processed[i]:
227
+ continue
228
+
229
+ neighbor_count = 0
230
+ neighbors_found = []
231
+
232
+ # Check against ALL other words - the key optimization is to stop early
233
+ for j in range(n):
234
+ if i == j:
235
+ continue
236
+
237
+ word1, word2 = words[i], words[j]
238
+
239
+ if is_neighbor(word1, word2):
240
+ neighbors_found.append(j)
241
+ neighbor_count += 1
242
+
243
+ # Early stopping as per your requirements:
244
+ # 1. If we have at least 2 neighbors, the word belongs to a text block
245
+ # 2. If we already have 4 neighbors (max possible in 2D), stop processing this word
246
+ if neighbor_count >= 1:
247
+ # Union with all found neighbors so far
248
+ for k in neighbors_found:
249
+ uf.union(i, k)
250
+
251
+ # Second early stop - no need to check further when 4 neighbors found
252
+ if neighbor_count >= 4:
253
+ processed[i] = True
254
+ break
255
+
256
+ # Continue processing other words even if current word had < 2 neighbors
257
+
258
+ # Build clusters
259
+ clusters: Dict[int, List[Word]] = {}
260
+ for idx in range(n):
261
+ root = uf.find(idx)
262
+ clusters.setdefault(root, []).append(words[idx])
263
+
264
+ # Return as list of Blocks
265
+ return [Block(wlist) for wlist in clusters.values()]
266
+
267
+
268
+
269
+ # --------------------------------------------------------------
270
+ # 5. Character index (vectorised)
271
+ # --------------------------------------------------------------
272
+
273
+ @dataclass
274
+ class CharIndex:
275
+ xs0: np.ndarray
276
+ xs1: np.ndarray
277
+ tops: np.ndarray
278
+ bottoms: np.ndarray
279
+ texts: List[str]
280
+ fonts: List[str]
281
+ sizes: np.ndarray
282
+
283
+ @classmethod
284
+ def build(cls, chars: Sequence[Dict[str, Any]]) -> "CharIndex":
285
+ return cls(
286
+ xs0=np.array([float(c["x0"]) for c in chars]),
287
+ xs1=np.array([float(c["x1"]) for c in chars]),
288
+ tops=np.array([float(c["top"]) for c in chars]),
289
+ bottoms=np.array([float(c["bottom"]) for c in chars]),
290
+ texts=[c.get("text", "") for c in chars],
291
+ fonts=[c.get("fontname", "") for c in chars],
292
+ sizes=np.array([float(c.get("size", 0)) for c in chars]),
293
+ )
294
+
295
+ def inside(self, x0: float, x1: float, y0: float, y1: float) -> np.ndarray:
296
+ return (
297
+ (self.xs0 >= x0)
298
+ & (self.xs1 <= x1)
299
+ & (self.tops >= y0)
300
+ & (self.bottoms <= y1)
301
+ )
302
+
303
+
304
+ # --------------------------------------------------------------
305
+ # 6. Core extraction helpers
306
+ # --------------------------------------------------------------
307
+
308
+ def _extract_tables(page: pdfplumber.page.Page) -> List[Tuple[str, Any]]:
309
+ """Return a list of JSON strings representing tables."""
310
+ suppress_pdfminer_logging()
311
+ raw_tables = page.extract_tables({"text_x_tolerance": Config.TEXT_EXTRACT_SETTINGS["x_tolerance"]})
312
+ jsons = []
313
+
314
+ for tbl in raw_tables:
315
+ # skip completely empty raw tables (rare)
316
+ if not tbl:
317
+ continue
318
+
319
+ cleaned = [[clean_cell_text(c) for c in row] for row in tbl]
320
+
321
+ # reject tables that are effectively empty after cleaning
322
+ if (
323
+ not cleaned
324
+ or len(cleaned) < 1
325
+ or len(cleaned[0]) < 1
326
+ or all(all(not is_valid_cell(cell) for cell in row) for row in cleaned)
327
+ ):
328
+ continue
329
+
330
+ header = cleaned[0]
331
+
332
+ if header[0].strip() == "":
333
+ # corner‑empty table -> two header row and column
334
+ col_headers = header[1:]
335
+ row_headers = [row[0] for row in cleaned[1:]]
336
+ data_rows = cleaned[1:]
337
+
338
+ table_dict = {}
339
+ for rh, row in zip(row_headers, data_rows):
340
+ table_dict[rh] = dict(zip(col_headers, row[1:]))
341
+ else:
342
+ # normal header‑table -> only top header
343
+ headers = header
344
+ data_rows = cleaned[1:]
345
+ table_dict = [dict(zip(headers, row)) for row in data_rows if len(row) == len(headers)]
346
+
347
+ jsons.append(json.dumps(table_dict, indent=1, ensure_ascii=False))
348
+ return jsons
349
+
350
+
351
+ def _filter_words(
352
+ words: List[Dict[str, Any]],
353
+ tables_bboxes: List[Tuple[int, int, int, int]],
354
+ ) -> List[Dict[str, Any]]:
355
+ """Keep all words, but clean each word of non-printable characters and table overlaps."""
356
+ filtered = []
357
+ for w in words:
358
+ x0, top = float(w["x0"]), float(w["top"])
359
+ # Skip words that overlap a table
360
+ if any(bx0 <= x0 <= bx2 and by0 <= top <= by3 for bx0, by0, bx2, by3 in tables_bboxes):
361
+ continue
362
+ # Clean the word in-place
363
+ w["text"] = clean_cell_text(w["text"])
364
+ filtered.append(w)
365
+ return filtered
366
+
367
+
368
+ def _build_word_info(
369
+ words: List[Dict[str, Any]],
370
+ char_index: CharIndex,
371
+ ) -> List[Word]:
372
+ """Convert raw pdfplumber words into Word dataclass instances."""
373
+ def is_bold(name: str) -> bool:
374
+ n = name.lower()
375
+ return "bold" in n or "bd" in n or "black" in n
376
+
377
+ word_objs: List[Word] = []
378
+ for w in words:
379
+ x0, y0, x1, y1 = map(float, (w["x0"], w["top"], w["x1"], w["bottom"]))
380
+ mask = char_index.inside(x0, x1, y0, y1)
381
+ sizes = char_index.sizes[mask]
382
+ fonts = [char_index.fonts[i] for i in np.nonzero(mask)[0]]
383
+ bolds = [is_bold(f) for f in fonts]
384
+
385
+ font_size = float(sizes.max()) if sizes.size else 0.0
386
+ word_objs.append(
387
+ Word(
388
+ text=w["text"],
389
+ x0=x0,
390
+ y0=y0,
391
+ x1=x1,
392
+ y1=y1,
393
+ font_size=font_size,
394
+ font_name=fonts[0] if fonts else "Unknown",
395
+ bold=bool(bolds),
396
+ )
397
+ )
398
+ return word_objs
399
+
400
+
401
+ def _group_blocks(
402
+ words: List[Word],
403
+ page_width: float,
404
+ page_height: float,
405
+ ) -> List[Block]:
406
+ """Cluster words into logical blocks using Union-Find, cleaning text and merging hyphen-split words."""
407
+
408
+ merged_words = []
409
+ skip_next = False
410
+
411
+ for i, word in enumerate(words):
412
+ if skip_next:
413
+ skip_next = False
414
+ continue
415
+
416
+ text = word.text.strip()
417
+
418
+ # If word ends with a hyphen (possibly at a line break), merge with next
419
+ if text.endswith('-') and i + 1 < len(words):
420
+ next_word = words[i + 1]
421
+ merged_text = re.sub(r'-\s*$', '', text) + next_word.text.lstrip()
422
+ merged_word = replace(word, text=merged_text)
423
+ merged_words.append(merged_word)
424
+ skip_next = True
425
+ else:
426
+ # Clean trailing hyphens and extra spaces (no merge)
427
+ cleaned_text = re.sub(r'-\s*$', '', text).strip()
428
+ if cleaned_text != text:
429
+ word = replace(word, text=cleaned_text)
430
+ merged_words.append(word)
431
+
432
+ # thresholds in pixel – derived from percentages
433
+ max_dx = int(round(page_width * 0.014)) # 1.51 %, ~9px
434
+ max_dy = int(round(page_height * 0.012)) # 1.43 %, ~12px
435
+ blocks = cluster_words(merged_words, max_dx, max_dy)
436
+
437
+ # Filter out empty blocks and single-character printable blocks
438
+ filtered_blocks = []
439
+ for block in blocks:
440
+ combined_text = " ".join(w.text for w in block.words)
441
+ stripped_text = combined_text.strip()
442
+
443
+ if stripped_text and len(stripped_text) > 1:
444
+ printable_chars = ''.join(c for c in stripped_text if not c.isspace())
445
+ if len(printable_chars) > 1:
446
+ filtered_blocks.append(block)
447
+
448
+ return filtered_blocks
449
+
450
+
451
+
452
+ # --------------------------------------------------------------
453
+ # 7. Page worker – orchestrator
454
+ # --------------------------------------------------------------
455
+
456
+ def process_page_worker(args: Tuple[int, str]) -> Tuple[int, str]:
457
+ """Process a single page; returns (page_number, rendered_text)."""
458
+ try:
459
+ page_no, path = args
460
+
461
+ with pdfplumber.open(path) as pdf:
462
+ page = pdf.pages[page_no]
463
+ w, h = page.width, page.height
464
+
465
+ # Crop margins percent of page
466
+ margin_x = w * Config.LEFT_RIGHT_MARGIN_PCT / 100.0
467
+ margin_y = h * Config.TOP_BOTTOM_MARGIN_PCT / 100.0
468
+ cropped_page = page.crop((margin_x, margin_y, w - margin_x, h - margin_y))
469
+
470
+ # ---------- Tables ----------
471
+ tables_json = _extract_tables(cropped_page)
472
+
473
+ # ---------- Words ----------
474
+ table_bboxes = [clamp_bbox(t.bbox, w, h) for t in cropped_page.find_tables()]
475
+ raw_words = cropped_page.extract_words(**Config.TEXT_EXTRACT_SETTINGS)
476
+ # Clean line break artifacts from PDF text extraction
477
+ filtered_raw = _filter_words(raw_words, table_bboxes)
478
+ char_index = CharIndex.build(cropped_page.chars)
479
+
480
+ words = _build_word_info(filtered_raw, char_index)
481
+ avg_font_size = float(np.mean([w.font_size for w in words])) if words else 0.0
482
+
483
+ # ---------- Blocks ----------
484
+ blocks = _group_blocks(words, w, h)
485
+
486
+ # ---------- Sorting (reading order) ----------
487
+ def reading_score(block: Block) -> Tuple[float, float]:
488
+ x0, y0, x1, y1 = block.bbox()
489
+ height = y1 - y0
490
+ width = x1 - x0
491
+ area_log = math.log1p(width * height)
492
+ return (y0 * 0.6 + x0 * 0.4 - area_log * 0.05, y0)
493
+
494
+ blocks.sort(key=reading_score)
495
+
496
+ # ---------- Images ----------
497
+ images: List[ImageInfo] = []
498
+ for im in cropped_page.images:
499
+ img_bbox = (
500
+ float(im["x0"]),
501
+ h - float(im["y1"]),
502
+ float(im["x1"]),
503
+ h - float(im["y0"]),
504
+ )
505
+ images.append(ImageInfo(bbox=img_bbox, obj=im))
506
+
507
+ # ---------- Assemble output ----------
508
+ lines: List[str] = [f"\n\n--- Page {page_no + 1} ---\n\n"]
509
+
510
+
511
+
512
+ # ---------- Classify blocks into small / large and reassign based on proximity ----------
513
+ large_blocks: List[Block] = []
514
+ small_blocks: List[Block] = []
515
+
516
+ # Initial classification by area
517
+ for block in blocks:
518
+ x0, y0, x1, y1 = block.bbox()
519
+ area = (x1 - x0) * (y1 - y0)
520
+
521
+ if area < 2000:
522
+ small_blocks.append(block)
523
+ else:
524
+ large_blocks.append(block)
525
+
526
+ # Re-check small blocks for promotion
527
+ remaining_small_blocks: List[Block] = []
528
+
529
+ for sblk in small_blocks:
530
+ x0_s, y0_s, x1_s, y1_s = sblk.bbox()
531
+ # Expanded bbox
532
+ re=12
533
+ x0_e, y0_e = x0_s - re, y0_s - re
534
+ x1_e, y1_e = x1_s + re, y1_s + re
535
+
536
+ # Check overlap with any large block
537
+ promoted = False
538
+ for lblk in large_blocks:
539
+ x0_l, y0_l, x1_l, y1_l = lblk.bbox()
540
+ if not (x1_e < x0_l or x1_l < x0_e or y1_e < y0_l or y1_l < y0_e):
541
+ large_blocks.append(sblk) # Promote without changing size
542
+ promoted = True
543
+ break
544
+
545
+ if not promoted:
546
+ remaining_small_blocks.append(sblk)
547
+
548
+ # Update small_blocks with non-promoted ones
549
+ small_blocks = remaining_small_blocks
550
+
551
+ # Visualisierung: Blocks als Rechtecke zeichnen
552
+ '''
553
+ im = page.to_image(resolution=150)
554
+
555
+ # Optional page boundary
556
+ im.draw_rect(
557
+ (margin_x, top_margin, width - margin_x, height - bottom_margin),
558
+ stroke="red",
559
+ stroke_width=2,
560
+ )
561
+
562
+ # Draw large blocks in blue
563
+ for block in large_blocks:
564
+ x0, y0, x1, y1 = block.bbox()
565
+ im.draw_rect((x0, y0, x1, y1), stroke="blue", stroke_width=2)
566
+
567
+ # Draw small blocks in green
568
+ for block in small_blocks:
569
+ x0, y0, x1, y1 = block.bbox()
570
+ im.draw_rect((x0, y0, x1, y1), stroke="green", stroke_width=2)
571
+ '''
572
+
573
+
574
+ # ---------- Process regular blocks ----------
575
+ for block in blocks:
576
+ # Now nearby_small_blocks contains only Block objects, not tuples
577
+ if any(block is small_block for small_block in small_blocks):
578
+ continue
579
+
580
+
581
+ # Sort words into lines using a y-tolerance
582
+ y_tolerance = 1.5 # pixels, adjust as needed
583
+ lines_dict: Dict[int, List[Word]] = {}
584
+
585
+ for w in sorted(block.words, key=lambda w: w.y0):
586
+ placed = False
587
+ for key in lines_dict:
588
+ if abs(w.y0 - key) <= y_tolerance:
589
+ lines_dict[key].append(w)
590
+ placed = True
591
+ break
592
+ if not placed:
593
+ lines_dict[w.y0] = [w]
594
+
595
+ # Combine words in each line
596
+ combined_lines = []
597
+ for line_words in sorted(lines_dict.values(), key=lambda lw: lw[0].y0):
598
+ line_words_sorted = sorted(line_words, key=lambda w: w.x0)
599
+ combined_lines.append(" ".join(w.text for w in line_words_sorted))
600
+
601
+ # Join all lines for the block
602
+ combined_text = " ".join(combined_lines)
603
+
604
+ # ------------------------------------------------------------------
605
+ # Labeling heuristics (unchanged from your original logic)
606
+ # ------------------------------------------------------------------
607
+ chapter_hits = 0
608
+ important_hits = 0
609
+ for wobj in block.words:
610
+ # Skip words with fewer than 4 letters and all numbers (no alphabetic characters)
611
+ if len(wobj.text) < 4 and not any(c.isalpha() for c in wobj.text):
612
+ continue
613
+ size_ratio = (
614
+ wobj.font_size / avg_font_size if avg_font_size else 0.0
615
+ )
616
+ if size_ratio >= 1.15:
617
+ chapter_hits += 1
618
+ elif wobj.bold and size_ratio >= 1.08:
619
+ important_hits += 1
620
+
621
+ label: str | None = None
622
+ hits = chapter_hits + important_hits
623
+ if hits > 1 or (hits == 1 and chapter_hits):
624
+ label = "CHAPTER" if chapter_hits else "IMPORTANT"
625
+
626
+ # ------------------------------------------------------------------
627
+ # Append block text (single line) and an empty line afterwards
628
+ # ------------------------------------------------------------------
629
+ if label:
630
+ line_text = f"[{label}] {combined_text}"
631
+ else:
632
+ line_text = combined_text
633
+
634
+ lines.append(line_text)
635
+ lines.append("") # <‑ blank line after every text block
636
+
637
+ # ---------- Tables ----------
638
+ for idx, tbl_json in enumerate(tables_json, 1):
639
+ lines.append(f'"table {idx}":\n{tbl_json}')
640
+
641
+ # ---------- Nearby small blocks (far from large blocks) ----------
642
+ if small_blocks:
643
+ lines.append(
644
+ "\n--- Small text snippets far away from large text blocks, possibly descriptions of image or diagram or table. ---"
645
+ )
646
+ for i, blk in enumerate(small_blocks, 1):
647
+ lines.append(f"Block {i}:")
648
+
649
+ # rebuild lines like earlier
650
+ y_tolerance = 1.5
651
+ local_dict = {}
652
+
653
+ for w in sorted(blk.words, key=lambda w: w.y0):
654
+ placed = False
655
+ for key in local_dict:
656
+ if abs(w.y0 - key) <= y_tolerance:
657
+ local_dict[key].append(w)
658
+ placed = True
659
+ break
660
+ if not placed:
661
+ local_dict[w.y0] = [w]
662
+
663
+ for line_words in sorted(local_dict.values(), key=lambda lw: lw[0].y0):
664
+ txt = " ".join(w.text for w in sorted(line_words, key=lambda w: w.x0))
665
+ lines.append(txt)
666
+
667
+ base, _ = os.path.splitext(path)
668
+ #img_out = f"{base}_page_{page_no + 1}_blocks.png"
669
+
670
+ #im.save(img_out)
671
+
672
+
673
+ return page_no, "\n".join(lines)
674
+
675
+
676
+ except Exception as exc: # pragma: no cover
677
+ err_msg = f"[ERROR] Seite {page_no + 1}: {exc.__class__.__name__}: {exc}"
678
+ logging.exception(err_msg)
679
+ return page_no, err_msg
680
+
681
+
682
+
683
+ # Processing part
684
+ def run_serial(path, page_number, tracker=None, progress_callback=None, stop_flag=None):
685
+ results = []
686
+ for i in range(page_number):
687
+ if stop_flag and stop_flag.is_set():
688
+ break
689
+ result = process_page_worker((i, path,))
690
+ results.append(result)
691
+ if tracker is not None:
692
+ tracker.update()
693
+ if progress_callback and tracker is not None:
694
+ report_status(tracker, progress_callback)
695
+ return results
696
+
697
+
698
+
699
+ def run_parallel(path, page_number, tracker=None, progress_callback=None, stop_flag=None):
700
+ args = [(i, path) for i in range(page_number)]
701
+ results = [None] * page_number
702
+
703
+ def callback(result):
704
+ if result is None:
705
+ return
706
+ page, _ = result
707
+ results[page] = result
708
+ if tracker is not None:
709
+ tracker.update()
710
+ if progress_callback and tracker is not None:
711
+ report_status(tracker, progress_callback)
712
+
713
+ max_workers = min(page_number, get_physical_cores())
714
+ with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
715
+ futures = {executor.submit(process_page_worker, arg): arg for arg in args}
716
+ for future in concurrent.futures.as_completed(futures):
717
+ callback(future.result())
718
+
719
+ return [r for r in results if r]
720
+
721
+
722
+
723
+
724
+ def report_status(tracker, progress_callback=None):
725
+ status = tracker.get_status()
726
+ if progress_callback:
727
+ progress_callback(status)
728
+ else:
729
+ print(f"[STATUS] {status['processed_pages']}/{status['total_pages']} Seiten "
730
+ f"({status['pages_per_sec']:} Seiten/s, "
731
+ f"Elapsed: {status['elapsed_time']} Sek.)"
732
+ f"Est Time: {status['est_time']} Sek.)")
733
+
734
+
735
+ def save_pdf(path, page_number, tracker=None, parallel=False, progress_callback=None, stop_flag=None):
736
+ if stop_flag and stop_flag.is_set():
737
+ return 0
738
+
739
+ if parallel:
740
+ results = run_parallel(path, page_number, tracker, progress_callback, stop_flag)
741
+ else:
742
+ results = run_serial(path, page_number, tracker, progress_callback, stop_flag)
743
+
744
+ results = [r for r in results if r] # Filter None (bei Stop)
745
+
746
+ results.sort(key=lambda x: x[0])
747
+ text_output = "\n".join(text for _, text in results)
748
+
749
+ out_path = os.path.splitext(path)[0] + ".txt"
750
+ with open(out_path, "w", encoding="utf-8", errors="ignore") as f:
751
+ f.write(text_output)
752
+
753
+ return page_number
754
+
755
+
756
+
757
+ def _process_single_pdf(path):
758
+ suppress_pdfminer_logging()
759
+ try:
760
+ with open(path, "rb") as f:
761
+ parser = PDFParser(f)
762
+ document = PDFDocument(parser)
763
+
764
+ if not document.is_extractable:
765
+ raise PDFTextExtractionNotAllowed("Text-Extraktion nicht erlaubt")
766
+
767
+ pages = list(PDFPage.create_pages(document))
768
+ return (path, len(pages), None)
769
+
770
+ except (PDFEncryptionError, PDFPasswordIncorrect) as e:
771
+ return (path, 0, f"[ERROR] Datei passwortgeschützt: {path} ({type(e).__name__}: {e})\n")
772
+ except PDFSyntaxError as e:
773
+ return (path, 0, f"[ERROR] Ungültige PDF-Syntax: {path} ({type(e).__name__}: {e})\n")
774
+ except PDFTextExtractionNotAllowed as e:
775
+ return (path, 0, f"[ERROR] Text-Extraktion nicht erlaubt: {path} ({type(e).__name__}: {e})\n")
776
+ except Exception as e:
777
+ return (path, 0, f"[ERROR] Fehler bei Datei {path}: {type(e).__name__}: {e}\n")
778
+
779
+ def get_total_pages(pdf_files, error_callback=None, progress_callback=None):
780
+ suppress_pdfminer_logging()
781
+ total = 0
782
+ page_info = []
783
+
784
+ def handle_result(path, count, error):
785
+ nonlocal total
786
+ if error:
787
+ if error_callback:
788
+ error_callback(error)
789
+ else:
790
+ print(error, end="")
791
+ else:
792
+ page_info.append((path, count))
793
+ total += count
794
+ if progress_callback:
795
+ progress_callback(total) # Rückmeldung an GUI
796
+
797
+ if len(pdf_files) > 14:
798
+ with concurrent.futures.ProcessPoolExecutor(max_workers=cores) as executor:
799
+ results = executor.map(_process_single_pdf, pdf_files)
800
+ for path, count, error in results:
801
+ handle_result(path, count, error)
802
+ else:
803
+ for path in pdf_files:
804
+ path, count, error = _process_single_pdf(path)
805
+ handle_result(path, count, error)
806
+
807
+ return page_info, total
808
+
809
+
810
+
811
+
812
+ # -------------------- GUI --------------------
813
+ class FileManager(wx.Frame):
814
+ def __init__(self, parent):
815
+ super().__init__(parent, title="PDF Parser - Sevenof9_v7f", size=(1000, 800))
816
+ self.files = []
817
+ self.InitUI()
818
+ self.stop_flag = threading.Event()
819
+
820
+ def InitUI(self):
821
+ panel = wx.Panel(self)
822
+ vbox = wx.BoxSizer(wx.VERTICAL)
823
+
824
+ hbox_lbl1 = wx.BoxSizer(wx.HORIZONTAL)
825
+
826
+ lbl1 = wx.StaticText(panel, label="PDF files: (with right mouse you can remove and open)")
827
+ hbox_lbl1.Add(lbl1, flag=wx.ALIGN_CENTER_VERTICAL | wx.LEFT, border=10)
828
+
829
+ hbox_lbl1.AddStretchSpacer() # <== schiebt den Button ganz nach rechts
830
+
831
+ help_btn = wx.Button(panel, label="? HELP ?", size=(60, 25))
832
+ help_btn.Bind(wx.EVT_BUTTON, self.ShowHelpText)
833
+ hbox_lbl1.Add(help_btn, flag=wx.RIGHT, border=10)
834
+
835
+ vbox.Add(hbox_lbl1, flag=wx.EXPAND | wx.TOP, border=10)
836
+
837
+
838
+ self.listbox = wx.ListBox(panel, style=wx.LB_EXTENDED)
839
+ self.listbox.Bind(wx.EVT_RIGHT_DOWN, self.OnRightClick)
840
+ self.listbox.Bind(wx.EVT_LISTBOX, self.ShowText)
841
+ vbox.Add(self.listbox, proportion=1, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
842
+
843
+ self.popup_menu = wx.Menu()
844
+ self.popup_menu.Append(1, "Remove selected")
845
+ self.popup_menu.Append(2, "Open in default PDF app")
846
+ self.popup_menu.Append(3, "Copy File Location")
847
+ self.popup_menu.Append(4, "Open File Location")
848
+ self.Bind(wx.EVT_MENU, self.RemoveFile, id=1)
849
+ self.Bind(wx.EVT_MENU, self.OpenPDF, id=2)
850
+ self.Bind(wx.EVT_MENU, self.CopyFileLocation, id=3)
851
+ self.Bind(wx.EVT_MENU, self.OpenFileLocation, id=4)
852
+
853
+
854
+ btn_panel = wx.Panel(panel)
855
+ btn_sizer = wx.BoxSizer(wx.HORIZONTAL)
856
+ for label, handler in [
857
+ ("Add Folder", self.AddFolder),
858
+ ("Select Files", self.AddFile),
859
+ ("Remove Selected", self.RemoveFile),
860
+ ("Remove All", self.RemoveAll),
861
+ ("Stop Parser", self.StopParser),
862
+ ("Start Parser", self.StartParser)
863
+ ]:
864
+ btn = wx.Button(btn_panel, label=label)
865
+ btn.Bind(wx.EVT_BUTTON, handler)
866
+ if label == "Stop Parser":
867
+ btn.SetBackgroundColour(wx.Colour(255, 180, 180)) # light red
868
+ elif label == "Start Parser":
869
+ btn.SetBackgroundColour(wx.Colour(180, 255, 180)) # light green
870
+ self.start_btn = btn # <-- Referenz merken
871
+ btn_sizer.Add(btn, proportion=1, flag=wx.ALL, border=5)
872
+ btn_panel.SetSizer(btn_sizer)
873
+ vbox.Add(btn_panel, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
874
+
875
+
876
+ lbl2 = wx.StaticText(panel, label="Text Frame: (choose PDF to see converted text)")
877
+ vbox.Add(lbl2, flag=wx.LEFT, border=10)
878
+
879
+ self.text_ctrl = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY)
880
+ self.ShowHelpText(None)
881
+ vbox.Add(self.text_ctrl, proportion=1, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
882
+
883
+ # Statusanzeige
884
+ stat_grid = wx.FlexGridSizer(1, 5, 5, 55)
885
+ self.lbl_processed_pages = wx.StaticText(panel, label="Processed pages: 0")
886
+ self.lbl_total_pages = wx.StaticText(panel, label="Total pages: 0")
887
+ self.lbl_pages_per_sec = wx.StaticText(panel, label="Pages/sec: 0")
888
+ self.lbl_est_time = wx.StaticText(panel, label="Estimated time (min): 0.0")
889
+ self.lbl_elapsed_time = wx.StaticText(panel, label="Elapsed time: 0.0")
890
+
891
+ for lbl in [self.lbl_processed_pages, self.lbl_total_pages, self.lbl_pages_per_sec, self.lbl_est_time, self.lbl_elapsed_time]:
892
+ stat_grid.Add(lbl)
893
+ vbox.Add(stat_grid, flag=wx.LEFT | wx.TOP, border=10)
894
+
895
+ self.prog_ctrl = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY)
896
+ vbox.Add(self.prog_ctrl, proportion=1, flag=wx.EXPAND | wx.ALL, border=10)
897
+
898
+ panel.SetSizer(vbox)
899
+
900
+
901
+ def ShowHelpText(self, event):
902
+ help_text = (
903
+ " This is a small help\n\n"
904
+ " • PRE ALPHA version (for ever) •\n"
905
+ "• The generated TXT file has the same name as the PDF file\n"
906
+ "• The TXT file is created in the same directory as the PDF\n"
907
+ "• Older TXT files will be overwritten without prompting\n"
908
+ "• When selecting a folder, subfolders are also selected\n"
909
+ "If:\n"
910
+ "[INFO] File completed: TEST.pdf (X pages)!\n"
911
+ "[INFO] Processing completed\n"
912
+ "-> This only means that all pages have been processed; it does not mean that the quality is good.\n"
913
+ "• An attempt is made to reproduce the layout of the page in columns from left to right and in blocks from top to bottom\n"
914
+ "• An attempt is made to detect regular tables with lines; headers (top or top and left) are assigned to the cells and stored in JSON format in the text file\n"
915
+ "• Adds the label “Page X” at the beginning of every page (absdlute number)\n"
916
+ "• Adds the label “Chapter” for large font and/or “important” for bold font\n"
917
+ "\n"
918
+ "Stop function becomes effective only after the currently processed file\n"
919
+ "When processing large amounts of data, the following should be noted:\n"
920
+ "First, all PDFs are opened once to determine the number of pages:\n"
921
+ "Then, all small PDFs are processed in parallel:\n"
922
+ "Then, each large PDF is processed page by page in parallel:\n"
923
+ )
924
+ self.text_ctrl.SetValue(help_text)
925
+
926
+
927
+ def AddFolder(self, event):
928
+ dlg = wx.DirDialog(self, "Select Folder")
929
+ if dlg.ShowModal() == wx.ID_OK:
930
+ for root, _, files in os.walk(dlg.GetPath()):
931
+ for f in files:
932
+ if f.lower().endswith(".pdf"):
933
+ path = os.path.normpath(os.path.join(root, f))
934
+ if path not in self.files:
935
+ self.files.append(path)
936
+ self.listbox.Append(path)
937
+ dlg.Destroy()
938
+
939
+ def AddFile(self, event):
940
+ with wx.FileDialog(self, "Select PDF Files", wildcard="PDF files (*.pdf)|*.pdf",
941
+ style=wx.FD_OPEN | wx.FD_MULTIPLE) as dlg:
942
+ if dlg.ShowModal() == wx.ID_OK:
943
+ for path in dlg.GetPaths():
944
+ if path not in self.files:
945
+ self.files.append(path)
946
+ self.listbox.Append(path)
947
+
948
+ def RemoveFile(self, event):
949
+ for i in reversed(self.listbox.GetSelections()):
950
+ self.listbox.Delete(i)
951
+ del self.files[i]
952
+ self.text_ctrl.Clear()
953
+
954
+ def RemoveAll(self, event):
955
+ self.listbox.Clear()
956
+ self.files.clear()
957
+ self.text_ctrl.Clear()
958
+
959
+ def OpenPDF(self, event):
960
+ i = self.listbox.GetSelections()
961
+ if i:
962
+ path = self.files[i[0]]
963
+ if platform.system() == "Windows":
964
+ os.startfile(path)
965
+ elif platform.system() == "Darwin":
966
+ subprocess.call(["open", path])
967
+ else:
968
+ subprocess.call(["xdg-open", path])
969
+
970
+ def CopyFileLocation(self, event):
971
+ sel = self.listbox.GetSelections()
972
+ if sel:
973
+ path = self.files[sel[0]]
974
+ if wx.TheClipboard.Open():
975
+ wx.TheClipboard.SetData(wx.TextDataObject(path))
976
+ wx.TheClipboard.Close()
977
+
978
+ def OpenFileLocation(self, event):
979
+ sel = self.listbox.GetSelections()
980
+ if sel:
981
+ folder = os.path.dirname(self.files[sel[0]])
982
+ if platform.system() == "Windows":
983
+ subprocess.Popen(f'explorer "{folder}"')
984
+ elif platform.system() == "Darwin":
985
+ subprocess.call(["open", folder])
986
+ else:
987
+ subprocess.call(["xdg-open", folder])
988
+
989
+
990
+ def OnRightClick(self, event):
991
+ if self.listbox.GetSelections():
992
+ self.PopupMenu(self.popup_menu, event.GetPosition())
993
+
994
+ def StartParser(self, event):
995
+ if not self.files:
996
+ wx.MessageBox("Please select files first.", "Hinweis", wx.OK | wx.ICON_INFORMATION)
997
+ wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
998
+ return
999
+
1000
+
1001
+ self.start_btn.Disable()
1002
+ self.stop_flag.clear()
1003
+ self.prog_ctrl.Clear()
1004
+
1005
+ def error_callback(msg):
1006
+ wx.CallAfter(self.AppendProg, msg)
1007
+
1008
+ def update_total_pages_live(new_total):
1009
+ wx.CallAfter(self.lbl_total_pages.SetLabel, f"Total pages: {new_total}")
1010
+
1011
+
1012
+ page_info, total_pages = get_total_pages(
1013
+ self.files,
1014
+ error_callback=error_callback,
1015
+ progress_callback=update_total_pages_live
1016
+ )
1017
+
1018
+ if total_pages == 0:
1019
+ self.AppendProg("[INFO] No pages found.\n")
1020
+ wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
1021
+ return
1022
+
1023
+ tracker = StatusTracker(total_pages)
1024
+
1025
+ def gui_progress_callback(status):
1026
+ wx.CallAfter(self.lbl_processed_pages.SetLabel, f"Processed pages: {status['processed_pages']}")
1027
+ wx.CallAfter(self.lbl_total_pages.SetLabel, f"Total pages: {status['total_pages']}")
1028
+ wx.CallAfter(self.lbl_pages_per_sec.SetLabel, f"Pages/sec: {status['pages_per_sec']:}")
1029
+ wx.CallAfter(self.lbl_est_time.SetLabel, f"Estimated time (min): {status['est_time']:}")
1030
+ wx.CallAfter(self.lbl_elapsed_time.SetLabel, f"Elapsed time: {status['elapsed_time']}")
1031
+
1032
+ throttled_gui_callback = throttle_callback(gui_progress_callback, 100)
1033
+
1034
+ def background():
1035
+ small = [p for p in page_info if p[1] <= PARALLEL_THRESHOLD]
1036
+ large = [p for p in page_info if p[1] > PARALLEL_THRESHOLD]
1037
+
1038
+ # Verarbeite kleine Dateien je in einem eigenen Prozess
1039
+ if small:
1040
+ max_workers = max(1, min(len(small), get_physical_cores()))
1041
+ with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
1042
+ futures = {}
1043
+ for path, count in small:
1044
+ if self.stop_flag.is_set():
1045
+ break
1046
+ future = executor.submit(save_pdf, path, count, None, False, None)
1047
+ futures[future] = (path, count)
1048
+
1049
+ for future in concurrent.futures.as_completed(futures):
1050
+ if self.stop_flag.is_set():
1051
+ break
1052
+ path, count = futures[future]
1053
+ try:
1054
+ pages_processed = future.result()
1055
+ tracker.update(pages_processed)
1056
+ throttled_gui_callback(tracker.get_status())
1057
+ wx.CallAfter(self.AppendProg, f"[INFO] File ready: {path} ({pages_processed} Seiten)\n")
1058
+ except Exception as e:
1059
+ wx.CallAfter(self.AppendProg, f"[ERROR] File {path}: {str(e)}\n")
1060
+
1061
+ # Verarbeite große Dateien Seite für Seite parallel
1062
+ for path, count in large:
1063
+ if self.stop_flag.is_set():
1064
+ break
1065
+
1066
+ try:
1067
+ pages_processed = save_pdf(
1068
+ path,
1069
+ count,
1070
+ tracker,
1071
+ parallel=True,
1072
+ progress_callback=throttled_gui_callback,
1073
+ stop_flag=self.stop_flag
1074
+ )
1075
+ if pages_processed:
1076
+ wx.CallAfter(
1077
+ self.AppendProg,
1078
+ f"[INFO] File ready: {path} ({pages_processed} Seiten)\n"
1079
+ )
1080
+ else:
1081
+ wx.CallAfter(
1082
+ self.AppendProg,
1083
+ f"[INFO] Stopped: {path}\n"
1084
+ )
1085
+ except Exception as e:
1086
+ wx.CallAfter(
1087
+ self.AppendProg,
1088
+ f"[ERROR] File {path}: {str(e)}\n"
1089
+ )
1090
+
1091
+
1092
+
1093
+ wx.CallAfter(self.AppendProg, "\n[INFO] Processing completed.\n")
1094
+ wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
1095
+ self.stop_flag.clear()
1096
+
1097
+ threading.Thread(target=background, daemon=True).start()
1098
+
1099
+
1100
+ def StopParser(self, event):
1101
+ self.stop_flag.set()
1102
+ self.AppendProg("[INFO] Processing Stopped...\n")
1103
+
1104
+
1105
+ def ShowText(self, event):
1106
+ sel = self.listbox.GetSelections()
1107
+ if not sel:
1108
+ return
1109
+ txt_path = os.path.splitext(self.files[sel[0]])[0] + ".txt"
1110
+ self.text_ctrl.Clear()
1111
+ if os.path.exists(txt_path):
1112
+ with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
1113
+ self.text_ctrl.SetValue(f.read())
1114
+ else:
1115
+ self.text_ctrl.SetValue("[No .txt file found]")
1116
+
1117
+ def AppendProg(self, text):
1118
+ self.prog_ctrl.AppendText(text)
1119
+
1120
+
1121
+ # -------------------- Einstiegspunkt --------------------
1122
+ def main():
1123
+ if len(sys.argv) > 1:
1124
+ pdf_files = sys.argv[1:]
1125
+ page_info, total_pages = get_total_pages(pdf_files)
1126
+ tracker = StatusTracker(total_pages)
1127
+
1128
+ def cli_callback(status):
1129
+ print(json.dumps(status))
1130
+
1131
+ for path, count in page_info:
1132
+ save_pdf(path, count, tracker, parallel=(count > PARALLEL_THRESHOLD), progress_callback=cli_callback)
1133
+ else:
1134
+ app = wx.App(False)
1135
+ frame = FileManager(None)
1136
+ frame.Show()
1137
+ app.MainLoop()
1138
+
1139
+
1140
+ if __name__ == "__main__":
1141
+ multiprocessing.freeze_support()
1142
+ main()
1143
+
1144
+
1145
+
1146
+
1147
+
1148
+
1149
+
build_exe_v7f.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import subprocess
3
+
4
+ # Hier wird der Pfad zum Entry-Point (dein Skript) definiert
5
+ entry_point = "PDF Parser - Sevenof9_v7f_07.py" # Passe dies nach Bedarf an
6
+
7
+
8
+ # Der Befehl, der an PyInstaller übergeben wird
9
+ cmd = [
10
+ "pyinstaller", # Use pyinstaller directly instead of sys.executable -m PyInstaller
11
+ "--onefile",
12
+ "--noconfirm",
13
+ "--clean",
14
+ "--noconsole", # Keine Konsole anzeigen (wichtig für GUI-Programme)
15
+
16
+ # External dependencies that need explicit hidden imports
17
+ "--hidden-import", "pdfminer.six",
18
+ "--hidden-import", "joblib",
19
+ "--hidden-import", "joblib.externals.loky.backend.resource_tracker",
20
+ "--hidden-import", "pdfplumber.utils.exceptions",
21
+ "--hidden-import", "pdfminer.layout",
22
+ "--hidden-import", "pdfminer.pdfpage",
23
+ "--hidden-import", "pdfminer.pdfinterp",
24
+ "--hidden-import", "pdfminer.pdfdocument",
25
+ "--hidden-import", "pdfminer.pdfparser",
26
+ "--hidden-import", "psutil",
27
+ "--hidden-import", "multiprocessing",
28
+ "--hidden-import", "rtree",
29
+ "--hidden-import", "numpy",
30
+ "--hidden-import", "concurrent.futures",
31
+ "--hidden-import", "wx", # This is the correct import for wxPython
32
+ "--hidden-import", "wx.lib.pubsub",
33
+ "--hidden-import", "wx.lib.pubsub.core",
34
+
35
+ entry_point
36
+ ]
37
+
38
+ try:
39
+ subprocess.run(cmd, check=True)
40
+ print("Kompilierung abgeschlossen.")
41
+ except subprocess.CalledProcessError as e:
42
+ print(f"Fehler bei der Kompilierung: {e}")