kalle07 commited on
Commit
4b74ae5
·
verified ·
1 Parent(s): 83ef09c

Delete PDF Parser - Sevenof9_v7d.py

Browse files
Files changed (1) hide show
  1. PDF Parser - Sevenof9_v7d.py +0 -826
PDF Parser - Sevenof9_v7d.py DELETED
@@ -1,826 +0,0 @@
1
- import os
2
- import sys
3
- import time
4
- import json
5
- import wx
6
- import re
7
- import platform
8
- import subprocess
9
- import threading
10
- import concurrent.futures
11
- import multiprocessing
12
- from concurrent.futures import ProcessPoolExecutor
13
- import pdfplumber
14
- import psutil
15
- import logging
16
- from pdfminer.pdfparser import PDFParser, PDFSyntaxError
17
- from pdfminer.pdfdocument import PDFDocument, PDFEncryptionError, PDFPasswordIncorrect
18
- from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
19
- from pdfminer.pdfinterp import PDFResourceManager
20
-
21
-
22
- # -------------------- Konfiguration --------------------
23
- PARALLEL_THRESHOLD = 14
24
-
25
- TEXT_EXTRACTION_SETTINGS = {
26
- "x_tolerance": 1.5,
27
- "y_tolerance": 2.5,
28
- "keep_blank_chars": False,
29
- "use_text_flow": False,
30
- }
31
-
32
-
33
-
34
- # GUi update intervall
35
- def throttle_callback(callback, interval_ms=1):
36
- last_called = 0
37
-
38
- def wrapper(status):
39
- nonlocal last_called
40
- now = time.time() * 1000 # Zeit in ms
41
- if now - last_called >= interval_ms:
42
- last_called = now
43
- callback(status)
44
- return wrapper
45
-
46
-
47
-
48
- # Function to suppress PDFMiner logging, reducing verbosity
49
- def suppress_pdfminer_logging():
50
- for logger_name in [
51
- "pdfminer", # Various pdfminer modules to suppress logging from
52
- "pdfminer.pdfparser",
53
- "pdfminer.pdfdocument",
54
- "pdfminer.pdfpage",
55
- "pdfminer.converter",
56
- "pdfminer.layout",
57
- "pdfminer.cmapdb",
58
- "pdfminer.utils"
59
- ]:
60
- logging.getLogger(logger_name).setLevel(logging.ERROR) # Set logging level to ERROR to suppress lower levels
61
-
62
-
63
- EUROPEAN_PRINTABLES_PATTERN = re.compile(r"[^\u0000-\uFFFF]", re.DOTALL)
64
- CID_PATTERN = re.compile(r"\(cid:\d+\)")
65
-
66
- def clean_cell_text(text):
67
- if not isinstance(text, str):
68
- return ""
69
- text = text.replace("-\n", "").replace("\n", " ")
70
- text = CID_PATTERN.sub("", text)
71
- return EUROPEAN_PRINTABLES_PATTERN.sub("", text)
72
-
73
- def clamp_bbox(bbox, page_width, page_height, p=3):
74
- x0, top, x1, bottom = bbox
75
- x0 = max(0, min(x0, page_width))
76
- x1 = max(0, min(x1, page_width))
77
- top = max(0, min(top, page_height))
78
- bottom = max(0, min(bottom, page_height))
79
- return round(x0, p), round(top, p), round(x1, p), round(bottom, p)
80
-
81
- def get_physical_cores():
82
- count = psutil.cpu_count(logical=False)
83
- return max(1, count if count else 1) # fallback = 1
84
- cores = get_physical_cores()
85
-
86
-
87
- def is_valid_cell(cell):
88
- """Prüft, ob eine Zelle mehr als nur Leerzeichen oder ein einzelnes Zeichen enthält."""
89
- if cell is None:
90
- return False
91
- content = str(cell).strip()
92
- return len(content) > 1
93
-
94
-
95
- def block_area(block):
96
- x0 = min(w["x0"] for w in block)
97
- x1 = max(w["x1"] for w in block)
98
- top = min(w["top"] for w in block)
99
- bottom = max(w["bottom"] for w in block)
100
- return (x1 - x0) * (bottom - top)
101
-
102
-
103
- suppress_pdfminer_logging()
104
-
105
- # -------------------- Status-Tracking --------------------
106
- class StatusTracker:
107
- def __init__(self, total_pages):
108
- self.start_time = time.time()
109
- self.total_pages = total_pages
110
- self.processed_pages = 0
111
-
112
- def update(self, n=1):
113
- self.processed_pages += n
114
-
115
- def get_status(self):
116
- elapsed = time.time() - self.start_time
117
- pages_per_sec = round(self.processed_pages / elapsed) if elapsed > 0 else 0
118
- remaining_pages = self.total_pages - self.processed_pages
119
- est_time = (remaining_pages / pages_per_sec) / 60 if pages_per_sec > 0 else float('inf')
120
- return {
121
- "processed_pages": self.processed_pages,
122
- "total_pages": self.total_pages,
123
- "pages_per_sec": pages_per_sec,
124
- "elapsed_time": round(elapsed / 60, 1),
125
- "est_time": round(est_time, 1)
126
- }
127
-
128
-
129
- # -------------------- PDF Verarbeitung --------------------
130
- def process_page_worker(args):
131
- suppress_pdfminer_logging()
132
- try:
133
- page_number, path = args
134
- with pdfplumber.open(path) as pdf:
135
- page = pdf.pages[page_number]
136
- width, height = page.width, page.height
137
- margin_x, margin_y = width * 0.04, height * 0.04
138
-
139
- cropped_page = page.crop((margin_x, margin_y, width - margin_x, height - margin_y))
140
- table_bboxes = [clamp_bbox(t.bbox, width, height) for t in cropped_page.find_tables()]
141
- extracted_tables = cropped_page.extract_tables({"text_x_tolerance": 1.5})
142
- tables_json = []
143
-
144
- for raw_table in extracted_tables:
145
- if not raw_table or len(raw_table) < 2:
146
- continue # Weniger als 2 Zeilen
147
-
148
- # Prüfe auf mindestens 2 Spalten
149
- if all(len(row) < 2 for row in raw_table if row):
150
- continue
151
-
152
- # Leere oder fast leere Tabellen (nur Leerzeichen oder 1 Zeichen pro Zelle) ausschließen
153
- if all(all(not is_valid_cell(cell) for cell in row) for row in raw_table):
154
- continue
155
-
156
- cleaned_table = [[clean_cell_text(c) for c in row] for row in raw_table]
157
- header_row = cleaned_table[0]
158
- is_corner_empty = header_row[0].strip() == ""
159
-
160
- if is_corner_empty:
161
- col_headers = cleaned_table[0][1:]
162
- row_headers = [row[0] for row in cleaned_table[1:]]
163
- data_rows = cleaned_table[1:]
164
-
165
- table_data = {}
166
- for row_header, row in zip(row_headers, data_rows):
167
- row_dict = {}
168
- for col_header, cell in zip(col_headers, row[1:]):
169
- row_dict[col_header] = cell
170
- table_data[row_header] = row_dict
171
- else:
172
- headers = header_row
173
- data_rows = cleaned_table[1:]
174
- table_data = []
175
- for row in data_rows:
176
- if len(row) == len(headers):
177
- table_data.append(dict(zip(headers, row)))
178
-
179
- tables_json.append(json.dumps(table_data, indent=1, ensure_ascii=False))
180
-
181
-
182
- words = []
183
- for w in cropped_page.extract_words(**TEXT_EXTRACTION_SETTINGS):
184
- x0, top = float(w["x0"]), float(w["top"])
185
- if any(bx0 <= x0 <= bx2 and by0 <= top <= by3 for bx0, by0, bx2, by3 in table_bboxes):
186
- continue
187
- if EUROPEAN_PRINTABLES_PATTERN.search(w["text"]):
188
- continue
189
- words.append(w)
190
-
191
- def is_bold(fontname: str) -> bool:
192
- fontname = fontname.lower()
193
- return "bold" in fontname or "bd" in fontname or "black" in fontname
194
-
195
- word_info = []
196
- font_sizes = []
197
- for w in words:
198
- x0 = float(w["x0"])
199
- x1 = float(w["x1"])
200
- top = float(w["top"])
201
- bottom = float(w["bottom"])
202
- text = w["text"]
203
-
204
- chars = [c for c in page.chars if x0 <= float(c["x0"]) <= x1 and top <= float(c["top"]) <= bottom]
205
- sizes = [float(c.get("size", 0)) for c in chars if c.get("text", "").strip()]
206
- fonts = [c.get("fontname", "") for c in chars]
207
- bold_flags = [is_bold(c.get("fontname", "")) for c in chars]
208
-
209
- font_size = max(sizes) if sizes else 0
210
- font_sizes.append(font_size)
211
- font_name = fonts[0] if fonts else "Unknown"
212
- bold_flag = any(bold_flags)
213
-
214
- word_info.append({
215
- "text": text,
216
- "top": round(top, 1),
217
- "bottom": round(bottom, 1),
218
- "font_size": font_size,
219
- "font_name": font_name,
220
- "bold_flag": bold_flag,
221
- "x0": round(x0, 1),
222
- "x1": round(x1, 1),
223
- })
224
-
225
-
226
-
227
- avg_fontsize = sum(font_sizes) / len(font_sizes) if font_sizes else 0
228
-
229
- # Abstandsschwellen
230
- MAX_DIST_X = 9
231
- MAX_DIST_Y = 10
232
-
233
- def are_words_close(w1, w2):
234
- # Prüfe, ob Wörter räumlich nah beieinander liegen
235
- dx = max(0, max(w1["x0"], w2["x0"]) - min(w1["x1"], w2["x1"]))
236
- dy = max(0, max(w1["top"], w2["top"]) - min(w1["bottom"], w2["bottom"]))
237
- return dx <= MAX_DIST_X and dy <= MAX_DIST_Y
238
-
239
- def group_into_blocks(words):
240
- blocks = []
241
- unvisited = set(range(len(words)))
242
- while unvisited:
243
- idx = unvisited.pop()
244
- block = {idx}
245
- to_visit = {idx}
246
- while to_visit:
247
- current = to_visit.pop()
248
- for other in list(unvisited):
249
- if are_words_close(words[current], words[other]):
250
- block.add(other)
251
- to_visit.add(other)
252
- unvisited.remove(other)
253
- blocks.append([words[i] for i in block])
254
- return blocks
255
-
256
- def group_block_into_lines(block, line_tolerance=2.5):
257
- # Gruppiere Wörter innerhalb eines Blocks in Zeilen (nach Y-Koordinate)
258
- sorted_words = sorted(block, key=lambda w: w["top"])
259
- lines = []
260
- #lines = [sorted(block, key=lambda w: w["x0"])]
261
- current_line = [sorted_words[0]]
262
- current_top = sorted_words[0]["top"]
263
-
264
- for word in sorted_words[1:]:
265
- if abs(word["top"] - current_top) <= line_tolerance:
266
- current_line.append(word)
267
- else:
268
- lines.append(sorted(current_line, key=lambda w: w["x0"]))
269
- current_line = [word]
270
- current_top = word["top"]
271
- if current_line:
272
- lines.append(sorted(current_line, key=lambda w: w["x0"]))
273
- return lines
274
-
275
-
276
- blocks = group_into_blocks(word_info)
277
-
278
- SORT_TOLERANCE = 1 # e.g. 1 point distance
279
-
280
- def round_to_nearest(value, tolerance):
281
- return round(value / tolerance) * tolerance
282
-
283
- def get_block_reference(block):
284
- min_x0 = min(w["x0"] for w in block)
285
- min_top = min(w["top"] for w in block)
286
- return (
287
- round_to_nearest(min_x0, SORT_TOLERANCE),
288
- round_to_nearest(min_top, SORT_TOLERANCE),
289
- )
290
-
291
- # Sort blocks first by x0, then by top (row beginning)
292
- sorted_blocks = sorted(blocks, key=get_block_reference)
293
-
294
- '''
295
- # Visualisierung: Blocks als Rechtecke zeichnen
296
- im = page.to_image(resolution=150) # ggf. Auflösung anpassen
297
-
298
- for block in blocks:
299
- # Grenzen berechnen
300
- x0 = min(w["x0"] for w in block)
301
- top = min(w["top"] for w in block)
302
- x1 = max(w["x1"] for w in block)
303
- bottom = max(w["bottom"] for w in block)
304
-
305
- # Rechteck zeichnen (blauer Rahmen, Dicke 1)
306
- im.draw_rect((x0, top, x1, bottom), stroke="blue", stroke_width=1)
307
-
308
- # Bild speichern – Dateiname z. B. mit Seitenzahl
309
- im.save(f"page_{page_number + 1}_blocks.png")
310
- '''
311
-
312
- output_lines = []
313
- output_lines.append(f"\nPage {page_number + 1}, Seite {page_number + 1}, Página {page_number + 1}\n") # Seitenzahl
314
-
315
- for block_idx, block in enumerate(sorted_blocks, 1):
316
- lines = group_block_into_lines(block)
317
-
318
- chapter_hits = 0
319
- important_hits = 0
320
- block_label = None # Initialisierung hier
321
-
322
- # Regel 1: Nur Wörter mit mehr als 3 Zeichen und keine reinen Zahlen
323
- for w in block:
324
- text = w["text"]
325
- if len(text) <= 5 or text.isdigit():
326
- continue # Regel 1 – alle anderen Regeln überspringen
327
-
328
- size_ratio = w["font_size"] / avg_fontsize if avg_fontsize else 0
329
- bold_flag = w["bold_flag"]
330
-
331
- # Regel 2 – Vorrangig
332
- if size_ratio >= 1.15:
333
- chapter_hits += 1
334
- # Regel 3 – Wenn Regel 2 nicht greift
335
- elif bold_flag and size_ratio >= 1:
336
- important_hits += 1
337
-
338
- total_hits = chapter_hits + important_hits
339
-
340
- # Regel 4 – Entscheidung auf Basis der Anzahl Treffer
341
- if total_hits > 1:
342
- block_label = "IMPORTANT"
343
- elif total_hits == 1:
344
- if chapter_hits == 1:
345
- block_label = "CHAPTER"
346
- elif important_hits == 1:
347
- block_label = "IMPORTANT"
348
-
349
- output_lines.append("") # Leerzeile vor Block
350
-
351
- for line_idx, line in enumerate(lines):
352
- line_text = " ".join(w["text"] for w in line)
353
- if line_idx == 0 and block_label:
354
- line_text = f"[{block_label}] {line_text}"
355
- output_lines.append(line_text)
356
-
357
-
358
-
359
- # Tabellen anhängen (wie gehabt)
360
- for idx, tbl in enumerate(tables_json, 1):
361
- output_lines.append(f'"table {idx}":\n{tbl}')
362
-
363
- return page_number, "\n".join(output_lines)
364
-
365
-
366
- except Exception as e:
367
- msg = str(e).strip() or f"{type(e).__name__} (no message)"
368
- return args[0], f"[ERROR] Seite {args[0]+1}: {msg}"
369
-
370
-
371
-
372
- def run_serial(path, page_number, tracker=None, progress_callback=None, stop_flag=None):
373
- results = []
374
- for i in range(page_number):
375
- if stop_flag and stop_flag.is_set():
376
- break
377
- result = process_page_worker((i, path,))
378
- results.append(result)
379
- if tracker is not None:
380
- tracker.update()
381
- if progress_callback and tracker is not None:
382
- report_status(tracker, progress_callback)
383
- return results
384
-
385
-
386
-
387
-
388
- def run_parallel(path, page_number, tracker=None, progress_callback=None, stop_flag=None):
389
- args = [(i, path) for i in range(page_number)] # stop_flag entfernt
390
- results = [None] * page_number
391
-
392
- def callback(result):
393
- if result is None:
394
- return
395
- page, _ = result
396
- results[page] = result
397
- if tracker is not None:
398
- tracker.update()
399
- if progress_callback and tracker is not None:
400
- report_status(tracker, progress_callback)
401
-
402
- with concurrent.futures.ProcessPoolExecutor(
403
- max_workers=min(page_number, get_physical_cores())
404
- ) as executor:
405
- futures = {executor.submit(process_page_worker, arg): arg for arg in args}
406
- for future in concurrent.futures.as_completed(futures):
407
- # stop_flag nicht hier prüfen, sondern im Hauptthread
408
- callback(future.result())
409
-
410
- return [r for r in results if r]
411
-
412
-
413
- def report_status(tracker, progress_callback=None):
414
- status = tracker.get_status()
415
- if progress_callback:
416
- progress_callback(status)
417
- else:
418
- print(f"[STATUS] {status['processed_pages']}/{status['total_pages']} Seiten "
419
- f"({status['pages_per_sec']:} Seiten/s, "
420
- f"Elapsed: {status['elapsed_time']} Sek.)"
421
- f"Est Time: {status['est_time']} Sek.)")
422
-
423
-
424
- def save_pdf(path, page_number, tracker=None, parallel=False, progress_callback=None, stop_flag=None):
425
- if stop_flag and stop_flag.is_set():
426
- return 0
427
-
428
- if parallel:
429
- results = run_parallel(path, page_number, tracker, progress_callback, stop_flag)
430
- else:
431
- results = run_serial(path, page_number, tracker, progress_callback, stop_flag)
432
-
433
- results = [r for r in results if r] # Filter None (bei Stop)
434
-
435
- results.sort(key=lambda x: x[0])
436
- text_output = "\n".join(text for _, text in results)
437
-
438
- out_path = os.path.splitext(path)[0] + ".txt"
439
- with open(out_path, "w", encoding="utf-8", errors="ignore") as f:
440
- f.write(text_output)
441
-
442
- return page_number
443
-
444
-
445
-
446
- def _process_single_pdf(path):
447
- suppress_pdfminer_logging()
448
- try:
449
- with open(path, "rb") as f:
450
- parser = PDFParser(f)
451
- document = PDFDocument(parser)
452
-
453
- if not document.is_extractable:
454
- raise PDFTextExtractionNotAllowed("Text-Extraktion nicht erlaubt")
455
-
456
- pages = list(PDFPage.create_pages(document))
457
- return (path, len(pages), None)
458
-
459
- except (PDFEncryptionError, PDFPasswordIncorrect) as e:
460
- return (path, 0, f"[ERROR] Datei passwortgeschützt: {path} ({type(e).__name__}: {e})\n")
461
- except PDFSyntaxError as e:
462
- return (path, 0, f"[ERROR] Ungültige PDF-Syntax: {path} ({type(e).__name__}: {e})\n")
463
- except PDFTextExtractionNotAllowed as e:
464
- return (path, 0, f"[ERROR] Text-Extraktion nicht erlaubt: {path} ({type(e).__name__}: {e})\n")
465
- except Exception as e:
466
- return (path, 0, f"[ERROR] Fehler bei Datei {path}: {type(e).__name__}: {e}\n")
467
-
468
- def get_total_pages(pdf_files, error_callback=None, progress_callback=None):
469
- suppress_pdfminer_logging()
470
- total = 0
471
- page_info = []
472
-
473
- def handle_result(path, count, error):
474
- nonlocal total
475
- if error:
476
- if error_callback:
477
- error_callback(error)
478
- else:
479
- print(error, end="")
480
- else:
481
- page_info.append((path, count))
482
- total += count
483
- if progress_callback:
484
- progress_callback(total) # Rückmeldung an GUI
485
-
486
- if len(pdf_files) > 14:
487
- with concurrent.futures.ProcessPoolExecutor(max_workers=cores) as executor:
488
- results = executor.map(_process_single_pdf, pdf_files)
489
- for path, count, error in results:
490
- handle_result(path, count, error)
491
- else:
492
- for path in pdf_files:
493
- path, count, error = _process_single_pdf(path)
494
- handle_result(path, count, error)
495
-
496
- return page_info, total
497
-
498
-
499
-
500
-
501
- # -------------------- GUI --------------------
502
- class FileManager(wx.Frame):
503
- def __init__(self, parent):
504
- super().__init__(parent, title="PDF Parser - Sevenof9_v7d", size=(1000, 800))
505
- self.files = []
506
- self.InitUI()
507
- self.stop_flag = threading.Event()
508
-
509
- def InitUI(self):
510
- panel = wx.Panel(self)
511
- vbox = wx.BoxSizer(wx.VERTICAL)
512
-
513
- hbox_lbl1 = wx.BoxSizer(wx.HORIZONTAL)
514
-
515
- lbl1 = wx.StaticText(panel, label="Filed PDF files: (with right mouse you can remove and open)")
516
- hbox_lbl1.Add(lbl1, flag=wx.ALIGN_CENTER_VERTICAL | wx.LEFT, border=10)
517
-
518
- hbox_lbl1.AddStretchSpacer() # <== schiebt den Button ganz nach rechts
519
-
520
- help_btn = wx.Button(panel, label="? HELP ?", size=(60, 25))
521
- help_btn.Bind(wx.EVT_BUTTON, self.ShowHelpText)
522
- hbox_lbl1.Add(help_btn, flag=wx.RIGHT, border=10)
523
-
524
- vbox.Add(hbox_lbl1, flag=wx.EXPAND | wx.TOP, border=10)
525
-
526
-
527
- self.listbox = wx.ListBox(panel, style=wx.LB_EXTENDED)
528
- self.listbox.Bind(wx.EVT_RIGHT_DOWN, self.OnRightClick)
529
- self.listbox.Bind(wx.EVT_LISTBOX, self.ShowText)
530
- vbox.Add(self.listbox, proportion=1, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
531
-
532
- self.popup_menu = wx.Menu()
533
- self.popup_menu.Append(1, "Remove selected")
534
- self.popup_menu.Append(2, "Open in default PDF app")
535
- self.popup_menu.Append(3, "Copy File Location")
536
- self.popup_menu.Append(4, "Open File Location")
537
- self.Bind(wx.EVT_MENU, self.RemoveFile, id=1)
538
- self.Bind(wx.EVT_MENU, self.OpenPDF, id=2)
539
- self.Bind(wx.EVT_MENU, self.CopyFileLocation, id=3)
540
- self.Bind(wx.EVT_MENU, self.OpenFileLocation, id=4)
541
-
542
-
543
- btn_panel = wx.Panel(panel)
544
- btn_sizer = wx.BoxSizer(wx.HORIZONTAL)
545
- for label, handler in [
546
- ("Add Folder", self.AddFolder),
547
- ("Select Files", self.AddFile),
548
- ("Remove Selected", self.RemoveFile),
549
- ("Remove All", self.RemoveAll),
550
- ("Stop Parser", self.StopParser),
551
- ("Start Parser", self.StartParser)
552
- ]:
553
- btn = wx.Button(btn_panel, label=label)
554
- btn.Bind(wx.EVT_BUTTON, handler)
555
- if label == "Start Parser":
556
- self.start_btn = btn # <-- Referenz merken
557
- btn_sizer.Add(btn, proportion=1, flag=wx.ALL, border=5)
558
- btn_panel.SetSizer(btn_sizer)
559
- vbox.Add(btn_panel, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
560
-
561
-
562
- lbl2 = wx.StaticText(panel, label="Text Frame: (choose PDF to see converted text)")
563
- vbox.Add(lbl2, flag=wx.LEFT, border=10)
564
-
565
- self.text_ctrl = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY)
566
- self.ShowHelpText(None)
567
- vbox.Add(self.text_ctrl, proportion=1, flag=wx.EXPAND | wx.LEFT | wx.RIGHT, border=10)
568
-
569
- # Statusanzeige
570
- stat_grid = wx.FlexGridSizer(1, 5, 5, 55)
571
- self.lbl_processed_pages = wx.StaticText(panel, label="Processed pages: 0")
572
- self.lbl_total_pages = wx.StaticText(panel, label="Total pages: 0")
573
- self.lbl_pages_per_sec = wx.StaticText(panel, label="Pages/sec: 0")
574
- self.lbl_est_time = wx.StaticText(panel, label="Estimated time (min): 0.0")
575
- self.lbl_elapsed_time = wx.StaticText(panel, label="Elapsed time: 0.0")
576
-
577
- for lbl in [self.lbl_processed_pages, self.lbl_total_pages, self.lbl_pages_per_sec, self.lbl_est_time, self.lbl_elapsed_time]:
578
- stat_grid.Add(lbl)
579
- vbox.Add(stat_grid, flag=wx.LEFT | wx.TOP, border=10)
580
-
581
- self.prog_ctrl = wx.TextCtrl(panel, style=wx.TE_MULTILINE | wx.TE_READONLY)
582
- vbox.Add(self.prog_ctrl, proportion=1, flag=wx.EXPAND | wx.ALL, border=10)
583
-
584
- panel.SetSizer(vbox)
585
-
586
-
587
- def ShowHelpText(self, event):
588
- help_text = (
589
- " This is a small help\n\n"
590
- " • PRE ALPHA version (for ever) •\n"
591
- "• The generated TXT file has the same name as the PDF file\n"
592
- "• The TXT file is created in the same directory as the PDF\n"
593
- "• Older TXT files will be overwritten without prompting\n"
594
- "• When selecting a folder, subfolders are also selected\n"
595
- "If:\n"
596
- "[INFO] File completed: TEST.pdf (X pages)!\n"
597
- "[INFO] Processing completed\n"
598
- "-> This only means that all pages have been processed; it does not mean that the quality is good.\n"
599
- "• An attempt is made to reproduce the layout of the page in columns from left to right and in blocks from top to bottom\n"
600
- "• An attempt is made to detect regular tables with lines; headers (top or top and left) are assigned to the cells and stored in JSON format in the text file\n"
601
- "\n"
602
- "Stop function becomes effective only after the currently processed file\n"
603
- "When processing large amounts of data, the following should be noted:\n"
604
- "First, all PDFs are opened once to determine the number of pages:\n"
605
- "Then, all small PDFs are processed in parallel:\n"
606
- "Then, each large PDF is processed page by page in parallel:\n"
607
- )
608
- self.text_ctrl.SetValue(help_text)
609
-
610
-
611
- def AddFolder(self, event):
612
- dlg = wx.DirDialog(self, "Select Folder")
613
- if dlg.ShowModal() == wx.ID_OK:
614
- for root, _, files in os.walk(dlg.GetPath()):
615
- for f in files:
616
- if f.lower().endswith(".pdf"):
617
- path = os.path.normpath(os.path.join(root, f))
618
- if path not in self.files:
619
- self.files.append(path)
620
- self.listbox.Append(path)
621
- dlg.Destroy()
622
-
623
- def AddFile(self, event):
624
- with wx.FileDialog(self, "Select PDF Files", wildcard="PDF files (*.pdf)|*.pdf",
625
- style=wx.FD_OPEN | wx.FD_MULTIPLE) as dlg:
626
- if dlg.ShowModal() == wx.ID_OK:
627
- for path in dlg.GetPaths():
628
- if path not in self.files:
629
- self.files.append(path)
630
- self.listbox.Append(path)
631
-
632
- def RemoveFile(self, event):
633
- for i in reversed(self.listbox.GetSelections()):
634
- self.listbox.Delete(i)
635
- del self.files[i]
636
- self.text_ctrl.Clear()
637
-
638
- def RemoveAll(self, event):
639
- self.listbox.Clear()
640
- self.files.clear()
641
- self.text_ctrl.Clear()
642
-
643
- def OpenPDF(self, event):
644
- i = self.listbox.GetSelections()
645
- if i:
646
- path = self.files[i[0]]
647
- if platform.system() == "Windows":
648
- os.startfile(path)
649
- elif platform.system() == "Darwin":
650
- subprocess.call(["open", path])
651
- else:
652
- subprocess.call(["xdg-open", path])
653
-
654
- def CopyFileLocation(self, event):
655
- sel = self.listbox.GetSelections()
656
- if sel:
657
- path = self.files[sel[0]]
658
- if wx.TheClipboard.Open():
659
- wx.TheClipboard.SetData(wx.TextDataObject(path))
660
- wx.TheClipboard.Close()
661
-
662
- def OpenFileLocation(self, event):
663
- sel = self.listbox.GetSelections()
664
- if sel:
665
- folder = os.path.dirname(self.files[sel[0]])
666
- if platform.system() == "Windows":
667
- subprocess.Popen(f'explorer "{folder}"')
668
- elif platform.system() == "Darwin":
669
- subprocess.call(["open", folder])
670
- else:
671
- subprocess.call(["xdg-open", folder])
672
-
673
-
674
- def OnRightClick(self, event):
675
- if self.listbox.GetSelections():
676
- self.PopupMenu(self.popup_menu, event.GetPosition())
677
-
678
- def StartParser(self, event):
679
- if not self.files:
680
- wx.MessageBox("Please select files first.", "Hinweis", wx.OK | wx.ICON_INFORMATION)
681
- wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
682
- return
683
-
684
-
685
- self.start_btn.Disable()
686
- self.stop_flag.clear()
687
- self.prog_ctrl.Clear()
688
-
689
- def error_callback(msg):
690
- wx.CallAfter(self.AppendProg, msg)
691
-
692
- def update_total_pages_live(new_total):
693
- wx.CallAfter(self.lbl_total_pages.SetLabel, f"Total pages: {new_total}")
694
-
695
-
696
- page_info, total_pages = get_total_pages(
697
- self.files,
698
- error_callback=error_callback,
699
- progress_callback=update_total_pages_live
700
- )
701
-
702
- if total_pages == 0:
703
- self.AppendProg("[INFO] No pages found.\n")
704
- wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
705
- return
706
-
707
- tracker = StatusTracker(total_pages)
708
-
709
- def gui_progress_callback(status):
710
- wx.CallAfter(self.lbl_processed_pages.SetLabel, f"Processed pages: {status['processed_pages']}")
711
- wx.CallAfter(self.lbl_total_pages.SetLabel, f"Total pages: {status['total_pages']}")
712
- wx.CallAfter(self.lbl_pages_per_sec.SetLabel, f"Pages/sec: {status['pages_per_sec']:}")
713
- wx.CallAfter(self.lbl_est_time.SetLabel, f"Estimated time (min): {status['est_time']:}")
714
- wx.CallAfter(self.lbl_elapsed_time.SetLabel, f"Elapsed time: {status['elapsed_time']}")
715
-
716
- throttled_gui_callback = throttle_callback(gui_progress_callback, 100)
717
-
718
- def background():
719
- small = [p for p in page_info if p[1] <= PARALLEL_THRESHOLD]
720
- large = [p for p in page_info if p[1] > PARALLEL_THRESHOLD]
721
-
722
- # Verarbeite kleine Dateien je in einem eigenen Prozess
723
- if small:
724
- max_workers = max(1, min(len(small), get_physical_cores()))
725
- with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
726
- futures = {}
727
- for path, count in small:
728
- if self.stop_flag.is_set():
729
- break
730
- future = executor.submit(save_pdf, path, count, None, False, None)
731
- futures[future] = (path, count)
732
-
733
- for future in concurrent.futures.as_completed(futures):
734
- if self.stop_flag.is_set():
735
- break
736
- path, count = futures[future]
737
- try:
738
- pages_processed = future.result()
739
- tracker.update(pages_processed)
740
- throttled_gui_callback(tracker.get_status())
741
- wx.CallAfter(self.AppendProg, f"[INFO] File ready: {path} ({pages_processed} Seiten)\n")
742
- except Exception as e:
743
- wx.CallAfter(self.AppendProg, f"[ERROR] File {path}: {str(e)}\n")
744
-
745
- # Verarbeite große Dateien Seite für Seite parallel
746
- for path, count in large:
747
- if self.stop_flag.is_set():
748
- break
749
-
750
- try:
751
- pages_processed = save_pdf(
752
- path,
753
- count,
754
- tracker,
755
- parallel=True,
756
- progress_callback=throttled_gui_callback,
757
- stop_flag=self.stop_flag
758
- )
759
- if pages_processed:
760
- wx.CallAfter(
761
- self.AppendProg,
762
- f"[INFO] File ready: {path} ({pages_processed} Seiten)\n"
763
- )
764
- else:
765
- wx.CallAfter(
766
- self.AppendProg,
767
- f"[INFO] Stopped: {path}\n"
768
- )
769
- except Exception as e:
770
- wx.CallAfter(
771
- self.AppendProg,
772
- f"[ERROR] File {path}: {str(e)}\n"
773
- )
774
-
775
-
776
-
777
- wx.CallAfter(self.AppendProg, "\n[INFO] Processing completed.\n")
778
- wx.CallAfter(self.start_btn.Enable) # <-- wieder aktivieren
779
- self.stop_flag.clear()
780
-
781
- threading.Thread(target=background, daemon=True).start()
782
-
783
-
784
- def StopParser(self, event):
785
- self.stop_flag.set()
786
- self.AppendProg("[INFO] Processing Stopped...\n")
787
-
788
-
789
- def ShowText(self, event):
790
- sel = self.listbox.GetSelections()
791
- if not sel:
792
- return
793
- txt_path = os.path.splitext(self.files[sel[0]])[0] + ".txt"
794
- self.text_ctrl.Clear()
795
- if os.path.exists(txt_path):
796
- with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
797
- self.text_ctrl.SetValue(f.read())
798
- else:
799
- self.text_ctrl.SetValue("[No .txt file found]")
800
-
801
- def AppendProg(self, text):
802
- self.prog_ctrl.AppendText(text)
803
-
804
-
805
- # -------------------- Einstiegspunkt --------------------
806
- def main():
807
- if len(sys.argv) > 1:
808
- pdf_files = sys.argv[1:]
809
- page_info, total_pages = get_total_pages(pdf_files)
810
- tracker = StatusTracker(total_pages)
811
-
812
- def cli_callback(status):
813
- print(json.dumps(status))
814
-
815
- for path, count in page_info:
816
- save_pdf(path, count, tracker, parallel=(count > PARALLEL_THRESHOLD), progress_callback=cli_callback)
817
- else:
818
- app = wx.App(False)
819
- frame = FileManager(None)
820
- frame.Show()
821
- app.MainLoop()
822
-
823
-
824
- if __name__ == "__main__":
825
- multiprocessing.freeze_support()
826
- main()