File size: 23,936 Bytes
3829982
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
import os  # OS module for interacting with the operating system (file management, etc.)
import sys  # Provides access to system-specific parameters and functions
import tkinter as tk  # GUI module for creating desktop applications
from tkinter import filedialog, messagebox  # Additional tkinter components for file dialogs and message boxes
import subprocess  # Module to run system commands
import threading  # Threading module to run tasks concurrently
import tempfile  # Module to create temporary files and directories
import shutil  # Module for file operations like copy, move, and delete
import json  # JSON module for working with JSON data
import logging  # Logging module for tracking events and errors
import pdfplumber  # Library for extracting text and tables from PDFs
from pdfplumber.utils import get_bbox_overlap, obj_to_bbox  # Helper functions from pdfplumber for working with bounding boxes
from pdfplumber.utils.exceptions import PdfminerException  # Exception related to PDF processing
from joblib import delayed, cpu_count, parallel_backend, Parallel  # Joblib for parallel processing and optimization
import multiprocessing  # Module for parallel processing using multiple CPU cores
from multiprocessing import Pool  # Pool class for parallelizing tasks across multiple processes


# ========================
# Parser Configuration
# ========================

TEXT_EXTRACTION_SETTINGS = {
    "x_tolerance": 1,  # Horizontal tolerance for text extraction
    "y_tolerance": 3,  # Vertical tolerance for text extraction
    "keep_blank_chars": False,  # Option to retain blank characters in the extracted text
    "use_text_flow": True  # Option to use text flow for better structure
}

# Suppress stderr output on Windows platform to avoid cluttering the console
if sys.platform == "win32":
    sys.stderr = open(os.devnull, 'w')  # Redirect stderr to null

PARALLEL_THRESHOLD = 16  # Number of pages to use for deciding between serial or parallel processing

# Function to suppress PDFMiner logging, reducing verbosity
def suppress_pdfminer_logging():
    for logger_name in [
        "pdfminer",  # Various pdfminer modules to suppress logging from
        "pdfminer.pdfparser",
        "pdfminer.pdfdocument",
        "pdfminer.pdfpage",
        "pdfminer.converter",
        "pdfminer.layout",
        "pdfminer.cmapdb",
        "pdfminer.utils"
    ]:
        logging.getLogger(logger_name).setLevel(logging.ERROR)  # Set logging level to ERROR to suppress lower levels

# Function to clean up text by removing unwanted hyphenations and newlines
def clean_cell_text(text):
    if not isinstance(text, str):  # If text is not a string, return empty string
        return ""
    text = text.replace("-\n", "").replace("\n", " ")  # Remove hyphenated line breaks and replace newlines with space
    return " ".join(text.split())  # Split text into words and join with single spaces

# Function to safely clean and join row cell data
def safe_join(row):
    return [clean_cell_text(str(cell)) if cell is not None else "" for cell in row]  # Clean each cell in the row, or return empty if None

# Function to clamp bounding box coordinates within page boundaries
def clamp_bbox(bbox, page_width, page_height):
    x0, top, x1, bottom = bbox  # Extract bounding box coordinates
    # Ensure each coordinate is within the page width and height limits
    x0 = max(0, min(x0, page_width))
    x1 = max(0, min(x1, page_width))
    top = max(0, min(top, page_height))
    bottom = max(0, min(bottom, page_height))
    return (x0, top, x1, bottom)  # Return the adjusted bounding box

# Function to process a single PDF page
def process_page(args):
    suppress_pdfminer_logging()  # Suppress unnecessary PDFMiner logging
    try:
        page_number, pdf_path, text_settings = args  # Extract page number, PDF path, and text extraction settings
        with pdfplumber.open(pdf_path) as pdf:  # Open the PDF using pdfplumber
            page = pdf.pages[page_number]  # Get the specific page
            output = f"Page {page_number + 1}\n"  # Add page number to the output
            width, height = page.width, page.height  # Get page dimensions

            filtered_page = page  # Initialize filtered page
            table_bboxes = []  # List to hold bounding boxes of tables
            table_json_outputs = []  # List to hold JSON output of tables

            # Iterate through all tables found on the page
            for table in page.find_tables():
                bbox = clamp_bbox(table.bbox, width, height)  # Adjust the bounding box to fit within the page
                table_bboxes.append(bbox)  # Add the bounding box to the list

                if not page.crop(bbox).chars:  # Skip tables that have no characters
                    continue

                # Filter out any elements that overlap with the table's bounding box
                filtered_page = filtered_page.filter(
                    lambda obj: get_bbox_overlap(obj_to_bbox(obj), bbox) is None
                )

                # Extract the table data and structure it
                table_data = table.extract()
                if table_data and len(table_data) >= 1:  # Ensure there is data in the table
                    headers = safe_join(table_data[0])  # Clean and join the headers
                    rows = [safe_join(row) for row in table_data[1:]]  # Clean and join the table rows
                    json_table = [dict(zip(headers, row)) for row in rows]  # Create a JSON object from headers and rows
                    table_json_outputs.append(json.dumps(json_table, indent=1, ensure_ascii=False))  # Convert table data to JSON

            # Extract words outside the tables
            words_outside_tables = [
                word for word in page.extract_words(**text_settings)  # Extract words from the page using the settings
                if not any(
                    bbox[0] <= float(word['x0']) <= bbox[2] and
                    bbox[1] <= float(word['top']) <= bbox[3]
                    for bbox in table_bboxes  # Ensure word is not inside any table bounding box
                )
            ]

            current_y = None  # Track vertical position of words
            line = []  # List to hold words for the current line
            text_content = ""  # Store the extracted text content

            # Iterate through words and group them into lines
            for word in words_outside_tables:
                if current_y is None or abs(word['top'] - current_y) > 10:  # Start a new line if Y position changes significantly
                    if line:  # If there's a previous line, join and add it to text content
                        text_content += " ".join(line) + "\n"
                    line = [word['text']]  # Start a new line with the current word
                    current_y = word['top']  # Update the current Y position
                else:
                    line.append(word['text'])  # Append the word to the current line
            if line:  # Add the last line to the text content
                text_content += " ".join(line) + "\n"

            output += text_content.strip() + "\n"  # Add the final text content for the page

            # Add table JSON outputs to the page output
            for idx, table in enumerate(table_json_outputs, start=1):
                output += f'"table {idx}":\n{table}\n'

            return page_number, output  # Return the processed page number and output content

    except Exception as e:
        return args[0], f"[ERROR] Page {args[0]+1} ({args[1]}): {str(e)}"  # Return an error message if an exception occurs

# Function to process the entire PDF document
def process_pdf(pdf_path):
    suppress_pdfminer_logging()  # Suppress unnecessary logging
    try:
        if not os.path.exists(pdf_path):  # Check if the file exists
            return f"[ERROR] File not found: {pdf_path}"  # Return error message if file does not exist

        print(f"[INFO] Starting processing: {pdf_path}")  # Log the start of processing
        try:
            with pdfplumber.open(pdf_path) as pdf:  # Open the PDF using pdfplumber
                num_pages = len(pdf.pages)  # Get the number of pages in the PDF
        except PdfminerException as e:
            return f"[ERROR] Cannot open PDF: {pdf_path}{str(e)}"  # Return error if the PDF cannot be opened
        except Exception as e:
            return f"[ERROR] General error opening PDF: {pdf_path}{str(e)}"  # Return general error if any exception occurs

        pages = [(i, pdf_path, TEXT_EXTRACTION_SETTINGS) for i in range(num_pages)]  # Prepare the pages for processing

        try:
            results = run_serial(pages) if num_pages <= PARALLEL_THRESHOLD else run_parallel(pages)  # Run serial or parallel processing
        except (EOFError, BrokenPipeError, KeyboardInterrupt):
            return "[INFO] Processing was interrupted."  # Handle interruptions during processing

        sorted_results = sorted(results, key=lambda x: x[0])  # Sort results by page number
        final_output = "\n".join(text for _, text in sorted_results)  # Combine all page results into a single string

        base_name = os.path.splitext(os.path.basename(pdf_path))[0]  # Get the base name of the PDF file
        output_dir = os.path.dirname(pdf_path)  # Get the directory of the PDF file
        output_path = os.path.join(output_dir, f"{base_name}.txt")  # Generate the output file path

        with open(output_path, "w", encoding="utf-8", errors="ignore") as f:  # Open the output file for writing
            f.write(final_output)  # Write the final output to the file

        print(f"[INFO] Processing complete: {output_path}")  # Log the successful processing completion

    except (EOFError, BrokenPipeError, KeyboardInterrupt):
        return "[INFO] Processing interrupted by user."  # Handle user interruptions
    except Exception as e:
        return f"[ERROR] Unexpected error with '{pdf_path}': {str(e)}"  # Handle unexpected errors during processing

# Function to run the PDF processing serially (one page at a time)
def run_serial(pages):
    return [process_page(args) for args in pages]  # Process each page in sequence

# Function to run the PDF processing in parallel (across multiple cores)
def run_parallel(pages):
    available_cores = max(1, cpu_count() - 2)  # Calculate the number of available CPU cores, leaving 2 for system processes
    num_cores = min(available_cores, len(pages))  # Limit the number of cores based on the number of pages
    print(f"Starting parallel processing with {num_cores} cores...")  # Log the number of cores used
    with Pool(processes=num_cores) as pool:  # Create a pool of processes
        return pool.map(process_page, pages)  # Distribute the page processing across the available cores

# Main function to process a list of PDFs
def process_pdfs_main():
    suppress_pdfminer_logging()  # Suppress unnecessary logging
    pdf_files = sys.argv[1:]  # Get PDF file paths from command-line arguments
    if not pdf_files:  # Check if any PDFs are provided
        print("No PDF files provided.")  # Log message if no PDFs are provided
        return

    small_pdfs = []  # List to store small PDFs (less than the parallel threshold)
    large_pdfs = []  # List to store large PDFs (greater than the parallel threshold)

    # Categorize PDFs into small and large based on the number of pages
    for path in pdf_files:
        if not os.path.exists(path):  # Check if the file exists
            print(f"File not found: {path}")  # Log error if file does not exist
            continue
        try:
            with pdfplumber.open(path) as pdf:  # Open the PDF
                if len(pdf.pages) <= PARALLEL_THRESHOLD:  # If the PDF has fewer pages than the threshold
                    small_pdfs.append(path)  # Add to small PDFs list
                else:
                    large_pdfs.append(path)  # Add to large PDFs list
        except PdfminerException:
            print(f"[ERROR] Password-protected PDF skipped: {path}")  # Log if the PDF is password-protected
        except Exception as e:
            print(f"[ERROR] Error opening {path}: {str(e)}")  # Log any other errors when opening the PDF

    # Process small PDFs in parallel (if there are any)
    if small_pdfs:
        available_cores = max(1, cpu_count() - 2)  # Determine the number of available cores
        num_cores = min(available_cores, len(small_pdfs))  # Use the lesser of available cores or small PDFs count
        print(f"\n[Phase 1] Starting parallel processing of small PDFs with {num_cores} cores...")  # Log processing start
        results = Parallel(n_jobs=num_cores)(  # Run parallel processing for small PDFs
            delayed(process_pdf)(path) for path in small_pdfs
        )
        for r in results:
            print(r)  # Print the results for each small PDF

    # Process large PDFs one by one (in serial)
    for path in large_pdfs:
        print(f"\n[Phase 2] Processing large PDF: {os.path.basename(path)}")  # Log processing of large PDF
        print(process_pdf(path))  # Process the large PDF


# GUI

class FileManager:
    def __init__(self, master):
        # Initialize the main window and title
        self.master = master
        self.master.title("Parser-Sevenof9")

        # Internal list to track selected PDF files
        self.files = []
        self.last_selected_index = None  # Stores the last clicked index for shift-selection

        # Label for file list
        self.label = tk.Label(master, text="Selected PDF files:")
        self.label.pack(pady=5)

        # Frame to contain the listbox and its scrollbar
        listbox_frame = tk.Frame(master)
        listbox_frame.pack(pady=5)

        # Scrollbar for the listbox
        scrollbar_listbox = tk.Scrollbar(listbox_frame)
        self.listbox = tk.Listbox(
            listbox_frame, selectmode=tk.MULTIPLE, width=80, height=6,
            yscrollcommand=scrollbar_listbox.set
        )
        scrollbar_listbox.config(command=self.listbox.yview)

        # Pack listbox and scrollbar side by side
        self.listbox.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
        scrollbar_listbox.pack(side=tk.RIGHT, fill=tk.Y)

        # Bind selection and click events for the listbox
        self.listbox.bind("<<ListboxSelect>>", self.show_text_file)
        self.listbox.bind("<Button-1>", self.on_listbox_click)
        self.listbox.bind("<Shift-Button-1>", self.on_listbox_shift_click)

        # Create a context menu for right-click actions
        self.context_menu = tk.Menu(master, tearoff=0)
        self.context_menu.add_command(label="Remove selected", command=self.remove_file)
        self.listbox.bind("<Button-3>", self.show_context_menu)

        # Frame for action buttons (Add/Remove)
        self.frame = tk.Frame(master)
        self.frame.pack(pady=10)

        # Action buttons
        tk.Button(self.frame, text="Add Folder", command=self.add_folder).pack(side=tk.LEFT, padx=5)
        tk.Button(self.frame, text="Select Files", command=self.add_file).pack(side=tk.LEFT, padx=5)
        tk.Button(self.frame, text="Remove Selected", command=self.remove_file).pack(side=tk.LEFT, padx=5)
        tk.Button(self.frame, text="Remove All", command=self.remove_all).pack(side=tk.LEFT, padx=5)
        tk.Button(master, text="Stop", command=self.stop_parser).pack(pady=5)

        # Placeholder for the parser process (used in threading)
        self.parser_process = None

        # Start button for parsing process
        tk.Button(master, text="Start Parser", command=self.start_parser).pack(pady=10)

        # Text frame to display the contents of the selected .txt file
        text_frame = tk.Frame(master)
        text_frame.pack(padx=10, pady=5)

        scrollbar_text = tk.Scrollbar(text_frame)
        self.text_widget = tk.Text(
            text_frame, height=15, width=100, wrap=tk.WORD,
            yscrollcommand=scrollbar_text.set
        )
        scrollbar_text.config(command=self.text_widget.yview)

        # Pack text viewer and scrollbar
        self.text_widget.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
        scrollbar_text.pack(side=tk.RIGHT, fill=tk.Y)

        # Label for progress section
        tk.Label(master, text="Progress:").pack()

        # Frame for progress output
        progress_frame = tk.Frame(master)
        progress_frame.pack(padx=10, pady=5)

        scrollbar_progress = tk.Scrollbar(progress_frame)
        self.progress_text = tk.Text(
            progress_frame, height=8, width=100, state=tk.DISABLED,
            yscrollcommand=scrollbar_progress.set
        )
        scrollbar_progress.config(command=self.progress_text.yview)

        self.progress_text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
        scrollbar_progress.pack(side=tk.RIGHT, fill=tk.Y)

    def on_listbox_click(self, event):
        # Handle single left-click selection; clear previous selection
        index = self.listbox.nearest(event.y)
        self.listbox.selection_clear(0, tk.END)
        self.listbox.selection_set(index)
        self.last_selected_index = index
        self.show_text_file(None)
        return "break"  # Prevent default event propagation

    def on_listbox_shift_click(self, event):
        # Handle shift-click for range selection
        index = self.listbox.nearest(event.y)
        if self.last_selected_index is None:
            self.last_selected_index = index
        start, end = sorted((self.last_selected_index, index))
        self.listbox.selection_clear(0, tk.END)
        for i in range(start, end + 1):
            self.listbox.selection_set(i)
        return "break"

    def show_context_menu(self, event):
        # Show right-click context menu if any item is selected
        if self.listbox.curselection():
            self.context_menu.tk_popup(event.x_root, event.y_root)

    def add_folder(self):
        # Add all PDFs from a selected folder
        folder = filedialog.askdirectory(title="Select Folder")
        if not folder:
            return
        for root, _, files in os.walk(folder):
            for file in files:
                if file.lower().endswith(".pdf"):
                    path = os.path.join(root, file)
                    if path not in self.files:
                        self.files.append(path)
                        self.listbox.insert(tk.END, path)

    def add_file(self):
        # Add selected individual PDF files
        paths = filedialog.askopenfilenames(title="Select PDF Files", filetypes=[("PDF Files", "*.pdf")])
        for path in paths:
            if path not in self.files:
                self.files.append(path)
                self.listbox.insert(tk.END, path)

    def remove_file(self):
        # Remove selected files from list and internal storage
        selection = self.listbox.curselection()
        if not selection:
            messagebox.showwarning("Notice", "Please select an entry to remove.")
            return
        for index in reversed(selection):  # Reverse to avoid index shifting
            self.listbox.delete(index)
            del self.files[index]
        self.text_widget.delete(1.0, tk.END)

    def remove_all(self):
        # Remove all files from the list
        self.listbox.delete(0, tk.END)
        self.files.clear()
        self.text_widget.delete(1.0, tk.END)

    def start_parser(self):
        # Validate input and launch parser in separate thread
        if not self.files:
            messagebox.showinfo("No Files", "Please select at least one file.")
            return
        self.progress_text.config(state=tk.NORMAL)
        self.progress_text.delete(1.0, tk.END)
        self.progress_text.insert(tk.END, "Starting parser...\n")
        self.progress_text.config(state=tk.DISABLED)

        # Launch parsing in background to avoid UI freeze
        thread = threading.Thread(target=self.run_parser)
        thread.start()

    def stop_parser(self):
        # Terminate running parser process if active
        if self.parser_process and self.parser_process.poll() is None:
            self.parser_process.terminate()
            self.append_progress_text("Parser process was stopped.\n")
        else:
            self.append_progress_text("No active parser process to stop.\n")

    def run_parser(self):
        # Internal method to run the external parser script
        try:
            self.parser_process = subprocess.Popen(
                [sys.executable, __file__] + self.files,
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
                text=True,
                encoding='utf-8',
                errors='ignore',
                bufsize=4096
            )
            for line in self.parser_process.stdout:
                self.append_progress_text(line)
            self.parser_process.stdout.close()
            self.parser_process.wait()

            if self.parser_process.returncode == 0:
                self.append_progress_text("\nParser finished successfully.\n")
                self.show_messagebox_threadsafe("Parser Done", "The parser was executed successfully.")
            else:
                self.append_progress_text("\nError while running the parser.\n")
                self.show_messagebox_threadsafe("Error", "Error while running the parser.")
        except Exception as e:
            self.append_progress_text(f"Error: {e}\n")
            self.show_messagebox_threadsafe("Error", f"Error during execution:\n{e}")
        finally:
            self.parser_process = None

    def append_progress_text(self, text):
        # Thread-safe method to append text to the progress view
        self.progress_text.after(0, lambda: self._insert_text(text))

    def _insert_text(self, text):
        # Append text and scroll to bottom
        self.progress_text.config(state=tk.NORMAL)
        self.progress_text.insert(tk.END, text)
        self.progress_text.see(tk.END)
        self.progress_text.config(state=tk.DISABLED)

    def show_messagebox_threadsafe(self, title, message):
        # Display a messagebox from a background thread
        self.master.after(0, lambda: messagebox.showinfo(title, message))

    def show_text_file(self, event):
        # Load and show the content of the corresponding .txt file (if available)
        selection = self.listbox.curselection()
        if not selection:
            return
        index = selection[0]
        path = self.files[index]
        txt_path = os.path.splitext(path)[0] + ".txt"
        self.text_widget.delete(1.0, tk.END)
        if os.path.exists(txt_path):
            try:
                with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
                    self.text_widget.insert(tk.END, f.read())
            except Exception as e:
                self.text_widget.insert(tk.END, f"Error loading text file:\n{e}")
        else:
            self.text_widget.insert(tk.END, "[No corresponding .txt file found]")


# MAIN

if __name__ == "__main__":
    multiprocessing.freeze_support()  # Required for Windows compatibility with multiprocessing

    if len(sys.argv) > 1:
        # If called with file arguments, execute parsing logic (e.g., from subprocess)
        process_pdfs_main()
    else:
        # Otherwise, launch the GUI application
        root = tk.Tk()
        app = FileManager(root)
        root.mainloop()