Spaces:

drewThomasson
/

PDF-to-TXT-OCR

Running

App Files Files Community

drewThomasson commited on Aug 7

Commit

be572f8

verified ·

1 Parent(s): 004ca9f

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -33

app.py CHANGED Viewed

@@ -3,43 +3,80 @@ import pytesseract
 from pdf2image import convert_from_path
 import tempfile
 import os
-import shutil
-def ocr_pdf(file_path):
-    # Temporary directory for processing
-    with tempfile.TemporaryDirectory() as temp_dir:
-        # Convert PDF to images
-        images = convert_from_path(file_path, output_folder=temp_dir)
-        # Extract text from each page image
-        extracted_text = ""
-        for i, image in enumerate(images):
-            text = pytesseract.image_to_string(image)
-            extracted_text += f"\n{text}\n\n"
-        # Save the extracted text to a .txt file in a persistent location
-        output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
-        with open(output_txt_path, "w", encoding="utf-8") as f:
-            f.write(extracted_text)
-        # Create a persistent file to serve for download
-        final_output_path = "/tmp/extracted_text.txt"
-        shutil.copy(output_txt_path, final_output_path)  # Copy to a persistent location
-        # Return both: actual text and path (for download)
-        return extracted_text, final_output_path
-# Gradio Interface
 iface = gr.Interface(
-    fn=lambda file: ocr_pdf(file.name),  # Pass file path instead of file object
-    inputs=gr.File(label="Upload PDF File"),
     outputs=[
-        gr.Textbox(label="Extracted Text"),               # Shows text directly
-        gr.File(label="Download Extracted Text (.txt)")   # Optional download
     ],
-    title="PDF to Text OCR",
-    allow_flagging="never"
 )
 if __name__ == "__main__":
     iface.launch()

 from pdf2image import convert_from_path
 import tempfile
 import os
+def ocr_pdf(pdf_file):
+    """
+    Performs OCR on a given PDF file.
+    Args:
+        pdf_file: An uploaded file object from Gradio.
+    Returns:
+        A tuple containing:
+        - The path to the generated .txt file.
+        - The extracted text content as a string.
+    """
+    if pdf_file is None:
+        return None, "Please upload a PDF file first."
+    file_path = pdf_file.name
+    try:
+        # Use a temporary directory that is automatically cleaned up
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Convert PDF pages to images
+            try:
+                images = convert_from_path(file_path, output_folder=temp_dir)
+            except Exception as e:
+                error_message = (
+                    "Failed to convert PDF. Please ensure Poppler is installed and in your system's PATH.\n"
+                    f"Details: {e}"
+                )
+                return None, error_message
+            # Extract text from each page image
+            full_extracted_text = ""
+            for i, image in enumerate(images):
+                text = pytesseract.image_to_string(image, lang='eng')
+                full_extracted_text += f"--- Page {i+1} ---\n{text}\n\n"
+            if not full_extracted_text.strip():
+                full_extracted_text = "No text could be extracted. The PDF might contain only images without text or be empty."
+            # Create a temporary file for the extracted text. Gradio will handle serving it.
+            # We use a NamedTemporaryFile to ensure it has a path and is cleaned up.
+            with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt", encoding="utf-8") as f:
+                f.write(full_extracted_text)
+                output_txt_path = f.name
+            # Return both the file path for download and the text for display
+            return output_txt_path, full_extracted_text
+    except Exception as e:
+        return None, f"An unexpected error occurred: {e}"
+# --- Gradio Interface Definition ---
+# We define the user interface for our application.
 iface = gr.Interface(
+    fn=ocr_pdf,
+    inputs=gr.File(label="Upload PDF File", type="file"),
     outputs=[
+        gr.File(label="Download Extracted Text (.txt)"),
+        gr.Textbox(label="Extracted Text Content", lines=20, interactive=False)
     ],
+    title="📄 PDF to Text Extractor (OCR)",
+    description=(
+        "Upload a PDF file to extract its text. The content will be displayed below, "
+        "and you'll get a link to download it as a .txt file. "
+        "**Note:** This tool relies on OCR and may not be 100% accurate, especially with complex layouts or poor quality scans."
+    ),
+    allow_flagging="never",
+    examples=[
+        # You can place paths to example PDFs on the server here if you have any.
+        # ["path/to/your/example.pdf"]
+    ]
 )
+# --- Launch the Application ---
 if __name__ == "__main__":
     iface.launch()