Spaces:

drewThomasson
/

PDF-to-TXT-OCR

Running

App Files Files Community

drewThomasson commited on Aug 7

Commit

52020a7

verified ·

1 Parent(s): be572f8

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -73

app.py CHANGED Viewed

@@ -3,80 +3,38 @@ import pytesseract
 from pdf2image import convert_from_path
 import tempfile
 import os
-def ocr_pdf(pdf_file):
-    """
-    Performs OCR on a given PDF file.
-    Args:
-        pdf_file: An uploaded file object from Gradio.
-    Returns:
-        A tuple containing:
-        - The path to the generated .txt file.
-        - The extracted text content as a string.
-    """
-    if pdf_file is None:
-        return None, "Please upload a PDF file first."
-    file_path = pdf_file.name
-    try:
-        # Use a temporary directory that is automatically cleaned up
-        with tempfile.TemporaryDirectory() as temp_dir:
-            # Convert PDF pages to images
-            try:
-                images = convert_from_path(file_path, output_folder=temp_dir)
-            except Exception as e:
-                error_message = (
-                    "Failed to convert PDF. Please ensure Poppler is installed and in your system's PATH.\n"
-                    f"Details: {e}"
-                )
-                return None, error_message
-            # Extract text from each page image
-            full_extracted_text = ""
-            for i, image in enumerate(images):
-                text = pytesseract.image_to_string(image, lang='eng')
-                full_extracted_text += f"--- Page {i+1} ---\n{text}\n\n"
-            if not full_extracted_text.strip():
-                full_extracted_text = "No text could be extracted. The PDF might contain only images without text or be empty."
-            # Create a temporary file for the extracted text. Gradio will handle serving it.
-            # We use a NamedTemporaryFile to ensure it has a path and is cleaned up.
-            with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt", encoding="utf-8") as f:
-                f.write(full_extracted_text)
-                output_txt_path = f.name
-            # Return both the file path for download and the text for display
-            return output_txt_path, full_extracted_text
-    except Exception as e:
-        return None, f"An unexpected error occurred: {e}"
-# --- Gradio Interface Definition ---
-# We define the user interface for our application.
 iface = gr.Interface(
-    fn=ocr_pdf,
-    inputs=gr.File(label="Upload PDF File", type="file"),
-    outputs=[
-        gr.File(label="Download Extracted Text (.txt)"),
-        gr.Textbox(label="Extracted Text Content", lines=20, interactive=False)
-    ],
-    title="📄 PDF to Text Extractor (OCR)",
-    description=(
-        "Upload a PDF file to extract its text. The content will be displayed below, "
-        "and you'll get a link to download it as a .txt file. "
-        "**Note:** This tool relies on OCR and may not be 100% accurate, especially with complex layouts or poor quality scans."
-    ),
-    allow_flagging="never",
-    examples=[
-        # You can place paths to example PDFs on the server here if you have any.
-        # ["path/to/your/example.pdf"]
-    ]
 )
-# --- Launch the Application ---
 if __name__ == "__main__":
-    iface.launch()

 from pdf2image import convert_from_path
 import tempfile
 import os
+import shutil
+def ocr_pdf(file_path):
+    # Temporary directory for processing
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Convert PDF to images
+        images = convert_from_path(file_path, output_folder=temp_dir)
+        # Extract text from each page image
+        extracted_text = ""
+        for i, image in enumerate(images):
+            text = pytesseract.image_to_string(image)
+            extracted_text += f"\n{text}\n\n"
+        # Save the extracted text to a .txt file in a persistent location
+        output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
+        with open(output_txt_path, "w") as f:
+            f.write(extracted_text)
+        # Create a persistent file to serve for download
+        final_output_path = "/tmp/extracted_text.txt"
+        shutil.copy(output_txt_path, final_output_path)  # Copy to a persistent location
+        return final_output_path
+# Gradio Interface
 iface = gr.Interface(
+    fn=lambda file: ocr_pdf(file.name),  # Pass file path instead of file object
+    inputs=gr.File(label="Upload PDF File"),
+    outputs=gr.File(label="Download Extracted Text (.txt)"),  # Outputs a downloadable .txt file
+    title="PDF to Text OCR"
 )
 if __name__ == "__main__":
+    iface.launch()