drewThomasson commited on
Commit
be572f8
·
verified ·
1 Parent(s): 004ca9f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -33
app.py CHANGED
@@ -3,43 +3,80 @@ import pytesseract
3
  from pdf2image import convert_from_path
4
  import tempfile
5
  import os
6
- import shutil
7
-
8
- def ocr_pdf(file_path):
9
- # Temporary directory for processing
10
- with tempfile.TemporaryDirectory() as temp_dir:
11
- # Convert PDF to images
12
- images = convert_from_path(file_path, output_folder=temp_dir)
13
-
14
- # Extract text from each page image
15
- extracted_text = ""
16
- for i, image in enumerate(images):
17
- text = pytesseract.image_to_string(image)
18
- extracted_text += f"\n{text}\n\n"
19
-
20
- # Save the extracted text to a .txt file in a persistent location
21
- output_txt_path = os.path.join(temp_dir, "extracted_text.txt")
22
- with open(output_txt_path, "w", encoding="utf-8") as f:
23
- f.write(extracted_text)
24
-
25
- # Create a persistent file to serve for download
26
- final_output_path = "/tmp/extracted_text.txt"
27
- shutil.copy(output_txt_path, final_output_path) # Copy to a persistent location
28
-
29
- # Return both: actual text and path (for download)
30
- return extracted_text, final_output_path
31
-
32
- # Gradio Interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  iface = gr.Interface(
34
- fn=lambda file: ocr_pdf(file.name), # Pass file path instead of file object
35
- inputs=gr.File(label="Upload PDF File"),
36
  outputs=[
37
- gr.Textbox(label="Extracted Text"), # Shows text directly
38
- gr.File(label="Download Extracted Text (.txt)") # Optional download
39
  ],
40
- title="PDF to Text OCR",
41
- allow_flagging="never"
 
 
 
 
 
 
 
 
 
42
  )
43
 
 
44
  if __name__ == "__main__":
45
  iface.launch()
 
3
  from pdf2image import convert_from_path
4
  import tempfile
5
  import os
6
+
7
+ def ocr_pdf(pdf_file):
8
+ """
9
+ Performs OCR on a given PDF file.
10
+
11
+ Args:
12
+ pdf_file: An uploaded file object from Gradio.
13
+
14
+ Returns:
15
+ A tuple containing:
16
+ - The path to the generated .txt file.
17
+ - The extracted text content as a string.
18
+ """
19
+ if pdf_file is None:
20
+ return None, "Please upload a PDF file first."
21
+
22
+ file_path = pdf_file.name
23
+
24
+ try:
25
+ # Use a temporary directory that is automatically cleaned up
26
+ with tempfile.TemporaryDirectory() as temp_dir:
27
+ # Convert PDF pages to images
28
+ try:
29
+ images = convert_from_path(file_path, output_folder=temp_dir)
30
+ except Exception as e:
31
+ error_message = (
32
+ "Failed to convert PDF. Please ensure Poppler is installed and in your system's PATH.\n"
33
+ f"Details: {e}"
34
+ )
35
+ return None, error_message
36
+
37
+ # Extract text from each page image
38
+ full_extracted_text = ""
39
+ for i, image in enumerate(images):
40
+ text = pytesseract.image_to_string(image, lang='eng')
41
+ full_extracted_text += f"--- Page {i+1} ---\n{text}\n\n"
42
+
43
+ if not full_extracted_text.strip():
44
+ full_extracted_text = "No text could be extracted. The PDF might contain only images without text or be empty."
45
+
46
+ # Create a temporary file for the extracted text. Gradio will handle serving it.
47
+ # We use a NamedTemporaryFile to ensure it has a path and is cleaned up.
48
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt", encoding="utf-8") as f:
49
+ f.write(full_extracted_text)
50
+ output_txt_path = f.name
51
+
52
+ # Return both the file path for download and the text for display
53
+ return output_txt_path, full_extracted_text
54
+
55
+ except Exception as e:
56
+ return None, f"An unexpected error occurred: {e}"
57
+
58
+ # --- Gradio Interface Definition ---
59
+ # We define the user interface for our application.
60
  iface = gr.Interface(
61
+ fn=ocr_pdf,
62
+ inputs=gr.File(label="Upload PDF File", type="file"),
63
  outputs=[
64
+ gr.File(label="Download Extracted Text (.txt)"),
65
+ gr.Textbox(label="Extracted Text Content", lines=20, interactive=False)
66
  ],
67
+ title="📄 PDF to Text Extractor (OCR)",
68
+ description=(
69
+ "Upload a PDF file to extract its text. The content will be displayed below, "
70
+ "and you'll get a link to download it as a .txt file. "
71
+ "**Note:** This tool relies on OCR and may not be 100% accurate, especially with complex layouts or poor quality scans."
72
+ ),
73
+ allow_flagging="never",
74
+ examples=[
75
+ # You can place paths to example PDFs on the server here if you have any.
76
+ # ["path/to/your/example.pdf"]
77
+ ]
78
  )
79
 
80
+ # --- Launch the Application ---
81
  if __name__ == "__main__":
82
  iface.launch()