Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -167,11 +167,14 @@ def extract_text_from_pdf(pdf_path):
|
|
| 167 |
|
| 168 |
for page_index in range(len(pdf_file)):
|
| 169 |
page = pdf_file.load_page(page_index)
|
| 170 |
-
text = page.get_text()
|
| 171 |
-
|
|
|
|
| 172 |
|
| 173 |
pdf_file.close()
|
| 174 |
-
|
|
|
|
|
|
|
| 175 |
|
| 176 |
except Exception as e:
|
| 177 |
print(f"Error extracting text from PDF: {e}")
|
|
@@ -196,6 +199,8 @@ def extract_images_from_pdf(pdf_path):
|
|
| 196 |
images.append(image)
|
| 197 |
|
| 198 |
pdf_file.close()
|
|
|
|
|
|
|
| 199 |
return images
|
| 200 |
|
| 201 |
except Exception as e:
|
|
@@ -212,9 +217,11 @@ def recognize_text(image):
|
|
| 212 |
recognized_text = ""
|
| 213 |
for (bbox, text, prob) in result:
|
| 214 |
if prob > 0.2:
|
| 215 |
-
recognized_text += f'{text}
|
| 216 |
-
|
| 217 |
-
|
|
|
|
|
|
|
| 218 |
|
| 219 |
except Exception as e:
|
| 220 |
print(f"Error recognizing text from image: {e}")
|
|
@@ -227,15 +234,24 @@ def ocr_text_from_pdf(pdf_path):
|
|
| 227 |
|
| 228 |
for image in images:
|
| 229 |
text = recognize_text(image)
|
| 230 |
-
|
|
|
|
| 231 |
|
| 232 |
-
|
|
|
|
|
|
|
| 233 |
|
| 234 |
def extract_all_text_from_pdf(pdf_path):
|
| 235 |
"""Extract both direct text and OCR text from a PDF."""
|
| 236 |
direct_text = extract_text_from_pdf(pdf_path)
|
| 237 |
ocr_text = ocr_text_from_pdf(pdf_path)
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
|
| 241 |
|
|
@@ -634,12 +650,7 @@ def main():
|
|
| 634 |
file = st.file_uploader("Upload PDF Files")
|
| 635 |
if file is not None:
|
| 636 |
try:
|
| 637 |
-
# pdf_path = "path/to/your/pdf_file.pdf"
|
| 638 |
-
|
| 639 |
-
# Extract text from the PDF
|
| 640 |
text = extract_all_text_from_pdf(file)
|
| 641 |
-
# print(extracted_text)
|
| 642 |
-
|
| 643 |
# text = get_pdf_text(file)
|
| 644 |
except Exception as e:
|
| 645 |
st.error(f"Error reading PDF file: {str(e)}")
|
|
|
|
| 167 |
|
| 168 |
for page_index in range(len(pdf_file)):
|
| 169 |
page = pdf_file.load_page(page_index)
|
| 170 |
+
text = page.get_text("text")
|
| 171 |
+
if text.strip(): # Check if the text is not empty
|
| 172 |
+
all_text += text.replace('\n', ' ') + " "
|
| 173 |
|
| 174 |
pdf_file.close()
|
| 175 |
+
if not all_text.strip():
|
| 176 |
+
print("No direct text found in the PDF.")
|
| 177 |
+
return all_text.strip() # Strip any leading/trailing whitespace
|
| 178 |
|
| 179 |
except Exception as e:
|
| 180 |
print(f"Error extracting text from PDF: {e}")
|
|
|
|
| 199 |
images.append(image)
|
| 200 |
|
| 201 |
pdf_file.close()
|
| 202 |
+
if not images:
|
| 203 |
+
print("No images found in the PDF.")
|
| 204 |
return images
|
| 205 |
|
| 206 |
except Exception as e:
|
|
|
|
| 217 |
recognized_text = ""
|
| 218 |
for (bbox, text, prob) in result:
|
| 219 |
if prob > 0.2:
|
| 220 |
+
recognized_text += f'{text} '
|
| 221 |
+
|
| 222 |
+
if not recognized_text.strip():
|
| 223 |
+
print("No text recognized from the image.")
|
| 224 |
+
return recognized_text.strip() # Strip any leading/trailing whitespace
|
| 225 |
|
| 226 |
except Exception as e:
|
| 227 |
print(f"Error recognizing text from image: {e}")
|
|
|
|
| 234 |
|
| 235 |
for image in images:
|
| 236 |
text = recognize_text(image)
|
| 237 |
+
if text.strip(): # Check if the recognized text is not empty
|
| 238 |
+
all_text += text + " "
|
| 239 |
|
| 240 |
+
if not all_text.strip():
|
| 241 |
+
print("No OCR text found in the PDF images.")
|
| 242 |
+
return all_text.strip() # Strip any leading/trailing whitespace
|
| 243 |
|
| 244 |
def extract_all_text_from_pdf(pdf_path):
|
| 245 |
"""Extract both direct text and OCR text from a PDF."""
|
| 246 |
direct_text = extract_text_from_pdf(pdf_path)
|
| 247 |
ocr_text = ocr_text_from_pdf(pdf_path)
|
| 248 |
+
all_text = direct_text + " " + ocr_text + " "
|
| 249 |
+
if not all_text.strip():
|
| 250 |
+
print("No text extracted from the PDF.")
|
| 251 |
+
return all_text.strip() # Strip any leading/trailing whitespace
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
|
| 255 |
|
| 256 |
|
| 257 |
|
|
|
|
| 650 |
file = st.file_uploader("Upload PDF Files")
|
| 651 |
if file is not None:
|
| 652 |
try:
|
|
|
|
|
|
|
|
|
|
| 653 |
text = extract_all_text_from_pdf(file)
|
|
|
|
|
|
|
| 654 |
# text = get_pdf_text(file)
|
| 655 |
except Exception as e:
|
| 656 |
st.error(f"Error reading PDF file: {str(e)}")
|