Spaces:

GAS17
/

ocr

Sleeping

GAS17 commited on Dec 25, 2024

Commit

b16b8c7

verified ·

1 Parent(s): 4f77700

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,28 +1,22 @@
-import gradio as gr
-import fitz  # PyMuPDF
-def consultar_pdf(pdf_file, consulta):
-    # Abrir el archivo PDF
-    pdf_document = fitz.open(pdf_file.name)
-    # Recorrer todas las páginas y extraer el texto
-    texto_completo = ""
-    for page_num in range(pdf_document.page_count):
-        page = pdf_document.load_page(page_num)
-        texto_completo += page.get_text()
-    # Devolver el texto completo del documento
-    return texto_completo
-# Crear la interfaz de Gradio
-iface = gr.Interface(
-    fn=consultar_pdf,
-    inputs=[
-        gr.File(label="Cargar PDF"),  # Entrada para cargar el archivo PDF
-        gr.Textbox(label="Consulta", placeholder="Escribe tu consulta aquí")  # Entrada para la consulta
-    ],
-    outputs="text"  # Salida de texto con el resultado de la consulta
-)
-# Lanzar la interfaz
-iface.launch()

+import pytesseract
+from pdf2image import convert_from_path
+def pdf_to_text(pdf_path, output_path):
+    # Convert PDF to list of images
+    pages = convert_from_path(pdf_path, 300)
+    # Extract text from all pages and join them
+    text = ""
+    for page in pages:
+        text += pytesseract.image_to_string(page)
+    # Write text to file
+    with open(output_path, "w", encoding="utf-8") as file:
+        file.write(text)
+    print(f"OCR completed. Text saved to {output_path}")
+# Usage
+pdf_path = 'input.pdf'
+output_path = 'output.txt'
+pdf_to_text(pdf_path, output_path)