typhoon-ocr

Sleeping

App Files Files Community

protae5544 commited on 21 days ago

Commit

04ed58c

verified ·

1 Parent(s): e8f7a5f

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -143

app.py CHANGED Viewed

@@ -2,23 +2,16 @@ import base64
 from io import BytesIO
 import json
 import os
-import PyPDF2  # เพิ่มไลบรารีสำหรับอ่าน PDF
 from openai import OpenAI
 from dotenv import load_dotenv
 from typhoon_ocr import prepare_ocr_messages
 import gradio as gr
 from PIL import Image
-# โหลด environment variables
 load_dotenv()
-# ตั้งค่า OpenAI API
-openai = OpenAI(
-    base_url=os.getenv("TYPHOON_BASE_URL"),
-    api_key=os.getenv("TYPHOON_API_KEY")
-)
-# ตั้งค่า Theme (เดิม)
 theme = gr.themes.Soft(
     primary_hue=gr.themes.Color(
         c50="#f7f7fd",
@@ -37,131 +30,54 @@ theme = gr.themes.Soft(
     neutral_hue="stone",
 )
-# ตัวแปรสำหรับบันทึกผลลัพธ์
-OUTPUT_FILE = "ocr_results.txt"
-def save_ocr_result(text):
-    """บันทึกผลลัพธ์ OCR แบบต่อเนื่องในไฟล์เดียว"""
-    with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
-        f.write(text + "\n\n")
-    return OUTPUT_FILE
-def clear_output_file():
-    """ล้างไฟล์ผลลัพธ์เก่า"""
-    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
-        f.write("")
-def get_pdf_page_count(pdf_path):
-    """หาจำนวนหน้าของ PDF"""
-    with open(pdf_path, 'rb') as f:
-        reader = PyPDF2.PdfReader(f)
-        return len(reader.pages)
-    return 0
-def process_pdf(pdf_or_image_file, task_type):
     if pdf_or_image_file is None:
         return None, "No file uploaded"
     orig_filename = pdf_or_image_file.name
-    combined_text = ""
-    image_pil = None  # ใช้เก็บภาพหน้าแรกของ PDF
     try:
-        # ตรวจสอบว่าเป็น PDF หรือไม่
-        if orig_filename.lower().endswith(".pdf"):
-            total_pages = get_pdf_page_count(orig_filename)
-            if total_pages == 0:
-                return None, "ไม่สามารถอ่านจำนวนหน้าของ PDF ได้"
-            # ประมวลผลทุกหน้า
-            for page_num in range(1, total_pages + 1):
-                # เตรียมข้อมูลสำหรับ OCR
-                messages = prepare_ocr_messages(
-                    pdf_or_image_path=orig_filename,
-                    task_type=task_type,
-                    target_image_dim=1800,
-                    target_text_length=8000,
-                    page_num=page_num
-                )
-                # ดึงภาพหน้าแรก
-                if page_num == 1:
-                    image_url = messages[0]["content"][1]["image_url"]["url"]
-                    image_base64 = image_url.replace("data:image/png;base64,", "")
-                    image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))
-                # ส่งไป API
-                response = openai.chat.completions.create(
-                    model=os.getenv("TYPHOON_OCR_MODEL"),
-                    messages=messages,
-                    max_tokens=16384,
-                    extra_body={
-                        "repetition_penalty": 1.2,
-                        "temperature": 0.1,
-                        "top_p": 0.6,
-                    },
-                )
-                text_output = response.choices[0].message.content
-                # ดึง natural_text
-                try:
-                    json_data = json.loads(text_output)
-                    markdown_out = json_data.get('natural_text', "").replace("<figure>", "").replace("</figure>", "")
-                except Exception as e:
-                    markdown_out = f"⚠️ Could not extract `natural_text` from output.\nError: {str(e)}"
-                # รวมผลลัพธ์ทุกหน้า
-                combined_text += f"[Page {page_num}]\n{markdown_out}\n\n"
-            # บันทึกผลลัพธ์ทั้งหมดลงไฟล์
-            save_ocr_result(combined_text)
-            return image_pil, combined_text, gr.File.update(value=OUTPUT_FILE)
-        # หากเป็นไฟล์ภาพ
-        else:
-            # ประมวลผลหน้าเดียว
-            messages = prepare_ocr_messages(
-                pdf_or_image_path=orig_filename,
-                task_type=task_type,
-                target_image_dim=1800,
-                target_text_length=8000,
-                page_num=1
-            )
-            # ดึงภาพ
-            image_url = messages[0]["content"][1]["image_url"]["url"]
-            image_base64 = image_url.replace("data:image/png;base64,", "")
-            image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))
-            # ส่งไป API
-            response = openai.chat.completions.create(
-                model=os.getenv("TYPHOON_OCR_MODEL"),
-                messages=messages,
-                max_tokens=16384,
-                extra_body={
-                    "repetition_penalty": 1.2,
-                    "temperature": 0.1,
-                    "top_p": 0.6,
-                },
-            )
-            text_output = response.choices[0].message.content
-            # ดึง natural_text
-            try:
-                json_data = json.loads(text_output)
-                markdown_out = json_data.get('natural_text', "").replace("<figure>", "").replace("</figure>", "")
-            except Exception as e:
-                markdown_out = f"⚠️ Could not extract `natural_text` from output.\nError: {str(e)}"
-            # บันทึกผลลัพธ์ลงไฟล์
-            save_ocr_result(markdown_out)
-            return image_pil, markdown_out, gr.File.update(value=OUTPUT_FILE)
     except Exception as e:
-        return None, f"Error processing file: {str(e)}", None
-# สร้าง UI
 with gr.Blocks(theme=theme) as demo:
     title = gr.HTML("""
     <h1>Typhoon OCR</h1>
@@ -171,7 +87,7 @@ with gr.Blocks(theme=theme) as demo:
     <br />
     <details>
         <summary><strong>Disclaimer</strong></summary>
-        The responses generated by this AI system are autonomously constructed and do not necessarily reflect the views or positions of the developing organizations, their affiliates, or any of their employees. These AI-generated responses do not represent those of the organizations. The organizations do not endorse, support, sanction, encourage, verify, or agree with the comments, opinions, or statements generated by this AI. The information produced by this AI is not intended to malign any religion, ethnic group, club, organization, company, individual, anyone, or anything. It is not the intent of the organizations to malign any group or individual. The AI operates based on its programming and training data and its responses should not be interpreted as the explicit intent or opinion of the organizations.
     </details>
     <br />
     <details>
@@ -183,14 +99,12 @@ with gr.Blocks(theme=theme) as demo:
         <summary><strong>License</strong></summary>
         This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses. The content of this project itself is licensed under the Apache license 2.0.
     </details>
-    """)
     with gr.Row():
         with gr.Column(scale=1):
-            # อัปโหลดไฟล์
             pdf_input = gr.File(label="📄 Upload Image file or PDF file", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
-            # เลือก Task
             with gr.Group(elem_classes=["task-background"]):
                 task_dropdown = gr.Radio(["default", "structure"], label="🎯 Select Task", value="default")
                 gr.HTML("""
@@ -201,6 +115,7 @@ with gr.Blocks(theme=theme) as demo:
                 demo.css = """
                 .task-background {
                     background: var(--block-background-fill) !important;
                 }
                 .task-background > * {
                     background: var(--block-background-fill) !important;
@@ -210,28 +125,19 @@ with gr.Blocks(theme=theme) as demo:
                     font-size: 12px;
                 }
                 """
-            # ปุ่มรัน
             run_button = gr.Button("🚀 Run")
-            # แสดงภาพ
             image_output = gr.Image(label="📸 Preview Image", type="pil")
         with gr.Column(scale=2):
-            # แสดงผลลัพธ์ Markdown
             markdown_output = gr.Markdown(label='Markdown Result', show_label=True)
-            # ปุ่มดาวน์โหลดไฟล์
-            download_button = gr.File(label="📥 ดาวน์โหลดผลลัพธ์ทั้งหมด (Text File)", interactive=False)
-    # เชื่อมต่อ UI กับฟังก์ชัน
     run_button.click(
         fn=process_pdf,
-        inputs=[pdf_input, task_dropdown],
-        outputs=[image_output, markdown_output, download_button]
     )
-# เริ่มต้นใหม่ (ล้างไฟล์ผลลัพธ์เก่า)
-clear_output_file()
-# รันแอป
 demo.launch(share=False)

 from io import BytesIO
 import json
 import os
 from openai import OpenAI
 from dotenv import load_dotenv
 from typhoon_ocr import prepare_ocr_messages
 import gradio as gr
 from PIL import Image
 load_dotenv()
+openai = OpenAI(base_url=os.getenv("TYPHOON_BASE_URL"), api_key=os.getenv("TYPHOON_API_KEY"))
 theme = gr.themes.Soft(
     primary_hue=gr.themes.Color(
         c50="#f7f7fd",
     neutral_hue="stone",
 )
+def process_pdf(pdf_or_image_file, task_type, page_number):
     if pdf_or_image_file is None:
         return None, "No file uploaded"
     orig_filename = pdf_or_image_file.name
     try:
+        # Use the new simplified function to prepare OCR messages with page number
+        messages = prepare_ocr_messages(
+            pdf_or_image_path=orig_filename,
+            task_type=task_type,
+            target_image_dim=1800,
+            target_text_length=8000,
+            page_num=page_number if page_number else 1
+        )
+        # Extract the image from the message content for display
+        image_url = messages[0]["content"][1]["image_url"]["url"]
+        image_base64 = image_url.replace("data:image/png;base64,", "")
+        image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))
+        # Send messages to OpenAI compatible API
+        response = openai.chat.completions.create(
+            model=os.getenv("TYPHOON_OCR_MODEL"),
+            messages=messages,
+            max_tokens=16384,
+            extra_body={
+                "repetition_penalty": 1.2,
+                "temperature": 0.1,
+                "top_p": 0.6,
+            },
+        )
+        text_output = response.choices[0].message.content
+        # Try to parse the output assuming it is a Python dictionary containing 'natural_text'
+        try:
+            json_data = json.loads(text_output)
+            markdown_out = json_data.get('natural_text', "").replace("<figure>", "").replace("</figure>", "")
+        except Exception as e:
+            markdown_out = f"⚠️ Could not extract `natural_text` from output.\nError: {str(e)}"
+        return image_pil, markdown_out
     except Exception as e:
+        return None, f"Error processing file: {str(e)}"
+# Build the Gradio UI.
 with gr.Blocks(theme=theme) as demo:
     title = gr.HTML("""
     <h1>Typhoon OCR</h1>
     <br />
     <details>
         <summary><strong>Disclaimer</strong></summary>
+        The responses generated by this Artificial Intelligence (AI) system are autonomously constructed and do not necessarily reflect the views or positions of the developing organizations, their affiliates, or any of their employees. These AI-generated responses do not represent those of the organizations. The organizations do not endorse, support, sanction, encourage, verify, or agree with the comments, opinions, or statements generated by this AI. The information produced by this AI is not intended to malign any religion, ethnic group, club, organization, company, individual, anyone, or anything. It is not the intent of the organizations to malign any group or individual. The AI operates based on its programming and training data and its responses should not be interpreted as the explicit intent or opinion of the organizations.
     </details>
     <br />
     <details>
         <summary><strong>License</strong></summary>
         This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses. The content of this project itself is licensed under the Apache license 2.0.
     </details>
+""")
     with gr.Row():
         with gr.Column(scale=1):
+            # Update file_types to accept PDF as well as common image formats.
             pdf_input = gr.File(label="📄 Upload Image file or PDF file", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
             with gr.Group(elem_classes=["task-background"]):
                 task_dropdown = gr.Radio(["default", "structure"], label="🎯 Select Task", value="default")
                 gr.HTML("""
                 demo.css = """
                 .task-background {
                     background: var(--block-background-fill) !important;
                 }
                 .task-background > * {
                     background: var(--block-background-fill) !important;
                     font-size: 12px;
                 }
                 """
+            page_number = gr.Number(label="📄 Page Number (for PDFs only)", value=1, minimum=1, step=1)
             run_button = gr.Button("🚀 Run")
             image_output = gr.Image(label="📸 Preview Image", type="pil")
         with gr.Column(scale=2):
             markdown_output = gr.Markdown(label='Markdown Result', show_label=True)
+    # Connect the UI inputs to the processing function.
     run_button.click(
         fn=process_pdf,
+        inputs=[pdf_input, task_dropdown, page_number],
+        outputs=[image_output, markdown_output]
     )
+# Launch the Gradio demo (temporary public share for 72 hours)
 demo.launch(share=False)