import json import random import gradio as gr import pymupdf def parse_pdf(file_path: str): doc = pymupdf.open(file_path) report_run_date_time = None transportation_provider_name = None rows = [] row = {} for page in doc: text_page = page.get_textpage() state = None for block in text_page.extractBLOCKS(): x0, y0, x1, y1, text = block[:5] text: str text = text.strip() if text.startswith("Report Run Date Time"): lines = text.split("\n") report_run_date_time = lines[1] transportation_provider_name = lines[2].replace("Transportation Provider Name:", "").strip() elif "Fidelis Agent" in text: state = "page_no" elif state == "page_no": state = "row_1" elif state == "row_1": lines = text.split("\n") if len(lines) == 1: type_, case_no, plan_id, member_id = lines[0].split(" ") else: type_ = lines[0] case_no, plan_id, member_id = lines[1].split(" ") row["type"] = type_ row["case_no"] = case_no row["plan_id"] = plan_id row["member_id"] = member_id state = "row_2" elif state == "row_2": pickup_info = text row["pickup_info"] = pickup_info state = "row_3" elif state == "row_3": text = text.replace("Pickup:", "") text = text.strip() text = text.replace("\n", " ") pickup_time = text row["pickup_time"] = pickup_time state = "row_4" elif state == "row_4": lines = text.split("\n") assert len(lines) in (3, 4) num_of_one_way_trips = int(lines[0]) vehicle_type = lines[1] num_of_riders = int(lines[2]) if len(lines) == 4: auth_number = lines[3] else: auth_number = None row["num_of_one_way_trips"] = num_of_one_way_trips row["vehicle_type"] = vehicle_type row["num_of_riders"] = num_of_riders row["auth_number"] = auth_number state = "row_5" elif state == "row_5": dest_info = text row["dest_info"] = dest_info state = "row_6" elif state == "row_6": if x0 >= 700: fidelis_agent = text row["special_needs_and_comments"] = None row["fidelis_agent"] = fidelis_agent rows.append(row) row = {} state = "row_1" elif x1 > 719: lines = text.split("\n") special_needs_and_comments = "\n".join(lines[:-1]) fidelis_agent = lines[-1] row["special_needs_and_comments"] = special_needs_and_comments row["fidelis_agent"] = fidelis_agent rows.append(row) row = {} state = "row_1" else: special_needs_and_comments = text row["special_needs_and_comments"] = special_needs_and_comments state = "row_7" elif state == "row_7": if x0 >= 700: fidelis_agent = text row["fidelis_agent"] = fidelis_agent rows.append(row) row = {} state = "row_1" else: special_needs_and_comments += text row["special_needs_and_comments"] = special_needs_and_comments state = "row_8" elif state == "row_8": assert x0 >= 700 fidelis_agent = text row["fidelis_agent"] = fidelis_agent rows.append(row) row = {} state = "row_1" metadata = f"""\ Report Run Date Time: {report_run_date_time} Transportation Provider Name: {transportation_provider_name} Number of Items: {len(rows)} """ json_data = { "report_run_date_time": report_run_date_time, "transportation_provider_name": transportation_provider_name, "itmes": rows, } output_path = f"output_{random.randint(0, 1000000):08d}.json" with open(output_path, "w") as f: json.dump(json_data, f, indent=4) return metadata, output_path def main(): app = gr.Interface( fn=parse_pdf, inputs=gr.File(label="PDF File"), outputs=[ gr.Textbox(label="Metadata", lines=7), gr.DownloadButton(label="Download JSON"), ], allow_flagging=False, ) app.launch() if __name__ == "__main__": main()