Spaces:
Runtime error
Runtime error
| import torch | |
| import streamlit as st | |
| from PIL import Image | |
| from io import BytesIO | |
| from transformers import VisionEncoderDecoderModel, VisionEncoderDecoderConfig , DonutProcessor | |
| def run_prediction(sample): | |
| global pretrained_model, processor, task_prompt | |
| if isinstance(sample, dict): | |
| # prepare inputs | |
| pixel_values = torch.tensor(sample["pixel_values"]).unsqueeze(0) | |
| else: # sample is an image | |
| # prepare encoder inputs | |
| pixel_values = processor(image, return_tensors="pt").pixel_values | |
| decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids | |
| # run inference | |
| outputs = pretrained_model.generate( | |
| pixel_values.to(device), | |
| decoder_input_ids=decoder_input_ids.to(device), | |
| max_length=pretrained_model.decoder.config.max_position_embeddings, | |
| early_stopping=True, | |
| pad_token_id=processor.tokenizer.pad_token_id, | |
| eos_token_id=processor.tokenizer.eos_token_id, | |
| use_cache=True, | |
| num_beams=1, | |
| bad_words_ids=[[processor.tokenizer.unk_token_id]], | |
| return_dict_in_generate=True, | |
| ) | |
| # process output | |
| prediction = processor.batch_decode(outputs.sequences)[0] | |
| # post-processing | |
| if "cord" in task_prompt: | |
| prediction = prediction.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") | |
| # prediction = re.sub(r"<.*?>", "", prediction, count=1).strip() # remove first task start token | |
| prediction = processor.token2json(prediction) | |
| # load reference target | |
| if isinstance(sample, dict): | |
| target = processor.token2json(sample["target_sequence"]) | |
| else: | |
| target = "<not_provided>" | |
| return prediction, target | |
| task_prompt = f"<s>" | |
| logo = Image.open("./img/rsz_unstructured_logo.png") | |
| st.image(logo) | |
| st.markdown(''' | |
| ### Receipt Parser | |
| This is an OCR-free Document Understanding Transformer nicknamed 🍩. It was fine-tuned with 1000 receipt images -> SROIE dataset. | |
| The original 🍩 implementation can be found on [here](https://github.com/clovaai/donut). | |
| At [Unstructured.io](https://github.com/Unstructured-IO/unstructured) we are on a mission to build custom preprocessing pipelines for labeling, training, or production ML-ready pipelines 🤩. | |
| Come and join us in our public repos and contribute! Each of your contributions and feedback holds great value and is very significant to the community 😊. | |
| ''') | |
| image_upload = None | |
| photo = None | |
| with st.sidebar: | |
| information = st.radio( | |
| "What information inside the 🧾s are you interested in extracting?", | |
| ('Receipt Summary', 'Receipt Menu Details', 'Extract all', 'Unstructured.io Parser')) | |
| receipt = st.selectbox('Pick one 🧾', ['1', '2', '3', '4', '5', '6'], index=1) | |
| # file upload | |
| uploaded_file = st.file_uploader("Upload a 🧾") | |
| if uploaded_file is not None: | |
| # To read file as bytes: | |
| image_bytes_data = uploaded_file.getvalue() | |
| image_upload = Image.open(BytesIO(image_bytes_data)) #.frombytes('RGBA', (128,128), image_bytes_data, 'raw') | |
| # st.write(bytes_data) | |
| camera_click = st.button('Use my camera') | |
| img_file_buffer = None | |
| if camera_click: | |
| img_file_buffer = st.camera_input("Take a picture of your receipt!") | |
| if img_file_buffer: | |
| # To read image file buffer as a PIL Image: | |
| photo = Image.open(img_file_buffer) | |
| st.info("picture taken!") | |
| st.text(f'{information} mode is ON!\nTarget 🧾: {receipt}') # \n(opening image @:./img/receipt-{receipt}.png)') | |
| col1, col2 = st.columns(2) | |
| if photo: | |
| image = photo | |
| st.info("photo loaded to image") | |
| elif image_upload: | |
| image = image_upload | |
| else: | |
| image = Image.open(f"./img/receipt-{receipt}.jpg") | |
| with col1: | |
| st.image(image, caption='Your target receipt') | |
| if st.button('Parse receipt! 🐍'): | |
| with st.spinner(f'baking the 🍩s...'): | |
| if information == 'Receipt Summary': | |
| processor = DonutProcessor.from_pretrained("unstructuredio/donut-base-sroie") | |
| pretrained_model = VisionEncoderDecoderModel.from_pretrained("unstructuredio/donut-base-sroie") | |
| task_prompt = f"<s>" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| pretrained_model.to(device) | |
| elif information == 'Receipt Menu Details': | |
| processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2") | |
| pretrained_model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2") | |
| task_prompt = f"<s_cord-v2>" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| pretrained_model.to(device) | |
| elif information == 'Unstructured.io Parser': | |
| processor = DonutProcessor.from_pretrained("unstructuredio/donut-base-labelstudio-A1.0") | |
| pretrained_model = VisionEncoderDecoderModel.from_pretrained("unstructuredio/donut-base-labelstudio-A1.0") | |
| task_prompt = f"<s>" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| pretrained_model.to(device) | |
| else: # Extract all | |
| processor_a = DonutProcessor.from_pretrained("unstructuredio/donut-base-sroie") | |
| processor_b = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2") | |
| pretrained_model_a = VisionEncoderDecoderModel.from_pretrained("unstructuredio/donut-base-sroie") | |
| pretrained_model_b = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| with col2: | |
| if information == 'Extract all': | |
| st.info(f'parsing 🧾 (extracting all)...') | |
| pretrained_model, processor, task_prompt = pretrained_model_a, processor_a, f"<s>" | |
| pretrained_model.to(device) | |
| parsed_receipt_info_a, _ = run_prediction(image) | |
| pretrained_model, processor, task_prompt = pretrained_model_b, processor_b, f"<s_cord-v2>" | |
| pretrained_model.to(device) | |
| parsed_receipt_info_b, _ = run_prediction(image) | |
| st.text(f'\nReceipt Summary:') | |
| st.json(parsed_receipt_info_a) | |
| st.text(f'\nReceipt Menu Details:') | |
| st.json(parsed_receipt_info_b) | |
| else: | |
| st.info(f'parsing 🧾...') | |
| parsed_receipt_info, _ = run_prediction(image) | |
| st.text(f'\n{information}') | |
| st.json(parsed_receipt_info) |