In [None]:
!pip install Pillow
!pip install pytesseract
!pip install datasets
!sudo apt-get install libtesseract-dev python3-pil tesseract-ocr-eng tesseract-ocr-script-latn


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os

os.chdir('drive/MyDrive/Colab Notebooks')

### References
Apply classes to this model like in here: https://medium.com/@tejpal.abhyuday/information-extraction-part-3-9c2487ec4930

In [5]:
import numpy as np
from transformers import LayoutLMv3Processor, LayoutLMv3Tokenizer, LayoutLMv3ForTokenClassification, AdamW, LayoutLMv3ImageProcessor
from datasets import load_dataset, Dataset, Features, Sequence, ClassLabel, Value, Array2D, Array3D
import torch
from PIL import Image, ImageDraw, ImageFont
from tqdm.notebook import tqdm

In [6]:
# Load our dataset
funsd_dataset = load_dataset("nielsr/funsd")
id2label = ["O", "B-HEADER", "I-HEADER", "B-QUESTION", "I-QUESTION", "B-ANSWER", "I-ANSWER"]
label2id = {l:i for i, l in enumerate(id2label)}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


funsd.py:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

The repository for nielsr/funsd contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/nielsr/funsd.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

{'O': 0,
 'B-HEADER': 1,
 'I-HEADER': 2,
 'B-QUESTION': 3,
 'I-QUESTION': 4,
 'B-ANSWER': 5,
 'I-ANSWER': 6}

In [7]:
# Processor and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
processor = LayoutLMv3Processor(
    image_processor=LayoutLMv3ImageProcessor(apply_ocr=True),
    tokenizer=LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base"),
)
model = LayoutLMv3ForTokenClassification.from_pretrained(
    "microsoft/layoutlmv3-base", num_labels=len(id2label)
)
model.to(device);

tokenizer_config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/856 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def encode_training_example(examples):
    images = [Image.open(path).convert("RGB") for path in examples['image_path']]

    words = examples['words']
    boxes = examples['bboxes']
    word_labels = examples['ner_tags']

    encoded_inputs = processor(
        images, words, boxes=boxes, word_labels=word_labels, padding="max_length", truncation=True
    )

    print("encoded inputs", encoded_inputs.keys())

    return encoded_inputs

training_features = Features({
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'labels': Sequence(ClassLabel(names=id2label)),
})

def training_dataloader_from_dataset(dataset):
    encoded_data = dataset.map(
        encode_training_example, batched=True, remove_columns=funsd_dataset['train'].column_names,
        features=training_features
    )
    encoded_data.set_format(type='torch', device=device)
    dataloader = torch.utils.data.DataLoader(encoded_data, batch_size=4, shuffle=True)
    batch = next(iter(dataloader))

    return dataloader
train_dataloader = training_dataloader_from_dataset(funsd_dataset['train'])
valid_dataloader = training_dataloader_from_dataset(funsd_dataset['test'])

In [None]:
# Training the model
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 200

training_loss_history = []
validation_loss_history = []

saved_models = 0

for epoch in range(num_epochs):
    print("Epoch: ", epoch)
    training_loss = 0.0
    model.train()
    for batch in tqdm(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss

        training_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    # Save the model and optimizer state after each epoch
    if ((training_loss / batch["input_ids"].shape[0]) < 5) and saved_models < 8:
        save_path = f"/kaggle/working/model_checkpoint_epoch_{epoch}.pt"
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'training_loss': training_loss,
        }, save_path)
        print(f"Checkpoint saved: {save_path}")
        saved_models+=1

    training_loss_history.append(training_loss)
    print("Training Loss: ", training_loss / batch["input_ids"].shape[0])
    validation_loss = 0.0
    for batch in tqdm(valid_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        validation_loss += loss.item()

    print("Validation Loss: ", validation_loss)
    validation_loss_history.append(validation_loss)


## Inference phase

In [8]:
import torch
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available()  else "cpu")

model = LayoutLMv3ForTokenClassification.from_pretrained('../Tese/LayoutLMv3_0/')
model.to(device)

LayoutLMv3ForTokenClassification(
  (layoutlmv3): LayoutLMv3Model(
    (embeddings): LayoutLMv3TextEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (x_position_embeddings): Embedding(1024, 128)
      (y_position_embeddings): Embedding(1024, 128)
      (h_position_embeddings): Embedding(1024, 128)
      (w_position_embeddings): Embedding(1024, 128)
    )
    (patch_embed): LayoutLMv3PatchEmbeddings(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
    (encoder): LayoutLMv3Encoder

In [None]:

from pprint import pprint
def unnormalize_box(bbox, width, height):
     return [
         width * (bbox[0] / 1000),
         height * (bbox[1] / 1000),
         width * (bbox[2] / 1000),
         height * (bbox[3] / 1000),
     ]

def iob_to_label(label):
    label = label[2:]
    if not label:
        return 'other'
    return label

def inference(apply_ocr=False, save=False):
    for i, image_ocr in enumerate(funsd_dataset['test']):
        image = Image.open(image_ocr['image_path'])
        image = image.convert("RGB")
        width, height = image.size

        if apply_ocr:
            encoded_inputs = processor(
                image,
                padding="max_length", truncation=True, return_tensors="pt", return_token_type_ids=True
            ).to(device)
        else:
            encoded_inputs = processor(
                image, image_ocr['words'], boxes=image_ocr['bboxes'], word_labels=image_ocr['ner_tags'],
                padding="max_length", truncation=True, return_tensors="pt"
            ).to(device)

        # Decode the words so it is easier to debug
        input_ids = encoded_inputs.input_ids
        words = processor.tokenizer.decode(input_ids[0], skip_special_tokens=False).split()

        # Inference
        outputs = model(**encoded_inputs)
        predictions = outputs.logits.argmax(-1).squeeze().tolist()
        token_boxes = encoded_inputs.bbox.squeeze().tolist()

        true_predictions = [id2label[prediction] for prediction in predictions]
        true_boxes = [unnormalize_box(box, width, height) for box in token_boxes]
        draw = ImageDraw.Draw(image)

        font = ImageFont.load_default()
        label2color = {'question':'blue', 'answer':'green', 'header':'orange', 'other':'violet'}

        # Draw the output
        for prediction, box in zip(true_predictions, true_boxes):
            predicted_label = iob_to_label(prediction).lower()
            draw.rectangle(box, outline=label2color[predicted_label])
            draw.text(
                (box[0]+10, box[1]-10), text=predicted_label, fill=label2color[predicted_label], font=font
            )

        # Save prediction in drive
        if save:
            if apply_ocr:
                save_path = f'../Tese/LayoutLMv3_0/TestDataWithOcr/funds_test_{i}.png'
            else:
                save_path = f'../Tese/LayoutLMv3_0/TestDataWithoutOcr/funds_test_{i}.png'

            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            image.save(save_path)



#ocrless_inference()
inference(apply_ocr=True, save=False)