{
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Fine-tune Gemma3n on FineVideo\n",
        "\n",
        "In this notebook, we will see how to fine-tune Gemma3n an videos with audios inside.\n",
        "Using all three modalities is very costly compute-wise, so keep in mind that this is an educational tutorial to fit the model in 40GB VRAM."
      ],
      "metadata": {
        "id": "0eVo7Mc5GMyL"
      }
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "BLv-NJRZzHiA",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "bb4e4b32-5000-42e0-889d-90648e335a41"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.9/40.9 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.8/10.8 MB\u001b[0m \u001b[31m114.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m376.2/376.2 kB\u001b[0m \u001b[31m33.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m494.8/494.8 kB\u001b[0m \u001b[31m38.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m193.6/193.6 kB\u001b[0m \u001b[31m17.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m126.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m92.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m58.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m11.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m42.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m19.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m114.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
            "gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.\u001b[0m\u001b[31m\n",
            "\u001b[0m"
          ]
        }
      ],
      "source": [
        "!pip install -U -q timm transformers trl peft datasets"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "id": "UxE2vzKsbov0"
      },
      "outputs": [],
      "source": [
        "import io\n",
        "import os\n",
        "import zipfile\n",
        "\n",
        "import torch\n",
        "from datasets import load_dataset\n",
        "from PIL import Image\n",
        "from transformers import AutoProcessor, Gemma3nForConditionalGeneration\n",
        "\n",
        "from trl import (\n",
        "    SFTConfig,\n",
        "    SFTTrainer,\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "T06yJvcMiqO6"
      },
      "source": [
        "## Download videos and preprocessing\n",
        "\n",
        "FineVideo is a quite large dataset, we don't need a ton of examples, so we stream the dataset, check the duration and download the videos shorter than 30 secs."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "wBFfYgLxmg7b"
      },
      "outputs": [],
      "source": [
        "from datasets import load_dataset\n",
        "import json\n",
        "import os\n",
        "\n",
        "dataset = load_dataset(\"HuggingFaceFV/finevideo\", split=\"train\", streaming=True)\n",
        "\n",
        "\n",
        "os.makedirs(\"videos\", exist_ok=True)\n",
        "os.makedirs(\"metadata\", exist_ok=True)\n",
        "\n",
        "for idx, sample in enumerate(dataset):\n",
        "    data = sample[\"json\"]\n",
        "    duration = data.get(\"duration_seconds\", 0)\n",
        "    if duration < 30:\n",
        "      video_filename = f\"videos/sample_{idx}.mp4\"\n",
        "      with open(video_filename, 'wb') as video_file:\n",
        "        video_file.write(sample['mp4'])\n",
        "\n",
        "      json_filename = f\"metadata/sample_{idx}.json\"\n",
        "      with open(json_filename, 'w') as json_file:\n",
        "          json.dump(sample['json'], json_file)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "K48dmmZTdZ1l",
        "outputId": "31c7c32b-1c40-4df4-eb51-11857d7b4da9"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Number of items in content/videos: 871\n"
          ]
        }
      ],
      "source": [
        " print(f\"Number of items in content/videos: {len(os.listdir('videos'))}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "In FineVideo some frames are dark so we downsample 6 frames and if we can't get meaningful videos we remove them."
      ],
      "metadata": {
        "id": "QbkDI03qHMog"
      }
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "id": "0UMZi3tHb-BC"
      },
      "outputs": [],
      "source": [
        "import cv2\n",
        "from PIL import Image\n",
        "import numpy as np\n",
        "\n",
        "def is_dark(frame, threshold=10):\n",
        "    return np.max(frame) < threshold  # all pixels are very close to 0\n",
        "\n",
        "def downsample_video(video_path):\n",
        "    vidcap = cv2.VideoCapture(video_path)\n",
        "    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))\n",
        "    fps = vidcap.get(cv2.CAP_PROP_FPS)\n",
        "\n",
        "    frames = []\n",
        "\n",
        "    # Generate 8 evenly spaced indices, skip first and last\n",
        "    full_indices = np.linspace(0, total_frames - 1, 8, dtype=int)[1:-1]\n",
        "\n",
        "    for i in full_indices:\n",
        "        found_valid = False\n",
        "        for offset in [0, -1, 1, -2, 2]:  # Try nearby frames if original is dark\n",
        "            candidate_idx = i + offset\n",
        "            if 0 <= candidate_idx < total_frames:\n",
        "                vidcap.set(cv2.CAP_PROP_POS_FRAMES, candidate_idx)\n",
        "                success, image = vidcap.read()\n",
        "                if success:\n",
        "                    if not is_dark(image):\n",
        "                        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n",
        "                        pil_image = Image.fromarray(image)\n",
        "                        timestamp = round(candidate_idx / fps, 2)\n",
        "                        frames.append((pil_image, timestamp))\n",
        "                        found_valid = True\n",
        "                        break\n",
        "        if not found_valid:\n",
        "            print(f\"Warning: Could not find non-dark frame near index {i}\")\n",
        "\n",
        "    vidcap.release()\n",
        "\n",
        "    # If still fewer than 8, try to top off by scanning more frames\n",
        "    if len(frames) < 6:\n",
        "        print(\"Trying to top off with additional non-dark frames...\")\n",
        "        idx = 0\n",
        "        while len(frames) < 8 and idx < total_frames:\n",
        "            vidcap.set(cv2.CAP_PROP_POS_FRAMES, idx)\n",
        "            success, image = vidcap.read()\n",
        "            if success and not is_dark(image):\n",
        "                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n",
        "                pil_image = Image.fromarray(image)\n",
        "                timestamp = round(idx / fps, 2)\n",
        "                # Avoid adding duplicate timestamps\n",
        "                if not any(ts == timestamp for _, ts in frames):\n",
        "                    frames.append((pil_image, timestamp))\n",
        "            idx += 1\n",
        "\n",
        "    return frames[:8]  # Ensure exactly 8 frames\n",
        "\n",
        "import os\n",
        "import glob\n",
        "\n",
        "def remove_dark_videos(video_dir, metadata_dir, audio_dir):\n",
        "    \"\"\"\n",
        "    Remove videos (and their metadata/audio files) if all frames are dark.\n",
        "    \"\"\"\n",
        "    video_paths = glob.glob(os.path.join(video_dir, \"*.mp4\"))\n",
        "\n",
        "    for video_path in video_paths:\n",
        "        filename = os.path.basename(video_path)\n",
        "        base_name = os.path.splitext(filename)[0]\n",
        "\n",
        "        frames = downsample_video(video_path)\n",
        "        if len(frames) < 6:\n",
        "            try:\n",
        "                os.remove(video_path)\n",
        "                print(f\"Deleted: {video_path}\")\n",
        "            except Exception as e:\n",
        "                print(f\"Failed to delete {video_path}: {e}\")\n",
        "\n",
        "            metadata_path = os.path.join(metadata_dir, f\"{base_name}.json\")\n",
        "            if os.path.exists(metadata_path):\n",
        "                os.remove(metadata_path)\n",
        "\n",
        "            # Remove audio\n",
        "            audio_path = os.path.join(audio_dir, f\"{base_name}.wav\")\n",
        "            if os.path.exists(audio_path):\n",
        "                os.remove(audio_path)\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "remove_dark_videos(\n",
        "    video_dir=\"videos\",\n",
        "    metadata_dir=\"metadata\",\n",
        "    audio_dir=\"audios\"\n",
        "    )"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "pA6iIR38l66-",
        "outputId": "78f81f41-5e70-4900-e33c-cd918aaed67d"
      },
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Warning: Could not find non-dark frame near index 208\n",
            "Trying to top off with additional non-dark frames...\n",
            "Deleted: videos/sample_9650.mp4\n",
            "Warning: Could not find non-dark frame near index 432\n",
            "Trying to top off with additional non-dark frames...\n",
            "Deleted: videos/sample_31965.mp4\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-qa4Tf8PwITC"
      },
      "source": [
        "Gemma-3n accepts video (image frames) and audio separately, so we strip audio from video."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {
        "id": "OR7bhnCawHrF"
      },
      "outputs": [],
      "source": [
        "import os\n",
        "import subprocess\n",
        "\n",
        "video_dir = \"videos\"\n",
        "audio_dir = \"audios\"\n",
        "os.makedirs(audio_dir, exist_ok=True)\n",
        "\n",
        "for filename in os.listdir(video_dir):\n",
        "    if not filename.endswith(\".mp4\"):\n",
        "        continue\n",
        "\n",
        "    idx = filename.split(\"_\")[1].split(\".\")[0]\n",
        "    video_path = os.path.join(video_dir, filename)\n",
        "    audio_path = os.path.join(audio_dir, f\"sample_{idx}.wav\")\n",
        "\n",
        "    subprocess.run([\n",
        "        \"ffmpeg\", \"-i\", video_path,\n",
        "        \"-q:a\", \"0\", \"-map\", \"a\",\n",
        "        audio_path,\n",
        "        \"-y\"\n",
        "    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "uIlVtxDcwQcy"
      },
      "source": [
        "Construct a new dataset with audio, video, metadata (video categories). This dataset is very cool, it has some questions and answers, captions and more so get creative if you have the GPU VRAM to do so. Here we solve an easier task for educational purposes."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 13,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 49,
          "referenced_widgets": [
            "4eb3613e8efa4fd9adf2cfe27bfbd699",
            "c15cc5cb9d7947a99a01a30e430d0459",
            "1801493cd54742fd99752b2f605af1cb",
            "e5e518d8cf5f4aa5a0ecad6583f0d317",
            "425f9f26bd0647b1989ecb704414aa9f",
            "5eeff3de00c5488db1817328e83bb992",
            "4846c29045294042b8d916cb0fd8f9d6",
            "20b59cdc19684e1c97517e36f5bf8d6a",
            "143d6079d1744eedb41e2e1182bd0f33",
            "c022d8fabedc43ef9db0c8aca82d215e",
            "464ffcc84f48468b8f5d3f08412c6101"
          ]
        },
        "id": "erYr3SdmuS4m",
        "outputId": "0c95ff77-7976-4641-9a51-b7f24f36270d"
      },
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "Generating train split: 0 examples [00:00, ? examples/s]"
            ],
            "application/vnd.jupyter.widget-view+json": {
              "version_major": 2,
              "version_minor": 0,
              "model_id": "4eb3613e8efa4fd9adf2cfe27bfbd699"
            }
          },
          "metadata": {}
        }
      ],
      "source": [
        "from datasets import Dataset\n",
        "import json\n",
        "\n",
        "def gen():\n",
        "    meta_dir = \"metadata\"\n",
        "    for filename in os.listdir(meta_dir):\n",
        "        if not filename.endswith(\".json\"):\n",
        "            continue\n",
        "\n",
        "        idx = filename.split(\"_\")[1].split(\".\")[0]\n",
        "        if os.path.exists(f\"videos/sample_{idx}.mp4\"):\n",
        "          video_filename = f\"sample_{idx}.mp4\"\n",
        "          audio_filename = f\"sample_{idx}.wav\"\n",
        "          json_path = os.path.join(meta_dir, filename)\n",
        "\n",
        "          with open(json_path, \"r\") as f:\n",
        "              metadata = json.load(f)\n",
        "\n",
        "\n",
        "          yield {\n",
        "              \"video\": video_filename,\n",
        "              \"audio\": audio_filename,\n",
        "              \"content_parent_category\": metadata[\"content_parent_category\"],\n",
        "              \"sample_index\": int(idx)\n",
        "          }\n",
        "        else:\n",
        "          pass\n",
        "\n",
        "dataset = Dataset.from_generator(gen)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "CjtgRoSEd9TV"
      },
      "source": [
        "We will speed-up and downsample the audios to save space during training."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 14,
      "metadata": {
        "id": "8DDaQ86MD1Y3"
      },
      "outputs": [],
      "source": [
        "import torchaudio\n",
        "from torchaudio.transforms import Resample\n",
        "import os\n",
        "import torch\n",
        "\n",
        "def preprocess_audio(audio_path, target_sample_rate=16000, max_duration_sec=5, speedup_factor=1.25):\n",
        "    waveform, sample_rate = torchaudio.load(audio_path)\n",
        "\n",
        "    if waveform.shape[0] > 1:\n",
        "        waveform = waveform.mean(dim=0, keepdim=True)\n",
        "\n",
        "    if sample_rate != target_sample_rate:\n",
        "        resampler = Resample(orig_freq=sample_rate, new_freq=target_sample_rate)\n",
        "        waveform = resampler(waveform)\n",
        "        sample_rate = target_sample_rate\n",
        "\n",
        "    if speedup_factor > 1.0:\n",
        "        indices = torch.arange(0, waveform.shape[1], step=speedup_factor).long()\n",
        "        if indices[-1] >= waveform.shape[1]:\n",
        "            indices = indices[:-1]\n",
        "        waveform = waveform[:, indices]\n",
        "\n",
        "    max_length = int(target_sample_rate * max_duration_sec)\n",
        "    if waveform.shape[1] > max_length:\n",
        "        waveform = waveform[:, :max_length]\n",
        "\n",
        "    torchaudio.save(audio_path, waveform, sample_rate)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 15,
      "metadata": {
        "id": "IQ7L2_0bI1tP"
      },
      "outputs": [],
      "source": [
        "for file_name in os.listdir(\"audios\"):\n",
        "    if file_name.lower().endswith(\".wav\"):\n",
        "        audio_path = os.path.join(\"audios\", file_name)\n",
        "        preprocess_audio(audio_path)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 16,
      "metadata": {
        "id": "pspaO2Lv4SxG"
      },
      "outputs": [],
      "source": [
        "dataset = dataset.train_test_split(test_size=0.10, seed=42)"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Load the model\n",
        "\n",
        "Make sure you have your Hugging Face token in your Colab secrets."
      ],
      "metadata": {
        "id": "hrvYdvQ9Hye4"
      }
    },
    {
      "cell_type": "code",
      "execution_count": 57,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 49,
          "referenced_widgets": [
            "a33fedc485b346b1b9d4fb8b18e8ac64",
            "94d5d3b00449488caa6d8badc443a74f",
            "a60a111fc7c24bd7b21fed3f3dd64f29",
            "e830732fc2bc4848847ea85c772d0b98",
            "3e25db05674d4d2f8fd839a0ec63e7d8",
            "3262178b8baf4741b06250d7416df1f3",
            "2e9d5cf7a5c6466a9e1de6d4f403cd95",
            "9d2631150d5c4089bcc95f22a6698287",
            "9c0857a4034f4780ab5e7fdd9aa9d09d",
            "073975370eab45d9abc4f69f2b7b3d48",
            "0d1dfc47d0704506bc6e521c07162b4b"
          ]
        },
        "id": "UQaaLBCVzXH-",
        "outputId": "a6244057-777b-4f48-e89e-0d3c945e06e8"
      },
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
            ],
            "application/vnd.jupyter.widget-view+json": {
              "version_major": 2,
              "version_minor": 0,
              "model_id": "a33fedc485b346b1b9d4fb8b18e8ac64"
            }
          },
          "metadata": {}
        }
      ],
      "source": [
        "model = Gemma3nForConditionalGeneration.from_pretrained(\n",
        "    \"google/gemma-3n-E2B-it\", torch_dtype=torch.bfloat16,\n",
        ")\n",
        "processor = AutoProcessor.from_pretrained(\n",
        "    \"google/gemma-3n-E2B-it\",\n",
        ")\n",
        "processor.tokenizer.padding_side = \"right\""
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "epPCxTFi3XQ2",
        "outputId": "f59ad356-5d7c-463e-9c6c-35eb0f0aa586"
      },
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[2, 1, 3, 0, 262273, 256000, 255999, 262272, 262144, 262145]"
            ]
          },
          "metadata": {},
          "execution_count": 24
        }
      ],
      "source": [
        "processor.tokenizer.all_special_ids"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "i-xR4GHUeQ9l"
      },
      "source": [
        "Write our dataset collator. We will train model to predict category of a video (which can be done easily). You can do much better things, for instance FineVideo has QnA section, you can train this model to do open-ended QnA if you have a big VRAM and a lot of patience. Open-ended tasks are harder to work with, and this notebook carries educational purposes on feeding different modalities.\n",
        "\n",
        "In collator we also downsample videos to 6 frames, we have written the helper above. For better results you need more frames."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 36,
      "metadata": {
        "id": "x_e3IjDCzioP"
      },
      "outputs": [],
      "source": [
        "def collate_fn(examples):\n",
        "  video_path = examples[0][\"video\"]\n",
        "  audio_path = examples[0][\"audio\"]\n",
        "  sample_idx = filename.split(\"_\")[1].split(\".\")[0]\n",
        "  frames = downsample_video(f\"videos/{video_path}\")\n",
        "\n",
        "  text = \"Based on the video, predict the category of it.\"\n",
        "  message = [\n",
        "      {\n",
        "          \"role\": \"user\",\n",
        "          \"content\": [\n",
        "              {\"type\": \"text\", \"text\": text}\n",
        "          ],\n",
        "      },\n",
        "  ]\n",
        "  # this is how video inference should be formatted in Gemma3n\n",
        "  for frame in frames:\n",
        "    image, timestamp = frame\n",
        "    message[0][\"content\"].append({\"type\": \"text\", \"text\": f\"Frame {timestamp}:\"})\n",
        "    timestamp = str(timestamp).replace(\".\", \"_\")\n",
        "    image.save(f\"image_idx_{sample_idx}_{timestamp}.png\")\n",
        "    message[0][\"content\"].append({\"type\": \"image\", \"url\": f\"image_idx_{sample_idx}_{timestamp}.png\"})\n",
        "\n",
        "  message[0][\"content\"].append({\"type\": \"audio\", \"audio\": f\"audios/{audio_path}\"})\n",
        "  message.append({\"role\": \"assistant\", \"content\": [{\"type\": \"text\", \"text\": examples[0][\"content_parent_category\"]}]})\n",
        "  inputs = processor.apply_chat_template(\n",
        "      message,\n",
        "      add_generation_prompt=False,\n",
        "      tokenize=True,\n",
        "      return_dict=True,\n",
        "      return_tensors=\"pt\",\n",
        "      padding=True,\n",
        "  ).to(model.device)\n",
        "\n",
        "  labels = inputs[\"input_ids\"].clone()\n",
        "  special_token_ids = processor.tokenizer.all_special_ids\n",
        "\n",
        "  special_token_ids_tensor = torch.tensor(special_token_ids, device=labels.device)\n",
        "  mask = torch.isin(labels, special_token_ids_tensor)\n",
        "  labels[mask] = -100\n",
        "\n",
        "  inputs[\"labels\"] = labels\n",
        "  if torch.all(inputs[\"pixel_values\"] == 0):\n",
        "    print(\"Frames are dark\")\n",
        "\n",
        "  return inputs"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wM6OxwNTiyZ1"
      },
      "source": [
        "## Training"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "We do LoRA fine-tuning again to save up on space."
      ],
      "metadata": {
        "id": "Wj7yYQTQH7wg"
      }
    },
    {
      "cell_type": "code",
      "execution_count": 58,
      "metadata": {
        "id": "uD3W2OO5-1PC"
      },
      "outputs": [],
      "source": [
        "from peft import LoraConfig\n",
        "peft_config = LoraConfig(\n",
        "    task_type=\"CAUSAL_LM\",\n",
        "    r=16,\n",
        "    target_modules=\"all-linear\",\n",
        "    lora_alpha=32,\n",
        "    lora_dropout=0.05,\n",
        "    bias=\"none\",\n",
        "    use_rslora=False,\n",
        "    use_dora=False,\n",
        "    modules_to_save=None\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 59,
      "metadata": {
        "id": "CT7xlPul8RNJ"
      },
      "outputs": [],
      "source": [
        "model.gradient_checkpointing_disable()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 60,
      "metadata": {
        "id": "3stdS0v15tnY"
      },
      "outputs": [],
      "source": [
        "model.config.use_cache = False"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 61,
      "metadata": {
        "id": "zG53iSes76H-"
      },
      "outputs": [],
      "source": [
        "training_args = SFTConfig(\n",
        "    output_dir=\"/content/gemma-3n-finevideo\",\n",
        "    eval_strategy='epoch',\n",
        "    per_device_train_batch_size=1,\n",
        "    per_device_eval_batch_size=1,\n",
        "    gradient_accumulation_steps=4,\n",
        "    gradient_checkpointing=False,\n",
        "    learning_rate=1e-05,\n",
        "    num_train_epochs=3.0,\n",
        "    logging_steps=10,\n",
        "    save_steps=100,\n",
        "    bf16=True,\n",
        "    report_to=[\"tensorboard\"],\n",
        "    dataset_kwargs={'skip_prepare_dataset': True},\n",
        "    remove_unused_columns=False,\n",
        "    max_seq_length=None,\n",
        "    push_to_hub=True,\n",
        "    dataloader_pin_memory=False,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 62,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "hPaplK2u70D9",
        "outputId": "4bd2f1cd-e4d2-4e38-e555-ec2e07528e02"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.\n"
          ]
        }
      ],
      "source": [
        "trainer = SFTTrainer(\n",
        "    model=model,\n",
        "    args=training_args,\n",
        "    data_collator=collate_fn,\n",
        "    train_dataset=dataset[\"train\"],\n",
        "    eval_dataset=dataset[\"test\"] if training_args.eval_strategy != \"no\" else None,\n",
        "    processing_class=processor.tokenizer,\n",
        "    peft_config=peft_config,\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 63,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 221
        },
        "id": "gsBJcyqe8ET1",
        "outputId": "9aa717c5-e046-42e7-91c7-deae74aa5407"
      },
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "\n",
              "    <div>\n",
              "      \n",
              "      <progress value='588' max='588' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      [588/588 1:28:09, Epoch 3/3]\n",
              "    </div>\n",
              "    <table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              " <tr style=\"text-align: left;\">\n",
              "      <th>Epoch</th>\n",
              "      <th>Training Loss</th>\n",
              "      <th>Validation Loss</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <td>1</td>\n",
              "      <td>1.363500</td>\n",
              "      <td>3.557561</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>2</td>\n",
              "      <td>0.981800</td>\n",
              "      <td>3.502365</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>3</td>\n",
              "      <td>0.844200</td>\n",
              "      <td>3.512452</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table><p>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "TrainOutput(global_step=588, training_loss=1.369473821451875, metrics={'train_runtime': 5299.3753, 'train_samples_per_second': 0.443, 'train_steps_per_second': 0.111, 'total_flos': 7.490494981503706e+16, 'train_loss': 1.369473821451875})"
            ]
          },
          "metadata": {},
          "execution_count": 63
        }
      ],
      "source": [
        "trainer.train()"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Test the model with a video of snowboarding."
      ],
      "metadata": {
        "id": "qKtWUXVoUyKE"
      }
    },
    {
      "cell_type": "code",
      "execution_count": 67,
      "metadata": {
        "id": "X5fOWf2bRERq",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "5daa499e-56c9-4241-eb04-c8c29864ee9e"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "--2025-07-16 13:18:33--  https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_8137.mp4\n",
            "Resolving huggingface.co (huggingface.co)... 18.160.143.99, 18.160.143.32, 18.160.143.75, ...\n",
            "Connecting to huggingface.co (huggingface.co)|18.160.143.99|:443... connected.\n",
            "HTTP request sent, awaiting response... 302 Found\n",
            "Location: https://cdn-lfs-us-1.hf.co/repos/7b/14/7b14679bb56cefbf7829be71f3f444110ccc308f431bd8596f534e743367ea5c/6331cbb913feb48349e3b7015a7969e04ce3cd594b1bda7278e4e33fe4a3f5f3?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27IMG_8137.mp4%3B+filename%3D%22IMG_8137.mp4%22%3B&response-content-type=video%2Fmp4&Expires=1752675513&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc1MjY3NTUxM319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzdiLzE0LzdiMTQ2NzliYjU2Y2VmYmY3ODI5YmU3MWYzZjQ0NDExMGNjYzMwOGY0MzFiZDg1OTZmNTM0ZTc0MzM2N2VhNWMvNjMzMWNiYjkxM2ZlYjQ4MzQ5ZTNiNzAxNWE3OTY5ZTA0Y2UzY2Q1OTRiMWJkYTcyNzhlNGUzM2ZlNGEzZjVmMz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=dKwm2ee9rdtmzuZ8tVMOOJWndfV85S9dKaTwiZbVQt3N6-1dtWkDKXbIsjuD%7Eyriu1dnXNDSjXSDIn-s7ypd8Ie-U1ABXw5Ou6CZ03Z9U4JIQDWBMwEGGEZ6HFCx0mR3royc3u-AKekcIw7zEOFtfAZ%7Eo0XT7l3BiAAV3IVu94m1ONONU779D1gSgPo1sWfuqWydAefPe2NVmSxY1HvH7DHxVOVRuGTfegXN59hvZKhSfZ0Dk0WqBjhReYVdEVxl5j-5pynjo-G%7EUsvldEcxxQpPdcD1DuOGQvYc0KyWw2Tyv3ibU7vhT%7EwVpvdG6tdIi2QOACJ4rfeaVWn5twIHxw__&Key-Pair-Id=K24J24Z295AEI9 [following]\n",
            "--2025-07-16 13:18:33--  https://cdn-lfs-us-1.hf.co/repos/7b/14/7b14679bb56cefbf7829be71f3f444110ccc308f431bd8596f534e743367ea5c/6331cbb913feb48349e3b7015a7969e04ce3cd594b1bda7278e4e33fe4a3f5f3?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27IMG_8137.mp4%3B+filename%3D%22IMG_8137.mp4%22%3B&response-content-type=video%2Fmp4&Expires=1752675513&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc1MjY3NTUxM319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzdiLzE0LzdiMTQ2NzliYjU2Y2VmYmY3ODI5YmU3MWYzZjQ0NDExMGNjYzMwOGY0MzFiZDg1OTZmNTM0ZTc0MzM2N2VhNWMvNjMzMWNiYjkxM2ZlYjQ4MzQ5ZTNiNzAxNWE3OTY5ZTA0Y2UzY2Q1OTRiMWJkYTcyNzhlNGUzM2ZlNGEzZjVmMz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=dKwm2ee9rdtmzuZ8tVMOOJWndfV85S9dKaTwiZbVQt3N6-1dtWkDKXbIsjuD%7Eyriu1dnXNDSjXSDIn-s7ypd8Ie-U1ABXw5Ou6CZ03Z9U4JIQDWBMwEGGEZ6HFCx0mR3royc3u-AKekcIw7zEOFtfAZ%7Eo0XT7l3BiAAV3IVu94m1ONONU779D1gSgPo1sWfuqWydAefPe2NVmSxY1HvH7DHxVOVRuGTfegXN59hvZKhSfZ0Dk0WqBjhReYVdEVxl5j-5pynjo-G%7EUsvldEcxxQpPdcD1DuOGQvYc0KyWw2Tyv3ibU7vhT%7EwVpvdG6tdIi2QOACJ4rfeaVWn5twIHxw__&Key-Pair-Id=K24J24Z295AEI9\n",
            "Resolving cdn-lfs-us-1.hf.co (cdn-lfs-us-1.hf.co)... 3.169.202.18, 3.169.202.35, 3.169.202.26, ...\n",
            "Connecting to cdn-lfs-us-1.hf.co (cdn-lfs-us-1.hf.co)|3.169.202.18|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 5340706 (5.1M) [video/mp4]\n",
            "Saving to: ‘IMG_8137.mp4’\n",
            "\n",
            "IMG_8137.mp4        100%[===================>]   5.09M  --.-KB/s    in 0.1s    \n",
            "\n",
            "2025-07-16 13:18:33 (38.9 MB/s) - ‘IMG_8137.mp4’ saved [5340706/5340706]\n",
            "\n"
          ]
        }
      ],
      "source": [
        "!wget https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_8137.mp4"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "model = trainer.model # trainer has the adapter"
      ],
      "metadata": {
        "id": "KBfMiUChc2Ky"
      },
      "execution_count": 89,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Strip audio and downsample video."
      ],
      "metadata": {
        "id": "R14WzyjbZCwI"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "audio_path = \"/content/test_audio.wav\"\n",
        "subprocess.run([\n",
        "        \"ffmpeg\", \"-i\", \"/content/IMG_8137.mp4\",\n",
        "        \"-q:a\", \"0\", \"-map\", \"a\",\n",
        "        f\"{audio_path}\",\n",
        "        \"-y\"\n",
        "    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "RnJZ-QNJaOqp",
        "outputId": "c2f42e28-d427-4da7-cf86-6c3b70e6ee02"
      },
      "execution_count": 97,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "CompletedProcess(args=['ffmpeg', '-i', '/content/IMG_8137.mp4', '-q:a', '0', '-map', 'a', '/content/test_audio.wav', '-y'], returncode=0)"
            ]
          },
          "metadata": {},
          "execution_count": 97
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "frames = downsample_video(\"/content/IMG_8137.mp4\")\n",
        "\n",
        "# repeat the chat template\n",
        "text = \"Based on the video, predict the category of it.\"\n",
        "message = [\n",
        "    {\n",
        "        \"role\": \"user\",\n",
        "        \"content\": [\n",
        "            {\"type\": \"text\", \"text\": text}\n",
        "        ],\n",
        "    },\n",
        "]\n",
        "for frame in frames:\n",
        "  image, timestamp = frame\n",
        "  message[0][\"content\"].append({\"type\": \"text\", \"text\": f\"Frame {timestamp}:\"})\n",
        "  timestamp = str(timestamp).replace(\".\", \"_\")\n",
        "  image.save(f\"test_frame_{timestamp}.png\")\n",
        "  message[0][\"content\"].append({\"type\": \"image\", \"url\": f\"test_frame_{timestamp}.png\"})\n",
        "\n",
        "message[0][\"content\"].append({\"type\": \"audio\", \"audio\": f\"{audio_path}\"})"
      ],
      "metadata": {
        "id": "9drrCnfRYi6O"
      },
      "execution_count": 98,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "message"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "7s1Dhxf_Z3xU",
        "outputId": "1eba1e9e-d859-4aa7-ff4e-992ef272df7c"
      },
      "execution_count": 99,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[{'role': 'user',\n",
              "  'content': [{'type': 'text',\n",
              "    'text': 'Based on the video, predict the category of it.'},\n",
              "   {'type': 'text', 'text': 'Frame 0.88:'},\n",
              "   {'type': 'image', 'url': 'test_frame_0_88.png'},\n",
              "   {'type': 'text', 'text': 'Frame 1.79:'},\n",
              "   {'type': 'image', 'url': 'test_frame_1_79.png'},\n",
              "   {'type': 'text', 'text': 'Frame 2.67:'},\n",
              "   {'type': 'image', 'url': 'test_frame_2_67.png'},\n",
              "   {'type': 'text', 'text': 'Frame 3.57:'},\n",
              "   {'type': 'image', 'url': 'test_frame_3_57.png'},\n",
              "   {'type': 'text', 'text': 'Frame 4.45:'},\n",
              "   {'type': 'image', 'url': 'test_frame_4_45.png'},\n",
              "   {'type': 'text', 'text': 'Frame 5.36:'},\n",
              "   {'type': 'image', 'url': 'test_frame_5_36.png'},\n",
              "   {'type': 'audio', 'audio': '/content/test_audio.wav'}]}]"
            ]
          },
          "metadata": {},
          "execution_count": 99
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "inputs = processor.apply_chat_template(\n",
        "    message,\n",
        "    add_generation_prompt=True,\n",
        "    tokenize=True,\n",
        "    return_dict=True,\n",
        "    return_tensors=\"pt\",\n",
        "    padding=True,\n",
        ").to(model.device).to(model.dtype)"
      ],
      "metadata": {
        "id": "xNTQRMzsZyQz"
      },
      "execution_count": 100,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "input_len = inputs[\"input_ids\"].shape[-1]\n",
        "\n",
        "with torch.inference_mode():\n",
        "    generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)\n",
        "    generation = generation[0][input_len:]\n",
        "\n",
        "decoded = processor.decode(generation, skip_special_tokens=True)\n",
        "print(decoded)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "WNfnannnZ5-S",
        "outputId": "0afca313-a4f7-4c02-872e-665a853a19df"
      },
      "execution_count": 101,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Snowboarding\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Thanks a lot for reading! Keep training the model further with more data or unfreeze the layers for better performance 💗"
      ],
      "metadata": {
        "id": "LOUBj5dgeddG"
      }
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "4KnNR6lneuKm"
      },
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "A100",
      "machine_shape": "hm",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    },
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "4eb3613e8efa4fd9adf2cfe27bfbd699": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_c15cc5cb9d7947a99a01a30e430d0459",
              "IPY_MODEL_1801493cd54742fd99752b2f605af1cb",
              "IPY_MODEL_e5e518d8cf5f4aa5a0ecad6583f0d317"
            ],
            "layout": "IPY_MODEL_425f9f26bd0647b1989ecb704414aa9f"
          }
        },
        "c15cc5cb9d7947a99a01a30e430d0459": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_5eeff3de00c5488db1817328e83bb992",
            "placeholder": "​",
            "style": "IPY_MODEL_4846c29045294042b8d916cb0fd8f9d6",
            "value": "Generating train split: "
          }
        },
        "1801493cd54742fd99752b2f605af1cb": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_20b59cdc19684e1c97517e36f5bf8d6a",
            "max": 1,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_143d6079d1744eedb41e2e1182bd0f33",
            "value": 1
          }
        },
        "e5e518d8cf5f4aa5a0ecad6583f0d317": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_c022d8fabedc43ef9db0c8aca82d215e",
            "placeholder": "​",
            "style": "IPY_MODEL_464ffcc84f48468b8f5d3f08412c6101",
            "value": " 869/0 [00:00&lt;00:00, 8490.20 examples/s]"
          }
        },
        "425f9f26bd0647b1989ecb704414aa9f": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "5eeff3de00c5488db1817328e83bb992": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "4846c29045294042b8d916cb0fd8f9d6": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "20b59cdc19684e1c97517e36f5bf8d6a": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": "20px"
          }
        },
        "143d6079d1744eedb41e2e1182bd0f33": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "c022d8fabedc43ef9db0c8aca82d215e": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "464ffcc84f48468b8f5d3f08412c6101": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "a33fedc485b346b1b9d4fb8b18e8ac64": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_94d5d3b00449488caa6d8badc443a74f",
              "IPY_MODEL_a60a111fc7c24bd7b21fed3f3dd64f29",
              "IPY_MODEL_e830732fc2bc4848847ea85c772d0b98"
            ],
            "layout": "IPY_MODEL_3e25db05674d4d2f8fd839a0ec63e7d8"
          }
        },
        "94d5d3b00449488caa6d8badc443a74f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_3262178b8baf4741b06250d7416df1f3",
            "placeholder": "​",
            "style": "IPY_MODEL_2e9d5cf7a5c6466a9e1de6d4f403cd95",
            "value": "Loading checkpoint shards: 100%"
          }
        },
        "a60a111fc7c24bd7b21fed3f3dd64f29": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_9d2631150d5c4089bcc95f22a6698287",
            "max": 3,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_9c0857a4034f4780ab5e7fdd9aa9d09d",
            "value": 3
          }
        },
        "e830732fc2bc4848847ea85c772d0b98": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_073975370eab45d9abc4f69f2b7b3d48",
            "placeholder": "​",
            "style": "IPY_MODEL_0d1dfc47d0704506bc6e521c07162b4b",
            "value": " 3/3 [00:00&lt;00:00,  3.91it/s]"
          }
        },
        "3e25db05674d4d2f8fd839a0ec63e7d8": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "3262178b8baf4741b06250d7416df1f3": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "2e9d5cf7a5c6466a9e1de6d4f403cd95": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "9d2631150d5c4089bcc95f22a6698287": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "9c0857a4034f4780ab5e7fdd9aa9d09d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "073975370eab45d9abc4f69f2b7b3d48": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "0d1dfc47d0704506bc6e521c07162b4b": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        }
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}