merve
/

smol-vision

Image-Text-to-Text

Transformers

notebook

Model card Files Files and versions Community

merve HF Staff commited on 9 days ago

Commit

6e994a0

verified ·

1 Parent(s): 26e8458

Upload Gemma3n_Fine_tuning_on_All_Modalities.ipynb

Browse files

Files changed (1) hide show

Gemma3n_Fine_tuning_on_All_Modalities.ipynb +1766 -0

Gemma3n_Fine_tuning_on_All_Modalities.ipynb ADDED Viewed

	@@ -0,0 +1,1766 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Fine-tune Gemma3n on FineVideo\n",
+        "\n",
+        "In this notebook, we will see how to fine-tune Gemma3n an videos with audios inside.\n",
+        "Using all three modalities is very costly compute-wise, so keep in mind that this is an educational tutorial to fit the model in 40GB VRAM."
+      ],
+      "metadata": {
+        "id": "0eVo7Mc5GMyL"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "id": "BLv-NJRZzHiA",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "bb4e4b32-5000-42e0-889d-90648e335a41"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.9/40.9 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.8/10.8 MB\u001b[0m \u001b[31m114.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m376.2/376.2 kB\u001b[0m \u001b[31m33.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m494.8/494.8 kB\u001b[0m \u001b[31m38.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m193.6/193.6 kB\u001b[0m \u001b[31m17.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m126.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m92.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m58.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m11.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m42.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m19.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m114.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+            "gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.\u001b[0m\u001b[31m\n",
+            "\u001b[0m"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install -U -q timm transformers trl peft datasets"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "id": "UxE2vzKsbov0"
+      },
+      "outputs": [],
+      "source": [
+        "import io\n",
+        "import os\n",
+        "import zipfile\n",
+        "\n",
+        "import torch\n",
+        "from datasets import load_dataset\n",
+        "from PIL import Image\n",
+        "from transformers import AutoProcessor, Gemma3nForConditionalGeneration\n",
+        "\n",
+        "from trl import (\n",
+        "    SFTConfig,\n",
+        "    SFTTrainer,\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "T06yJvcMiqO6"
+      },
+      "source": [
+        "## Download videos and preprocessing\n",
+        "\n",
+        "FineVideo is a quite large dataset, we don't need a ton of examples, so we stream the dataset, check the duration and download the videos shorter than 30 secs."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "wBFfYgLxmg7b"
+      },
+      "outputs": [],
+      "source": [
+        "from datasets import load_dataset\n",
+        "import json\n",
+        "import os\n",
+        "\n",
+        "dataset = load_dataset(\"HuggingFaceFV/finevideo\", split=\"train\", streaming=True)\n",
+        "\n",
+        "\n",
+        "os.makedirs(\"videos\", exist_ok=True)\n",
+        "os.makedirs(\"metadata\", exist_ok=True)\n",
+        "\n",
+        "for idx, sample in enumerate(dataset):\n",
+        "    data = sample[\"json\"]\n",
+        "    duration = data.get(\"duration_seconds\", 0)\n",
+        "    if duration < 30:\n",
+        "      video_filename = f\"videos/sample_{idx}.mp4\"\n",
+        "      with open(video_filename, 'wb') as video_file:\n",
+        "        video_file.write(sample['mp4'])\n",
+        "\n",
+        "      json_filename = f\"metadata/sample_{idx}.json\"\n",
+        "      with open(json_filename, 'w') as json_file:\n",
+        "          json.dump(sample['json'], json_file)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "K48dmmZTdZ1l",
+        "outputId": "31c7c32b-1c40-4df4-eb51-11857d7b4da9"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Number of items in content/videos: 871\n"
+          ]
+        }
+      ],
+      "source": [
+        " print(f\"Number of items in content/videos: {len(os.listdir('videos'))}\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "In FineVideo some frames are dark so we downsample 6 frames and if we can't get meaningful videos we remove them."
+      ],
+      "metadata": {
+        "id": "QbkDI03qHMog"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "id": "0UMZi3tHb-BC"
+      },
+      "outputs": [],
+      "source": [
+        "import cv2\n",
+        "from PIL import Image\n",
+        "import numpy as np\n",
+        "\n",
+        "def is_dark(frame, threshold=10):\n",
+        "    return np.max(frame) < threshold  # all pixels are very close to 0\n",
+        "\n",
+        "def downsample_video(video_path):\n",
+        "    vidcap = cv2.VideoCapture(video_path)\n",
+        "    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))\n",
+        "    fps = vidcap.get(cv2.CAP_PROP_FPS)\n",
+        "\n",
+        "    frames = []\n",
+        "\n",
+        "    # Generate 8 evenly spaced indices, skip first and last\n",
+        "    full_indices = np.linspace(0, total_frames - 1, 8, dtype=int)[1:-1]\n",
+        "\n",
+        "    for i in full_indices:\n",
+        "        found_valid = False\n",
+        "        for offset in [0, -1, 1, -2, 2]:  # Try nearby frames if original is dark\n",
+        "            candidate_idx = i + offset\n",
+        "            if 0 <= candidate_idx < total_frames:\n",
+        "                vidcap.set(cv2.CAP_PROP_POS_FRAMES, candidate_idx)\n",
+        "                success, image = vidcap.read()\n",
+        "                if success:\n",
+        "                    if not is_dark(image):\n",
+        "                        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n",
+        "                        pil_image = Image.fromarray(image)\n",
+        "                        timestamp = round(candidate_idx / fps, 2)\n",
+        "                        frames.append((pil_image, timestamp))\n",
+        "                        found_valid = True\n",
+        "                        break\n",
+        "        if not found_valid:\n",
+        "            print(f\"Warning: Could not find non-dark frame near index {i}\")\n",
+        "\n",
+        "    vidcap.release()\n",
+        "\n",
+        "    # If still fewer than 8, try to top off by scanning more frames\n",
+        "    if len(frames) < 6:\n",
+        "        print(\"Trying to top off with additional non-dark frames...\")\n",
+        "        idx = 0\n",
+        "        while len(frames) < 8 and idx < total_frames:\n",
+        "            vidcap.set(cv2.CAP_PROP_POS_FRAMES, idx)\n",
+        "            success, image = vidcap.read()\n",
+        "            if success and not is_dark(image):\n",
+        "                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n",
+        "                pil_image = Image.fromarray(image)\n",
+        "                timestamp = round(idx / fps, 2)\n",
+        "                # Avoid adding duplicate timestamps\n",
+        "                if not any(ts == timestamp for _, ts in frames):\n",
+        "                    frames.append((pil_image, timestamp))\n",
+        "            idx += 1\n",
+        "\n",
+        "    return frames[:8]  # Ensure exactly 8 frames\n",
+        "\n",
+        "import os\n",
+        "import glob\n",
+        "\n",
+        "def remove_dark_videos(video_dir, metadata_dir, audio_dir):\n",
+        "    \"\"\"\n",
+        "    Remove videos (and their metadata/audio files) if all frames are dark.\n",
+        "    \"\"\"\n",
+        "    video_paths = glob.glob(os.path.join(video_dir, \"*.mp4\"))\n",
+        "\n",
+        "    for video_path in video_paths:\n",
+        "        filename = os.path.basename(video_path)\n",
+        "        base_name = os.path.splitext(filename)[0]\n",
+        "\n",
+        "        frames = downsample_video(video_path)\n",
+        "        if len(frames) < 6:\n",
+        "            try:\n",
+        "                os.remove(video_path)\n",
+        "                print(f\"Deleted: {video_path}\")\n",
+        "            except Exception as e:\n",
+        "                print(f\"Failed to delete {video_path}: {e}\")\n",
+        "\n",
+        "            metadata_path = os.path.join(metadata_dir, f\"{base_name}.json\")\n",
+        "            if os.path.exists(metadata_path):\n",
+        "                os.remove(metadata_path)\n",
+        "\n",
+        "            # Remove audio\n",
+        "            audio_path = os.path.join(audio_dir, f\"{base_name}.wav\")\n",
+        "            if os.path.exists(audio_path):\n",
+        "                os.remove(audio_path)\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "remove_dark_videos(\n",
+        "    video_dir=\"videos\",\n",
+        "    metadata_dir=\"metadata\",\n",
+        "    audio_dir=\"audios\"\n",
+        "    )"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "pA6iIR38l66-",
+        "outputId": "78f81f41-5e70-4900-e33c-cd918aaed67d"
+      },
+      "execution_count": 12,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Warning: Could not find non-dark frame near index 208\n",
+            "Trying to top off with additional non-dark frames...\n",
+            "Deleted: videos/sample_9650.mp4\n",
+            "Warning: Could not find non-dark frame near index 432\n",
+            "Trying to top off with additional non-dark frames...\n",
+            "Deleted: videos/sample_31965.mp4\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-qa4Tf8PwITC"
+      },
+      "source": [
+        "Gemma-3n accepts video (image frames) and audio separately, so we strip audio from video."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "id": "OR7bhnCawHrF"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import subprocess\n",
+        "\n",
+        "video_dir = \"videos\"\n",
+        "audio_dir = \"audios\"\n",
+        "os.makedirs(audio_dir, exist_ok=True)\n",
+        "\n",
+        "for filename in os.listdir(video_dir):\n",
+        "    if not filename.endswith(\".mp4\"):\n",
+        "        continue\n",
+        "\n",
+        "    idx = filename.split(\"_\")[1].split(\".\")[0]\n",
+        "    video_path = os.path.join(video_dir, filename)\n",
+        "    audio_path = os.path.join(audio_dir, f\"sample_{idx}.wav\")\n",
+        "\n",
+        "    subprocess.run([\n",
+        "        \"ffmpeg\", \"-i\", video_path,\n",
+        "        \"-q:a\", \"0\", \"-map\", \"a\",\n",
+        "        audio_path,\n",
+        "        \"-y\"\n",
+        "    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "uIlVtxDcwQcy"
+      },
+      "source": [
+        "Construct a new dataset with audio, video, metadata (video categories). This dataset is very cool, it has some questions and answers, captions and more so get creative if you have the GPU VRAM to do so. Here we solve an easier task for educational purposes."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 49,
+          "referenced_widgets": [
+            "4eb3613e8efa4fd9adf2cfe27bfbd699",
+            "c15cc5cb9d7947a99a01a30e430d0459",
+            "1801493cd54742fd99752b2f605af1cb",
+            "e5e518d8cf5f4aa5a0ecad6583f0d317",
+            "425f9f26bd0647b1989ecb704414aa9f",
+            "5eeff3de00c5488db1817328e83bb992",
+            "4846c29045294042b8d916cb0fd8f9d6",
+            "20b59cdc19684e1c97517e36f5bf8d6a",
+            "143d6079d1744eedb41e2e1182bd0f33",
+            "c022d8fabedc43ef9db0c8aca82d215e",
+            "464ffcc84f48468b8f5d3f08412c6101"
+          ]
+        },
+        "id": "erYr3SdmuS4m",
+        "outputId": "0c95ff77-7976-4641-9a51-b7f24f36270d"
+      },
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "Generating train split: 0 examples [00:00, ? examples/s]"
+            ],
+            "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
+              "version_minor": 0,
+              "model_id": "4eb3613e8efa4fd9adf2cfe27bfbd699"
+            }
+          },
+          "metadata": {}
+        }
+      ],
+      "source": [
+        "from datasets import Dataset\n",
+        "import json\n",
+        "\n",
+        "def gen():\n",
+        "    meta_dir = \"metadata\"\n",
+        "    for filename in os.listdir(meta_dir):\n",
+        "        if not filename.endswith(\".json\"):\n",
+        "            continue\n",
+        "\n",
+        "        idx = filename.split(\"_\")[1].split(\".\")[0]\n",
+        "        if os.path.exists(f\"videos/sample_{idx}.mp4\"):\n",
+        "          video_filename = f\"sample_{idx}.mp4\"\n",
+        "          audio_filename = f\"sample_{idx}.wav\"\n",
+        "          json_path = os.path.join(meta_dir, filename)\n",
+        "\n",
+        "          with open(json_path, \"r\") as f:\n",
+        "              metadata = json.load(f)\n",
+        "\n",
+        "\n",
+        "          yield {\n",
+        "              \"video\": video_filename,\n",
+        "              \"audio\": audio_filename,\n",
+        "              \"content_parent_category\": metadata[\"content_parent_category\"],\n",
+        "              \"sample_index\": int(idx)\n",
+        "          }\n",
+        "        else:\n",
+        "          pass\n",
+        "\n",
+        "dataset = Dataset.from_generator(gen)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CjtgRoSEd9TV"
+      },
+      "source": [
+        "We will speed-up and downsample the audios to save space during training."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "metadata": {
+        "id": "8DDaQ86MD1Y3"
+      },
+      "outputs": [],
+      "source": [
+        "import torchaudio\n",
+        "from torchaudio.transforms import Resample\n",
+        "import os\n",
+        "import torch\n",
+        "\n",
+        "def preprocess_audio(audio_path, target_sample_rate=16000, max_duration_sec=5, speedup_factor=1.25):\n",
+        "    waveform, sample_rate = torchaudio.load(audio_path)\n",
+        "\n",
+        "    if waveform.shape[0] > 1:\n",
+        "        waveform = waveform.mean(dim=0, keepdim=True)\n",
+        "\n",
+        "    if sample_rate != target_sample_rate:\n",
+        "        resampler = Resample(orig_freq=sample_rate, new_freq=target_sample_rate)\n",
+        "        waveform = resampler(waveform)\n",
+        "        sample_rate = target_sample_rate\n",
+        "\n",
+        "    if speedup_factor > 1.0:\n",
+        "        indices = torch.arange(0, waveform.shape[1], step=speedup_factor).long()\n",
+        "        if indices[-1] >= waveform.shape[1]:\n",
+        "            indices = indices[:-1]\n",
+        "        waveform = waveform[:, indices]\n",
+        "\n",
+        "    max_length = int(target_sample_rate * max_duration_sec)\n",
+        "    if waveform.shape[1] > max_length:\n",
+        "        waveform = waveform[:, :max_length]\n",
+        "\n",
+        "    torchaudio.save(audio_path, waveform, sample_rate)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 15,
+      "metadata": {
+        "id": "IQ7L2_0bI1tP"
+      },
+      "outputs": [],
+      "source": [
+        "for file_name in os.listdir(\"audios\"):\n",
+        "    if file_name.lower().endswith(\".wav\"):\n",
+        "        audio_path = os.path.join(\"audios\", file_name)\n",
+        "        preprocess_audio(audio_path)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 16,
+      "metadata": {
+        "id": "pspaO2Lv4SxG"
+      },
+      "outputs": [],
+      "source": [
+        "dataset = dataset.train_test_split(test_size=0.10, seed=42)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "### Load the model\n",
+        "\n",
+        "Make sure you have your Hugging Face token in your Colab secrets."
+      ],
+      "metadata": {
+        "id": "hrvYdvQ9Hye4"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 57,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 49,
+          "referenced_widgets": [
+            "a33fedc485b346b1b9d4fb8b18e8ac64",
+            "94d5d3b00449488caa6d8badc443a74f",
+            "a60a111fc7c24bd7b21fed3f3dd64f29",
+            "e830732fc2bc4848847ea85c772d0b98",
+            "3e25db05674d4d2f8fd839a0ec63e7d8",
+            "3262178b8baf4741b06250d7416df1f3",
+            "2e9d5cf7a5c6466a9e1de6d4f403cd95",
+            "9d2631150d5c4089bcc95f22a6698287",
+            "9c0857a4034f4780ab5e7fdd9aa9d09d",
+            "073975370eab45d9abc4f69f2b7b3d48",
+            "0d1dfc47d0704506bc6e521c07162b4b"
+          ]
+        },
+        "id": "UQaaLBCVzXH-",
+        "outputId": "a6244057-777b-4f48-e89e-0d3c945e06e8"
+      },
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]"
+            ],
+            "application/vnd.jupyter.widget-view+json": {
+              "version_major": 2,
+              "version_minor": 0,
+              "model_id": "a33fedc485b346b1b9d4fb8b18e8ac64"
+            }
+          },
+          "metadata": {}
+        }
+      ],
+      "source": [
+        "model = Gemma3nForConditionalGeneration.from_pretrained(\n",
+        "    \"google/gemma-3n-E2B-it\", torch_dtype=torch.bfloat16,\n",
+        ")\n",
+        "processor = AutoProcessor.from_pretrained(\n",
+        "    \"google/gemma-3n-E2B-it\",\n",
+        ")\n",
+        "processor.tokenizer.padding_side = \"right\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "epPCxTFi3XQ2",
+        "outputId": "f59ad356-5d7c-463e-9c6c-35eb0f0aa586"
+      },
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "[2, 1, 3, 0, 262273, 256000, 255999, 262272, 262144, 262145]"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 24
+        }
+      ],
+      "source": [
+        "processor.tokenizer.all_special_ids"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "i-xR4GHUeQ9l"
+      },
+      "source": [
+        "Write our dataset collator. We will train model to predict category of a video (which can be done easily). You can do much better things, for instance FineVideo has QnA section, you can train this model to do open-ended QnA if you have a big VRAM and a lot of patience. Open-ended tasks are harder to work with, and this notebook carries educational purposes on feeding different modalities.\n",
+        "\n",
+        "In collator we also downsample videos to 6 frames, we have written the helper above. For better results you need more frames."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 36,
+      "metadata": {
+        "id": "x_e3IjDCzioP"
+      },
+      "outputs": [],
+      "source": [
+        "def collate_fn(examples):\n",
+        "  video_path = examples[0][\"video\"]\n",
+        "  audio_path = examples[0][\"audio\"]\n",
+        "  sample_idx = filename.split(\"_\")[1].split(\".\")[0]\n",
+        "  frames = downsample_video(f\"videos/{video_path}\")\n",
+        "\n",
+        "  text = \"Based on the video, predict the category of it.\"\n",
+        "  message = [\n",
+        "      {\n",
+        "          \"role\": \"user\",\n",
+        "          \"content\": [\n",
+        "              {\"type\": \"text\", \"text\": text}\n",
+        "          ],\n",
+        "      },\n",
+        "  ]\n",
+        "  # this is how video inference should be formatted in Gemma3n\n",
+        "  for frame in frames:\n",
+        "    image, timestamp = frame\n",
+        "    message[0][\"content\"].append({\"type\": \"text\", \"text\": f\"Frame {timestamp}:\"})\n",
+        "    timestamp = str(timestamp).replace(\".\", \"_\")\n",
+        "    image.save(f\"image_idx_{sample_idx}_{timestamp}.png\")\n",
+        "    message[0][\"content\"].append({\"type\": \"image\", \"url\": f\"image_idx_{sample_idx}_{timestamp}.png\"})\n",
+        "\n",
+        "  message[0][\"content\"].append({\"type\": \"audio\", \"audio\": f\"audios/{audio_path}\"})\n",
+        "  message.append({\"role\": \"assistant\", \"content\": [{\"type\": \"text\", \"text\": examples[0][\"content_parent_category\"]}]})\n",
+        "  inputs = processor.apply_chat_template(\n",
+        "      message,\n",
+        "      add_generation_prompt=False,\n",
+        "      tokenize=True,\n",
+        "      return_dict=True,\n",
+        "      return_tensors=\"pt\",\n",
+        "      padding=True,\n",
+        "  ).to(model.device)\n",
+        "\n",
+        "  labels = inputs[\"input_ids\"].clone()\n",
+        "  special_token_ids = processor.tokenizer.all_special_ids\n",
+        "\n",
+        "  special_token_ids_tensor = torch.tensor(special_token_ids, device=labels.device)\n",
+        "  mask = torch.isin(labels, special_token_ids_tensor)\n",
+        "  labels[mask] = -100\n",
+        "\n",
+        "  inputs[\"labels\"] = labels\n",
+        "  if torch.all(inputs[\"pixel_values\"] == 0):\n",
+        "    print(\"Frames are dark\")\n",
+        "\n",
+        "  return inputs"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "wM6OxwNTiyZ1"
+      },
+      "source": [
+        "## Training"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We do LoRA fine-tuning again to save up on space."
+      ],
+      "metadata": {
+        "id": "Wj7yYQTQH7wg"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 58,
+      "metadata": {
+        "id": "uD3W2OO5-1PC"
+      },
+      "outputs": [],
+      "source": [
+        "from peft import LoraConfig\n",
+        "peft_config = LoraConfig(\n",
+        "    task_type=\"CAUSAL_LM\",\n",
+        "    r=16,\n",
+        "    target_modules=\"all-linear\",\n",
+        "    lora_alpha=32,\n",
+        "    lora_dropout=0.05,\n",
+        "    bias=\"none\",\n",
+        "    use_rslora=False,\n",
+        "    use_dora=False,\n",
+        "    modules_to_save=None\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 59,
+      "metadata": {
+        "id": "CT7xlPul8RNJ"
+      },
+      "outputs": [],
+      "source": [
+        "model.gradient_checkpointing_disable()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 60,
+      "metadata": {
+        "id": "3stdS0v15tnY"
+      },
+      "outputs": [],
+      "source": [
+        "model.config.use_cache = False"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 61,
+      "metadata": {
+        "id": "zG53iSes76H-"
+      },
+      "outputs": [],
+      "source": [
+        "training_args = SFTConfig(\n",
+        "    output_dir=\"/content/gemma-3n-finevideo\",\n",
+        "    eval_strategy='epoch',\n",
+        "    per_device_train_batch_size=1,\n",
+        "    per_device_eval_batch_size=1,\n",
+        "    gradient_accumulation_steps=4,\n",
+        "    gradient_checkpointing=False,\n",
+        "    learning_rate=1e-05,\n",
+        "    num_train_epochs=3.0,\n",
+        "    logging_steps=10,\n",
+        "    save_steps=100,\n",
+        "    bf16=True,\n",
+        "    report_to=[\"tensorboard\"],\n",
+        "    dataset_kwargs={'skip_prepare_dataset': True},\n",
+        "    remove_unused_columns=False,\n",
+        "    max_seq_length=None,\n",
+        "    push_to_hub=True,\n",
+        "    dataloader_pin_memory=False,\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 62,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "hPaplK2u70D9",
+        "outputId": "4bd2f1cd-e4d2-4e38-e555-ec2e07528e02"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.\n"
+          ]
+        }
+      ],
+      "source": [
+        "trainer = SFTTrainer(\n",
+        "    model=model,\n",
+        "    args=training_args,\n",
+        "    data_collator=collate_fn,\n",
+        "    train_dataset=dataset[\"train\"],\n",
+        "    eval_dataset=dataset[\"test\"] if training_args.eval_strategy != \"no\" else None,\n",
+        "    processing_class=processor.tokenizer,\n",
+        "    peft_config=peft_config,\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 63,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 221
+        },
+        "id": "gsBJcyqe8ET1",
+        "outputId": "9aa717c5-e046-42e7-91c7-deae74aa5407"
+      },
+      "outputs": [
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ],
+            "text/html": [
+              "\n",
+              "    <div>\n",
+              "      \n",
+              "      <progress value='588' max='588' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+              "      [588/588 1:28:09, Epoch 3/3]\n",
+              "    </div>\n",
+              "    <table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              " <tr style=\"text-align: left;\">\n",
+              "      <th>Epoch</th>\n",
+              "      <th>Training Loss</th>\n",
+              "      <th>Validation Loss</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <td>1</td>\n",
+              "      <td>1.363500</td>\n",
+              "      <td>3.557561</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>2</td>\n",
+              "      <td>0.981800</td>\n",
+              "      <td>3.502365</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <td>3</td>\n",
+              "      <td>0.844200</td>\n",
+              "      <td>3.512452</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table><p>"
+            ]
+          },
+          "metadata": {}
+        },
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "TrainOutput(global_step=588, training_loss=1.369473821451875, metrics={'train_runtime': 5299.3753, 'train_samples_per_second': 0.443, 'train_steps_per_second': 0.111, 'total_flos': 7.490494981503706e+16, 'train_loss': 1.369473821451875})"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 63
+        }
+      ],
+      "source": [
+        "trainer.train()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Test the model with a video of snowboarding."
+      ],
+      "metadata": {
+        "id": "qKtWUXVoUyKE"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 67,
+      "metadata": {
+        "id": "X5fOWf2bRERq",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "5daa499e-56c9-4241-eb04-c8c29864ee9e"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "--2025-07-16 13:18:33--  https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_8137.mp4\n",
+            "Resolving huggingface.co (huggingface.co)... 18.160.143.99, 18.160.143.32, 18.160.143.75, ...\n",
+            "Connecting to huggingface.co (huggingface.co)|18.160.143.99|:443... connected.\n",
+            "HTTP request sent, awaiting response... 302 Found\n",
+            "Location: https://cdn-lfs-us-1.hf.co/repos/7b/14/7b14679bb56cefbf7829be71f3f444110ccc308f431bd8596f534e743367ea5c/6331cbb913feb48349e3b7015a7969e04ce3cd594b1bda7278e4e33fe4a3f5f3?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27IMG_8137.mp4%3B+filename%3D%22IMG_8137.mp4%22%3B&response-content-type=video%2Fmp4&Expires=1752675513&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc1MjY3NTUxM319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzdiLzE0LzdiMTQ2NzliYjU2Y2VmYmY3ODI5YmU3MWYzZjQ0NDExMGNjYzMwOGY0MzFiZDg1OTZmNTM0ZTc0MzM2N2VhNWMvNjMzMWNiYjkxM2ZlYjQ4MzQ5ZTNiNzAxNWE3OTY5ZTA0Y2UzY2Q1OTRiMWJkYTcyNzhlNGUzM2ZlNGEzZjVmMz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=dKwm2ee9rdtmzuZ8tVMOOJWndfV85S9dKaTwiZbVQt3N6-1dtWkDKXbIsjuD%7Eyriu1dnXNDSjXSDIn-s7ypd8Ie-U1ABXw5Ou6CZ03Z9U4JIQDWBMwEGGEZ6HFCx0mR3royc3u-AKekcIw7zEOFtfAZ%7Eo0XT7l3BiAAV3IVu94m1ONONU779D1gSgPo1sWfuqWydAefPe2NVmSxY1HvH7DHxVOVRuGTfegXN59hvZKhSfZ0Dk0WqBjhReYVdEVxl5j-5pynjo-G%7EUsvldEcxxQpPdcD1DuOGQvYc0KyWw2Tyv3ibU7vhT%7EwVpvdG6tdIi2QOACJ4rfeaVWn5twIHxw__&Key-Pair-Id=K24J24Z295AEI9 [following]\n",
+            "--2025-07-16 13:18:33--  https://cdn-lfs-us-1.hf.co/repos/7b/14/7b14679bb56cefbf7829be71f3f444110ccc308f431bd8596f534e743367ea5c/6331cbb913feb48349e3b7015a7969e04ce3cd594b1bda7278e4e33fe4a3f5f3?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27IMG_8137.mp4%3B+filename%3D%22IMG_8137.mp4%22%3B&response-content-type=video%2Fmp4&Expires=1752675513&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc1MjY3NTUxM319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzdiLzE0LzdiMTQ2NzliYjU2Y2VmYmY3ODI5YmU3MWYzZjQ0NDExMGNjYzMwOGY0MzFiZDg1OTZmNTM0ZTc0MzM2N2VhNWMvNjMzMWNiYjkxM2ZlYjQ4MzQ5ZTNiNzAxNWE3OTY5ZTA0Y2UzY2Q1OTRiMWJkYTcyNzhlNGUzM2ZlNGEzZjVmMz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=dKwm2ee9rdtmzuZ8tVMOOJWndfV85S9dKaTwiZbVQt3N6-1dtWkDKXbIsjuD%7Eyriu1dnXNDSjXSDIn-s7ypd8Ie-U1ABXw5Ou6CZ03Z9U4JIQDWBMwEGGEZ6HFCx0mR3royc3u-AKekcIw7zEOFtfAZ%7Eo0XT7l3BiAAV3IVu94m1ONONU779D1gSgPo1sWfuqWydAefPe2NVmSxY1HvH7DHxVOVRuGTfegXN59hvZKhSfZ0Dk0WqBjhReYVdEVxl5j-5pynjo-G%7EUsvldEcxxQpPdcD1DuOGQvYc0KyWw2Tyv3ibU7vhT%7EwVpvdG6tdIi2QOACJ4rfeaVWn5twIHxw__&Key-Pair-Id=K24J24Z295AEI9\n",
+            "Resolving cdn-lfs-us-1.hf.co (cdn-lfs-us-1.hf.co)... 3.169.202.18, 3.169.202.35, 3.169.202.26, ...\n",
+            "Connecting to cdn-lfs-us-1.hf.co (cdn-lfs-us-1.hf.co)|3.169.202.18|:443... connected.\n",
+            "HTTP request sent, awaiting response... 200 OK\n",
+            "Length: 5340706 (5.1M) [video/mp4]\n",
+            "Saving to: ‘IMG_8137.mp4’\n",
+            "\n",
+            "IMG_8137.mp4        100%[===================>]   5.09M  --.-KB/s    in 0.1s    \n",
+            "\n",
+            "2025-07-16 13:18:33 (38.9 MB/s) - ‘IMG_8137.mp4’ saved [5340706/5340706]\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "!wget https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_8137.mp4"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "model = trainer.model # trainer has the adapter"
+      ],
+      "metadata": {
+        "id": "KBfMiUChc2Ky"
+      },
+      "execution_count": 89,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Strip audio and downsample video."
+      ],
+      "metadata": {
+        "id": "R14WzyjbZCwI"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "audio_path = \"/content/test_audio.wav\"\n",
+        "subprocess.run([\n",
+        "        \"ffmpeg\", \"-i\", \"/content/IMG_8137.mp4\",\n",
+        "        \"-q:a\", \"0\", \"-map\", \"a\",\n",
+        "        f\"{audio_path}\",\n",
+        "        \"-y\"\n",
+        "    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "RnJZ-QNJaOqp",
+        "outputId": "c2f42e28-d427-4da7-cf86-6c3b70e6ee02"
+      },
+      "execution_count": 97,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "CompletedProcess(args=['ffmpeg', '-i', '/content/IMG_8137.mp4', '-q:a', '0', '-map', 'a', '/content/test_audio.wav', '-y'], returncode=0)"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 97
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "frames = downsample_video(\"/content/IMG_8137.mp4\")\n",
+        "\n",
+        "# repeat the chat template\n",
+        "text = \"Based on the video, predict the category of it.\"\n",
+        "message = [\n",
+        "    {\n",
+        "        \"role\": \"user\",\n",
+        "        \"content\": [\n",
+        "            {\"type\": \"text\", \"text\": text}\n",
+        "        ],\n",
+        "    },\n",
+        "]\n",
+        "for frame in frames:\n",
+        "  image, timestamp = frame\n",
+        "  message[0][\"content\"].append({\"type\": \"text\", \"text\": f\"Frame {timestamp}:\"})\n",
+        "  timestamp = str(timestamp).replace(\".\", \"_\")\n",
+        "  image.save(f\"test_frame_{timestamp}.png\")\n",
+        "  message[0][\"content\"].append({\"type\": \"image\", \"url\": f\"test_frame_{timestamp}.png\"})\n",
+        "\n",
+        "message[0][\"content\"].append({\"type\": \"audio\", \"audio\": f\"{audio_path}\"})"
+      ],
+      "metadata": {
+        "id": "9drrCnfRYi6O"
+      },
+      "execution_count": 98,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "message"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "7s1Dhxf_Z3xU",
+        "outputId": "1eba1e9e-d859-4aa7-ff4e-992ef272df7c"
+      },
+      "execution_count": 99,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "[{'role': 'user',\n",
+              "  'content': [{'type': 'text',\n",
+              "    'text': 'Based on the video, predict the category of it.'},\n",
+              "   {'type': 'text', 'text': 'Frame 0.88:'},\n",
+              "   {'type': 'image', 'url': 'test_frame_0_88.png'},\n",
+              "   {'type': 'text', 'text': 'Frame 1.79:'},\n",
+              "   {'type': 'image', 'url': 'test_frame_1_79.png'},\n",
+              "   {'type': 'text', 'text': 'Frame 2.67:'},\n",
+              "   {'type': 'image', 'url': 'test_frame_2_67.png'},\n",
+              "   {'type': 'text', 'text': 'Frame 3.57:'},\n",
+              "   {'type': 'image', 'url': 'test_frame_3_57.png'},\n",
+              "   {'type': 'text', 'text': 'Frame 4.45:'},\n",
+              "   {'type': 'image', 'url': 'test_frame_4_45.png'},\n",
+              "   {'type': 'text', 'text': 'Frame 5.36:'},\n",
+              "   {'type': 'image', 'url': 'test_frame_5_36.png'},\n",
+              "   {'type': 'audio', 'audio': '/content/test_audio.wav'}]}]"
+            ]
+          },
+          "metadata": {},
+          "execution_count": 99
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "inputs = processor.apply_chat_template(\n",
+        "    message,\n",
+        "    add_generation_prompt=True,\n",
+        "    tokenize=True,\n",
+        "    return_dict=True,\n",
+        "    return_tensors=\"pt\",\n",
+        "    padding=True,\n",
+        ").to(model.device).to(model.dtype)"
+      ],
+      "metadata": {
+        "id": "xNTQRMzsZyQz"
+      },
+      "execution_count": 100,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "input_len = inputs[\"input_ids\"].shape[-1]\n",
+        "\n",
+        "with torch.inference_mode():\n",
+        "    generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)\n",
+        "    generation = generation[0][input_len:]\n",
+        "\n",
+        "decoded = processor.decode(generation, skip_special_tokens=True)\n",
+        "print(decoded)"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "WNfnannnZ5-S",
+        "outputId": "0afca313-a4f7-4c02-872e-665a853a19df"
+      },
+      "execution_count": 101,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stderr",
+          "text": [
+            "The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n"
+          ]
+        },
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Snowboarding\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Thanks a lot for reading! Keep training the model further with more data or unfreeze the layers for better performance 💗"
+      ],
+      "metadata": {
+        "id": "LOUBj5dgeddG"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "4KnNR6lneuKm"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "A100",
+      "machine_shape": "hm",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "widgets": {
+      "application/vnd.jupyter.widget-state+json": {
+        "4eb3613e8efa4fd9adf2cfe27bfbd699": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_c15cc5cb9d7947a99a01a30e430d0459",
+              "IPY_MODEL_1801493cd54742fd99752b2f605af1cb",
+              "IPY_MODEL_e5e518d8cf5f4aa5a0ecad6583f0d317"
+            ],
+            "layout": "IPY_MODEL_425f9f26bd0647b1989ecb704414aa9f"
+          }
+        },
+        "c15cc5cb9d7947a99a01a30e430d0459": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_5eeff3de00c5488db1817328e83bb992",
+            "placeholder": "",
+            "style": "IPY_MODEL_4846c29045294042b8d916cb0fd8f9d6",
+            "value": "Generating train split: "
+          }
+        },
+        "1801493cd54742fd99752b2f605af1cb": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "FloatProgressModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_20b59cdc19684e1c97517e36f5bf8d6a",
+            "max": 1,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_143d6079d1744eedb41e2e1182bd0f33",
+            "value": 1
+          }
+        },
+        "e5e518d8cf5f4aa5a0ecad6583f0d317": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_c022d8fabedc43ef9db0c8aca82d215e",
+            "placeholder": "",
+            "style": "IPY_MODEL_464ffcc84f48468b8f5d3f08412c6101",
+            "value": " 869/0 [00:00&lt;00:00, 8490.20 examples/s]"
+          }
+        },
+        "425f9f26bd0647b1989ecb704414aa9f": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "5eeff3de00c5488db1817328e83bb992": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "4846c29045294042b8d916cb0fd8f9d6": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "20b59cdc19684e1c97517e36f5bf8d6a": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": "20px"
+          }
+        },
+        "143d6079d1744eedb41e2e1182bd0f33": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ProgressStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "c022d8fabedc43ef9db0c8aca82d215e": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "464ffcc84f48468b8f5d3f08412c6101": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "a33fedc485b346b1b9d4fb8b18e8ac64": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HBoxModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_94d5d3b00449488caa6d8badc443a74f",
+              "IPY_MODEL_a60a111fc7c24bd7b21fed3f3dd64f29",
+              "IPY_MODEL_e830732fc2bc4848847ea85c772d0b98"
+            ],
+            "layout": "IPY_MODEL_3e25db05674d4d2f8fd839a0ec63e7d8"
+          }
+        },
+        "94d5d3b00449488caa6d8badc443a74f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_3262178b8baf4741b06250d7416df1f3",
+            "placeholder": "",
+            "style": "IPY_MODEL_2e9d5cf7a5c6466a9e1de6d4f403cd95",
+            "value": "Loading checkpoint shards: 100%"
+          }
+        },
+        "a60a111fc7c24bd7b21fed3f3dd64f29": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "FloatProgressModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_9d2631150d5c4089bcc95f22a6698287",
+            "max": 3,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_9c0857a4034f4780ab5e7fdd9aa9d09d",
+            "value": 3
+          }
+        },
+        "e830732fc2bc4848847ea85c772d0b98": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "HTMLModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_073975370eab45d9abc4f69f2b7b3d48",
+            "placeholder": "",
+            "style": "IPY_MODEL_0d1dfc47d0704506bc6e521c07162b4b",
+            "value": " 3/3 [00:00&lt;00:00,  3.91it/s]"
+          }
+        },
+        "3e25db05674d4d2f8fd839a0ec63e7d8": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "3262178b8baf4741b06250d7416df1f3": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "2e9d5cf7a5c6466a9e1de6d4f403cd95": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "9d2631150d5c4089bcc95f22a6698287": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "9c0857a4034f4780ab5e7fdd9aa9d09d": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "ProgressStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "073975370eab45d9abc4f69f2b7b3d48": {
+          "model_module": "@jupyter-widgets/base",
+          "model_name": "LayoutModel",
+          "model_module_version": "1.2.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "0d1dfc47d0704506bc6e521c07162b4b": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_name": "DescriptionStyleModel",
+          "model_module_version": "1.5.0",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        }
+      }
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}