merve HF Staff commited on
Commit
6e994a0
·
verified ·
1 Parent(s): 26e8458

Upload Gemma3n_Fine_tuning_on_All_Modalities.ipynb

Browse files
Gemma3n_Fine_tuning_on_All_Modalities.ipynb ADDED
@@ -0,0 +1,1766 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "source": [
6
+ "# Fine-tune Gemma3n on FineVideo\n",
7
+ "\n",
8
+ "In this notebook, we will see how to fine-tune Gemma3n an videos with audios inside.\n",
9
+ "Using all three modalities is very costly compute-wise, so keep in mind that this is an educational tutorial to fit the model in 40GB VRAM."
10
+ ],
11
+ "metadata": {
12
+ "id": "0eVo7Mc5GMyL"
13
+ }
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 1,
18
+ "metadata": {
19
+ "id": "BLv-NJRZzHiA",
20
+ "colab": {
21
+ "base_uri": "https://localhost:8080/"
22
+ },
23
+ "outputId": "bb4e4b32-5000-42e0-889d-90648e335a41"
24
+ },
25
+ "outputs": [
26
+ {
27
+ "output_type": "stream",
28
+ "name": "stdout",
29
+ "text": [
30
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.9/40.9 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
31
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.8/10.8 MB\u001b[0m \u001b[31m114.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
32
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m376.2/376.2 kB\u001b[0m \u001b[31m33.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
33
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m494.8/494.8 kB\u001b[0m \u001b[31m38.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
34
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m193.6/193.6 kB\u001b[0m \u001b[31m17.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
35
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m4.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
36
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m126.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
37
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m92.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
38
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m58.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
39
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
40
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m11.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
41
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m42.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
42
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m19.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
43
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
44
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m114.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
45
+ "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
46
+ "gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.\u001b[0m\u001b[31m\n",
47
+ "\u001b[0m"
48
+ ]
49
+ }
50
+ ],
51
+ "source": [
52
+ "!pip install -U -q timm transformers trl peft datasets"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": 2,
58
+ "metadata": {
59
+ "id": "UxE2vzKsbov0"
60
+ },
61
+ "outputs": [],
62
+ "source": [
63
+ "import io\n",
64
+ "import os\n",
65
+ "import zipfile\n",
66
+ "\n",
67
+ "import torch\n",
68
+ "from datasets import load_dataset\n",
69
+ "from PIL import Image\n",
70
+ "from transformers import AutoProcessor, Gemma3nForConditionalGeneration\n",
71
+ "\n",
72
+ "from trl import (\n",
73
+ " SFTConfig,\n",
74
+ " SFTTrainer,\n",
75
+ ")"
76
+ ]
77
+ },
78
+ {
79
+ "cell_type": "markdown",
80
+ "metadata": {
81
+ "id": "T06yJvcMiqO6"
82
+ },
83
+ "source": [
84
+ "## Download videos and preprocessing\n",
85
+ "\n",
86
+ "FineVideo is a quite large dataset, we don't need a ton of examples, so we stream the dataset, check the duration and download the videos shorter than 30 secs."
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": null,
92
+ "metadata": {
93
+ "id": "wBFfYgLxmg7b"
94
+ },
95
+ "outputs": [],
96
+ "source": [
97
+ "from datasets import load_dataset\n",
98
+ "import json\n",
99
+ "import os\n",
100
+ "\n",
101
+ "dataset = load_dataset(\"HuggingFaceFV/finevideo\", split=\"train\", streaming=True)\n",
102
+ "\n",
103
+ "\n",
104
+ "os.makedirs(\"videos\", exist_ok=True)\n",
105
+ "os.makedirs(\"metadata\", exist_ok=True)\n",
106
+ "\n",
107
+ "for idx, sample in enumerate(dataset):\n",
108
+ " data = sample[\"json\"]\n",
109
+ " duration = data.get(\"duration_seconds\", 0)\n",
110
+ " if duration < 30:\n",
111
+ " video_filename = f\"videos/sample_{idx}.mp4\"\n",
112
+ " with open(video_filename, 'wb') as video_file:\n",
113
+ " video_file.write(sample['mp4'])\n",
114
+ "\n",
115
+ " json_filename = f\"metadata/sample_{idx}.json\"\n",
116
+ " with open(json_filename, 'w') as json_file:\n",
117
+ " json.dump(sample['json'], json_file)\n"
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "execution_count": 7,
123
+ "metadata": {
124
+ "colab": {
125
+ "base_uri": "https://localhost:8080/"
126
+ },
127
+ "id": "K48dmmZTdZ1l",
128
+ "outputId": "31c7c32b-1c40-4df4-eb51-11857d7b4da9"
129
+ },
130
+ "outputs": [
131
+ {
132
+ "output_type": "stream",
133
+ "name": "stdout",
134
+ "text": [
135
+ "Number of items in content/videos: 871\n"
136
+ ]
137
+ }
138
+ ],
139
+ "source": [
140
+ " print(f\"Number of items in content/videos: {len(os.listdir('videos'))}\")"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "markdown",
145
+ "source": [
146
+ "In FineVideo some frames are dark so we downsample 6 frames and if we can't get meaningful videos we remove them."
147
+ ],
148
+ "metadata": {
149
+ "id": "QbkDI03qHMog"
150
+ }
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": 10,
155
+ "metadata": {
156
+ "id": "0UMZi3tHb-BC"
157
+ },
158
+ "outputs": [],
159
+ "source": [
160
+ "import cv2\n",
161
+ "from PIL import Image\n",
162
+ "import numpy as np\n",
163
+ "\n",
164
+ "def is_dark(frame, threshold=10):\n",
165
+ " return np.max(frame) < threshold # all pixels are very close to 0\n",
166
+ "\n",
167
+ "def downsample_video(video_path):\n",
168
+ " vidcap = cv2.VideoCapture(video_path)\n",
169
+ " total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))\n",
170
+ " fps = vidcap.get(cv2.CAP_PROP_FPS)\n",
171
+ "\n",
172
+ " frames = []\n",
173
+ "\n",
174
+ " # Generate 8 evenly spaced indices, skip first and last\n",
175
+ " full_indices = np.linspace(0, total_frames - 1, 8, dtype=int)[1:-1]\n",
176
+ "\n",
177
+ " for i in full_indices:\n",
178
+ " found_valid = False\n",
179
+ " for offset in [0, -1, 1, -2, 2]: # Try nearby frames if original is dark\n",
180
+ " candidate_idx = i + offset\n",
181
+ " if 0 <= candidate_idx < total_frames:\n",
182
+ " vidcap.set(cv2.CAP_PROP_POS_FRAMES, candidate_idx)\n",
183
+ " success, image = vidcap.read()\n",
184
+ " if success:\n",
185
+ " if not is_dark(image):\n",
186
+ " image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n",
187
+ " pil_image = Image.fromarray(image)\n",
188
+ " timestamp = round(candidate_idx / fps, 2)\n",
189
+ " frames.append((pil_image, timestamp))\n",
190
+ " found_valid = True\n",
191
+ " break\n",
192
+ " if not found_valid:\n",
193
+ " print(f\"Warning: Could not find non-dark frame near index {i}\")\n",
194
+ "\n",
195
+ " vidcap.release()\n",
196
+ "\n",
197
+ " # If still fewer than 8, try to top off by scanning more frames\n",
198
+ " if len(frames) < 6:\n",
199
+ " print(\"Trying to top off with additional non-dark frames...\")\n",
200
+ " idx = 0\n",
201
+ " while len(frames) < 8 and idx < total_frames:\n",
202
+ " vidcap.set(cv2.CAP_PROP_POS_FRAMES, idx)\n",
203
+ " success, image = vidcap.read()\n",
204
+ " if success and not is_dark(image):\n",
205
+ " image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n",
206
+ " pil_image = Image.fromarray(image)\n",
207
+ " timestamp = round(idx / fps, 2)\n",
208
+ " # Avoid adding duplicate timestamps\n",
209
+ " if not any(ts == timestamp for _, ts in frames):\n",
210
+ " frames.append((pil_image, timestamp))\n",
211
+ " idx += 1\n",
212
+ "\n",
213
+ " return frames[:8] # Ensure exactly 8 frames\n",
214
+ "\n",
215
+ "import os\n",
216
+ "import glob\n",
217
+ "\n",
218
+ "def remove_dark_videos(video_dir, metadata_dir, audio_dir):\n",
219
+ " \"\"\"\n",
220
+ " Remove videos (and their metadata/audio files) if all frames are dark.\n",
221
+ " \"\"\"\n",
222
+ " video_paths = glob.glob(os.path.join(video_dir, \"*.mp4\"))\n",
223
+ "\n",
224
+ " for video_path in video_paths:\n",
225
+ " filename = os.path.basename(video_path)\n",
226
+ " base_name = os.path.splitext(filename)[0]\n",
227
+ "\n",
228
+ " frames = downsample_video(video_path)\n",
229
+ " if len(frames) < 6:\n",
230
+ " try:\n",
231
+ " os.remove(video_path)\n",
232
+ " print(f\"Deleted: {video_path}\")\n",
233
+ " except Exception as e:\n",
234
+ " print(f\"Failed to delete {video_path}: {e}\")\n",
235
+ "\n",
236
+ " metadata_path = os.path.join(metadata_dir, f\"{base_name}.json\")\n",
237
+ " if os.path.exists(metadata_path):\n",
238
+ " os.remove(metadata_path)\n",
239
+ "\n",
240
+ " # Remove audio\n",
241
+ " audio_path = os.path.join(audio_dir, f\"{base_name}.wav\")\n",
242
+ " if os.path.exists(audio_path):\n",
243
+ " os.remove(audio_path)\n",
244
+ "\n"
245
+ ]
246
+ },
247
+ {
248
+ "cell_type": "code",
249
+ "source": [
250
+ "remove_dark_videos(\n",
251
+ " video_dir=\"videos\",\n",
252
+ " metadata_dir=\"metadata\",\n",
253
+ " audio_dir=\"audios\"\n",
254
+ " )"
255
+ ],
256
+ "metadata": {
257
+ "colab": {
258
+ "base_uri": "https://localhost:8080/"
259
+ },
260
+ "id": "pA6iIR38l66-",
261
+ "outputId": "78f81f41-5e70-4900-e33c-cd918aaed67d"
262
+ },
263
+ "execution_count": 12,
264
+ "outputs": [
265
+ {
266
+ "output_type": "stream",
267
+ "name": "stdout",
268
+ "text": [
269
+ "Warning: Could not find non-dark frame near index 208\n",
270
+ "Trying to top off with additional non-dark frames...\n",
271
+ "Deleted: videos/sample_9650.mp4\n",
272
+ "Warning: Could not find non-dark frame near index 432\n",
273
+ "Trying to top off with additional non-dark frames...\n",
274
+ "Deleted: videos/sample_31965.mp4\n"
275
+ ]
276
+ }
277
+ ]
278
+ },
279
+ {
280
+ "cell_type": "markdown",
281
+ "metadata": {
282
+ "id": "-qa4Tf8PwITC"
283
+ },
284
+ "source": [
285
+ "Gemma-3n accepts video (image frames) and audio separately, so we strip audio from video."
286
+ ]
287
+ },
288
+ {
289
+ "cell_type": "code",
290
+ "execution_count": 8,
291
+ "metadata": {
292
+ "id": "OR7bhnCawHrF"
293
+ },
294
+ "outputs": [],
295
+ "source": [
296
+ "import os\n",
297
+ "import subprocess\n",
298
+ "\n",
299
+ "video_dir = \"videos\"\n",
300
+ "audio_dir = \"audios\"\n",
301
+ "os.makedirs(audio_dir, exist_ok=True)\n",
302
+ "\n",
303
+ "for filename in os.listdir(video_dir):\n",
304
+ " if not filename.endswith(\".mp4\"):\n",
305
+ " continue\n",
306
+ "\n",
307
+ " idx = filename.split(\"_\")[1].split(\".\")[0]\n",
308
+ " video_path = os.path.join(video_dir, filename)\n",
309
+ " audio_path = os.path.join(audio_dir, f\"sample_{idx}.wav\")\n",
310
+ "\n",
311
+ " subprocess.run([\n",
312
+ " \"ffmpeg\", \"-i\", video_path,\n",
313
+ " \"-q:a\", \"0\", \"-map\", \"a\",\n",
314
+ " audio_path,\n",
315
+ " \"-y\"\n",
316
+ " ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)\n"
317
+ ]
318
+ },
319
+ {
320
+ "cell_type": "markdown",
321
+ "metadata": {
322
+ "id": "uIlVtxDcwQcy"
323
+ },
324
+ "source": [
325
+ "Construct a new dataset with audio, video, metadata (video categories). This dataset is very cool, it has some questions and answers, captions and more so get creative if you have the GPU VRAM to do so. Here we solve an easier task for educational purposes."
326
+ ]
327
+ },
328
+ {
329
+ "cell_type": "code",
330
+ "execution_count": 13,
331
+ "metadata": {
332
+ "colab": {
333
+ "base_uri": "https://localhost:8080/",
334
+ "height": 49,
335
+ "referenced_widgets": [
336
+ "4eb3613e8efa4fd9adf2cfe27bfbd699",
337
+ "c15cc5cb9d7947a99a01a30e430d0459",
338
+ "1801493cd54742fd99752b2f605af1cb",
339
+ "e5e518d8cf5f4aa5a0ecad6583f0d317",
340
+ "425f9f26bd0647b1989ecb704414aa9f",
341
+ "5eeff3de00c5488db1817328e83bb992",
342
+ "4846c29045294042b8d916cb0fd8f9d6",
343
+ "20b59cdc19684e1c97517e36f5bf8d6a",
344
+ "143d6079d1744eedb41e2e1182bd0f33",
345
+ "c022d8fabedc43ef9db0c8aca82d215e",
346
+ "464ffcc84f48468b8f5d3f08412c6101"
347
+ ]
348
+ },
349
+ "id": "erYr3SdmuS4m",
350
+ "outputId": "0c95ff77-7976-4641-9a51-b7f24f36270d"
351
+ },
352
+ "outputs": [
353
+ {
354
+ "output_type": "display_data",
355
+ "data": {
356
+ "text/plain": [
357
+ "Generating train split: 0 examples [00:00, ? examples/s]"
358
+ ],
359
+ "application/vnd.jupyter.widget-view+json": {
360
+ "version_major": 2,
361
+ "version_minor": 0,
362
+ "model_id": "4eb3613e8efa4fd9adf2cfe27bfbd699"
363
+ }
364
+ },
365
+ "metadata": {}
366
+ }
367
+ ],
368
+ "source": [
369
+ "from datasets import Dataset\n",
370
+ "import json\n",
371
+ "\n",
372
+ "def gen():\n",
373
+ " meta_dir = \"metadata\"\n",
374
+ " for filename in os.listdir(meta_dir):\n",
375
+ " if not filename.endswith(\".json\"):\n",
376
+ " continue\n",
377
+ "\n",
378
+ " idx = filename.split(\"_\")[1].split(\".\")[0]\n",
379
+ " if os.path.exists(f\"videos/sample_{idx}.mp4\"):\n",
380
+ " video_filename = f\"sample_{idx}.mp4\"\n",
381
+ " audio_filename = f\"sample_{idx}.wav\"\n",
382
+ " json_path = os.path.join(meta_dir, filename)\n",
383
+ "\n",
384
+ " with open(json_path, \"r\") as f:\n",
385
+ " metadata = json.load(f)\n",
386
+ "\n",
387
+ "\n",
388
+ " yield {\n",
389
+ " \"video\": video_filename,\n",
390
+ " \"audio\": audio_filename,\n",
391
+ " \"content_parent_category\": metadata[\"content_parent_category\"],\n",
392
+ " \"sample_index\": int(idx)\n",
393
+ " }\n",
394
+ " else:\n",
395
+ " pass\n",
396
+ "\n",
397
+ "dataset = Dataset.from_generator(gen)\n"
398
+ ]
399
+ },
400
+ {
401
+ "cell_type": "markdown",
402
+ "metadata": {
403
+ "id": "CjtgRoSEd9TV"
404
+ },
405
+ "source": [
406
+ "We will speed-up and downsample the audios to save space during training."
407
+ ]
408
+ },
409
+ {
410
+ "cell_type": "code",
411
+ "execution_count": 14,
412
+ "metadata": {
413
+ "id": "8DDaQ86MD1Y3"
414
+ },
415
+ "outputs": [],
416
+ "source": [
417
+ "import torchaudio\n",
418
+ "from torchaudio.transforms import Resample\n",
419
+ "import os\n",
420
+ "import torch\n",
421
+ "\n",
422
+ "def preprocess_audio(audio_path, target_sample_rate=16000, max_duration_sec=5, speedup_factor=1.25):\n",
423
+ " waveform, sample_rate = torchaudio.load(audio_path)\n",
424
+ "\n",
425
+ " if waveform.shape[0] > 1:\n",
426
+ " waveform = waveform.mean(dim=0, keepdim=True)\n",
427
+ "\n",
428
+ " if sample_rate != target_sample_rate:\n",
429
+ " resampler = Resample(orig_freq=sample_rate, new_freq=target_sample_rate)\n",
430
+ " waveform = resampler(waveform)\n",
431
+ " sample_rate = target_sample_rate\n",
432
+ "\n",
433
+ " if speedup_factor > 1.0:\n",
434
+ " indices = torch.arange(0, waveform.shape[1], step=speedup_factor).long()\n",
435
+ " if indices[-1] >= waveform.shape[1]:\n",
436
+ " indices = indices[:-1]\n",
437
+ " waveform = waveform[:, indices]\n",
438
+ "\n",
439
+ " max_length = int(target_sample_rate * max_duration_sec)\n",
440
+ " if waveform.shape[1] > max_length:\n",
441
+ " waveform = waveform[:, :max_length]\n",
442
+ "\n",
443
+ " torchaudio.save(audio_path, waveform, sample_rate)\n"
444
+ ]
445
+ },
446
+ {
447
+ "cell_type": "code",
448
+ "execution_count": 15,
449
+ "metadata": {
450
+ "id": "IQ7L2_0bI1tP"
451
+ },
452
+ "outputs": [],
453
+ "source": [
454
+ "for file_name in os.listdir(\"audios\"):\n",
455
+ " if file_name.lower().endswith(\".wav\"):\n",
456
+ " audio_path = os.path.join(\"audios\", file_name)\n",
457
+ " preprocess_audio(audio_path)"
458
+ ]
459
+ },
460
+ {
461
+ "cell_type": "code",
462
+ "execution_count": 16,
463
+ "metadata": {
464
+ "id": "pspaO2Lv4SxG"
465
+ },
466
+ "outputs": [],
467
+ "source": [
468
+ "dataset = dataset.train_test_split(test_size=0.10, seed=42)"
469
+ ]
470
+ },
471
+ {
472
+ "cell_type": "markdown",
473
+ "source": [
474
+ "### Load the model\n",
475
+ "\n",
476
+ "Make sure you have your Hugging Face token in your Colab secrets."
477
+ ],
478
+ "metadata": {
479
+ "id": "hrvYdvQ9Hye4"
480
+ }
481
+ },
482
+ {
483
+ "cell_type": "code",
484
+ "execution_count": 57,
485
+ "metadata": {
486
+ "colab": {
487
+ "base_uri": "https://localhost:8080/",
488
+ "height": 49,
489
+ "referenced_widgets": [
490
+ "a33fedc485b346b1b9d4fb8b18e8ac64",
491
+ "94d5d3b00449488caa6d8badc443a74f",
492
+ "a60a111fc7c24bd7b21fed3f3dd64f29",
493
+ "e830732fc2bc4848847ea85c772d0b98",
494
+ "3e25db05674d4d2f8fd839a0ec63e7d8",
495
+ "3262178b8baf4741b06250d7416df1f3",
496
+ "2e9d5cf7a5c6466a9e1de6d4f403cd95",
497
+ "9d2631150d5c4089bcc95f22a6698287",
498
+ "9c0857a4034f4780ab5e7fdd9aa9d09d",
499
+ "073975370eab45d9abc4f69f2b7b3d48",
500
+ "0d1dfc47d0704506bc6e521c07162b4b"
501
+ ]
502
+ },
503
+ "id": "UQaaLBCVzXH-",
504
+ "outputId": "a6244057-777b-4f48-e89e-0d3c945e06e8"
505
+ },
506
+ "outputs": [
507
+ {
508
+ "output_type": "display_data",
509
+ "data": {
510
+ "text/plain": [
511
+ "Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]"
512
+ ],
513
+ "application/vnd.jupyter.widget-view+json": {
514
+ "version_major": 2,
515
+ "version_minor": 0,
516
+ "model_id": "a33fedc485b346b1b9d4fb8b18e8ac64"
517
+ }
518
+ },
519
+ "metadata": {}
520
+ }
521
+ ],
522
+ "source": [
523
+ "model = Gemma3nForConditionalGeneration.from_pretrained(\n",
524
+ " \"google/gemma-3n-E2B-it\", torch_dtype=torch.bfloat16,\n",
525
+ ")\n",
526
+ "processor = AutoProcessor.from_pretrained(\n",
527
+ " \"google/gemma-3n-E2B-it\",\n",
528
+ ")\n",
529
+ "processor.tokenizer.padding_side = \"right\""
530
+ ]
531
+ },
532
+ {
533
+ "cell_type": "code",
534
+ "execution_count": null,
535
+ "metadata": {
536
+ "colab": {
537
+ "base_uri": "https://localhost:8080/"
538
+ },
539
+ "id": "epPCxTFi3XQ2",
540
+ "outputId": "f59ad356-5d7c-463e-9c6c-35eb0f0aa586"
541
+ },
542
+ "outputs": [
543
+ {
544
+ "output_type": "execute_result",
545
+ "data": {
546
+ "text/plain": [
547
+ "[2, 1, 3, 0, 262273, 256000, 255999, 262272, 262144, 262145]"
548
+ ]
549
+ },
550
+ "metadata": {},
551
+ "execution_count": 24
552
+ }
553
+ ],
554
+ "source": [
555
+ "processor.tokenizer.all_special_ids"
556
+ ]
557
+ },
558
+ {
559
+ "cell_type": "markdown",
560
+ "metadata": {
561
+ "id": "i-xR4GHUeQ9l"
562
+ },
563
+ "source": [
564
+ "Write our dataset collator. We will train model to predict category of a video (which can be done easily). You can do much better things, for instance FineVideo has QnA section, you can train this model to do open-ended QnA if you have a big VRAM and a lot of patience. Open-ended tasks are harder to work with, and this notebook carries educational purposes on feeding different modalities.\n",
565
+ "\n",
566
+ "In collator we also downsample videos to 6 frames, we have written the helper above. For better results you need more frames."
567
+ ]
568
+ },
569
+ {
570
+ "cell_type": "code",
571
+ "execution_count": 36,
572
+ "metadata": {
573
+ "id": "x_e3IjDCzioP"
574
+ },
575
+ "outputs": [],
576
+ "source": [
577
+ "def collate_fn(examples):\n",
578
+ " video_path = examples[0][\"video\"]\n",
579
+ " audio_path = examples[0][\"audio\"]\n",
580
+ " sample_idx = filename.split(\"_\")[1].split(\".\")[0]\n",
581
+ " frames = downsample_video(f\"videos/{video_path}\")\n",
582
+ "\n",
583
+ " text = \"Based on the video, predict the category of it.\"\n",
584
+ " message = [\n",
585
+ " {\n",
586
+ " \"role\": \"user\",\n",
587
+ " \"content\": [\n",
588
+ " {\"type\": \"text\", \"text\": text}\n",
589
+ " ],\n",
590
+ " },\n",
591
+ " ]\n",
592
+ " # this is how video inference should be formatted in Gemma3n\n",
593
+ " for frame in frames:\n",
594
+ " image, timestamp = frame\n",
595
+ " message[0][\"content\"].append({\"type\": \"text\", \"text\": f\"Frame {timestamp}:\"})\n",
596
+ " timestamp = str(timestamp).replace(\".\", \"_\")\n",
597
+ " image.save(f\"image_idx_{sample_idx}_{timestamp}.png\")\n",
598
+ " message[0][\"content\"].append({\"type\": \"image\", \"url\": f\"image_idx_{sample_idx}_{timestamp}.png\"})\n",
599
+ "\n",
600
+ " message[0][\"content\"].append({\"type\": \"audio\", \"audio\": f\"audios/{audio_path}\"})\n",
601
+ " message.append({\"role\": \"assistant\", \"content\": [{\"type\": \"text\", \"text\": examples[0][\"content_parent_category\"]}]})\n",
602
+ " inputs = processor.apply_chat_template(\n",
603
+ " message,\n",
604
+ " add_generation_prompt=False,\n",
605
+ " tokenize=True,\n",
606
+ " return_dict=True,\n",
607
+ " return_tensors=\"pt\",\n",
608
+ " padding=True,\n",
609
+ " ).to(model.device)\n",
610
+ "\n",
611
+ " labels = inputs[\"input_ids\"].clone()\n",
612
+ " special_token_ids = processor.tokenizer.all_special_ids\n",
613
+ "\n",
614
+ " special_token_ids_tensor = torch.tensor(special_token_ids, device=labels.device)\n",
615
+ " mask = torch.isin(labels, special_token_ids_tensor)\n",
616
+ " labels[mask] = -100\n",
617
+ "\n",
618
+ " inputs[\"labels\"] = labels\n",
619
+ " if torch.all(inputs[\"pixel_values\"] == 0):\n",
620
+ " print(\"Frames are dark\")\n",
621
+ "\n",
622
+ " return inputs"
623
+ ]
624
+ },
625
+ {
626
+ "cell_type": "markdown",
627
+ "metadata": {
628
+ "id": "wM6OxwNTiyZ1"
629
+ },
630
+ "source": [
631
+ "## Training"
632
+ ]
633
+ },
634
+ {
635
+ "cell_type": "markdown",
636
+ "source": [
637
+ "We do LoRA fine-tuning again to save up on space."
638
+ ],
639
+ "metadata": {
640
+ "id": "Wj7yYQTQH7wg"
641
+ }
642
+ },
643
+ {
644
+ "cell_type": "code",
645
+ "execution_count": 58,
646
+ "metadata": {
647
+ "id": "uD3W2OO5-1PC"
648
+ },
649
+ "outputs": [],
650
+ "source": [
651
+ "from peft import LoraConfig\n",
652
+ "peft_config = LoraConfig(\n",
653
+ " task_type=\"CAUSAL_LM\",\n",
654
+ " r=16,\n",
655
+ " target_modules=\"all-linear\",\n",
656
+ " lora_alpha=32,\n",
657
+ " lora_dropout=0.05,\n",
658
+ " bias=\"none\",\n",
659
+ " use_rslora=False,\n",
660
+ " use_dora=False,\n",
661
+ " modules_to_save=None\n",
662
+ ")"
663
+ ]
664
+ },
665
+ {
666
+ "cell_type": "code",
667
+ "execution_count": 59,
668
+ "metadata": {
669
+ "id": "CT7xlPul8RNJ"
670
+ },
671
+ "outputs": [],
672
+ "source": [
673
+ "model.gradient_checkpointing_disable()"
674
+ ]
675
+ },
676
+ {
677
+ "cell_type": "code",
678
+ "execution_count": 60,
679
+ "metadata": {
680
+ "id": "3stdS0v15tnY"
681
+ },
682
+ "outputs": [],
683
+ "source": [
684
+ "model.config.use_cache = False"
685
+ ]
686
+ },
687
+ {
688
+ "cell_type": "code",
689
+ "execution_count": 61,
690
+ "metadata": {
691
+ "id": "zG53iSes76H-"
692
+ },
693
+ "outputs": [],
694
+ "source": [
695
+ "training_args = SFTConfig(\n",
696
+ " output_dir=\"/content/gemma-3n-finevideo\",\n",
697
+ " eval_strategy='epoch',\n",
698
+ " per_device_train_batch_size=1,\n",
699
+ " per_device_eval_batch_size=1,\n",
700
+ " gradient_accumulation_steps=4,\n",
701
+ " gradient_checkpointing=False,\n",
702
+ " learning_rate=1e-05,\n",
703
+ " num_train_epochs=3.0,\n",
704
+ " logging_steps=10,\n",
705
+ " save_steps=100,\n",
706
+ " bf16=True,\n",
707
+ " report_to=[\"tensorboard\"],\n",
708
+ " dataset_kwargs={'skip_prepare_dataset': True},\n",
709
+ " remove_unused_columns=False,\n",
710
+ " max_seq_length=None,\n",
711
+ " push_to_hub=True,\n",
712
+ " dataloader_pin_memory=False,\n",
713
+ ")"
714
+ ]
715
+ },
716
+ {
717
+ "cell_type": "code",
718
+ "execution_count": 62,
719
+ "metadata": {
720
+ "colab": {
721
+ "base_uri": "https://localhost:8080/"
722
+ },
723
+ "id": "hPaplK2u70D9",
724
+ "outputId": "4bd2f1cd-e4d2-4e38-e555-ec2e07528e02"
725
+ },
726
+ "outputs": [
727
+ {
728
+ "output_type": "stream",
729
+ "name": "stderr",
730
+ "text": [
731
+ "No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.\n"
732
+ ]
733
+ }
734
+ ],
735
+ "source": [
736
+ "trainer = SFTTrainer(\n",
737
+ " model=model,\n",
738
+ " args=training_args,\n",
739
+ " data_collator=collate_fn,\n",
740
+ " train_dataset=dataset[\"train\"],\n",
741
+ " eval_dataset=dataset[\"test\"] if training_args.eval_strategy != \"no\" else None,\n",
742
+ " processing_class=processor.tokenizer,\n",
743
+ " peft_config=peft_config,\n",
744
+ ")"
745
+ ]
746
+ },
747
+ {
748
+ "cell_type": "code",
749
+ "execution_count": 63,
750
+ "metadata": {
751
+ "colab": {
752
+ "base_uri": "https://localhost:8080/",
753
+ "height": 221
754
+ },
755
+ "id": "gsBJcyqe8ET1",
756
+ "outputId": "9aa717c5-e046-42e7-91c7-deae74aa5407"
757
+ },
758
+ "outputs": [
759
+ {
760
+ "output_type": "display_data",
761
+ "data": {
762
+ "text/plain": [
763
+ "<IPython.core.display.HTML object>"
764
+ ],
765
+ "text/html": [
766
+ "\n",
767
+ " <div>\n",
768
+ " \n",
769
+ " <progress value='588' max='588' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
770
+ " [588/588 1:28:09, Epoch 3/3]\n",
771
+ " </div>\n",
772
+ " <table border=\"1\" class=\"dataframe\">\n",
773
+ " <thead>\n",
774
+ " <tr style=\"text-align: left;\">\n",
775
+ " <th>Epoch</th>\n",
776
+ " <th>Training Loss</th>\n",
777
+ " <th>Validation Loss</th>\n",
778
+ " </tr>\n",
779
+ " </thead>\n",
780
+ " <tbody>\n",
781
+ " <tr>\n",
782
+ " <td>1</td>\n",
783
+ " <td>1.363500</td>\n",
784
+ " <td>3.557561</td>\n",
785
+ " </tr>\n",
786
+ " <tr>\n",
787
+ " <td>2</td>\n",
788
+ " <td>0.981800</td>\n",
789
+ " <td>3.502365</td>\n",
790
+ " </tr>\n",
791
+ " <tr>\n",
792
+ " <td>3</td>\n",
793
+ " <td>0.844200</td>\n",
794
+ " <td>3.512452</td>\n",
795
+ " </tr>\n",
796
+ " </tbody>\n",
797
+ "</table><p>"
798
+ ]
799
+ },
800
+ "metadata": {}
801
+ },
802
+ {
803
+ "output_type": "execute_result",
804
+ "data": {
805
+ "text/plain": [
806
+ "TrainOutput(global_step=588, training_loss=1.369473821451875, metrics={'train_runtime': 5299.3753, 'train_samples_per_second': 0.443, 'train_steps_per_second': 0.111, 'total_flos': 7.490494981503706e+16, 'train_loss': 1.369473821451875})"
807
+ ]
808
+ },
809
+ "metadata": {},
810
+ "execution_count": 63
811
+ }
812
+ ],
813
+ "source": [
814
+ "trainer.train()"
815
+ ]
816
+ },
817
+ {
818
+ "cell_type": "markdown",
819
+ "source": [
820
+ "Test the model with a video of snowboarding."
821
+ ],
822
+ "metadata": {
823
+ "id": "qKtWUXVoUyKE"
824
+ }
825
+ },
826
+ {
827
+ "cell_type": "code",
828
+ "execution_count": 67,
829
+ "metadata": {
830
+ "id": "X5fOWf2bRERq",
831
+ "colab": {
832
+ "base_uri": "https://localhost:8080/"
833
+ },
834
+ "outputId": "5daa499e-56c9-4241-eb04-c8c29864ee9e"
835
+ },
836
+ "outputs": [
837
+ {
838
+ "output_type": "stream",
839
+ "name": "stdout",
840
+ "text": [
841
+ "--2025-07-16 13:18:33-- https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_8137.mp4\n",
842
+ "Resolving huggingface.co (huggingface.co)... 18.160.143.99, 18.160.143.32, 18.160.143.75, ...\n",
843
+ "Connecting to huggingface.co (huggingface.co)|18.160.143.99|:443... connected.\n",
844
+ "HTTP request sent, awaiting response... 302 Found\n",
845
+ "Location: https://cdn-lfs-us-1.hf.co/repos/7b/14/7b14679bb56cefbf7829be71f3f444110ccc308f431bd8596f534e743367ea5c/6331cbb913feb48349e3b7015a7969e04ce3cd594b1bda7278e4e33fe4a3f5f3?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27IMG_8137.mp4%3B+filename%3D%22IMG_8137.mp4%22%3B&response-content-type=video%2Fmp4&Expires=1752675513&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc1MjY3NTUxM319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzdiLzE0LzdiMTQ2NzliYjU2Y2VmYmY3ODI5YmU3MWYzZjQ0NDExMGNjYzMwOGY0MzFiZDg1OTZmNTM0ZTc0MzM2N2VhNWMvNjMzMWNiYjkxM2ZlYjQ4MzQ5ZTNiNzAxNWE3OTY5ZTA0Y2UzY2Q1OTRiMWJkYTcyNzhlNGUzM2ZlNGEzZjVmMz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=dKwm2ee9rdtmzuZ8tVMOOJWndfV85S9dKaTwiZbVQt3N6-1dtWkDKXbIsjuD%7Eyriu1dnXNDSjXSDIn-s7ypd8Ie-U1ABXw5Ou6CZ03Z9U4JIQDWBMwEGGEZ6HFCx0mR3royc3u-AKekcIw7zEOFtfAZ%7Eo0XT7l3BiAAV3IVu94m1ONONU779D1gSgPo1sWfuqWydAefPe2NVmSxY1HvH7DHxVOVRuGTfegXN59hvZKhSfZ0Dk0WqBjhReYVdEVxl5j-5pynjo-G%7EUsvldEcxxQpPdcD1DuOGQvYc0KyWw2Tyv3ibU7vhT%7EwVpvdG6tdIi2QOACJ4rfeaVWn5twIHxw__&Key-Pair-Id=K24J24Z295AEI9 [following]\n",
846
+ "--2025-07-16 13:18:33-- https://cdn-lfs-us-1.hf.co/repos/7b/14/7b14679bb56cefbf7829be71f3f444110ccc308f431bd8596f534e743367ea5c/6331cbb913feb48349e3b7015a7969e04ce3cd594b1bda7278e4e33fe4a3f5f3?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27IMG_8137.mp4%3B+filename%3D%22IMG_8137.mp4%22%3B&response-content-type=video%2Fmp4&Expires=1752675513&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc1MjY3NTUxM319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzdiLzE0LzdiMTQ2NzliYjU2Y2VmYmY3ODI5YmU3MWYzZjQ0NDExMGNjYzMwOGY0MzFiZDg1OTZmNTM0ZTc0MzM2N2VhNWMvNjMzMWNiYjkxM2ZlYjQ4MzQ5ZTNiNzAxNWE3OTY5ZTA0Y2UzY2Q1OTRiMWJkYTcyNzhlNGUzM2ZlNGEzZjVmMz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=dKwm2ee9rdtmzuZ8tVMOOJWndfV85S9dKaTwiZbVQt3N6-1dtWkDKXbIsjuD%7Eyriu1dnXNDSjXSDIn-s7ypd8Ie-U1ABXw5Ou6CZ03Z9U4JIQDWBMwEGGEZ6HFCx0mR3royc3u-AKekcIw7zEOFtfAZ%7Eo0XT7l3BiAAV3IVu94m1ONONU779D1gSgPo1sWfuqWydAefPe2NVmSxY1HvH7DHxVOVRuGTfegXN59hvZKhSfZ0Dk0WqBjhReYVdEVxl5j-5pynjo-G%7EUsvldEcxxQpPdcD1DuOGQvYc0KyWw2Tyv3ibU7vhT%7EwVpvdG6tdIi2QOACJ4rfeaVWn5twIHxw__&Key-Pair-Id=K24J24Z295AEI9\n",
847
+ "Resolving cdn-lfs-us-1.hf.co (cdn-lfs-us-1.hf.co)... 3.169.202.18, 3.169.202.35, 3.169.202.26, ...\n",
848
+ "Connecting to cdn-lfs-us-1.hf.co (cdn-lfs-us-1.hf.co)|3.169.202.18|:443... connected.\n",
849
+ "HTTP request sent, awaiting response... 200 OK\n",
850
+ "Length: 5340706 (5.1M) [video/mp4]\n",
851
+ "Saving to: ‘IMG_8137.mp4’\n",
852
+ "\n",
853
+ "IMG_8137.mp4 100%[===================>] 5.09M --.-KB/s in 0.1s \n",
854
+ "\n",
855
+ "2025-07-16 13:18:33 (38.9 MB/s) - ‘IMG_8137.mp4’ saved [5340706/5340706]\n",
856
+ "\n"
857
+ ]
858
+ }
859
+ ],
860
+ "source": [
861
+ "!wget https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_8137.mp4"
862
+ ]
863
+ },
864
+ {
865
+ "cell_type": "code",
866
+ "source": [
867
+ "model = trainer.model # trainer has the adapter"
868
+ ],
869
+ "metadata": {
870
+ "id": "KBfMiUChc2Ky"
871
+ },
872
+ "execution_count": 89,
873
+ "outputs": []
874
+ },
875
+ {
876
+ "cell_type": "markdown",
877
+ "source": [
878
+ "Strip audio and downsample video."
879
+ ],
880
+ "metadata": {
881
+ "id": "R14WzyjbZCwI"
882
+ }
883
+ },
884
+ {
885
+ "cell_type": "code",
886
+ "source": [
887
+ "audio_path = \"/content/test_audio.wav\"\n",
888
+ "subprocess.run([\n",
889
+ " \"ffmpeg\", \"-i\", \"/content/IMG_8137.mp4\",\n",
890
+ " \"-q:a\", \"0\", \"-map\", \"a\",\n",
891
+ " f\"{audio_path}\",\n",
892
+ " \"-y\"\n",
893
+ " ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)"
894
+ ],
895
+ "metadata": {
896
+ "colab": {
897
+ "base_uri": "https://localhost:8080/"
898
+ },
899
+ "id": "RnJZ-QNJaOqp",
900
+ "outputId": "c2f42e28-d427-4da7-cf86-6c3b70e6ee02"
901
+ },
902
+ "execution_count": 97,
903
+ "outputs": [
904
+ {
905
+ "output_type": "execute_result",
906
+ "data": {
907
+ "text/plain": [
908
+ "CompletedProcess(args=['ffmpeg', '-i', '/content/IMG_8137.mp4', '-q:a', '0', '-map', 'a', '/content/test_audio.wav', '-y'], returncode=0)"
909
+ ]
910
+ },
911
+ "metadata": {},
912
+ "execution_count": 97
913
+ }
914
+ ]
915
+ },
916
+ {
917
+ "cell_type": "code",
918
+ "source": [
919
+ "frames = downsample_video(\"/content/IMG_8137.mp4\")\n",
920
+ "\n",
921
+ "# repeat the chat template\n",
922
+ "text = \"Based on the video, predict the category of it.\"\n",
923
+ "message = [\n",
924
+ " {\n",
925
+ " \"role\": \"user\",\n",
926
+ " \"content\": [\n",
927
+ " {\"type\": \"text\", \"text\": text}\n",
928
+ " ],\n",
929
+ " },\n",
930
+ "]\n",
931
+ "for frame in frames:\n",
932
+ " image, timestamp = frame\n",
933
+ " message[0][\"content\"].append({\"type\": \"text\", \"text\": f\"Frame {timestamp}:\"})\n",
934
+ " timestamp = str(timestamp).replace(\".\", \"_\")\n",
935
+ " image.save(f\"test_frame_{timestamp}.png\")\n",
936
+ " message[0][\"content\"].append({\"type\": \"image\", \"url\": f\"test_frame_{timestamp}.png\"})\n",
937
+ "\n",
938
+ "message[0][\"content\"].append({\"type\": \"audio\", \"audio\": f\"{audio_path}\"})"
939
+ ],
940
+ "metadata": {
941
+ "id": "9drrCnfRYi6O"
942
+ },
943
+ "execution_count": 98,
944
+ "outputs": []
945
+ },
946
+ {
947
+ "cell_type": "code",
948
+ "source": [
949
+ "message"
950
+ ],
951
+ "metadata": {
952
+ "colab": {
953
+ "base_uri": "https://localhost:8080/"
954
+ },
955
+ "id": "7s1Dhxf_Z3xU",
956
+ "outputId": "1eba1e9e-d859-4aa7-ff4e-992ef272df7c"
957
+ },
958
+ "execution_count": 99,
959
+ "outputs": [
960
+ {
961
+ "output_type": "execute_result",
962
+ "data": {
963
+ "text/plain": [
964
+ "[{'role': 'user',\n",
965
+ " 'content': [{'type': 'text',\n",
966
+ " 'text': 'Based on the video, predict the category of it.'},\n",
967
+ " {'type': 'text', 'text': 'Frame 0.88:'},\n",
968
+ " {'type': 'image', 'url': 'test_frame_0_88.png'},\n",
969
+ " {'type': 'text', 'text': 'Frame 1.79:'},\n",
970
+ " {'type': 'image', 'url': 'test_frame_1_79.png'},\n",
971
+ " {'type': 'text', 'text': 'Frame 2.67:'},\n",
972
+ " {'type': 'image', 'url': 'test_frame_2_67.png'},\n",
973
+ " {'type': 'text', 'text': 'Frame 3.57:'},\n",
974
+ " {'type': 'image', 'url': 'test_frame_3_57.png'},\n",
975
+ " {'type': 'text', 'text': 'Frame 4.45:'},\n",
976
+ " {'type': 'image', 'url': 'test_frame_4_45.png'},\n",
977
+ " {'type': 'text', 'text': 'Frame 5.36:'},\n",
978
+ " {'type': 'image', 'url': 'test_frame_5_36.png'},\n",
979
+ " {'type': 'audio', 'audio': '/content/test_audio.wav'}]}]"
980
+ ]
981
+ },
982
+ "metadata": {},
983
+ "execution_count": 99
984
+ }
985
+ ]
986
+ },
987
+ {
988
+ "cell_type": "code",
989
+ "source": [
990
+ "inputs = processor.apply_chat_template(\n",
991
+ " message,\n",
992
+ " add_generation_prompt=True,\n",
993
+ " tokenize=True,\n",
994
+ " return_dict=True,\n",
995
+ " return_tensors=\"pt\",\n",
996
+ " padding=True,\n",
997
+ ").to(model.device).to(model.dtype)"
998
+ ],
999
+ "metadata": {
1000
+ "id": "xNTQRMzsZyQz"
1001
+ },
1002
+ "execution_count": 100,
1003
+ "outputs": []
1004
+ },
1005
+ {
1006
+ "cell_type": "code",
1007
+ "source": [
1008
+ "input_len = inputs[\"input_ids\"].shape[-1]\n",
1009
+ "\n",
1010
+ "with torch.inference_mode():\n",
1011
+ " generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)\n",
1012
+ " generation = generation[0][input_len:]\n",
1013
+ "\n",
1014
+ "decoded = processor.decode(generation, skip_special_tokens=True)\n",
1015
+ "print(decoded)"
1016
+ ],
1017
+ "metadata": {
1018
+ "colab": {
1019
+ "base_uri": "https://localhost:8080/"
1020
+ },
1021
+ "id": "WNfnannnZ5-S",
1022
+ "outputId": "0afca313-a4f7-4c02-872e-665a853a19df"
1023
+ },
1024
+ "execution_count": 101,
1025
+ "outputs": [
1026
+ {
1027
+ "output_type": "stream",
1028
+ "name": "stderr",
1029
+ "text": [
1030
+ "The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n"
1031
+ ]
1032
+ },
1033
+ {
1034
+ "output_type": "stream",
1035
+ "name": "stdout",
1036
+ "text": [
1037
+ "Snowboarding\n"
1038
+ ]
1039
+ }
1040
+ ]
1041
+ },
1042
+ {
1043
+ "cell_type": "markdown",
1044
+ "source": [
1045
+ "Thanks a lot for reading! Keep training the model further with more data or unfreeze the layers for better performance 💗"
1046
+ ],
1047
+ "metadata": {
1048
+ "id": "LOUBj5dgeddG"
1049
+ }
1050
+ },
1051
+ {
1052
+ "cell_type": "code",
1053
+ "source": [],
1054
+ "metadata": {
1055
+ "id": "4KnNR6lneuKm"
1056
+ },
1057
+ "execution_count": null,
1058
+ "outputs": []
1059
+ }
1060
+ ],
1061
+ "metadata": {
1062
+ "accelerator": "GPU",
1063
+ "colab": {
1064
+ "gpuType": "A100",
1065
+ "machine_shape": "hm",
1066
+ "provenance": []
1067
+ },
1068
+ "kernelspec": {
1069
+ "display_name": "Python 3",
1070
+ "name": "python3"
1071
+ },
1072
+ "language_info": {
1073
+ "name": "python"
1074
+ },
1075
+ "widgets": {
1076
+ "application/vnd.jupyter.widget-state+json": {
1077
+ "4eb3613e8efa4fd9adf2cfe27bfbd699": {
1078
+ "model_module": "@jupyter-widgets/controls",
1079
+ "model_name": "HBoxModel",
1080
+ "model_module_version": "1.5.0",
1081
+ "state": {
1082
+ "_dom_classes": [],
1083
+ "_model_module": "@jupyter-widgets/controls",
1084
+ "_model_module_version": "1.5.0",
1085
+ "_model_name": "HBoxModel",
1086
+ "_view_count": null,
1087
+ "_view_module": "@jupyter-widgets/controls",
1088
+ "_view_module_version": "1.5.0",
1089
+ "_view_name": "HBoxView",
1090
+ "box_style": "",
1091
+ "children": [
1092
+ "IPY_MODEL_c15cc5cb9d7947a99a01a30e430d0459",
1093
+ "IPY_MODEL_1801493cd54742fd99752b2f605af1cb",
1094
+ "IPY_MODEL_e5e518d8cf5f4aa5a0ecad6583f0d317"
1095
+ ],
1096
+ "layout": "IPY_MODEL_425f9f26bd0647b1989ecb704414aa9f"
1097
+ }
1098
+ },
1099
+ "c15cc5cb9d7947a99a01a30e430d0459": {
1100
+ "model_module": "@jupyter-widgets/controls",
1101
+ "model_name": "HTMLModel",
1102
+ "model_module_version": "1.5.0",
1103
+ "state": {
1104
+ "_dom_classes": [],
1105
+ "_model_module": "@jupyter-widgets/controls",
1106
+ "_model_module_version": "1.5.0",
1107
+ "_model_name": "HTMLModel",
1108
+ "_view_count": null,
1109
+ "_view_module": "@jupyter-widgets/controls",
1110
+ "_view_module_version": "1.5.0",
1111
+ "_view_name": "HTMLView",
1112
+ "description": "",
1113
+ "description_tooltip": null,
1114
+ "layout": "IPY_MODEL_5eeff3de00c5488db1817328e83bb992",
1115
+ "placeholder": "​",
1116
+ "style": "IPY_MODEL_4846c29045294042b8d916cb0fd8f9d6",
1117
+ "value": "Generating train split: "
1118
+ }
1119
+ },
1120
+ "1801493cd54742fd99752b2f605af1cb": {
1121
+ "model_module": "@jupyter-widgets/controls",
1122
+ "model_name": "FloatProgressModel",
1123
+ "model_module_version": "1.5.0",
1124
+ "state": {
1125
+ "_dom_classes": [],
1126
+ "_model_module": "@jupyter-widgets/controls",
1127
+ "_model_module_version": "1.5.0",
1128
+ "_model_name": "FloatProgressModel",
1129
+ "_view_count": null,
1130
+ "_view_module": "@jupyter-widgets/controls",
1131
+ "_view_module_version": "1.5.0",
1132
+ "_view_name": "ProgressView",
1133
+ "bar_style": "success",
1134
+ "description": "",
1135
+ "description_tooltip": null,
1136
+ "layout": "IPY_MODEL_20b59cdc19684e1c97517e36f5bf8d6a",
1137
+ "max": 1,
1138
+ "min": 0,
1139
+ "orientation": "horizontal",
1140
+ "style": "IPY_MODEL_143d6079d1744eedb41e2e1182bd0f33",
1141
+ "value": 1
1142
+ }
1143
+ },
1144
+ "e5e518d8cf5f4aa5a0ecad6583f0d317": {
1145
+ "model_module": "@jupyter-widgets/controls",
1146
+ "model_name": "HTMLModel",
1147
+ "model_module_version": "1.5.0",
1148
+ "state": {
1149
+ "_dom_classes": [],
1150
+ "_model_module": "@jupyter-widgets/controls",
1151
+ "_model_module_version": "1.5.0",
1152
+ "_model_name": "HTMLModel",
1153
+ "_view_count": null,
1154
+ "_view_module": "@jupyter-widgets/controls",
1155
+ "_view_module_version": "1.5.0",
1156
+ "_view_name": "HTMLView",
1157
+ "description": "",
1158
+ "description_tooltip": null,
1159
+ "layout": "IPY_MODEL_c022d8fabedc43ef9db0c8aca82d215e",
1160
+ "placeholder": "​",
1161
+ "style": "IPY_MODEL_464ffcc84f48468b8f5d3f08412c6101",
1162
+ "value": " 869/0 [00:00&lt;00:00, 8490.20 examples/s]"
1163
+ }
1164
+ },
1165
+ "425f9f26bd0647b1989ecb704414aa9f": {
1166
+ "model_module": "@jupyter-widgets/base",
1167
+ "model_name": "LayoutModel",
1168
+ "model_module_version": "1.2.0",
1169
+ "state": {
1170
+ "_model_module": "@jupyter-widgets/base",
1171
+ "_model_module_version": "1.2.0",
1172
+ "_model_name": "LayoutModel",
1173
+ "_view_count": null,
1174
+ "_view_module": "@jupyter-widgets/base",
1175
+ "_view_module_version": "1.2.0",
1176
+ "_view_name": "LayoutView",
1177
+ "align_content": null,
1178
+ "align_items": null,
1179
+ "align_self": null,
1180
+ "border": null,
1181
+ "bottom": null,
1182
+ "display": null,
1183
+ "flex": null,
1184
+ "flex_flow": null,
1185
+ "grid_area": null,
1186
+ "grid_auto_columns": null,
1187
+ "grid_auto_flow": null,
1188
+ "grid_auto_rows": null,
1189
+ "grid_column": null,
1190
+ "grid_gap": null,
1191
+ "grid_row": null,
1192
+ "grid_template_areas": null,
1193
+ "grid_template_columns": null,
1194
+ "grid_template_rows": null,
1195
+ "height": null,
1196
+ "justify_content": null,
1197
+ "justify_items": null,
1198
+ "left": null,
1199
+ "margin": null,
1200
+ "max_height": null,
1201
+ "max_width": null,
1202
+ "min_height": null,
1203
+ "min_width": null,
1204
+ "object_fit": null,
1205
+ "object_position": null,
1206
+ "order": null,
1207
+ "overflow": null,
1208
+ "overflow_x": null,
1209
+ "overflow_y": null,
1210
+ "padding": null,
1211
+ "right": null,
1212
+ "top": null,
1213
+ "visibility": null,
1214
+ "width": null
1215
+ }
1216
+ },
1217
+ "5eeff3de00c5488db1817328e83bb992": {
1218
+ "model_module": "@jupyter-widgets/base",
1219
+ "model_name": "LayoutModel",
1220
+ "model_module_version": "1.2.0",
1221
+ "state": {
1222
+ "_model_module": "@jupyter-widgets/base",
1223
+ "_model_module_version": "1.2.0",
1224
+ "_model_name": "LayoutModel",
1225
+ "_view_count": null,
1226
+ "_view_module": "@jupyter-widgets/base",
1227
+ "_view_module_version": "1.2.0",
1228
+ "_view_name": "LayoutView",
1229
+ "align_content": null,
1230
+ "align_items": null,
1231
+ "align_self": null,
1232
+ "border": null,
1233
+ "bottom": null,
1234
+ "display": null,
1235
+ "flex": null,
1236
+ "flex_flow": null,
1237
+ "grid_area": null,
1238
+ "grid_auto_columns": null,
1239
+ "grid_auto_flow": null,
1240
+ "grid_auto_rows": null,
1241
+ "grid_column": null,
1242
+ "grid_gap": null,
1243
+ "grid_row": null,
1244
+ "grid_template_areas": null,
1245
+ "grid_template_columns": null,
1246
+ "grid_template_rows": null,
1247
+ "height": null,
1248
+ "justify_content": null,
1249
+ "justify_items": null,
1250
+ "left": null,
1251
+ "margin": null,
1252
+ "max_height": null,
1253
+ "max_width": null,
1254
+ "min_height": null,
1255
+ "min_width": null,
1256
+ "object_fit": null,
1257
+ "object_position": null,
1258
+ "order": null,
1259
+ "overflow": null,
1260
+ "overflow_x": null,
1261
+ "overflow_y": null,
1262
+ "padding": null,
1263
+ "right": null,
1264
+ "top": null,
1265
+ "visibility": null,
1266
+ "width": null
1267
+ }
1268
+ },
1269
+ "4846c29045294042b8d916cb0fd8f9d6": {
1270
+ "model_module": "@jupyter-widgets/controls",
1271
+ "model_name": "DescriptionStyleModel",
1272
+ "model_module_version": "1.5.0",
1273
+ "state": {
1274
+ "_model_module": "@jupyter-widgets/controls",
1275
+ "_model_module_version": "1.5.0",
1276
+ "_model_name": "DescriptionStyleModel",
1277
+ "_view_count": null,
1278
+ "_view_module": "@jupyter-widgets/base",
1279
+ "_view_module_version": "1.2.0",
1280
+ "_view_name": "StyleView",
1281
+ "description_width": ""
1282
+ }
1283
+ },
1284
+ "20b59cdc19684e1c97517e36f5bf8d6a": {
1285
+ "model_module": "@jupyter-widgets/base",
1286
+ "model_name": "LayoutModel",
1287
+ "model_module_version": "1.2.0",
1288
+ "state": {
1289
+ "_model_module": "@jupyter-widgets/base",
1290
+ "_model_module_version": "1.2.0",
1291
+ "_model_name": "LayoutModel",
1292
+ "_view_count": null,
1293
+ "_view_module": "@jupyter-widgets/base",
1294
+ "_view_module_version": "1.2.0",
1295
+ "_view_name": "LayoutView",
1296
+ "align_content": null,
1297
+ "align_items": null,
1298
+ "align_self": null,
1299
+ "border": null,
1300
+ "bottom": null,
1301
+ "display": null,
1302
+ "flex": null,
1303
+ "flex_flow": null,
1304
+ "grid_area": null,
1305
+ "grid_auto_columns": null,
1306
+ "grid_auto_flow": null,
1307
+ "grid_auto_rows": null,
1308
+ "grid_column": null,
1309
+ "grid_gap": null,
1310
+ "grid_row": null,
1311
+ "grid_template_areas": null,
1312
+ "grid_template_columns": null,
1313
+ "grid_template_rows": null,
1314
+ "height": null,
1315
+ "justify_content": null,
1316
+ "justify_items": null,
1317
+ "left": null,
1318
+ "margin": null,
1319
+ "max_height": null,
1320
+ "max_width": null,
1321
+ "min_height": null,
1322
+ "min_width": null,
1323
+ "object_fit": null,
1324
+ "object_position": null,
1325
+ "order": null,
1326
+ "overflow": null,
1327
+ "overflow_x": null,
1328
+ "overflow_y": null,
1329
+ "padding": null,
1330
+ "right": null,
1331
+ "top": null,
1332
+ "visibility": null,
1333
+ "width": "20px"
1334
+ }
1335
+ },
1336
+ "143d6079d1744eedb41e2e1182bd0f33": {
1337
+ "model_module": "@jupyter-widgets/controls",
1338
+ "model_name": "ProgressStyleModel",
1339
+ "model_module_version": "1.5.0",
1340
+ "state": {
1341
+ "_model_module": "@jupyter-widgets/controls",
1342
+ "_model_module_version": "1.5.0",
1343
+ "_model_name": "ProgressStyleModel",
1344
+ "_view_count": null,
1345
+ "_view_module": "@jupyter-widgets/base",
1346
+ "_view_module_version": "1.2.0",
1347
+ "_view_name": "StyleView",
1348
+ "bar_color": null,
1349
+ "description_width": ""
1350
+ }
1351
+ },
1352
+ "c022d8fabedc43ef9db0c8aca82d215e": {
1353
+ "model_module": "@jupyter-widgets/base",
1354
+ "model_name": "LayoutModel",
1355
+ "model_module_version": "1.2.0",
1356
+ "state": {
1357
+ "_model_module": "@jupyter-widgets/base",
1358
+ "_model_module_version": "1.2.0",
1359
+ "_model_name": "LayoutModel",
1360
+ "_view_count": null,
1361
+ "_view_module": "@jupyter-widgets/base",
1362
+ "_view_module_version": "1.2.0",
1363
+ "_view_name": "LayoutView",
1364
+ "align_content": null,
1365
+ "align_items": null,
1366
+ "align_self": null,
1367
+ "border": null,
1368
+ "bottom": null,
1369
+ "display": null,
1370
+ "flex": null,
1371
+ "flex_flow": null,
1372
+ "grid_area": null,
1373
+ "grid_auto_columns": null,
1374
+ "grid_auto_flow": null,
1375
+ "grid_auto_rows": null,
1376
+ "grid_column": null,
1377
+ "grid_gap": null,
1378
+ "grid_row": null,
1379
+ "grid_template_areas": null,
1380
+ "grid_template_columns": null,
1381
+ "grid_template_rows": null,
1382
+ "height": null,
1383
+ "justify_content": null,
1384
+ "justify_items": null,
1385
+ "left": null,
1386
+ "margin": null,
1387
+ "max_height": null,
1388
+ "max_width": null,
1389
+ "min_height": null,
1390
+ "min_width": null,
1391
+ "object_fit": null,
1392
+ "object_position": null,
1393
+ "order": null,
1394
+ "overflow": null,
1395
+ "overflow_x": null,
1396
+ "overflow_y": null,
1397
+ "padding": null,
1398
+ "right": null,
1399
+ "top": null,
1400
+ "visibility": null,
1401
+ "width": null
1402
+ }
1403
+ },
1404
+ "464ffcc84f48468b8f5d3f08412c6101": {
1405
+ "model_module": "@jupyter-widgets/controls",
1406
+ "model_name": "DescriptionStyleModel",
1407
+ "model_module_version": "1.5.0",
1408
+ "state": {
1409
+ "_model_module": "@jupyter-widgets/controls",
1410
+ "_model_module_version": "1.5.0",
1411
+ "_model_name": "DescriptionStyleModel",
1412
+ "_view_count": null,
1413
+ "_view_module": "@jupyter-widgets/base",
1414
+ "_view_module_version": "1.2.0",
1415
+ "_view_name": "StyleView",
1416
+ "description_width": ""
1417
+ }
1418
+ },
1419
+ "a33fedc485b346b1b9d4fb8b18e8ac64": {
1420
+ "model_module": "@jupyter-widgets/controls",
1421
+ "model_name": "HBoxModel",
1422
+ "model_module_version": "1.5.0",
1423
+ "state": {
1424
+ "_dom_classes": [],
1425
+ "_model_module": "@jupyter-widgets/controls",
1426
+ "_model_module_version": "1.5.0",
1427
+ "_model_name": "HBoxModel",
1428
+ "_view_count": null,
1429
+ "_view_module": "@jupyter-widgets/controls",
1430
+ "_view_module_version": "1.5.0",
1431
+ "_view_name": "HBoxView",
1432
+ "box_style": "",
1433
+ "children": [
1434
+ "IPY_MODEL_94d5d3b00449488caa6d8badc443a74f",
1435
+ "IPY_MODEL_a60a111fc7c24bd7b21fed3f3dd64f29",
1436
+ "IPY_MODEL_e830732fc2bc4848847ea85c772d0b98"
1437
+ ],
1438
+ "layout": "IPY_MODEL_3e25db05674d4d2f8fd839a0ec63e7d8"
1439
+ }
1440
+ },
1441
+ "94d5d3b00449488caa6d8badc443a74f": {
1442
+ "model_module": "@jupyter-widgets/controls",
1443
+ "model_name": "HTMLModel",
1444
+ "model_module_version": "1.5.0",
1445
+ "state": {
1446
+ "_dom_classes": [],
1447
+ "_model_module": "@jupyter-widgets/controls",
1448
+ "_model_module_version": "1.5.0",
1449
+ "_model_name": "HTMLModel",
1450
+ "_view_count": null,
1451
+ "_view_module": "@jupyter-widgets/controls",
1452
+ "_view_module_version": "1.5.0",
1453
+ "_view_name": "HTMLView",
1454
+ "description": "",
1455
+ "description_tooltip": null,
1456
+ "layout": "IPY_MODEL_3262178b8baf4741b06250d7416df1f3",
1457
+ "placeholder": "​",
1458
+ "style": "IPY_MODEL_2e9d5cf7a5c6466a9e1de6d4f403cd95",
1459
+ "value": "Loading checkpoint shards: 100%"
1460
+ }
1461
+ },
1462
+ "a60a111fc7c24bd7b21fed3f3dd64f29": {
1463
+ "model_module": "@jupyter-widgets/controls",
1464
+ "model_name": "FloatProgressModel",
1465
+ "model_module_version": "1.5.0",
1466
+ "state": {
1467
+ "_dom_classes": [],
1468
+ "_model_module": "@jupyter-widgets/controls",
1469
+ "_model_module_version": "1.5.0",
1470
+ "_model_name": "FloatProgressModel",
1471
+ "_view_count": null,
1472
+ "_view_module": "@jupyter-widgets/controls",
1473
+ "_view_module_version": "1.5.0",
1474
+ "_view_name": "ProgressView",
1475
+ "bar_style": "success",
1476
+ "description": "",
1477
+ "description_tooltip": null,
1478
+ "layout": "IPY_MODEL_9d2631150d5c4089bcc95f22a6698287",
1479
+ "max": 3,
1480
+ "min": 0,
1481
+ "orientation": "horizontal",
1482
+ "style": "IPY_MODEL_9c0857a4034f4780ab5e7fdd9aa9d09d",
1483
+ "value": 3
1484
+ }
1485
+ },
1486
+ "e830732fc2bc4848847ea85c772d0b98": {
1487
+ "model_module": "@jupyter-widgets/controls",
1488
+ "model_name": "HTMLModel",
1489
+ "model_module_version": "1.5.0",
1490
+ "state": {
1491
+ "_dom_classes": [],
1492
+ "_model_module": "@jupyter-widgets/controls",
1493
+ "_model_module_version": "1.5.0",
1494
+ "_model_name": "HTMLModel",
1495
+ "_view_count": null,
1496
+ "_view_module": "@jupyter-widgets/controls",
1497
+ "_view_module_version": "1.5.0",
1498
+ "_view_name": "HTMLView",
1499
+ "description": "",
1500
+ "description_tooltip": null,
1501
+ "layout": "IPY_MODEL_073975370eab45d9abc4f69f2b7b3d48",
1502
+ "placeholder": "​",
1503
+ "style": "IPY_MODEL_0d1dfc47d0704506bc6e521c07162b4b",
1504
+ "value": " 3/3 [00:00&lt;00:00,  3.91it/s]"
1505
+ }
1506
+ },
1507
+ "3e25db05674d4d2f8fd839a0ec63e7d8": {
1508
+ "model_module": "@jupyter-widgets/base",
1509
+ "model_name": "LayoutModel",
1510
+ "model_module_version": "1.2.0",
1511
+ "state": {
1512
+ "_model_module": "@jupyter-widgets/base",
1513
+ "_model_module_version": "1.2.0",
1514
+ "_model_name": "LayoutModel",
1515
+ "_view_count": null,
1516
+ "_view_module": "@jupyter-widgets/base",
1517
+ "_view_module_version": "1.2.0",
1518
+ "_view_name": "LayoutView",
1519
+ "align_content": null,
1520
+ "align_items": null,
1521
+ "align_self": null,
1522
+ "border": null,
1523
+ "bottom": null,
1524
+ "display": null,
1525
+ "flex": null,
1526
+ "flex_flow": null,
1527
+ "grid_area": null,
1528
+ "grid_auto_columns": null,
1529
+ "grid_auto_flow": null,
1530
+ "grid_auto_rows": null,
1531
+ "grid_column": null,
1532
+ "grid_gap": null,
1533
+ "grid_row": null,
1534
+ "grid_template_areas": null,
1535
+ "grid_template_columns": null,
1536
+ "grid_template_rows": null,
1537
+ "height": null,
1538
+ "justify_content": null,
1539
+ "justify_items": null,
1540
+ "left": null,
1541
+ "margin": null,
1542
+ "max_height": null,
1543
+ "max_width": null,
1544
+ "min_height": null,
1545
+ "min_width": null,
1546
+ "object_fit": null,
1547
+ "object_position": null,
1548
+ "order": null,
1549
+ "overflow": null,
1550
+ "overflow_x": null,
1551
+ "overflow_y": null,
1552
+ "padding": null,
1553
+ "right": null,
1554
+ "top": null,
1555
+ "visibility": null,
1556
+ "width": null
1557
+ }
1558
+ },
1559
+ "3262178b8baf4741b06250d7416df1f3": {
1560
+ "model_module": "@jupyter-widgets/base",
1561
+ "model_name": "LayoutModel",
1562
+ "model_module_version": "1.2.0",
1563
+ "state": {
1564
+ "_model_module": "@jupyter-widgets/base",
1565
+ "_model_module_version": "1.2.0",
1566
+ "_model_name": "LayoutModel",
1567
+ "_view_count": null,
1568
+ "_view_module": "@jupyter-widgets/base",
1569
+ "_view_module_version": "1.2.0",
1570
+ "_view_name": "LayoutView",
1571
+ "align_content": null,
1572
+ "align_items": null,
1573
+ "align_self": null,
1574
+ "border": null,
1575
+ "bottom": null,
1576
+ "display": null,
1577
+ "flex": null,
1578
+ "flex_flow": null,
1579
+ "grid_area": null,
1580
+ "grid_auto_columns": null,
1581
+ "grid_auto_flow": null,
1582
+ "grid_auto_rows": null,
1583
+ "grid_column": null,
1584
+ "grid_gap": null,
1585
+ "grid_row": null,
1586
+ "grid_template_areas": null,
1587
+ "grid_template_columns": null,
1588
+ "grid_template_rows": null,
1589
+ "height": null,
1590
+ "justify_content": null,
1591
+ "justify_items": null,
1592
+ "left": null,
1593
+ "margin": null,
1594
+ "max_height": null,
1595
+ "max_width": null,
1596
+ "min_height": null,
1597
+ "min_width": null,
1598
+ "object_fit": null,
1599
+ "object_position": null,
1600
+ "order": null,
1601
+ "overflow": null,
1602
+ "overflow_x": null,
1603
+ "overflow_y": null,
1604
+ "padding": null,
1605
+ "right": null,
1606
+ "top": null,
1607
+ "visibility": null,
1608
+ "width": null
1609
+ }
1610
+ },
1611
+ "2e9d5cf7a5c6466a9e1de6d4f403cd95": {
1612
+ "model_module": "@jupyter-widgets/controls",
1613
+ "model_name": "DescriptionStyleModel",
1614
+ "model_module_version": "1.5.0",
1615
+ "state": {
1616
+ "_model_module": "@jupyter-widgets/controls",
1617
+ "_model_module_version": "1.5.0",
1618
+ "_model_name": "DescriptionStyleModel",
1619
+ "_view_count": null,
1620
+ "_view_module": "@jupyter-widgets/base",
1621
+ "_view_module_version": "1.2.0",
1622
+ "_view_name": "StyleView",
1623
+ "description_width": ""
1624
+ }
1625
+ },
1626
+ "9d2631150d5c4089bcc95f22a6698287": {
1627
+ "model_module": "@jupyter-widgets/base",
1628
+ "model_name": "LayoutModel",
1629
+ "model_module_version": "1.2.0",
1630
+ "state": {
1631
+ "_model_module": "@jupyter-widgets/base",
1632
+ "_model_module_version": "1.2.0",
1633
+ "_model_name": "LayoutModel",
1634
+ "_view_count": null,
1635
+ "_view_module": "@jupyter-widgets/base",
1636
+ "_view_module_version": "1.2.0",
1637
+ "_view_name": "LayoutView",
1638
+ "align_content": null,
1639
+ "align_items": null,
1640
+ "align_self": null,
1641
+ "border": null,
1642
+ "bottom": null,
1643
+ "display": null,
1644
+ "flex": null,
1645
+ "flex_flow": null,
1646
+ "grid_area": null,
1647
+ "grid_auto_columns": null,
1648
+ "grid_auto_flow": null,
1649
+ "grid_auto_rows": null,
1650
+ "grid_column": null,
1651
+ "grid_gap": null,
1652
+ "grid_row": null,
1653
+ "grid_template_areas": null,
1654
+ "grid_template_columns": null,
1655
+ "grid_template_rows": null,
1656
+ "height": null,
1657
+ "justify_content": null,
1658
+ "justify_items": null,
1659
+ "left": null,
1660
+ "margin": null,
1661
+ "max_height": null,
1662
+ "max_width": null,
1663
+ "min_height": null,
1664
+ "min_width": null,
1665
+ "object_fit": null,
1666
+ "object_position": null,
1667
+ "order": null,
1668
+ "overflow": null,
1669
+ "overflow_x": null,
1670
+ "overflow_y": null,
1671
+ "padding": null,
1672
+ "right": null,
1673
+ "top": null,
1674
+ "visibility": null,
1675
+ "width": null
1676
+ }
1677
+ },
1678
+ "9c0857a4034f4780ab5e7fdd9aa9d09d": {
1679
+ "model_module": "@jupyter-widgets/controls",
1680
+ "model_name": "ProgressStyleModel",
1681
+ "model_module_version": "1.5.0",
1682
+ "state": {
1683
+ "_model_module": "@jupyter-widgets/controls",
1684
+ "_model_module_version": "1.5.0",
1685
+ "_model_name": "ProgressStyleModel",
1686
+ "_view_count": null,
1687
+ "_view_module": "@jupyter-widgets/base",
1688
+ "_view_module_version": "1.2.0",
1689
+ "_view_name": "StyleView",
1690
+ "bar_color": null,
1691
+ "description_width": ""
1692
+ }
1693
+ },
1694
+ "073975370eab45d9abc4f69f2b7b3d48": {
1695
+ "model_module": "@jupyter-widgets/base",
1696
+ "model_name": "LayoutModel",
1697
+ "model_module_version": "1.2.0",
1698
+ "state": {
1699
+ "_model_module": "@jupyter-widgets/base",
1700
+ "_model_module_version": "1.2.0",
1701
+ "_model_name": "LayoutModel",
1702
+ "_view_count": null,
1703
+ "_view_module": "@jupyter-widgets/base",
1704
+ "_view_module_version": "1.2.0",
1705
+ "_view_name": "LayoutView",
1706
+ "align_content": null,
1707
+ "align_items": null,
1708
+ "align_self": null,
1709
+ "border": null,
1710
+ "bottom": null,
1711
+ "display": null,
1712
+ "flex": null,
1713
+ "flex_flow": null,
1714
+ "grid_area": null,
1715
+ "grid_auto_columns": null,
1716
+ "grid_auto_flow": null,
1717
+ "grid_auto_rows": null,
1718
+ "grid_column": null,
1719
+ "grid_gap": null,
1720
+ "grid_row": null,
1721
+ "grid_template_areas": null,
1722
+ "grid_template_columns": null,
1723
+ "grid_template_rows": null,
1724
+ "height": null,
1725
+ "justify_content": null,
1726
+ "justify_items": null,
1727
+ "left": null,
1728
+ "margin": null,
1729
+ "max_height": null,
1730
+ "max_width": null,
1731
+ "min_height": null,
1732
+ "min_width": null,
1733
+ "object_fit": null,
1734
+ "object_position": null,
1735
+ "order": null,
1736
+ "overflow": null,
1737
+ "overflow_x": null,
1738
+ "overflow_y": null,
1739
+ "padding": null,
1740
+ "right": null,
1741
+ "top": null,
1742
+ "visibility": null,
1743
+ "width": null
1744
+ }
1745
+ },
1746
+ "0d1dfc47d0704506bc6e521c07162b4b": {
1747
+ "model_module": "@jupyter-widgets/controls",
1748
+ "model_name": "DescriptionStyleModel",
1749
+ "model_module_version": "1.5.0",
1750
+ "state": {
1751
+ "_model_module": "@jupyter-widgets/controls",
1752
+ "_model_module_version": "1.5.0",
1753
+ "_model_name": "DescriptionStyleModel",
1754
+ "_view_count": null,
1755
+ "_view_module": "@jupyter-widgets/base",
1756
+ "_view_module_version": "1.2.0",
1757
+ "_view_name": "StyleView",
1758
+ "description_width": ""
1759
+ }
1760
+ }
1761
+ }
1762
+ }
1763
+ },
1764
+ "nbformat": 4,
1765
+ "nbformat_minor": 0
1766
+ }