diff --git "a/Fine_tune_SmolVLM2_on_Video.ipynb" "b/Fine_tune_SmolVLM2_on_Video.ipynb"
new file mode 100644--- /dev/null
+++ "b/Fine_tune_SmolVLM2_on_Video.ipynb"
@@ -0,0 +1,2909 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "nc0g2NLpUSGr"
+ },
+ "source": [
+ "# Fine-tune SmolVLM2 on Video Captioning\n",
+ "In this notebook we will fine-tune SmolVLM2-500M-Video-Instruct on Video Feedback dataset. It is ran on a Colab A100 for full fine-tuning, but you can squeeze it to L4 with QLoRA."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "WIhA1lQ7j0kw",
+ "outputId": "928f2f4e-6cd8-452b-d621-605550fdd33c"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.5/163.5 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+ "\u001b[?25h Building wheel for docopt (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
+ ]
+ }
+ ],
+ "source": [
+ "!pip install -q accelerate datasets peft bitsandbytes tensorboard pyav num2words"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install -q git+https://github.com/huggingface/transformers.git"
+ ],
+ "metadata": {
+ "id": "FCYgmJtDRElR"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "XyJaqZZ3uYYl"
+ },
+ "outputs": [],
+ "source": [
+ "!pip install -q flash-attn --no-build-isolation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "wAeMA0heVBjT"
+ },
+ "source": [
+ "We will push out model to Hub so we need to authenticate ourselves."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 17,
+ "referenced_widgets": [
+ "112da28d935543069e7a1a2abc22f9f4",
+ "0d22c009aa584ca1a71e32336a7985e0",
+ "ad17e30049cb4b5aa4046d94690f87d3",
+ "e77d3520a2d64f9a840652669c9a0ba1",
+ "1852745b0de44f4281cea0cbb3508459",
+ "166c19ec6d9f4455a56a0f146d1c0abc",
+ "f6362bc7b5b24dd592d35a76a1fbf26b",
+ "e99fbdfc8a22408a8c728a36c8744b24",
+ "0fee30c9bf2b4bdfad7a37261f92db64",
+ "4cd8babc92cc4aeba74d2147f28dee7d",
+ "a4fbf37fe0fe44cfbf72ca1e82af3467",
+ "be50e04c5629463eb18d029d045f25b3",
+ "5490c69c251144c4979e346c66ac1e53",
+ "44d0e1db5f664b3fb7c146c216566776",
+ "7af918a10ec745d7a3f4a883dbdc8b6a",
+ "4156b6897089446984196606ef0d3461",
+ "cf4b5a9cefe84fd9a4d120ab1da6f3f4",
+ "484155e67e36453c9d1ebd2ea1768eca",
+ "48bb89c434284b639f45b5929cf8d1a9",
+ "0ead4ab9bb7648c69352094bfbcb8800"
+ ]
+ },
+ "id": "yKd5xtSGj7cm",
+ "outputId": "a6e841d8-f2d6-44a8-d44d-c0c244d95f9b"
+ },
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "VBox(children=(HTML(value='
Step | \n", + "Training Loss | \n", + "
---|---|
25 | \n", + "3.345600 | \n", + "
50 | \n", + "0.709500 | \n", + "
75 | \n", + "0.341000 | \n", + "
100 | \n", + "0.272200 | \n", + "
125 | \n", + "0.250600 | \n", + "
150 | \n", + "0.290400 | \n", + "
175 | \n", + "0.261100 | \n", + "
200 | \n", + "0.258000 | \n", + "
225 | \n", + "0.276500 | \n", + "
250 | \n", + "0.265900 | \n", + "
275 | \n", + "0.301500 | \n", + "
300 | \n", + "0.277900 | \n", + "
325 | \n", + "0.282800 | \n", + "
350 | \n", + "0.264100 | \n", + "
375 | \n", + "0.235500 | \n", + "
400 | \n", + "0.251400 | \n", + "
425 | \n", + "0.242500 | \n", + "
450 | \n", + "0.281100 | \n", + "
475 | \n", + "0.261000 | \n", + "
500 | \n", + "0.231800 | \n", + "
525 | \n", + "0.232200 | \n", + "
550 | \n", + "0.268100 | \n", + "
575 | \n", + "0.222400 | \n", + "
600 | \n", + "0.246600 | \n", + "
625 | \n", + "0.251700 | \n", + "
650 | \n", + "0.257800 | \n", + "
675 | \n", + "0.241000 | \n", + "
700 | \n", + "0.229000 | \n", + "
725 | \n", + "0.236600 | \n", + "
750 | \n", + "0.220900 | \n", + "
775 | \n", + "0.271400 | \n", + "
800 | \n", + "0.259900 | \n", + "
825 | \n", + "0.243900 | \n", + "
850 | \n", + "0.236400 | \n", + "
875 | \n", + "0.227200 | \n", + "
900 | \n", + "0.227900 | \n", + "
925 | \n", + "0.263300 | \n", + "
950 | \n", + "0.255200 | \n", + "
975 | \n", + "0.250000 | \n", + "
1000 | \n", + "0.244400 | \n", + "
" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "TrainOutput(global_step=1000, training_loss=0.3446595501899719, metrics={'train_runtime': 1194.5916, 'train_samples_per_second': 1.674, 'train_steps_per_second': 0.837, 'total_flos': 1550232912784896.0, 'train_loss': 0.3446595501899719, 'epoch': 1.0})" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ], + "source": [ + "trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0hN0QD9_uYYo", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 214, + "referenced_widgets": [ + "50f23a17be4f46b687b1b1df1e70238a", + "f1a0aa50142044f5817f8676103bff58", + "02cb2b35986f4ea294fbf6b2490972d5", + "ecc5cc7a30fa48f2afc708f8a40f50eb", + "6e80b0bf0aa2433fa97d6f43dd21e2cf", + "d7d628d4ef7c4b888fd2f70e472dac10", + "c42bbb0b19a544f58a5737acb1d97a85", + "7cd065e777f54efb857efb92415997b2", + "970c3d13cbbf47da986503c6ad99f506", + "96ba1124ba7642fe9d616b348499d0ef", + "484d11b997454f88902fe507c1156698", + "03e863ab55424bdabbc87bb965562b9b", + "62c1056ac5f14b1caa235010d33f241a", + "a415d0d029ec4225864ed59db18c20b3", + "56cda03822db434bb17b7cadbbaeb81b", + "4e085c242353430b8e32afa3bb260aa9", + "c55a0063346d4c429cbc000bbd612287", + "ba8348a627bc41149e888f0deae68a51", + "bef10393380046c3a842dc979ed8c01f", + "052ce17694bd4d3291ff5a10d2702b4b", + "11775cc8d35442c3a31452d66f6104e7", + "af6707547c5243eb9227efc0eb76134e", + "3d6e35795ba24eed96f2fa842b265e5b", + "84376aa81cae42a18fc49bdded395187", + "805784ce9e65411dbf35373db3680920", + "308ab776682a4e93ac05e06aa98a77f1", + "13ece6fbb1d84f03ae434119de486f07", + "88ef32028fd640de85d75c197eca36eb", + "d926557788ba45c0bef88d9e8a4b56aa", + "a5faec577a9844ea921c2bce1d472b23", + "f1e2134eb4624735842db7c112b515a0", + "0bd50b2853324f5c832821b7174c5ce2", + "077f3bcf99044d168250d1e6c4abbcae", + "bc2065597db04146a6df7ed10de7b93c", + "24cf286de1cf40f299bf0797f74c85eb", + "c349943637234fbc96a2a9f325d3c9f1", + "2d4d2f5ffae5451ebf1583362da3e9a9", + "3cd8e1e9fc234219b8fbd4161799640f", + "9e604469cb34439c944e84238b1ec055", + "ba94df0961c44b6d974ef882297731d8", + "419572dbd59a4583831961b7d8ecfa4a", + "88cf901fb47a4925930b7deffe98a9ce", + "9a6a8a5bf8f1479eb478a8c81b58aa69", + "3a51fdcde7984422b7a0925057f6cc37" + ] + }, + "outputId": "20daaa82-d090-4a7b-c655-60eccf851f47" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Upload 3 LFS files: 0%| | 0/3 [00:00, ?it/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "50f23a17be4f46b687b1b1df1e70238a" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "events.out.tfevents.1740055910.82ea94387a47.41010.0: 0%| | 0.00/17.1k [00:00, ?B/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "03e863ab55424bdabbc87bb965562b9b" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "model.safetensors: 0%| | 0.00/1.02G [00:00, ?B/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "3d6e35795ba24eed96f2fa842b265e5b" + } + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "training_args.bin: 0%| | 0.00/5.43k [00:00, ?B/s]" + ], + "application/vnd.jupyter.widget-view+json": { + "version_major": 2, + "version_minor": 0, + "model_id": "bc2065597db04146a6df7ed10de7b93c" + } + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "CommitInfo(commit_url='https://huggingface.co/merve/SmolVLM2-500M-Video-Instruct-video-feedback/commit/2f33b0685d991475ac091593e224f3e5e7b7cac7', commit_message='End of training', commit_description='', oid='2f33b0685d991475ac091593e224f3e5e7b7cac7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/merve/SmolVLM2-500M-Video-Instruct-video-feedback', endpoint='https://huggingface.co', repo_type='model', repo_id='merve/SmolVLM2-500M-Video-Instruct-video-feedback'), pr_revision=None, pr_num=None)" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 10 + } + ], + "source": [ + "trainer.push_to_hub()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "The test example is a video of a woman walking by, you can download and check from [here](https://huggingface.co/datasets/hexuan21/VideoFeedback-videos-mp4/blob/main/p/p000304.mp4)." + ], + "metadata": { + "id": "4dewIZzjfpNx" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2dkZlDPtPsV7", + "outputId": "d37f856a-5873-4b7c-e807-e0f2a706be94" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "User: Caption the video.You are provided the following series of three frames from a 0:00:03 [H:MM:SS] video.\n", + "\n", + "Frame from 00:00:\n", + "Frame from 00:01:\n", + "Frame from 00:02:\n", + "\n", + "\n", + "Assistant: woman in white shirt walks by\n" + ] + } + ], + "source": [ + "messages = [{\"role\": \"user\",\n", + " \"content\": [{\"type\": \"text\", \"text\": \"Caption the video.\"},\n", + " {\"type\": \"video\", \"path\": \"https://huggingface.co/datasets/hexuan21/VideoFeedback-videos-mp4/resolve/main/p/p000304.mp4\"}]}]\n", + "\n", + "\n", + "inputs = processor.apply_chat_template(messages, add_generation_prompt=True,\n", + " tokenize=True, return_dict=True, return_tensors=\"pt\").to(\"cuda\").to(model.dtype)\n", + "\n", + "generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=64)\n", + "generated_texts = processor.batch_decode(\n", + " generated_ids,\n", + " skip_special_tokens=True,\n", + ")\n", + "\n", + "print(generated_texts[0])" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "A100", + "provenance": [], + "include_colab_link": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "112da28d935543069e7a1a2abc22f9f4": { + "model_module": "@jupyter-widgets/controls", + "model_name": "VBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "VBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "VBoxView", + "box_style": "", + "children": [], + "layout": "IPY_MODEL_f6362bc7b5b24dd592d35a76a1fbf26b" + } + }, + "0d22c009aa584ca1a71e32336a7985e0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e99fbdfc8a22408a8c728a36c8744b24", + "placeholder": "", + "style": "IPY_MODEL_0fee30c9bf2b4bdfad7a37261f92db64", + "value": "