notebook.ipynb
| 1 | { |
| 2 | "nbformat": 4, |
| 3 | "nbformat_minor": 0, |
| 4 | "metadata": { |
| 5 | "colab": { |
| 6 | "provenance": [], |
| 7 | "gpuType": "T4" |
| 8 | }, |
| 9 | "kernelspec": { |
| 10 | "name": "python3", |
| 11 | "display_name": "Python 3" |
| 12 | }, |
| 13 | "language_info": { |
| 14 | "name": "python" |
| 15 | }, |
| 16 | "accelerator": "GPU" |
| 17 | }, |
| 18 | "cells": [ |
| 19 | { |
| 20 | "cell_type": "markdown", |
| 21 | "source": [ |
| 22 | "# Use VJEPA 2" |
| 23 | ], |
| 24 | "metadata": { |
| 25 | "id": "02ruu54h4yLc" |
| 26 | } |
| 27 | }, |
| 28 | { |
| 29 | "cell_type": "markdown", |
| 30 | "source": [ |
| 31 | "V-JEPA 2 is a new open 1.2B video embedding model by Meta, which attempts to capture the physical world modelling through video ⏯️\n", |
| 32 | "\n", |
| 33 | "The model can be used for various tasks for video: fine-tuning for downstream tasks like video classification, or any task involving embeddings (similarity, retrieval and more!).\n", |
| 34 | "\n", |
| 35 | "You can check all V-JEPA 2 checkpoints and the datasets that come with this release [in this collection](https://huggingface.co/collections/facebook/v-jepa-2-6841bad8413014e185b497a6)." |
| 36 | ], |
| 37 | "metadata": { |
| 38 | "id": "ol0IGYCd4hg4" |
| 39 | } |
| 40 | }, |
| 41 | { |
| 42 | "cell_type": "markdown", |
| 43 | "source": [ |
| 44 | "We need to install transformers' release specific branch." |
| 45 | ], |
| 46 | "metadata": { |
| 47 | "id": "kIIBxYOA41Ga" |
| 48 | } |
| 49 | }, |
| 50 | { |
| 51 | "cell_type": "code", |
| 52 | "source": [ |
| 53 | "!pip install -q git+https://github.com/huggingface/transformers@v4.52.4-VJEPA-2-preview" |
| 54 | ], |
| 55 | "metadata": { |
| 56 | "id": "4D4D1hC940yX" |
| 57 | }, |
| 58 | "execution_count": null, |
| 59 | "outputs": [] |
| 60 | }, |
| 61 | { |
| 62 | "cell_type": "code", |
| 63 | "source": [ |
| 64 | "from huggingface_hub import login # to later push the model\n", |
| 65 | "\n", |
| 66 | "login()" |
| 67 | ], |
| 68 | "metadata": { |
| 69 | "id": "Ne2rU68Ep1On" |
| 70 | }, |
| 71 | "execution_count": null, |
| 72 | "outputs": [] |
| 73 | }, |
| 74 | { |
| 75 | "cell_type": "markdown", |
| 76 | "source": [ |
| 77 | "As of now, Colab supports torchcodec==0.2.1 which supports torch==2.6.0." |
| 78 | ], |
| 79 | "metadata": { |
| 80 | "id": "dJWXmFu53Ap6" |
| 81 | } |
| 82 | }, |
| 83 | { |
| 84 | "cell_type": "code", |
| 85 | "source": [ |
| 86 | "!pip install -q torch==2.6.0 torchvision==0.21.0\n", |
| 87 | "!pip install -q torchcodec==0.2.1\n", |
| 88 | "\n", |
| 89 | "import torch\n", |
| 90 | "print(\"Torch:\", torch.__version__)\n", |
| 91 | "from torchcodec.decoders import VideoDecoder # verify" |
| 92 | ], |
| 93 | "metadata": { |
| 94 | "id": "JIoq84ze2_Ls" |
| 95 | }, |
| 96 | "execution_count": null, |
| 97 | "outputs": [] |
| 98 | }, |
| 99 | { |
| 100 | "cell_type": "markdown", |
| 101 | "source": [ |
| 102 | "## Initialize the model and the processor" |
| 103 | ], |
| 104 | "metadata": { |
| 105 | "id": "-7OATf5S20U_" |
| 106 | } |
| 107 | }, |
| 108 | { |
| 109 | "cell_type": "code", |
| 110 | "source": [ |
| 111 | "from transformers import AutoVideoProcessor, AutoModel\n", |
| 112 | "\n", |
| 113 | "hf_repo = \"facebook/vjepa2-vitl-fpc64-256\"\n", |
| 114 | "\n", |
| 115 | "model = AutoModel.from_pretrained(hf_repo).to(\"cuda\")\n", |
| 116 | "processor = AutoVideoProcessor.from_pretrained(hf_repo)" |
| 117 | ], |
| 118 | "metadata": { |
| 119 | "id": "K8oSsy7Y2zQK" |
| 120 | }, |
| 121 | "execution_count": null, |
| 122 | "outputs": [] |
| 123 | }, |
| 124 | { |
| 125 | "cell_type": "markdown", |
| 126 | "source": [ |
| 127 | "## Extract video embeddings from the model" |
| 128 | ], |
| 129 | "metadata": { |
| 130 | "id": "ZJ_DUR9f22Uc" |
| 131 | } |
| 132 | }, |
| 133 | { |
| 134 | "cell_type": "code", |
| 135 | "source": [ |
| 136 | "import torch\n", |
| 137 | "from torchcodec.decoders import VideoDecoder\n", |
| 138 | "import numpy as np\n", |
| 139 | "\n", |
| 140 | "video_url = \"https://huggingface.co/datasets/nateraw/kinetics-mini/resolve/main/val/archery/-Qz25rXdMjE_000014_000024.mp4\"\n", |
| 141 | "vr = VideoDecoder(video_url)\n", |
| 142 | "frame_idx = np.arange(0, 64) # choosing some frames. here, you can define more complex sampling strategy\n", |
| 143 | "video = vr.get_frames_at(indices=frame_idx).data # T x C x H x W\n", |
| 144 | "video = processor(video, return_tensors=\"pt\").to(model.device)\n", |
| 145 | "with torch.no_grad():\n", |
| 146 | " video_embeddings = model.get_vision_features(**video)\n", |
| 147 | "\n", |
| 148 | "print(video_embeddings.shape)" |
| 149 | ], |
| 150 | "metadata": { |
| 151 | "id": "kAgWZJHt24px" |
| 152 | }, |
| 153 | "execution_count": null, |
| 154 | "outputs": [] |
| 155 | } |
| 156 | ] |
| 157 | } |