notebook.ipynb
4.2 KB · 157 lines · plaintext Raw
1 {
2 "nbformat": 4,
3 "nbformat_minor": 0,
4 "metadata": {
5 "colab": {
6 "provenance": [],
7 "gpuType": "T4"
8 },
9 "kernelspec": {
10 "name": "python3",
11 "display_name": "Python 3"
12 },
13 "language_info": {
14 "name": "python"
15 },
16 "accelerator": "GPU"
17 },
18 "cells": [
19 {
20 "cell_type": "markdown",
21 "source": [
22 "# Use VJEPA 2"
23 ],
24 "metadata": {
25 "id": "02ruu54h4yLc"
26 }
27 },
28 {
29 "cell_type": "markdown",
30 "source": [
31 "V-JEPA 2 is a new open 1.2B video embedding model by Meta, which attempts to capture the physical world modelling through video ⏯️\n",
32 "\n",
33 "The model can be used for various tasks for video: fine-tuning for downstream tasks like video classification, or any task involving embeddings (similarity, retrieval and more!).\n",
34 "\n",
35 "You can check all V-JEPA 2 checkpoints and the datasets that come with this release [in this collection](https://huggingface.co/collections/facebook/v-jepa-2-6841bad8413014e185b497a6)."
36 ],
37 "metadata": {
38 "id": "ol0IGYCd4hg4"
39 }
40 },
41 {
42 "cell_type": "markdown",
43 "source": [
44 "We need to install transformers' release specific branch."
45 ],
46 "metadata": {
47 "id": "kIIBxYOA41Ga"
48 }
49 },
50 {
51 "cell_type": "code",
52 "source": [
53 "!pip install -q git+https://github.com/huggingface/transformers@v4.52.4-VJEPA-2-preview"
54 ],
55 "metadata": {
56 "id": "4D4D1hC940yX"
57 },
58 "execution_count": null,
59 "outputs": []
60 },
61 {
62 "cell_type": "code",
63 "source": [
64 "from huggingface_hub import login # to later push the model\n",
65 "\n",
66 "login()"
67 ],
68 "metadata": {
69 "id": "Ne2rU68Ep1On"
70 },
71 "execution_count": null,
72 "outputs": []
73 },
74 {
75 "cell_type": "markdown",
76 "source": [
77 "As of now, Colab supports torchcodec==0.2.1 which supports torch==2.6.0."
78 ],
79 "metadata": {
80 "id": "dJWXmFu53Ap6"
81 }
82 },
83 {
84 "cell_type": "code",
85 "source": [
86 "!pip install -q torch==2.6.0 torchvision==0.21.0\n",
87 "!pip install -q torchcodec==0.2.1\n",
88 "\n",
89 "import torch\n",
90 "print(\"Torch:\", torch.__version__)\n",
91 "from torchcodec.decoders import VideoDecoder # verify"
92 ],
93 "metadata": {
94 "id": "JIoq84ze2_Ls"
95 },
96 "execution_count": null,
97 "outputs": []
98 },
99 {
100 "cell_type": "markdown",
101 "source": [
102 "## Initialize the model and the processor"
103 ],
104 "metadata": {
105 "id": "-7OATf5S20U_"
106 }
107 },
108 {
109 "cell_type": "code",
110 "source": [
111 "from transformers import AutoVideoProcessor, AutoModel\n",
112 "\n",
113 "hf_repo = \"facebook/vjepa2-vith-fpc64-256\"\n",
114 "\n",
115 "model = AutoModel.from_pretrained(hf_repo).to(\"cuda\")\n",
116 "processor = AutoVideoProcessor.from_pretrained(hf_repo)"
117 ],
118 "metadata": {
119 "id": "K8oSsy7Y2zQK"
120 },
121 "execution_count": null,
122 "outputs": []
123 },
124 {
125 "cell_type": "markdown",
126 "source": [
127 "## Extract video embeddings from the model"
128 ],
129 "metadata": {
130 "id": "ZJ_DUR9f22Uc"
131 }
132 },
133 {
134 "cell_type": "code",
135 "source": [
136 "import torch\n",
137 "from torchcodec.decoders import VideoDecoder\n",
138 "import numpy as np\n",
139 "\n",
140 "video_url = \"https://huggingface.co/datasets/nateraw/kinetics-mini/resolve/main/val/archery/-Qz25rXdMjE_000014_000024.mp4\"\n",
141 "vr = VideoDecoder(video_url)\n",
142 "frame_idx = np.arange(0, 64) # choosing some frames. here, you can define more complex sampling strategy\n",
143 "video = vr.get_frames_at(indices=frame_idx).data # T x C x H x W\n",
144 "video = processor(video, return_tensors=\"pt\").to(model.device)\n",
145 "with torch.no_grad():\n",
146 " video_embeddings = model.get_vision_features(**video)\n",
147 "\n",
148 "print(video_embeddings.shape)"
149 ],
150 "metadata": {
151 "id": "kAgWZJHt24px"
152 },
153 "execution_count": null,
154 "outputs": []
155 }
156 ]
157 }