config.json
2.2 KB · 96 lines · json Raw
1 {
2 "type": "smolvla",
3 "n_obs_steps": 1,
4 "input_features": {
5 "observation.state": {
6 "type": "STATE",
7 "shape": [
8 6
9 ]
10 },
11 "observation.images.camera1": {
12 "type": "VISUAL",
13 "shape": [
14 3,
15 256,
16 256
17 ]
18 },
19 "observation.images.camera2": {
20 "type": "VISUAL",
21 "shape": [
22 3,
23 256,
24 256
25 ]
26 },
27 "observation.images.camera3": {
28 "type": "VISUAL",
29 "shape": [
30 3,
31 256,
32 256
33 ]
34 }
35 },
36 "output_features": {
37 "action": {
38 "type": "ACTION",
39 "shape": [
40 6
41 ]
42 }
43 },
44 "device": "cuda",
45 "use_amp": false,
46 "push_to_hub": true,
47 "repo_id": null,
48 "private": null,
49 "tags": null,
50 "license": null,
51 "chunk_size": 50,
52 "n_action_steps": 50,
53 "normalization_mapping": {
54 "VISUAL": "IDENTITY",
55 "STATE": "MEAN_STD",
56 "ACTION": "MEAN_STD"
57 },
58 "max_state_dim": 32,
59 "max_action_dim": 32,
60 "resize_imgs_with_padding": [
61 512,
62 512
63 ],
64 "empty_cameras": 0,
65 "adapt_to_pi_aloha": false,
66 "use_delta_joint_actions_aloha": false,
67 "tokenizer_max_length": 48,
68 "num_steps": 10,
69 "use_cache": true,
70 "freeze_vision_encoder": true,
71 "train_expert_only": true,
72 "train_state_proj": true,
73 "optimizer_lr": 0.0001,
74 "optimizer_betas": [
75 0.9,
76 0.95
77 ],
78 "optimizer_eps": 1e-08,
79 "optimizer_weight_decay": 1e-10,
80 "optimizer_grad_clip_norm": 10,
81 "scheduler_warmup_steps": 1000,
82 "scheduler_decay_steps": 30000,
83 "scheduler_decay_lr": 2.5e-06,
84 "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Video-Instruct",
85 "load_vlm_weights": true,
86 "add_image_special_tokens": false,
87 "attention_mode": "cross_attn",
88 "prefix_length": 0,
89 "pad_language_to": "max_length",
90 "num_expert_layers": 0,
91 "num_vlm_layers": 16,
92 "self_attn_every_n_layers": 2,
93 "expert_width_multiplier": 0.75,
94 "min_period": 0.004,
95 "max_period": 4.0
96 }