config.json
2.1 KB · 88 lines · json Raw
1 {
2 "type": "smolvla",
3 "n_obs_steps": 1,
4 "input_features": {
5 "observation.images.image": {
6 "type": "VISUAL",
7 "shape": [
8 3,
9 256,
10 256
11 ]
12 },
13 "observation.images.image2": {
14 "type": "VISUAL",
15 "shape": [
16 3,
17 256,
18 256
19 ]
20 },
21 "observation.state": {
22 "type": "STATE",
23 "shape": [
24 8
25 ]
26 }
27 },
28 "output_features": {
29 "action": {
30 "type": "ACTION",
31 "shape": [
32 7
33 ]
34 }
35 },
36 "device": "cuda",
37 "use_amp": false,
38 "push_to_hub": true,
39 "repo_id": "None",
40 "private": null,
41 "tags": null,
42 "license": null,
43 "chunk_size": 50,
44 "n_action_steps": 1,
45 "normalization_mapping": {
46 "VISUAL": "IDENTITY",
47 "STATE": "MEAN_STD",
48 "ACTION": "MEAN_STD"
49 },
50 "max_state_dim": 32,
51 "max_action_dim": 32,
52 "resize_imgs_with_padding": [
53 512,
54 512
55 ],
56 "empty_cameras": 0,
57 "adapt_to_pi_aloha": false,
58 "use_delta_joint_actions_aloha": false,
59 "tokenizer_max_length": 48,
60 "num_steps": 10,
61 "use_cache": true,
62 "freeze_vision_encoder": true,
63 "train_expert_only": true,
64 "train_state_proj": true,
65 "optimizer_lr": 0.0001,
66 "optimizer_betas": [
67 0.9,
68 0.95
69 ],
70 "optimizer_eps": 1e-08,
71 "optimizer_weight_decay": 1e-10,
72 "optimizer_grad_clip_norm": 10,
73 "scheduler_warmup_steps": 1000,
74 "scheduler_decay_steps": 30000,
75 "scheduler_decay_lr": 2.5e-06,
76 "vlm_model_name": "HuggingFaceTB/SmolVLM2-500M-Instruct",
77 "load_vlm_weights": true,
78 "add_image_special_tokens": false,
79 "attention_mode": "cross_attn",
80 "prefix_length": 0,
81 "pad_language_to": "longest",
82 "num_expert_layers": -1,
83 "num_vlm_layers": 0,
84 "self_attn_every_n_layers": 2,
85 "expert_width_multiplier": 0.5,
86 "min_period": 0.004,
87 "max_period": 4.0
88 }