experiment_cfg/final_model_config.json
1.5 KB · 57 lines · json Raw
1 {
2 "model_type": "Gr00tN1d7",
3 "model_dtype": "bfloat16",
4 "model_name": "nvidia/Cosmos-Reason2-2B",
5 "backbone_model_type": "qwen",
6 "model_revision": null,
7 "tune_top_llm_layers": 4,
8 "backbone_embedding_dim": 2048,
9 "tune_llm": true,
10 "tune_visual": true,
11 "select_layer": 16,
12 "reproject_vision": false,
13 "use_flash_attention": true,
14 "load_bf16": true,
15 "collator_overwrite_image_inputs": false,
16 "eagle_collator": false,
17 "backbone_trainable_params_fp32": true,
18 "gemma_collator": false,
19 "apply_sincos_state_encoding": true,
20 "use_percentiles": false,
21 "use_relative_action": true,
22 "max_state_dim": 128,
23 "max_action_dim": 128,
24 "action_horizon": 50,
25 "hidden_size": 1024,
26 "input_embedding_dim": 1536,
27 "state_history_length": 1,
28 "add_pos_embed": true,
29 "attn_dropout": 0.2,
30 "use_vlln": true,
31 "max_seq_len": 1024,
32 "use_alternate_vl_dit": true,
33 "attend_text_every_n_blocks": 2,
34 "diffusion_model_cfg": {
35 "positional_embeddings": null,
36 "num_layers": 32,
37 "num_attention_heads": 32,
38 "attention_head_dim": 48,
39 "norm_type": "ada_norm",
40 "dropout": 0.2,
41 "final_dropout": true,
42 "output_dim": 1024,
43 "interleave_self_attention": true
44 },
45 "num_inference_timesteps": 4,
46 "noise_beta_alpha": 1.5,
47 "noise_beta_beta": 1.0,
48 "noise_s": 0.999,
49 "num_timestep_buckets": 1000,
50 "tune_projector": true,
51 "tune_diffusion_model": true,
52 "tune_vlln": true,
53 "state_dropout_prob": 0.0,
54 "state_additive_noise_scale": 0.0,
55 "max_num_embodiments": 32
56 }
57