experiment_cfg/conf.yaml
30.4 KB · 1325 lines · yaml Raw
1 load_config_path: groot/vla/omni/configs/experiments/r1_pro/sharpa/n17_pretrain/n17_pretrain_human_robot_cross_embodiment_fix_yam_absolute_hand_2step.yaml
2 model:
3 return_dict: true
4 output_hidden_states: false
5 torchscript: false
6 dtype: null
7 pruned_heads: {}
8 tie_word_embeddings: true
9 chunk_size_feed_forward: 0
10 is_encoder_decoder: false
11 is_decoder: false
12 cross_attention_hidden_size: null
13 add_cross_attention: false
14 tie_encoder_decoder: false
15 architectures: null
16 finetuning_task: null
17 id2label:
18 0: LABEL_0
19 1: LABEL_1
20 label2id:
21 LABEL_0: 0
22 LABEL_1: 1
23 task_specific_params: null
24 problem_type: null
25 tokenizer_class: null
26 prefix: null
27 bos_token_id: null
28 pad_token_id: null
29 eos_token_id: null
30 sep_token_id: null
31 decoder_start_token_id: null
32 max_length: 20
33 min_length: 0
34 do_sample: false
35 early_stopping: false
36 num_beams: 1
37 temperature: 1.0
38 top_k: 50
39 top_p: 1.0
40 typical_p: 1.0
41 repetition_penalty: 1.0
42 length_penalty: 1.0
43 no_repeat_ngram_size: 0
44 encoder_no_repeat_ngram_size: 0
45 bad_words_ids: null
46 num_return_sequences: 1
47 output_scores: false
48 return_dict_in_generate: false
49 forced_bos_token_id: null
50 forced_eos_token_id: null
51 remove_invalid_values: false
52 exponential_decay_length_penalty: null
53 suppress_tokens: null
54 begin_suppress_tokens: null
55 num_beam_groups: 1
56 diversity_penalty: 0.0
57 transformers_version: null
58 model_type: GrootN1d5Qwen
59 model_dtype: bfloat16
60 vlm_backend: qwen3
61 vlm_model_path: nvidia/Cosmos-Reason2-2B
62 backbone_embedding_dim: 2048
63 tune_llm: false
64 tune_top_llm_layers: 0
65 tune_visual: false
66 tune_linear: true
67 select_layer: 16
68 reproject_vision: false
69 use_flash_attention: true
70 load_bf16: true
71 exclude_state: false
72 image_crop_size:
73 - 230
74 - 230
75 image_target_size:
76 - 256
77 - 256
78 random_rotation_angle: 0
79 color_jitter_params:
80 brightness: 0.3
81 contrast: 0.4
82 saturation: 0.5
83 hue: 0.08
84 formalize_language: true
85 action_space_prompt: false
86 apply_sincos_state_encoding: false
87 letter_box_transform: false
88 use_percentiles: true
89 use_mean_std: false
90 use_albumentations: true
91 shortest_image_edge: 256
92 crop_fraction: 0.95
93 random_history_crop: true
94 state_gaussian_noise_std: 0.0
95 do_human_interpolation: false
96 interpolation_steps: 20
97 human_embodiment_tags: null
98 max_state_dim: 132
99 max_action_dim: 132
100 action_horizon: 40
101 hidden_size: 1024
102 dit_latent_dim: 1536
103 state_dropout_prob: 0.2
104 language_dropout_prob: 0.0
105 add_pos_embed: true
106 attn_dropout: 0.2
107 use_vlln: true
108 use_vl_self_attention: true
109 max_seq_len: 1024
110 use_future_tokens: false
111 use_alternate_vl_dit: true
112 vl_self_attention_cfg:
113 positional_embeddings: null
114 num_layers: 4
115 num_attention_heads: 32
116 attention_head_dim: 64
117 dropout: 0.2
118 final_dropout: true
119 diffusion_model_cfg:
120 positional_embeddings: null
121 num_layers: 32
122 num_attention_heads: 32
123 attention_head_dim: 48
124 norm_type: ada_norm
125 dropout: 0.2
126 final_dropout: true
127 output_dim: 1024
128 interleave_self_attention: true
129 cross_attention_dim: 2048
130 num_inference_timesteps: 4
131 noise_beta_alpha: 1.5
132 noise_beta_beta: 1.0
133 noise_s: 0.999
134 num_timestep_buckets: 1000
135 tune_projector: true
136 tune_diffusion_model: true
137 tune_vlln: true
138 max_num_embodiments: 32
139 rtc_ramp_rate: 6.0
140 tf_legacy_loss: false
141 use_bfloat16: false
142 data:
143 datasets:
144 - dataset_paths:
145 - /mnt/aws-lfs-02/shared/datasets/xdof.yam_v7_all_merged_global_task_exclude_bad_subtasks
146 embodiment_tag: xdof_relative_eef_relative_joint
147 mix_ratio: 0.1
148 dataset_type: physical_embodiment
149 - dataset_paths:
150 - /mnt/aws-lfs-02/shared/datasets/xdof.yam_v7_subtask_only_merged_global_task
151 embodiment_tag: xdof_relative_eef_relative_joint_subtask
152 mix_ratio: 0.2
153 dataset_type: physical_embodiment
154 - dataset_paths:
155 - /mnt/aws-lfs-02/shared/datasets/droid_101_success_idlefiltered_n17
156 - /mnt/aws-lfs-02/shared/datasets/droid_101_success_idlefiltered_n17_swapped
157 embodiment_tag: oxe_droid_relative_eef_relative_joint
158 mix_ratio: 0.1
159 dataset_type: physical_embodiment
160 - dataset_paths:
161 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/real_g1.g1-in-the-wild-merged
162 embodiment_tag: real_g1_relative_eef_relative_joints
163 mix_ratio: 0.05
164 dataset_type: physical_embodiment
165 - dataset_paths:
166 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/real_r1_pro_sharpa.inlab_play_real_robot_batch_1
167 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/real_r1_pro_sharpa.inlab_play_real_robot_batch_2
168 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/real_r1_pro_sharpa.miscellaneous_1k_trajectories
169 embodiment_tag: real_r1_pro_sharpa_relative_eef
170 mix_ratio: 0.05
171 dataset_type: physical_embodiment
172 - dataset_paths:
173 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/mecka_lerobot/real_r1_pro_sharpa.mecka_batch1-2025-12-10-merged
174 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/mecka_lerobot/real_r1_pro_sharpa.mecka_batch3_2026-01-04-merged_backup
175 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/mecka_lerobot/real_r1_pro_sharpa.mecka_batch4_2026-01-05-merged_backup
176 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/mecka_lerobot/real_r1_pro_sharpa.mecka_batch5_2026-01-05-merged_backup
177 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/mecka_lerobot/real_r1_pro_sharpa.mecka_batch6_2026-01-05-merged_backup
178 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/mecka_lerobot/real_r1_pro_sharpa.mecka_batch10_2026-01-10-merged_backup
179 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/mecka_lerobot/real_r1_pro_sharpa.mecka_batch11_2026-01-10-merged_backup
180 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/mecka_lerobot/real_r1_pro_sharpa.mecka_batch12_2026-01-10-merged_backup
181 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/mecka_lerobot/real_r1_pro_sharpa.mecka_batch8_2026-01-10-merged_backup
182 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/mecka_lerobot/real_r1_pro_sharpa.mecka_batch9_2026-01-10-merged_backup
183 embodiment_tag: real_r1_pro_sharpa_relative_eef_mecka
184 mix_ratio: 0.25
185 dataset_type: physical_embodiment
186 - dataset_paths:
187 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/maxinsights_lerobot_updated/1530hrs/real_r1_pro_sharpa.maxinsights_1530hrs_updated_train_set_merged
188 embodiment_tag: real_r1_pro_sharpa_relative_eef_maxinsights
189 mix_ratio: 0.2
190 dataset_type: physical_embodiment
191 - dataset_paths:
192 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/real_r1_pro_sharpa.inlab_play_human_batch1
193 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/real_r1_pro_sharpa.inlab_play_human_batch2
194 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/real_r1_pro_sharpa.shirt_rolling_task24_2000_human_video_filter_n6_keep1619_demo_stats
195 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/real_r1_pro_sharpa.shirt_rolling_task15_2000_human_video_filter_n6_keep572_demo_stats
196 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/real_r1_pro_sharpa.sort_cards_human_filter_n6_keep523_demo_stats_overwrite_left_side_stats
197 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/real_r1_pro_sharpa.tong_task38_2000_human_video_overwrite_left_side_stats
198 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/real_r1_pro_sharpa.syringe_task30i_2000_human_video_filtered
199 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/real_r1_pro_sharpa.unscrew_bottle_task43_2000_human_video_fixed-duration
200 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/real_r1_pro_sharpa.unscrew_Jim_bottle_task47_600_human_video
201 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/real_r1_pro_sharpa.fold_shirt_task30b_500_human_video_halfdone
202 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/real_r1_pro_sharpa.fold_towel_task30c_500_human_video_halfdone
203 - /mnt/aws-lfs-02/shared/datasets/galaxea_sharpa/real_r1_pro_sharpa.sort_cards_task32e_1000_human_video
204 embodiment_tag: real_r1_pro_sharpa_relative_eef_human
205 mix_ratio: 0.05
206 dataset_type: physical_embodiment
207 modality_configs:
208 real_g1_relative_eef_relative_joints:
209 video:
210 delta_indices:
211 - -20
212 - 0
213 modality_keys:
214 - ego_view
215 normalization_mode: null
216 action_representation: null
217 exclude_state: false
218 action_type: null
219 action_format: null
220 normalize_rotation: true
221 wrist_keys: null
222 hand_keys: null
223 extra_keys: null
224 loss_weights: null
225 state:
226 delta_indices:
227 - 0
228 modality_keys:
229 - left_wrist_eef_9d
230 - right_wrist_eef_9d
231 - left_hand
232 - right_hand
233 - left_arm
234 - right_arm
235 - waist
236 normalization_mode: null
237 action_representation: null
238 exclude_state: false
239 action_type: null
240 action_format: null
241 normalize_rotation: true
242 wrist_keys: null
243 hand_keys: null
244 extra_keys: null
245 loss_weights: null
246 action:
247 delta_indices:
248 - 0
249 - 1
250 - 2
251 - 3
252 - 4
253 - 5
254 - 6
255 - 7
256 - 8
257 - 9
258 - 10
259 - 11
260 - 12
261 - 13
262 - 14
263 - 15
264 - 16
265 - 17
266 - 18
267 - 19
268 - 20
269 - 21
270 - 22
271 - 23
272 - 24
273 - 25
274 - 26
275 - 27
276 - 28
277 - 29
278 - 30
279 - 31
280 - 32
281 - 33
282 - 34
283 - 35
284 - 36
285 - 37
286 - 38
287 - 39
288 modality_keys:
289 - left_wrist_eef_9d
290 - right_wrist_eef_9d
291 - left_hand
292 - right_hand
293 - left_arm
294 - right_arm
295 - waist
296 - base_height_command
297 - navigate_command
298 normalization_mode: null
299 action_representation:
300 - {}
301 - {}
302 - {}
303 - {}
304 - {}
305 - {}
306 - {}
307 - {}
308 - {}
309 exclude_state: false
310 action_type:
311 - {}
312 - {}
313 - {}
314 - {}
315 - {}
316 - {}
317 - {}
318 - {}
319 - {}
320 action_format:
321 - {}
322 - {}
323 - {}
324 - {}
325 - {}
326 - {}
327 - {}
328 - {}
329 - {}
330 normalize_rotation: true
331 wrist_keys:
332 - left_wrist_eef_9d
333 - right_wrist_eef_9d
334 hand_keys:
335 - left_hand
336 - right_hand
337 extra_keys:
338 - left_arm
339 - right_arm
340 - waist
341 - base_height_command
342 - navigate_command
343 loss_weights: null
344 language:
345 delta_indices:
346 - 0
347 modality_keys:
348 - annotation.human.task_description
349 normalization_mode: null
350 action_representation: null
351 exclude_state: false
352 action_type: null
353 action_format: null
354 normalize_rotation: true
355 wrist_keys: null
356 hand_keys: null
357 extra_keys: null
358 loss_weights: null
359 real_r1_pro_sharpa_relative_eef_mecka:
360 video:
361 delta_indices:
362 - -30
363 - 0
364 modality_keys:
365 - ego_view_cropratio_res320x240_freq30
366 normalization_mode: null
367 action_representation: null
368 exclude_state: false
369 action_type: null
370 action_format: null
371 normalize_rotation: true
372 wrist_keys: null
373 hand_keys: null
374 extra_keys: null
375 loss_weights: null
376 state:
377 delta_indices:
378 - 0
379 modality_keys:
380 - left_wrist_eef
381 - right_wrist_eef
382 - left_hand_joints
383 - right_hand_joints
384 normalization_mode: null
385 action_representation: null
386 exclude_state: true
387 action_type: null
388 action_format: null
389 normalize_rotation: true
390 wrist_keys: null
391 hand_keys: null
392 extra_keys: null
393 loss_weights: null
394 action:
395 delta_indices:
396 - 0
397 - 1
398 - 2
399 - 3
400 - 4
401 - 5
402 - 6
403 - 7
404 - 8
405 - 9
406 - 10
407 - 11
408 - 12
409 - 13
410 - 14
411 - 15
412 - 16
413 - 17
414 - 18
415 - 19
416 - 20
417 - 21
418 - 22
419 - 23
420 - 24
421 - 25
422 - 26
423 - 27
424 - 28
425 - 29
426 - 30
427 - 31
428 - 32
429 - 33
430 - 34
431 - 35
432 - 36
433 - 37
434 - 38
435 - 39
436 modality_keys:
437 - left_wrist_eef
438 - right_wrist_eef
439 - left_hand_joints
440 - right_hand_joints
441 normalization_mode: null
442 action_representation:
443 - {}
444 - {}
445 - {}
446 - {}
447 exclude_state: false
448 action_type:
449 - {}
450 - {}
451 - {}
452 - {}
453 action_format:
454 - {}
455 - {}
456 - {}
457 - {}
458 normalize_rotation: true
459 wrist_keys:
460 - left_wrist_eef
461 - right_wrist_eef
462 hand_keys:
463 - left_hand_joints
464 - right_hand_joints
465 extra_keys: []
466 loss_weights: null
467 language:
468 delta_indices:
469 - 0
470 modality_keys:
471 - annotation.human.coarse_action
472 normalization_mode: null
473 action_representation: null
474 exclude_state: false
475 action_type: null
476 action_format: null
477 normalize_rotation: true
478 wrist_keys: null
479 hand_keys: null
480 extra_keys: null
481 loss_weights: null
482 oxe_droid_relative_eef_relative_joint:
483 video:
484 delta_indices:
485 - -15
486 - 0
487 modality_keys:
488 - exterior_image_1_left
489 - wrist_image_left
490 normalization_mode: null
491 action_representation: null
492 exclude_state: false
493 action_type: null
494 action_format: null
495 normalize_rotation: true
496 wrist_keys: null
497 hand_keys: null
498 extra_keys: null
499 loss_weights: null
500 state:
501 delta_indices:
502 - 0
503 modality_keys:
504 - eef_9d
505 - gripper_position
506 - joint_position
507 normalization_mode: null
508 action_representation: null
509 exclude_state: false
510 action_type: null
511 action_format: null
512 normalize_rotation: true
513 wrist_keys: null
514 hand_keys: null
515 extra_keys: null
516 loss_weights: null
517 action:
518 delta_indices:
519 - 0
520 - 1
521 - 2
522 - 3
523 - 4
524 - 5
525 - 6
526 - 7
527 - 8
528 - 9
529 - 10
530 - 11
531 - 12
532 - 13
533 - 14
534 - 15
535 - 16
536 - 17
537 - 18
538 - 19
539 - 20
540 - 21
541 - 22
542 - 23
543 - 24
544 - 25
545 - 26
546 - 27
547 - 28
548 - 29
549 - 30
550 - 31
551 - 32
552 - 33
553 - 34
554 - 35
555 - 36
556 - 37
557 - 38
558 - 39
559 modality_keys:
560 - eef_9d
561 - gripper_position
562 - joint_position
563 normalization_mode: null
564 action_representation:
565 - {}
566 - {}
567 - {}
568 exclude_state: false
569 action_type:
570 - {}
571 - {}
572 - {}
573 action_format:
574 - {}
575 - {}
576 - {}
577 normalize_rotation: true
578 wrist_keys:
579 - eef_9d
580 hand_keys:
581 - gripper_position
582 extra_keys:
583 - joint_position
584 loss_weights: null
585 language:
586 delta_indices:
587 - 0
588 modality_keys:
589 - annotation.language.language_instruction
590 - annotation.language.language_instruction_2
591 - annotation.language.language_instruction_3
592 normalization_mode: null
593 action_representation: null
594 exclude_state: false
595 action_type: null
596 action_format: null
597 normalize_rotation: true
598 wrist_keys: null
599 hand_keys: null
600 extra_keys: null
601 loss_weights: null
602 real_r1_pro_sharpa_relative_eef_human:
603 video:
604 delta_indices:
605 - -20
606 - 0
607 modality_keys:
608 - ego_view_res320x240_freq20
609 - left_wrist_view_res320x240_freq20
610 - right_wrist_view_res320x240_freq20
611 normalization_mode: null
612 action_representation: null
613 exclude_state: false
614 action_type: null
615 action_format: null
616 normalize_rotation: true
617 wrist_keys: null
618 hand_keys: null
619 extra_keys: null
620 loss_weights: null
621 state:
622 delta_indices:
623 - 0
624 modality_keys:
625 - left_wrist_eef
626 - right_wrist_eef
627 - left_hand_joints
628 - right_hand_joints
629 normalization_mode: null
630 action_representation: null
631 exclude_state: true
632 action_type: null
633 action_format: null
634 normalize_rotation: true
635 wrist_keys: null
636 hand_keys: null
637 extra_keys: null
638 loss_weights: null
639 action:
640 delta_indices:
641 - 0
642 - 1
643 - 2
644 - 3
645 - 4
646 - 5
647 - 6
648 - 7
649 - 8
650 - 9
651 - 10
652 - 11
653 - 12
654 - 13
655 - 14
656 - 15
657 - 16
658 - 17
659 - 18
660 - 19
661 - 20
662 - 21
663 - 22
664 - 23
665 - 24
666 - 25
667 - 26
668 - 27
669 - 28
670 - 29
671 - 30
672 - 31
673 - 32
674 - 33
675 - 34
676 - 35
677 - 36
678 - 37
679 - 38
680 - 39
681 modality_keys:
682 - left_wrist_eef
683 - right_wrist_eef
684 - left_hand_joints
685 - right_hand_joints
686 normalization_mode: null
687 action_representation:
688 - {}
689 - {}
690 - {}
691 - {}
692 exclude_state: false
693 action_type:
694 - {}
695 - {}
696 - {}
697 - {}
698 action_format:
699 - {}
700 - {}
701 - {}
702 - {}
703 normalize_rotation: true
704 wrist_keys:
705 - left_wrist_eef
706 - right_wrist_eef
707 hand_keys:
708 - left_hand_joints
709 - right_hand_joints
710 extra_keys: []
711 loss_weights: null
712 language:
713 delta_indices:
714 - 0
715 modality_keys:
716 - annotation.human.coarse_action
717 normalization_mode: null
718 action_representation: null
719 exclude_state: false
720 action_type: null
721 action_format: null
722 normalize_rotation: true
723 wrist_keys: null
724 hand_keys: null
725 extra_keys: null
726 loss_weights: null
727 xdof_relative_eef_relative_joint:
728 video:
729 delta_indices:
730 - -30
731 - 0
732 modality_keys:
733 - top_camera-images-rgb_320_240
734 - left_camera-images-rgb_320_240
735 - right_camera-images-rgb_320_240
736 normalization_mode: null
737 action_representation: null
738 exclude_state: false
739 action_type: null
740 action_format: null
741 normalize_rotation: true
742 wrist_keys: null
743 hand_keys: null
744 extra_keys: null
745 loss_weights: null
746 state:
747 delta_indices:
748 - 0
749 modality_keys:
750 - left_wrist_eef
751 - right_wrist_eef
752 - left_gripper_pos
753 - right_gripper_pos
754 - left_joint_pos
755 - right_joint_pos
756 normalization_mode: null
757 action_representation: null
758 exclude_state: false
759 action_type: null
760 action_format: null
761 normalize_rotation: true
762 wrist_keys: null
763 hand_keys: null
764 extra_keys: null
765 loss_weights: null
766 action:
767 delta_indices:
768 - 0
769 - 1
770 - 2
771 - 3
772 - 4
773 - 5
774 - 6
775 - 7
776 - 8
777 - 9
778 - 10
779 - 11
780 - 12
781 - 13
782 - 14
783 - 15
784 - 16
785 - 17
786 - 18
787 - 19
788 - 20
789 - 21
790 - 22
791 - 23
792 - 24
793 - 25
794 - 26
795 - 27
796 - 28
797 - 29
798 - 30
799 - 31
800 - 32
801 - 33
802 - 34
803 - 35
804 - 36
805 - 37
806 - 38
807 - 39
808 modality_keys:
809 - left_wrist_eef
810 - right_wrist_eef
811 - left_gripper_pos
812 - right_gripper_pos
813 - left_joint_pos
814 - right_joint_pos
815 normalization_mode: null
816 action_representation:
817 - {}
818 - {}
819 - {}
820 - {}
821 - {}
822 - {}
823 exclude_state: false
824 action_type:
825 - {}
826 - {}
827 - {}
828 - {}
829 - {}
830 - {}
831 action_format:
832 - {}
833 - {}
834 - {}
835 - {}
836 - {}
837 - {}
838 normalize_rotation: true
839 wrist_keys:
840 - left_wrist_eef
841 - right_wrist_eef
842 hand_keys:
843 - left_gripper_pos
844 - right_gripper_pos
845 extra_keys:
846 - left_joint_pos
847 - right_joint_pos
848 loss_weights: null
849 language:
850 delta_indices:
851 - 0
852 modality_keys:
853 - annotation.task
854 normalization_mode: null
855 action_representation: null
856 exclude_state: false
857 action_type: null
858 action_format: null
859 normalize_rotation: true
860 wrist_keys: null
861 hand_keys: null
862 extra_keys: null
863 loss_weights: null
864 xdof_relative_eef_relative_joint_subtask:
865 video:
866 delta_indices:
867 - -30
868 - 0
869 modality_keys:
870 - top_camera-images-rgb_320_240
871 - left_camera-images-rgb_320_240
872 - right_camera-images-rgb_320_240
873 normalization_mode: null
874 action_representation: null
875 exclude_state: false
876 action_type: null
877 action_format: null
878 normalize_rotation: true
879 wrist_keys: null
880 hand_keys: null
881 extra_keys: null
882 loss_weights: null
883 state:
884 delta_indices:
885 - 0
886 modality_keys:
887 - left_wrist_eef
888 - right_wrist_eef
889 - left_gripper_pos
890 - right_gripper_pos
891 - left_joint_pos
892 - right_joint_pos
893 normalization_mode: null
894 action_representation: null
895 exclude_state: false
896 action_type: null
897 action_format: null
898 normalize_rotation: true
899 wrist_keys: null
900 hand_keys: null
901 extra_keys: null
902 loss_weights: null
903 action:
904 delta_indices:
905 - 0
906 - 1
907 - 2
908 - 3
909 - 4
910 - 5
911 - 6
912 - 7
913 - 8
914 - 9
915 - 10
916 - 11
917 - 12
918 - 13
919 - 14
920 - 15
921 - 16
922 - 17
923 - 18
924 - 19
925 - 20
926 - 21
927 - 22
928 - 23
929 - 24
930 - 25
931 - 26
932 - 27
933 - 28
934 - 29
935 - 30
936 - 31
937 - 32
938 - 33
939 - 34
940 - 35
941 - 36
942 - 37
943 - 38
944 - 39
945 modality_keys:
946 - left_wrist_eef
947 - right_wrist_eef
948 - left_gripper_pos
949 - right_gripper_pos
950 - left_joint_pos
951 - right_joint_pos
952 normalization_mode: null
953 action_representation:
954 - {}
955 - {}
956 - {}
957 - {}
958 - {}
959 - {}
960 exclude_state: false
961 action_type:
962 - {}
963 - {}
964 - {}
965 - {}
966 - {}
967 - {}
968 action_format:
969 - {}
970 - {}
971 - {}
972 - {}
973 - {}
974 - {}
975 normalize_rotation: true
976 wrist_keys:
977 - left_wrist_eef
978 - right_wrist_eef
979 hand_keys:
980 - left_gripper_pos
981 - right_gripper_pos
982 extra_keys:
983 - left_joint_pos
984 - right_joint_pos
985 loss_weights: null
986 language:
987 delta_indices:
988 - 0
989 modality_keys:
990 - annotation.sub_task
991 normalization_mode: null
992 action_representation: null
993 exclude_state: false
994 action_type: null
995 action_format: null
996 normalize_rotation: true
997 wrist_keys: null
998 hand_keys: null
999 extra_keys: null
1000 loss_weights: null
1001 real_r1_pro_sharpa_relative_eef:
1002 video:
1003 delta_indices:
1004 - -20
1005 - 0
1006 modality_keys:
1007 - ego_view_res320x240_freq20
1008 - left_wrist_view_res320x240_freq20
1009 - right_wrist_view_res320x240_freq20
1010 normalization_mode: null
1011 action_representation: null
1012 exclude_state: false
1013 action_type: null
1014 action_format: null
1015 normalize_rotation: true
1016 wrist_keys: null
1017 hand_keys: null
1018 extra_keys: null
1019 loss_weights: null
1020 state:
1021 delta_indices:
1022 - 0
1023 modality_keys:
1024 - left_wrist_eef
1025 - right_wrist_eef
1026 - left_hand_joints
1027 - right_hand_joints
1028 normalization_mode: null
1029 action_representation: null
1030 exclude_state: false
1031 action_type: null
1032 action_format: null
1033 normalize_rotation: true
1034 wrist_keys: null
1035 hand_keys: null
1036 extra_keys: null
1037 loss_weights: null
1038 action:
1039 delta_indices:
1040 - 0
1041 - 1
1042 - 2
1043 - 3
1044 - 4
1045 - 5
1046 - 6
1047 - 7
1048 - 8
1049 - 9
1050 - 10
1051 - 11
1052 - 12
1053 - 13
1054 - 14
1055 - 15
1056 - 16
1057 - 17
1058 - 18
1059 - 19
1060 - 20
1061 - 21
1062 - 22
1063 - 23
1064 - 24
1065 - 25
1066 - 26
1067 - 27
1068 - 28
1069 - 29
1070 - 30
1071 - 31
1072 - 32
1073 - 33
1074 - 34
1075 - 35
1076 - 36
1077 - 37
1078 - 38
1079 - 39
1080 modality_keys:
1081 - left_wrist_eef
1082 - right_wrist_eef
1083 - left_hand_joints
1084 - right_hand_joints
1085 normalization_mode: null
1086 action_representation:
1087 - {}
1088 - {}
1089 - {}
1090 - {}
1091 exclude_state: false
1092 action_type:
1093 - {}
1094 - {}
1095 - {}
1096 - {}
1097 action_format:
1098 - {}
1099 - {}
1100 - {}
1101 - {}
1102 normalize_rotation: true
1103 wrist_keys:
1104 - left_wrist_eef
1105 - right_wrist_eef
1106 hand_keys:
1107 - left_hand_joints
1108 - right_hand_joints
1109 extra_keys: []
1110 loss_weights: null
1111 language:
1112 delta_indices:
1113 - 0
1114 modality_keys:
1115 - annotation.human.coarse_action
1116 normalization_mode: null
1117 action_representation: null
1118 exclude_state: false
1119 action_type: null
1120 action_format: null
1121 normalize_rotation: true
1122 wrist_keys: null
1123 hand_keys: null
1124 extra_keys: null
1125 loss_weights: null
1126 real_r1_pro_sharpa_relative_eef_maxinsights:
1127 video:
1128 delta_indices:
1129 - -30
1130 - 0
1131 modality_keys:
1132 - ego_view_cropratio_res320x240_freq30
1133 normalization_mode: null
1134 action_representation: null
1135 exclude_state: false
1136 action_type: null
1137 action_format: null
1138 normalize_rotation: true
1139 wrist_keys: null
1140 hand_keys: null
1141 extra_keys: null
1142 loss_weights: null
1143 state:
1144 delta_indices:
1145 - 0
1146 modality_keys:
1147 - left_wrist_eef
1148 - right_wrist_eef
1149 - left_hand_joints
1150 - right_hand_joints
1151 normalization_mode: null
1152 action_representation: null
1153 exclude_state: true
1154 action_type: null
1155 action_format: null
1156 normalize_rotation: true
1157 wrist_keys: null
1158 hand_keys: null
1159 extra_keys: null
1160 loss_weights: null
1161 action:
1162 delta_indices:
1163 - 0
1164 - 1
1165 - 2
1166 - 3
1167 - 4
1168 - 5
1169 - 6
1170 - 7
1171 - 8
1172 - 9
1173 - 10
1174 - 11
1175 - 12
1176 - 13
1177 - 14
1178 - 15
1179 - 16
1180 - 17
1181 - 18
1182 - 19
1183 - 20
1184 - 21
1185 - 22
1186 - 23
1187 - 24
1188 - 25
1189 - 26
1190 - 27
1191 - 28
1192 - 29
1193 - 30
1194 - 31
1195 - 32
1196 - 33
1197 - 34
1198 - 35
1199 - 36
1200 - 37
1201 - 38
1202 - 39
1203 modality_keys:
1204 - left_wrist_eef
1205 - right_wrist_eef
1206 - left_hand_joints
1207 - right_hand_joints
1208 normalization_mode: null
1209 action_representation:
1210 - {}
1211 - {}
1212 - {}
1213 - {}
1214 exclude_state: false
1215 action_type:
1216 - {}
1217 - {}
1218 - {}
1219 - {}
1220 action_format:
1221 - {}
1222 - {}
1223 - {}
1224 - {}
1225 normalize_rotation: true
1226 wrist_keys:
1227 - left_wrist_eef
1228 - right_wrist_eef
1229 hand_keys:
1230 - left_hand_joints
1231 - right_hand_joints
1232 extra_keys: []
1233 loss_weights: null
1234 language:
1235 delta_indices:
1236 - 0
1237 modality_keys:
1238 - annotation.human.coarse_action
1239 normalization_mode: null
1240 action_representation: null
1241 exclude_state: false
1242 action_type: null
1243 action_format: null
1244 normalize_rotation: true
1245 wrist_keys: null
1246 hand_keys: null
1247 extra_keys: null
1248 loss_weights: null
1249 download_cache: false
1250 shard_size: 1024
1251 episode_sampling_rate: 0.1
1252 num_shards_per_epoch: 100000
1253 override_pretraining_statistics: false
1254 mode: single_turn
1255 random_chop: 0.0
1256 mock_dataset_mode: false
1257 num_prompt_trajectories: 2
1258 variable_num_demos: false
1259 max_prompt_trajectories: 5
1260 shuffle: true
1261 seed: 24
1262 subsample_ratio: 1.0
1263 image_crop_size:
1264 - 244
1265 - 244
1266 image_target_size:
1267 - 224
1268 - 224
1269 video_backend: torchcodec
1270 training:
1271 output_dir: nvidia/Cosmos-Reason2-2B
1272 experiment_name: null
1273 max_steps: 200000
1274 global_batch_size: 1024
1275 batch_size: 32
1276 gradient_accumulation_steps: 1
1277 use_muon: false
1278 muon_lr: 0.005
1279 use_legacy_wd_application: false
1280 learning_rate: 5.0e-05
1281 lr_scheduler_type: cosine
1282 weight_decay: 1.0e-05
1283 warmup_ratio: 0.05
1284 warmup_steps: 0
1285 max_grad_norm: 1.0
1286 wsd_stable_ratio: 0.8
1287 wsd_decay_type: cosine
1288 optim: adamw_torch_fused
1289 start_from_checkpoint: null
1290 tf32: true
1291 fp16: false
1292 bf16: true
1293 eval_bf16: true
1294 logging_steps: 10
1295 save_steps: 1000
1296 save_total_limit: 5
1297 save_vl_model: false
1298 upload_checkpoints: true
1299 upload_every: 1000
1300 upload_last_n_checkpoints: 5
1301 max_concurrent_uploads: 2
1302 eval_strategy: 'no'
1303 eval_steps: 500
1304 eval_set_split_ratio: 0.1
1305 eval_batch_size: 2
1306 save_best_eval_metric_name: ''
1307 save_best_eval_metric_greater_is_better: true
1308 deepspeed_stage: 2
1309 gradient_checkpointing: false
1310 use_ddp: false
1311 num_gpus: 256
1312 dataloader_num_workers: 4
1313 remove_unused_columns: false
1314 use_wandb: true
1315 wandb_project: human_pretraining_n15_galaxea_sharpa
1316 enable_profiling: false
1317 max_retries: 3
1318 skip_spike: true
1319 skip_spike_threshold: 5.0
1320 skip_spike_ema_alpha: 0.99
1321 skip_spike_max_consecutive: 10
1322 assert_loss_less_than: null
1323 max_steps: 200000
1324 save_steps: 1000
1325