transformer/config.json

1.1 KB · 45 lines · json Raw

1	`{`
2	`"_class_name": "LTX2VideoTransformer3DModel",`
3	`"_diffusers_version": "0.37.0.dev0",`
4	`"activation_fn": "gelu-approximate",`
5	`"attention_bias": true,`
6	`"attention_head_dim": 128,`
7	`"attention_out_bias": true,`
8	`"audio_attention_head_dim": 64,`
9	`"audio_cross_attention_dim": 2048,`
10	`"audio_hop_length": 160,`
11	`"audio_in_channels": 128,`
12	`"audio_num_attention_heads": 32,`
13	`"audio_out_channels": 128,`
14	`"audio_patch_size": 1,`
15	`"audio_patch_size_t": 1,`
16	`"audio_pos_embed_max_pos": 20,`
17	`"audio_sampling_rate": 16000,`
18	`"audio_scale_factor": 4,`
19	`"base_height": 2048,`
20	`"base_width": 2048,`
21	`"caption_channels": 3840,`
22	`"causal_offset": 1,`
23	`"cross_attention_dim": 4096,`
24	`"cross_attn_timestep_scale_multiplier": 1000,`
25	`"in_channels": 128,`
26	`"norm_elementwise_affine": false,`
27	`"norm_eps": 1e-06,`
28	`"num_attention_heads": 32,`
29	`"num_layers": 48,`
30	`"out_channels": 128,`
31	`"patch_size": 1,`
32	`"patch_size_t": 1,`
33	`"pos_embed_max_pos": 20,`
34	`"qk_norm": "rms_norm_across_heads",`
35	`"rope_double_precision": true,`
36	`"rope_theta": 10000.0,`
37	`"rope_type": "split",`
38	`"timestep_scale_multiplier": 1000,`
39	`"vae_scale_factors": [`
40	`8,`
41	`32,`
42	`32`
43	`]`
44	`}`
45