transformer/config.json
1.1 KB · 45 lines · json Raw
1 {
2 "_class_name": "LTX2VideoTransformer3DModel",
3 "_diffusers_version": "0.37.0.dev0",
4 "activation_fn": "gelu-approximate",
5 "attention_bias": true,
6 "attention_head_dim": 128,
7 "attention_out_bias": true,
8 "audio_attention_head_dim": 64,
9 "audio_cross_attention_dim": 2048,
10 "audio_hop_length": 160,
11 "audio_in_channels": 128,
12 "audio_num_attention_heads": 32,
13 "audio_out_channels": 128,
14 "audio_patch_size": 1,
15 "audio_patch_size_t": 1,
16 "audio_pos_embed_max_pos": 20,
17 "audio_sampling_rate": 16000,
18 "audio_scale_factor": 4,
19 "base_height": 2048,
20 "base_width": 2048,
21 "caption_channels": 3840,
22 "causal_offset": 1,
23 "cross_attention_dim": 4096,
24 "cross_attn_timestep_scale_multiplier": 1000,
25 "in_channels": 128,
26 "norm_elementwise_affine": false,
27 "norm_eps": 1e-06,
28 "num_attention_heads": 32,
29 "num_layers": 48,
30 "out_channels": 128,
31 "patch_size": 1,
32 "patch_size_t": 1,
33 "pos_embed_max_pos": 20,
34 "qk_norm": "rms_norm_across_heads",
35 "rope_double_precision": true,
36 "rope_theta": 10000.0,
37 "rope_type": "split",
38 "timestep_scale_multiplier": 1000,
39 "vae_scale_factors": [
40 8,
41 32,
42 32
43 ]
44 }
45