config.json
2.7 KB · 116 lines · json Raw
1 {
2 "acoustic_vae_dim": 64,
3 "acoustic_tokenizer_config": {
4 "causal": true,
5 "channels": 1,
6 "conv_bias": true,
7 "conv_norm": "none",
8 "corpus_normalize": 0.0,
9 "decoder_depths": null,
10 "decoder_n_filters": 32,
11 "decoder_ratios": [
12 8,
13 5,
14 5,
15 4,
16 2,
17 2
18 ],
19 "disable_last_norm": true,
20 "encoder_depths": "3-3-3-3-3-3-8",
21 "encoder_n_filters": 32,
22 "encoder_ratios": [
23 8,
24 5,
25 5,
26 4,
27 2,
28 2
29 ],
30 "fix_std": 0.5,
31 "layer_scale_init_value": 1e-06,
32 "layernorm": "RMSNorm",
33 "layernorm_elementwise_affine": true,
34 "layernorm_eps": 1e-05,
35 "mixer_layer": "depthwise_conv",
36 "model_type": "vibevoice_acoustic_tokenizer",
37 "pad_mode": "constant",
38 "std_dist_type": "gaussian",
39 "vae_dim": 64,
40 "weight_init_value": 0.01
41 },
42 "architectures": [
43 "VibeVoiceForConditionalGeneration"
44 ],
45 "decoder_config": {
46 "attention_dropout": 0.0,
47 "hidden_act": "silu",
48 "hidden_size": 1536,
49 "initializer_range": 0.02,
50 "intermediate_size": 8960,
51 "max_position_embeddings": 65536,
52 "max_window_layers": 28,
53 "model_type": "qwen2",
54 "num_attention_heads": 12,
55 "num_hidden_layers": 28,
56 "num_key_value_heads": 2,
57 "rms_norm_eps": 1e-06,
58 "rope_scaling": null,
59 "rope_theta": 1000000.0,
60 "sliding_window": null,
61 "tie_word_embeddings": true,
62 "torch_dtype": "bfloat16",
63 "use_cache": true,
64 "use_sliding_window": false,
65 "vocab_size": 151936
66 },
67 "diffusion_head_config": {
68 "ddpm_batch_mul": 4,
69 "ddpm_beta_schedule": "cosine",
70 "ddpm_num_inference_steps": 20,
71 "ddpm_num_steps": 1000,
72 "diffusion_type": "ddpm",
73 "head_ffn_ratio": 3.0,
74 "head_layers": 4,
75 "hidden_size": 1536,
76 "latent_size": 64,
77 "model_type": "vibevoice_diffusion_head",
78 "prediction_type": "v_prediction",
79 "rms_norm_eps": 1e-05,
80 "speech_vae_dim": 64
81 },
82 "model_type": "vibevoice",
83 "semantic_tokenizer_config": {
84 "causal": true,
85 "channels": 1,
86 "conv_bias": true,
87 "conv_norm": "none",
88 "corpus_normalize": 0.0,
89 "disable_last_norm": true,
90 "encoder_depths": "3-3-3-3-3-3-8",
91 "encoder_n_filters": 32,
92 "encoder_ratios": [
93 8,
94 5,
95 5,
96 4,
97 2,
98 2
99 ],
100 "fix_std": 0,
101 "layer_scale_init_value": 1e-06,
102 "layernorm": "RMSNorm",
103 "layernorm_elementwise_affine": true,
104 "layernorm_eps": 1e-05,
105 "mixer_layer": "depthwise_conv",
106 "model_type": "vibevoice_semantic_tokenizer",
107 "pad_mode": "constant",
108 "std_dist_type": "none",
109 "vae_dim": 128,
110 "weight_init_value": 0.01
111 },
112 "semantic_vae_dim": 128,
113 "torch_dtype": "bfloat16",
114 "transformers_version": "4.51.3"
115 }
116