config.json
| 1 | { |
| 2 | "acoustic_vae_dim": 64, |
| 3 | "acoustic_tokenizer_config": { |
| 4 | "causal": true, |
| 5 | "channels": 1, |
| 6 | "conv_bias": true, |
| 7 | "conv_norm": "none", |
| 8 | "corpus_normalize": 0.0, |
| 9 | "decoder_depths": null, |
| 10 | "decoder_n_filters": 32, |
| 11 | "decoder_ratios": [ |
| 12 | 8, |
| 13 | 5, |
| 14 | 5, |
| 15 | 4, |
| 16 | 2, |
| 17 | 2 |
| 18 | ], |
| 19 | "disable_last_norm": true, |
| 20 | "encoder_depths": "3-3-3-3-3-3-8", |
| 21 | "encoder_n_filters": 32, |
| 22 | "encoder_ratios": [ |
| 23 | 8, |
| 24 | 5, |
| 25 | 5, |
| 26 | 4, |
| 27 | 2, |
| 28 | 2 |
| 29 | ], |
| 30 | "fix_std": 0.5, |
| 31 | "layer_scale_init_value": 1e-06, |
| 32 | "layernorm": "RMSNorm", |
| 33 | "layernorm_elementwise_affine": true, |
| 34 | "layernorm_eps": 1e-05, |
| 35 | "mixer_layer": "depthwise_conv", |
| 36 | "model_type": "vibevoice_acoustic_tokenizer", |
| 37 | "pad_mode": "constant", |
| 38 | "std_dist_type": "gaussian", |
| 39 | "vae_dim": 64, |
| 40 | "weight_init_value": 0.01 |
| 41 | }, |
| 42 | "architectures": [ |
| 43 | "VibeVoiceForConditionalGeneration" |
| 44 | ], |
| 45 | "decoder_config": { |
| 46 | "attention_dropout": 0.0, |
| 47 | "hidden_act": "silu", |
| 48 | "hidden_size": 1536, |
| 49 | "initializer_range": 0.02, |
| 50 | "intermediate_size": 8960, |
| 51 | "max_position_embeddings": 65536, |
| 52 | "max_window_layers": 28, |
| 53 | "model_type": "qwen2", |
| 54 | "num_attention_heads": 12, |
| 55 | "num_hidden_layers": 28, |
| 56 | "num_key_value_heads": 2, |
| 57 | "rms_norm_eps": 1e-06, |
| 58 | "rope_scaling": null, |
| 59 | "rope_theta": 1000000.0, |
| 60 | "sliding_window": null, |
| 61 | "tie_word_embeddings": true, |
| 62 | "torch_dtype": "bfloat16", |
| 63 | "use_cache": true, |
| 64 | "use_sliding_window": false, |
| 65 | "vocab_size": 151936 |
| 66 | }, |
| 67 | "diffusion_head_config": { |
| 68 | "ddpm_batch_mul": 4, |
| 69 | "ddpm_beta_schedule": "cosine", |
| 70 | "ddpm_num_inference_steps": 20, |
| 71 | "ddpm_num_steps": 1000, |
| 72 | "diffusion_type": "ddpm", |
| 73 | "head_ffn_ratio": 3.0, |
| 74 | "head_layers": 4, |
| 75 | "hidden_size": 1536, |
| 76 | "latent_size": 64, |
| 77 | "model_type": "vibevoice_diffusion_head", |
| 78 | "prediction_type": "v_prediction", |
| 79 | "rms_norm_eps": 1e-05, |
| 80 | "speech_vae_dim": 64 |
| 81 | }, |
| 82 | "model_type": "vibevoice", |
| 83 | "semantic_tokenizer_config": { |
| 84 | "causal": true, |
| 85 | "channels": 1, |
| 86 | "conv_bias": true, |
| 87 | "conv_norm": "none", |
| 88 | "corpus_normalize": 0.0, |
| 89 | "disable_last_norm": true, |
| 90 | "encoder_depths": "3-3-3-3-3-3-8", |
| 91 | "encoder_n_filters": 32, |
| 92 | "encoder_ratios": [ |
| 93 | 8, |
| 94 | 5, |
| 95 | 5, |
| 96 | 4, |
| 97 | 2, |
| 98 | 2 |
| 99 | ], |
| 100 | "fix_std": 0, |
| 101 | "layer_scale_init_value": 1e-06, |
| 102 | "layernorm": "RMSNorm", |
| 103 | "layernorm_elementwise_affine": true, |
| 104 | "layernorm_eps": 1e-05, |
| 105 | "mixer_layer": "depthwise_conv", |
| 106 | "model_type": "vibevoice_semantic_tokenizer", |
| 107 | "pad_mode": "constant", |
| 108 | "std_dist_type": "none", |
| 109 | "vae_dim": 128, |
| 110 | "weight_init_value": 0.01 |
| 111 | }, |
| 112 | "semantic_vae_dim": 128, |
| 113 | "torch_dtype": "bfloat16", |
| 114 | "transformers_version": "4.51.3" |
| 115 | } |
| 116 | |