config.json
1.6 KB · 83 lines · json Raw
1 {
2 "activation_dropout": 0.1,
3 "architectures": [
4 "VitsModel"
5 ],
6 "attention_dropout": 0.1,
7 "depth_separable_channels": 2,
8 "depth_separable_num_layers": 3,
9 "duration_predictor_dropout": 0.5,
10 "duration_predictor_filter_channels": 256,
11 "duration_predictor_flow_bins": 10,
12 "duration_predictor_kernel_size": 3,
13 "duration_predictor_num_flows": 4,
14 "duration_predictor_tail_bound": 5.0,
15 "ffn_dim": 768,
16 "ffn_kernel_size": 3,
17 "flow_size": 192,
18 "hidden_act": "relu",
19 "hidden_dropout": 0.1,
20 "hidden_size": 192,
21 "initializer_range": 0.02,
22 "layer_norm_eps": 1e-05,
23 "layerdrop": 0.1,
24 "leaky_relu_slope": 0.1,
25 "model_type": "vits",
26 "noise_scale": 0.667,
27 "noise_scale_duration": 0.8,
28 "num_attention_heads": 2,
29 "num_hidden_layers": 6,
30 "num_speakers": 1,
31 "posterior_encoder_num_wavenet_layers": 16,
32 "prior_encoder_num_flows": 4,
33 "prior_encoder_num_wavenet_layers": 4,
34 "resblock_dilation_sizes": [
35 [
36 1,
37 3,
38 5
39 ],
40 [
41 1,
42 3,
43 5
44 ],
45 [
46 1,
47 3,
48 5
49 ]
50 ],
51 "resblock_kernel_sizes": [
52 3,
53 7,
54 11
55 ],
56 "sampling_rate": 16000,
57 "speaker_embedding_size": 0,
58 "speaking_rate": 1.0,
59 "spectrogram_bins": 513,
60 "torch_dtype": "float32",
61 "transformers_version": "4.33.0.dev0",
62 "upsample_initial_channel": 512,
63 "upsample_kernel_sizes": [
64 16,
65 16,
66 4,
67 4
68 ],
69 "upsample_rates": [
70 8,
71 8,
72 2,
73 2
74 ],
75 "use_bias": true,
76 "use_stochastic_duration_prediction": true,
77 "vocab_size": 32,
78 "wavenet_dilation_rate": 1,
79 "wavenet_dropout": 0.0,
80 "wavenet_kernel_size": 5,
81 "window_size": 4
82 }
83