config.json
2.0 KB · 92 lines · json Raw
1 {
2 "activation_dropout": 0.1,
3 "apply_spec_augment": true,
4 "architectures": [
5 "SpeechT5ForTextToSpeech"
6 ],
7 "attention_dropout": 0.1,
8 "bos_token_id": 0,
9 "conv_bias": false,
10 "conv_dim": [
11 512,
12 512,
13 512,
14 512,
15 512,
16 512,
17 512
18 ],
19 "conv_kernel": [
20 10,
21 3,
22 3,
23 3,
24 3,
25 2,
26 2
27 ],
28 "conv_stride": [
29 5,
30 2,
31 2,
32 2,
33 2,
34 2,
35 2
36 ],
37 "decoder_attention_heads": 12,
38 "decoder_ffn_dim": 3072,
39 "decoder_layerdrop": 0.1,
40 "decoder_layers": 6,
41 "decoder_start_token_id": 2,
42 "encoder_attention_heads": 12,
43 "encoder_ffn_dim": 3072,
44 "encoder_layerdrop": 0.1,
45 "encoder_layers": 12,
46 "encoder_max_relative_position": 160,
47 "eos_token_id": 2,
48 "feat_extract_activation": "gelu",
49 "feat_extract_norm": "group",
50 "feat_proj_dropout": 0.0,
51 "guided_attention_loss_num_heads": 2,
52 "guided_attention_loss_scale": 10.0,
53 "guided_attention_loss_sigma": 0.4,
54 "hidden_act": "gelu",
55 "hidden_dropout": 0.1,
56 "hidden_size": 768,
57 "initializer_range": 0.02,
58 "is_encoder_decoder": true,
59 "layer_norm_eps": 1e-05,
60 "mask_feature_length": 10,
61 "mask_feature_min_masks": 0,
62 "mask_feature_prob": 0.0,
63 "mask_time_length": 10,
64 "mask_time_min_masks": 2,
65 "mask_time_prob": 0.05,
66 "max_length": 1876,
67 "max_speech_positions": 1876,
68 "max_text_positions": 600,
69 "model_type": "speecht5",
70 "num_conv_pos_embedding_groups": 16,
71 "num_conv_pos_embeddings": 128,
72 "num_feat_extract_layers": 7,
73 "num_mel_bins": 80,
74 "pad_token_id": 1,
75 "positional_dropout": 0.1,
76 "reduction_factor": 2,
77 "scale_embedding": false,
78 "speaker_embedding_dim": 512,
79 "speech_decoder_postnet_dropout": 0.5,
80 "speech_decoder_postnet_kernel": 5,
81 "speech_decoder_postnet_layers": 5,
82 "speech_decoder_postnet_units": 256,
83 "speech_decoder_prenet_dropout": 0.5,
84 "speech_decoder_prenet_layers": 2,
85 "speech_decoder_prenet_units": 256,
86 "torch_dtype": "float32",
87 "transformers_version": "4.28.0.dev0",
88 "use_cache": true,
89 "use_guided_attention_loss": true,
90 "vocab_size": 81
91 }
92