config.json
| 1 | { |
| 2 | "istftnet": { |
| 3 | "upsample_kernel_sizes": [20, 12], |
| 4 | "upsample_rates": [10, 6], |
| 5 | "gen_istft_hop_size": 5, |
| 6 | "gen_istft_n_fft": 20, |
| 7 | "resblock_dilation_sizes": [ |
| 8 | [1, 3, 5], |
| 9 | [1, 3, 5], |
| 10 | [1, 3, 5] |
| 11 | ], |
| 12 | "resblock_kernel_sizes": [3, 7, 11], |
| 13 | "upsample_initial_channel": 512 |
| 14 | }, |
| 15 | "dim_in": 64, |
| 16 | "dropout": 0.2, |
| 17 | "hidden_dim": 512, |
| 18 | "max_conv_dim": 512, |
| 19 | "max_dur": 50, |
| 20 | "multispeaker": true, |
| 21 | "n_layer": 3, |
| 22 | "n_mels": 80, |
| 23 | "n_token": 178, |
| 24 | "style_dim": 128, |
| 25 | "text_encoder_kernel_size": 5, |
| 26 | "plbert": { |
| 27 | "hidden_size": 768, |
| 28 | "num_attention_heads": 12, |
| 29 | "intermediate_size": 2048, |
| 30 | "max_position_embeddings": 512, |
| 31 | "num_hidden_layers": 12, |
| 32 | "dropout": 0.1 |
| 33 | }, |
| 34 | "vocab": { |
| 35 | ";": 1, |
| 36 | ":": 2, |
| 37 | ",": 3, |
| 38 | ".": 4, |
| 39 | "!": 5, |
| 40 | "?": 6, |
| 41 | "—": 9, |
| 42 | "…": 10, |
| 43 | "\"": 11, |
| 44 | "(": 12, |
| 45 | ")": 13, |
| 46 | "“": 14, |
| 47 | "”": 15, |
| 48 | " ": 16, |
| 49 | "\u0303": 17, |
| 50 | "ʣ": 18, |
| 51 | "ʥ": 19, |
| 52 | "ʦ": 20, |
| 53 | "ʨ": 21, |
| 54 | "ᵝ": 22, |
| 55 | "\uAB67": 23, |
| 56 | "A": 24, |
| 57 | "I": 25, |
| 58 | "O": 31, |
| 59 | "Q": 33, |
| 60 | "S": 35, |
| 61 | "T": 36, |
| 62 | "W": 39, |
| 63 | "Y": 41, |
| 64 | "ᵊ": 42, |
| 65 | "a": 43, |
| 66 | "b": 44, |
| 67 | "c": 45, |
| 68 | "d": 46, |
| 69 | "e": 47, |
| 70 | "f": 48, |
| 71 | "h": 50, |
| 72 | "i": 51, |
| 73 | "j": 52, |
| 74 | "k": 53, |
| 75 | "l": 54, |
| 76 | "m": 55, |
| 77 | "n": 56, |
| 78 | "o": 57, |
| 79 | "p": 58, |
| 80 | "q": 59, |
| 81 | "r": 60, |
| 82 | "s": 61, |
| 83 | "t": 62, |
| 84 | "u": 63, |
| 85 | "v": 64, |
| 86 | "w": 65, |
| 87 | "x": 66, |
| 88 | "y": 67, |
| 89 | "z": 68, |
| 90 | "ɑ": 69, |
| 91 | "ɐ": 70, |
| 92 | "ɒ": 71, |
| 93 | "æ": 72, |
| 94 | "β": 75, |
| 95 | "ɔ": 76, |
| 96 | "ɕ": 77, |
| 97 | "ç": 78, |
| 98 | "ɖ": 80, |
| 99 | "ð": 81, |
| 100 | "ʤ": 82, |
| 101 | "ə": 83, |
| 102 | "ɚ": 85, |
| 103 | "ɛ": 86, |
| 104 | "ɜ": 87, |
| 105 | "ɟ": 90, |
| 106 | "ɡ": 92, |
| 107 | "ɥ": 99, |
| 108 | "ɨ": 101, |
| 109 | "ɪ": 102, |
| 110 | "ʝ": 103, |
| 111 | "ɯ": 110, |
| 112 | "ɰ": 111, |
| 113 | "ŋ": 112, |
| 114 | "ɳ": 113, |
| 115 | "ɲ": 114, |
| 116 | "ɴ": 115, |
| 117 | "ø": 116, |
| 118 | "ɸ": 118, |
| 119 | "θ": 119, |
| 120 | "œ": 120, |
| 121 | "ɹ": 123, |
| 122 | "ɾ": 125, |
| 123 | "ɻ": 126, |
| 124 | "ʁ": 128, |
| 125 | "ɽ": 129, |
| 126 | "ʂ": 130, |
| 127 | "ʃ": 131, |
| 128 | "ʈ": 132, |
| 129 | "ʧ": 133, |
| 130 | "ʊ": 135, |
| 131 | "ʋ": 136, |
| 132 | "ʌ": 138, |
| 133 | "ɣ": 139, |
| 134 | "ɤ": 140, |
| 135 | "χ": 142, |
| 136 | "ʎ": 143, |
| 137 | "ʒ": 147, |
| 138 | "ʔ": 148, |
| 139 | "ˈ": 156, |
| 140 | "ˌ": 157, |
| 141 | "ː": 158, |
| 142 | "ʰ": 162, |
| 143 | "ʲ": 164, |
| 144 | "↓": 169, |
| 145 | "→": 171, |
| 146 | "↗": 172, |
| 147 | "↘": 173, |
| 148 | "ᵻ": 177 |
| 149 | } |
| 150 | } |