config.json
4.8 KB · 184 lines · json Raw
1 {
2 "architectures": [
3 "Qwen3TTSForConditionalGeneration"
4 ],
5 "assistant_token_id": 77091,
6 "im_end_token_id": 151645,
7 "im_start_token_id": 151644,
8 "tts_bos_token_id": 151672,
9 "tts_eos_token_id": 151673,
10 "tts_pad_token_id": 151671,
11 "model_type": "qwen3_tts",
12 "tokenizer_type": "qwen3_tts_tokenizer_12hz",
13 "tts_model_size": "1b7",
14 "tts_model_type": "custom_voice",
15 "talker_config": {
16 "attention_bias": false,
17 "attention_dropout": 0,
18 "code_predictor_config": {
19 "_name_or_path": "",
20 "add_cross_attention": false,
21 "architectures": null,
22 "attention_bias": false,
23 "attention_dropout": 0,
24 "bad_words_ids": null,
25 "begin_suppress_tokens": null,
26 "bos_token_id": null,
27 "chunk_size_feed_forward": 0,
28 "cross_attention_hidden_size": null,
29 "decoder_start_token_id": null,
30 "diversity_penalty": 0.0,
31 "do_sample": false,
32 "early_stopping": false,
33 "encoder_no_repeat_ngram_size": 0,
34 "eos_token_id": null,
35 "exponential_decay_length_penalty": null,
36 "finetuning_task": null,
37 "forced_bos_token_id": null,
38 "forced_eos_token_id": null,
39 "head_dim": 128,
40 "hidden_act": "silu",
41 "hidden_size": 1024,
42 "id2label": {
43 "0": "LABEL_0",
44 "1": "LABEL_1"
45 },
46 "initializer_range": 0.02,
47 "intermediate_size": 3072,
48 "is_decoder": false,
49 "is_encoder_decoder": false,
50 "label2id": {
51 "LABEL_0": 0,
52 "LABEL_1": 1
53 },
54 "layer_types": [
55 "full_attention",
56 "full_attention",
57 "full_attention",
58 "full_attention",
59 "full_attention"
60 ],
61 "length_penalty": 1.0,
62 "max_length": 20,
63 "max_position_embeddings": 65536,
64 "max_window_layers": 28,
65 "min_length": 0,
66 "model_type": "qwen3_tts_talker_code_predictor",
67 "no_repeat_ngram_size": 0,
68 "num_attention_heads": 16,
69 "num_beam_groups": 1,
70 "num_beams": 1,
71 "num_code_groups": 16,
72 "num_hidden_layers": 5,
73 "num_key_value_heads": 8,
74 "num_return_sequences": 1,
75 "output_attentions": false,
76 "output_hidden_states": false,
77 "output_scores": false,
78 "pad_token_id": null,
79 "prefix": null,
80 "problem_type": null,
81 "pruned_heads": {},
82 "remove_invalid_values": false,
83 "repetition_penalty": 1.0,
84 "return_dict": true,
85 "return_dict_in_generate": false,
86 "rms_norm_eps": 1e-06,
87 "rope_scaling": null,
88 "rope_theta": 1000000,
89 "sep_token_id": null,
90 "sliding_window": null,
91 "suppress_tokens": null,
92 "task_specific_params": null,
93 "temperature": 1.0,
94 "tf_legacy_loss": false,
95 "tie_encoder_decoder": false,
96 "tie_word_embeddings": false,
97 "tokenizer_class": null,
98 "top_k": 50,
99 "top_p": 1.0,
100 "dtype": null,
101 "torchscript": false,
102 "typical_p": 1.0,
103 "use_bfloat16": false,
104 "use_cache": true,
105 "use_sliding_window": false,
106 "vocab_size": 2048
107 },
108 "codec_bos_id": 2149,
109 "codec_eos_token_id": 2150,
110 "codec_think_id": 2154,
111 "codec_language_id": {
112 "chinese": 2055,
113 "english": 2050,
114 "german": 2053,
115 "italian": 2070,
116 "portuguese": 2071,
117 "spanish": 2054,
118 "japanese": 2058,
119 "korean": 2064,
120 "french": 2061,
121 "russian": 2069,
122 "beijing_dialect": 2074,
123 "sichuan_dialect": 2062
124 },
125 "codec_nothink_id": 2155,
126 "codec_pad_id": 2148,
127 "codec_think_bos_id": 2156,
128 "codec_think_eos_id": 2157,
129 "spk_id": {
130 "serena": 3066,
131 "vivian": 3065,
132 "uncle_fu": 3010,
133 "ryan": 3061,
134 "aiden": 2861,
135 "ono_anna": 2873,
136 "sohee": 2864,
137 "eric": 2875,
138 "dylan": 2878
139 },
140 "spk_is_dialect": {
141 "serena": false,
142 "vivian": false,
143 "uncle_fu": false,
144 "ryan": false,
145 "aiden": false,
146 "ono_anna": false,
147 "sohee": false,
148 "eric": "sichuan_dialect",
149 "dylan": "beijing_dialect"
150 },
151 "head_dim": 128,
152 "hidden_act": "silu",
153 "hidden_size": 2048,
154 "initializer_range": 0.02,
155 "intermediate_size": 6144,
156 "max_position_embeddings": 32768,
157 "model_type": "qwen3_tts_talker",
158 "num_attention_heads": 16,
159 "num_code_groups": 16,
160 "num_hidden_layers": 28,
161 "num_key_value_heads": 8,
162 "position_id_per_seconds": 13,
163 "rms_norm_eps": 1e-06,
164 "rope_scaling": {
165 "interleaved": true,
166 "mrope_section": [
167 24,
168 20,
169 20
170 ],
171 "rope_type": "default",
172 "type": "default"
173 },
174 "rope_theta": 1000000,
175 "sliding_window": null,
176 "text_hidden_size": 2048,
177 "text_vocab_size": 151936,
178 "use_cache": true,
179 "use_sliding_window": false,
180 "vocab_size": 3072
181 },
182 "transformers_version": "4.57.3"
183 }
184