config.json
6.0 KB · 221 lines · json Raw
1 {
2 "architectures": [
3 "Qwen3ASRForConditionalGeneration"
4 ],
5 "model_type": "qwen3_asr",
6 "support_languages": [
7 "Chinese",
8 "English",
9 "Cantonese",
10 "Arabic",
11 "German",
12 "French",
13 "Spanish",
14 "Portuguese",
15 "Indonesian",
16 "Italian",
17 "Korean",
18 "Russian",
19 "Thai",
20 "Vietnamese",
21 "Japanese",
22 "Turkish",
23 "Hindi",
24 "Malay",
25 "Dutch",
26 "Swedish",
27 "Danish",
28 "Finnish",
29 "Polish",
30 "Czech",
31 "Filipino",
32 "Persian",
33 "Greek",
34 "Romanian",
35 "Hungarian",
36 "Macedonian"
37 ],
38 "thinker_config": {
39 "model_type": "qwen3_asr",
40 "architectures": [
41 "Qwen3ASRForConditionalGeneration"
42 ],
43 "audio_config": {
44 "_name_or_path": "",
45 "activation_dropout": 0,
46 "activation_function": "gelu",
47 "add_cross_attention": false,
48 "architectures": null,
49 "attention_dropout": 0,
50 "bad_words_ids": null,
51 "begin_suppress_tokens": null,
52 "bos_token_id": null,
53 "chunk_size_feed_forward": 0,
54 "conv_chunksize": 500,
55 "cross_attention_hidden_size": null,
56 "d_model": 1024,
57 "decoder_start_token_id": null,
58 "diversity_penalty": 0.0,
59 "do_sample": false,
60 "downsample_hidden_size": 480,
61 "dropout": 0,
62 "dtype": null,
63 "early_stopping": false,
64 "encoder_attention_heads": 16,
65 "encoder_ffn_dim": 4096,
66 "encoder_layers": 24,
67 "encoder_no_repeat_ngram_size": 0,
68 "eos_token_id": null,
69 "exponential_decay_length_penalty": null,
70 "finetuning_task": null,
71 "forced_bos_token_id": null,
72 "forced_eos_token_id": null,
73 "id2label": {
74 "0": "LABEL_0",
75 "1": "LABEL_1"
76 },
77 "initializer_range": 0.02,
78 "is_decoder": false,
79 "is_encoder_decoder": false,
80 "label2id": {
81 "LABEL_0": 0,
82 "LABEL_1": 1
83 },
84 "length_penalty": 1.0,
85 "max_length": 20,
86 "max_source_positions": 1500,
87 "min_length": 0,
88 "model_type": "qwen3_asr_audio_encoder",
89 "n_window": 50,
90 "n_window_infer": 800,
91 "no_repeat_ngram_size": 0,
92 "num_beam_groups": 1,
93 "num_beams": 1,
94 "num_hidden_layers": 24,
95 "num_mel_bins": 128,
96 "num_return_sequences": 1,
97 "output_attentions": false,
98 "output_dim": 2048,
99 "output_hidden_states": false,
100 "output_scores": false,
101 "pad_token_id": null,
102 "prefix": null,
103 "problem_type": null,
104 "pruned_heads": {},
105 "remove_invalid_values": false,
106 "repetition_penalty": 1.0,
107 "return_dict": true,
108 "return_dict_in_generate": false,
109 "scale_embedding": false,
110 "sep_token_id": null,
111 "suppress_tokens": null,
112 "task_specific_params": null,
113 "temperature": 1.0,
114 "tf_legacy_loss": false,
115 "tie_encoder_decoder": false,
116 "tie_word_embeddings": true,
117 "tokenizer_class": null,
118 "top_k": 50,
119 "top_p": 1.0,
120 "torchscript": false,
121 "typical_p": 1.0,
122 "use_bfloat16": false
123 },
124 "audio_end_token_id": 151670,
125 "audio_start_token_id": 151669,
126 "audio_token_id": 151676,
127 "dtype": "bfloat16",
128 "initializer_range": 0.02,
129 "text_config": {
130 "_name_or_path": "",
131 "add_cross_attention": false,
132 "architectures": null,
133 "attention_bias": false,
134 "attention_dropout": 0.0,
135 "bad_words_ids": null,
136 "begin_suppress_tokens": null,
137 "bos_token_id": null,
138 "chunk_size_feed_forward": 0,
139 "cross_attention_hidden_size": null,
140 "decoder_start_token_id": null,
141 "diversity_penalty": 0.0,
142 "do_sample": false,
143 "dtype": null,
144 "early_stopping": false,
145 "encoder_no_repeat_ngram_size": 0,
146 "eos_token_id": null,
147 "exponential_decay_length_penalty": null,
148 "finetuning_task": null,
149 "forced_bos_token_id": null,
150 "forced_eos_token_id": null,
151 "head_dim": 128,
152 "hidden_act": "silu",
153 "hidden_size": 2048,
154 "id2label": {
155 "0": "LABEL_0",
156 "1": "LABEL_1"
157 },
158 "initializer_range": 0.02,
159 "intermediate_size": 6144,
160 "is_decoder": false,
161 "is_encoder_decoder": false,
162 "label2id": {
163 "LABEL_0": 0,
164 "LABEL_1": 1
165 },
166 "length_penalty": 1.0,
167 "max_length": 20,
168 "max_position_embeddings": 65536,
169 "min_length": 0,
170 "model_type": "qwen3",
171 "no_repeat_ngram_size": 0,
172 "num_attention_heads": 16,
173 "num_beam_groups": 1,
174 "num_beams": 1,
175 "num_hidden_layers": 28,
176 "num_key_value_heads": 8,
177 "num_return_sequences": 1,
178 "output_attentions": false,
179 "output_hidden_states": false,
180 "output_scores": false,
181 "pad_token_id": null,
182 "prefix": null,
183 "problem_type": null,
184 "pruned_heads": {},
185 "remove_invalid_values": false,
186 "repetition_penalty": 1.0,
187 "return_dict": true,
188 "return_dict_in_generate": false,
189 "rms_norm_eps": 1e-06,
190 "rope_scaling": {
191 "interleaved": true,
192 "mrope_interleaved": true,
193 "mrope_section": [
194 24,
195 20,
196 20
197 ],
198 "rope_type": "default",
199 "type": "default"
200 },
201 "rope_theta": 1000000,
202 "sep_token_id": null,
203 "suppress_tokens": null,
204 "task_specific_params": null,
205 "temperature": 1.0,
206 "tf_legacy_loss": false,
207 "tie_encoder_decoder": false,
208 "tie_word_embeddings": true,
209 "tokenizer_class": null,
210 "top_k": 50,
211 "top_p": 1.0,
212 "torchscript": false,
213 "typical_p": 1.0,
214 "use_bfloat16": false,
215 "use_cache": true,
216 "vocab_size": 151936
217 }
218 },
219 "transformers_version": "4.57.6"
220 }
221