cosyvoice3.yaml · Fun-CosyVoice3-0.5B-2512

cosyvoice3.yaml

6.8 KB · 224 lines · yaml Raw

1	`# set random seed, so that you may reproduce your result.`
2	`__set_seed1: !apply:random.seed [1986]`
3	`__set_seed2: !apply:numpy.random.seed [1986]`
4	`__set_seed3: !apply:torch.manual_seed [1986]`
5	`__set_seed4: !apply:torch.cuda.manual_seed_all [1986]`
6
7	`# fixed params`
8	`sample_rate: 24000`
9	`llm_input_size: 896`
10	`llm_output_size: 896`
11	`spk_embed_dim: 192`
12	`qwen_pretrain_path: ''`
13	`token_frame_rate: 25`
14	`token_mel_ratio: 2`
15
16	`# stream related params`
17	`chunk_size: 25 # streaming inference chunk size, in token`
18	`num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks`
19
20	`# model params`
21	`# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.`
22	`# for system/third_party class/function, we do not require this.`
23	`llm: !new:cosyvoice.llm.llm.CosyVoice3LM`
24	`llm_input_size: !ref <llm_input_size>`
25	`llm_output_size: !ref <llm_output_size>`
26	`speech_token_size: 6561`
27	`length_normalized_loss: True`
28	`lsm_weight: 0`
29	`mix_ratio: [5, 15]`
30	`llm: !new:cosyvoice.llm.llm.Qwen2Encoder`
31	`pretrain_path: !ref <qwen_pretrain_path>`
32	`sampling: !name:cosyvoice.utils.common.ras_sampling`
33	`top_p: 0.8`
34	`top_k: 25`
35	`win_size: 10`
36	`tau_r: 0.1`
37
38	`flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithDiT`
39	`input_size: 80`
40	`output_size: 80`
41	`spk_embed_dim: !ref <spk_embed_dim>`
42	`output_type: 'mel'`
43	`vocab_size: 6561`
44	`input_frame_rate: !ref <token_frame_rate>`
45	`only_mask_loss: True`
46	`token_mel_ratio: !ref <token_mel_ratio>`
47	`pre_lookahead_len: 3`
48	`pre_lookahead_layer: !new:cosyvoice.transformer.upsample_encoder.PreLookaheadLayer`
49	`in_channels: 80`
50	`channels: 1024`
51	`pre_lookahead_len: 3`
52	`decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM`
53	`in_channels: 240`
54	`n_spks: 1`
55	`spk_emb_dim: 80`
56	`cfm_params: !new:omegaconf.DictConfig`
57	`content:`
58	`sigma_min: 1e-06`
59	`solver: 'euler'`
60	`t_scheduler: 'cosine'`
61	`training_cfg_rate: 0.2`
62	`inference_cfg_rate: 0.7`
63	`reg_loss_type: 'l1'`
64	`estimator: !new:cosyvoice.flow.DiT.dit.DiT`
65	`dim: 1024`
66	`depth: 22`
67	`heads: 16`
68	`dim_head: 64`
69	`ff_mult: 2`
70	`mel_dim: 80`
71	`mu_dim: 80`
72	`spk_dim: 80`
73	`out_channels: 80`
74	`static_chunk_size: !ref <chunk_size> * <token_mel_ratio>`
75	`num_decoding_left_chunks: !ref <num_decoding_left_chunks>`
76
77	`hift: !new:cosyvoice.hifigan.generator.CausalHiFTGenerator`
78	`in_channels: 80`
79	`base_channels: 512`
80	`nb_harmonics: 8`
81	`sampling_rate: !ref <sample_rate>`
82	`nsf_alpha: 0.1`
83	`nsf_sigma: 0.003`
84	`nsf_voiced_threshold: 10`
85	`upsample_rates: [8, 5, 3]`
86	`upsample_kernel_sizes: [16, 11, 7]`
87	`istft_params:`
88	`n_fft: 16`
89	`hop_len: 4`
90	`resblock_kernel_sizes: [3, 7, 11]`
91	`resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]`
92	`source_resblock_kernel_sizes: [7, 7, 11]`
93	`source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]`
94	`lrelu_slope: 0.1`
95	`audio_limit: 0.99`
96	`conv_pre_look_right: 4`
97	`f0_predictor: !new:cosyvoice.hifigan.f0_predictor.CausalConvRNNF0Predictor`
98	`num_class: 1`
99	`in_channels: 80`
100	`cond_channels: 512`
101
102	`# gan related module`
103	`mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram`
104	`n_fft: 1920`
105	`num_mels: 80`
106	`sampling_rate: !ref <sample_rate>`
107	`hop_size: 480`
108	`win_size: 1920`
109	`fmin: 0`
110	`fmax: null`
111	`center: False`
112	`hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan`
113	`generator: !ref <hift>`
114	`discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator`
115	`mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator`
116	`mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator`
117	`mel_spec_transform: [`
118	`!ref <mel_spec_transform1>`
119	`]`
120
121	`# processor functions`
122	`parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener`
123	`get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer`
124	`token_path: !ref <qwen_pretrain_path>`
125	`skip_special_tokens: True`
126	`version: cosyvoice3`
127	`allowed_special: 'all'`
128	`tokenize: !name:cosyvoice.dataset.processor.tokenize`
129	`get_tokenizer: !ref <get_tokenizer>`
130	`allowed_special: !ref <allowed_special>`
131	`filter: !name:cosyvoice.dataset.processor.filter`
132	`max_length: 40960`
133	`min_length: 100`
134	`token_max_length: 200`
135	`token_min_length: 1`
136	`resample: !name:cosyvoice.dataset.processor.resample`
137	`resample_rate: !ref <sample_rate>`
138	`truncate: !name:cosyvoice.dataset.processor.truncate`
139	`truncate_length: 24480 # must be a multiplier of hop_size`
140	`feat_extractor: !name:matcha.utils.audio.mel_spectrogram`
141	`n_fft: 1920`
142	`num_mels: 80`
143	`sampling_rate: !ref <sample_rate>`
144	`hop_size: 480`
145	`win_size: 1920`
146	`fmin: 0`
147	`fmax: null`
148	`center: False`
149	`compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank`
150	`feat_extractor: !ref <feat_extractor>`
151	`compute_f0: !name:cosyvoice.dataset.processor.compute_f0`
152	`sample_rate: !ref <sample_rate>`
153	`hop_size: 480`
154	`parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding`
155	`normalize: True`
156	`shuffle: !name:cosyvoice.dataset.processor.shuffle`
157	`shuffle_size: 1000`
158	`sort: !name:cosyvoice.dataset.processor.sort`
159	`sort_size: 500 # sort_size should be less than shuffle_size`
160	`batch: !name:cosyvoice.dataset.processor.batch`
161	`batch_type: 'dynamic'`
162	`max_frames_in_batch: 2000`
163	`padding: !name:cosyvoice.dataset.processor.padding`
164	`use_spk_embedding: False # change to True during sft`
165
166
167	`# dataset processor pipeline`
168	`data_pipeline: [`
169	`!ref <parquet_opener>,`
170	`!ref <tokenize>,`
171	`!ref <filter>,`
172	`!ref <resample>,`
173	`!ref <compute_fbank>,`
174	`!ref <parse_embedding>,`
175	`!ref <shuffle>,`
176	`!ref <sort>,`
177	`!ref <batch>,`
178	`!ref <padding>,`
179	`]`
180	`data_pipeline_gan: [`
181	`!ref <parquet_opener>,`
182	`!ref <tokenize>,`
183	`!ref <filter>,`
184	`!ref <resample>,`
185	`!ref <truncate>,`
186	`!ref <compute_fbank>,`
187	`!ref <compute_f0>,`
188	`!ref <parse_embedding>,`
189	`!ref <shuffle>,`
190	`!ref <sort>,`
191	`!ref <batch>,`
192	`!ref <padding>,`
193	`]`
194
195	`# llm flow train conf`
196	`train_conf:`
197	`optim: adam`
198	`optim_conf:`
199	`lr: 1e-5 # change to 1e-5 during sft`
200	`scheduler: constantlr # change to constantlr during sft`
201	`scheduler_conf:`
202	`warmup_steps: 2500`
203	`max_epoch: 200`
204	`grad_clip: 5`
205	`accum_grad: 2`
206	`log_interval: 100`
207	`save_per_step: -1`
208
209	`# gan train conf`
210	`train_conf_gan:`
211	`optim: adam`
212	`optim_conf:`
213	`lr: 0.0002 # use small lr for gan training`
214	`scheduler: constantlr`
215	`optim_d: adam`
216	`optim_conf_d:`
217	`lr: 0.0002 # use small lr for gan training`
218	`scheduler_d: constantlr`
219	`max_epoch: 200`
220	`grad_clip: 5`
221	`accum_grad: 1 # in gan training, accum_grad must be 1`
222	`log_interval: 100`
223	`save_per_step: -1`
224