cosyvoice3.yaml
6.8 KB · 224 lines · yaml Raw
1 # set random seed, so that you may reproduce your result.
2 __set_seed1: !apply:random.seed [1986]
3 __set_seed2: !apply:numpy.random.seed [1986]
4 __set_seed3: !apply:torch.manual_seed [1986]
5 __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
6
7 # fixed params
8 sample_rate: 24000
9 llm_input_size: 896
10 llm_output_size: 896
11 spk_embed_dim: 192
12 qwen_pretrain_path: ''
13 token_frame_rate: 25
14 token_mel_ratio: 2
15
16 # stream related params
17 chunk_size: 25 # streaming inference chunk size, in token
18 num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
19
20 # model params
21 # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
22 # for system/third_party class/function, we do not require this.
23 llm: !new:cosyvoice.llm.llm.CosyVoice3LM
24 llm_input_size: !ref <llm_input_size>
25 llm_output_size: !ref <llm_output_size>
26 speech_token_size: 6561
27 length_normalized_loss: True
28 lsm_weight: 0
29 mix_ratio: [5, 15]
30 llm: !new:cosyvoice.llm.llm.Qwen2Encoder
31 pretrain_path: !ref <qwen_pretrain_path>
32 sampling: !name:cosyvoice.utils.common.ras_sampling
33 top_p: 0.8
34 top_k: 25
35 win_size: 10
36 tau_r: 0.1
37
38 flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithDiT
39 input_size: 80
40 output_size: 80
41 spk_embed_dim: !ref <spk_embed_dim>
42 output_type: 'mel'
43 vocab_size: 6561
44 input_frame_rate: !ref <token_frame_rate>
45 only_mask_loss: True
46 token_mel_ratio: !ref <token_mel_ratio>
47 pre_lookahead_len: 3
48 pre_lookahead_layer: !new:cosyvoice.transformer.upsample_encoder.PreLookaheadLayer
49 in_channels: 80
50 channels: 1024
51 pre_lookahead_len: 3
52 decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
53 in_channels: 240
54 n_spks: 1
55 spk_emb_dim: 80
56 cfm_params: !new:omegaconf.DictConfig
57 content:
58 sigma_min: 1e-06
59 solver: 'euler'
60 t_scheduler: 'cosine'
61 training_cfg_rate: 0.2
62 inference_cfg_rate: 0.7
63 reg_loss_type: 'l1'
64 estimator: !new:cosyvoice.flow.DiT.dit.DiT
65 dim: 1024
66 depth: 22
67 heads: 16
68 dim_head: 64
69 ff_mult: 2
70 mel_dim: 80
71 mu_dim: 80
72 spk_dim: 80
73 out_channels: 80
74 static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
75 num_decoding_left_chunks: !ref <num_decoding_left_chunks>
76
77 hift: !new:cosyvoice.hifigan.generator.CausalHiFTGenerator
78 in_channels: 80
79 base_channels: 512
80 nb_harmonics: 8
81 sampling_rate: !ref <sample_rate>
82 nsf_alpha: 0.1
83 nsf_sigma: 0.003
84 nsf_voiced_threshold: 10
85 upsample_rates: [8, 5, 3]
86 upsample_kernel_sizes: [16, 11, 7]
87 istft_params:
88 n_fft: 16
89 hop_len: 4
90 resblock_kernel_sizes: [3, 7, 11]
91 resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
92 source_resblock_kernel_sizes: [7, 7, 11]
93 source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
94 lrelu_slope: 0.1
95 audio_limit: 0.99
96 conv_pre_look_right: 4
97 f0_predictor: !new:cosyvoice.hifigan.f0_predictor.CausalConvRNNF0Predictor
98 num_class: 1
99 in_channels: 80
100 cond_channels: 512
101
102 # gan related module
103 mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
104 n_fft: 1920
105 num_mels: 80
106 sampling_rate: !ref <sample_rate>
107 hop_size: 480
108 win_size: 1920
109 fmin: 0
110 fmax: null
111 center: False
112 hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
113 generator: !ref <hift>
114 discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
115 mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
116 mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
117 mel_spec_transform: [
118 !ref <mel_spec_transform1>
119 ]
120
121 # processor functions
122 parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
123 get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
124 token_path: !ref <qwen_pretrain_path>
125 skip_special_tokens: True
126 version: cosyvoice3
127 allowed_special: 'all'
128 tokenize: !name:cosyvoice.dataset.processor.tokenize
129 get_tokenizer: !ref <get_tokenizer>
130 allowed_special: !ref <allowed_special>
131 filter: !name:cosyvoice.dataset.processor.filter
132 max_length: 40960
133 min_length: 100
134 token_max_length: 200
135 token_min_length: 1
136 resample: !name:cosyvoice.dataset.processor.resample
137 resample_rate: !ref <sample_rate>
138 truncate: !name:cosyvoice.dataset.processor.truncate
139 truncate_length: 24480 # must be a multiplier of hop_size
140 feat_extractor: !name:matcha.utils.audio.mel_spectrogram
141 n_fft: 1920
142 num_mels: 80
143 sampling_rate: !ref <sample_rate>
144 hop_size: 480
145 win_size: 1920
146 fmin: 0
147 fmax: null
148 center: False
149 compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
150 feat_extractor: !ref <feat_extractor>
151 compute_f0: !name:cosyvoice.dataset.processor.compute_f0
152 sample_rate: !ref <sample_rate>
153 hop_size: 480
154 parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
155 normalize: True
156 shuffle: !name:cosyvoice.dataset.processor.shuffle
157 shuffle_size: 1000
158 sort: !name:cosyvoice.dataset.processor.sort
159 sort_size: 500 # sort_size should be less than shuffle_size
160 batch: !name:cosyvoice.dataset.processor.batch
161 batch_type: 'dynamic'
162 max_frames_in_batch: 2000
163 padding: !name:cosyvoice.dataset.processor.padding
164 use_spk_embedding: False # change to True during sft
165
166
167 # dataset processor pipeline
168 data_pipeline: [
169 !ref <parquet_opener>,
170 !ref <tokenize>,
171 !ref <filter>,
172 !ref <resample>,
173 !ref <compute_fbank>,
174 !ref <parse_embedding>,
175 !ref <shuffle>,
176 !ref <sort>,
177 !ref <batch>,
178 !ref <padding>,
179 ]
180 data_pipeline_gan: [
181 !ref <parquet_opener>,
182 !ref <tokenize>,
183 !ref <filter>,
184 !ref <resample>,
185 !ref <truncate>,
186 !ref <compute_fbank>,
187 !ref <compute_f0>,
188 !ref <parse_embedding>,
189 !ref <shuffle>,
190 !ref <sort>,
191 !ref <batch>,
192 !ref <padding>,
193 ]
194
195 # llm flow train conf
196 train_conf:
197 optim: adam
198 optim_conf:
199 lr: 1e-5 # change to 1e-5 during sft
200 scheduler: constantlr # change to constantlr during sft
201 scheduler_conf:
202 warmup_steps: 2500
203 max_epoch: 200
204 grad_clip: 5
205 accum_grad: 2
206 log_interval: 100
207 save_per_step: -1
208
209 # gan train conf
210 train_conf_gan:
211 optim: adam
212 optim_conf:
213 lr: 0.0002 # use small lr for gan training
214 scheduler: constantlr
215 optim_d: adam
216 optim_conf_d:
217 lr: 0.0002 # use small lr for gan training
218 scheduler_d: constantlr
219 max_epoch: 200
220 grad_clip: 5
221 accum_grad: 1 # in gan training, accum_grad must be 1
222 log_interval: 100
223 save_per_step: -1
224