cosyvoice3.yaml
| 1 | # set random seed, so that you may reproduce your result. |
| 2 | __set_seed1: !apply:random.seed [1986] |
| 3 | __set_seed2: !apply:numpy.random.seed [1986] |
| 4 | __set_seed3: !apply:torch.manual_seed [1986] |
| 5 | __set_seed4: !apply:torch.cuda.manual_seed_all [1986] |
| 6 | |
| 7 | # fixed params |
| 8 | sample_rate: 24000 |
| 9 | llm_input_size: 896 |
| 10 | llm_output_size: 896 |
| 11 | spk_embed_dim: 192 |
| 12 | qwen_pretrain_path: '' |
| 13 | token_frame_rate: 25 |
| 14 | token_mel_ratio: 2 |
| 15 | |
| 16 | # stream related params |
| 17 | chunk_size: 25 # streaming inference chunk size, in token |
| 18 | num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks |
| 19 | |
| 20 | # model params |
| 21 | # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml. |
| 22 | # for system/third_party class/function, we do not require this. |
| 23 | llm: !new:cosyvoice.llm.llm.CosyVoice3LM |
| 24 | llm_input_size: !ref <llm_input_size> |
| 25 | llm_output_size: !ref <llm_output_size> |
| 26 | speech_token_size: 6561 |
| 27 | length_normalized_loss: True |
| 28 | lsm_weight: 0 |
| 29 | mix_ratio: [5, 15] |
| 30 | llm: !new:cosyvoice.llm.llm.Qwen2Encoder |
| 31 | pretrain_path: !ref <qwen_pretrain_path> |
| 32 | sampling: !name:cosyvoice.utils.common.ras_sampling |
| 33 | top_p: 0.8 |
| 34 | top_k: 25 |
| 35 | win_size: 10 |
| 36 | tau_r: 0.1 |
| 37 | |
| 38 | flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithDiT |
| 39 | input_size: 80 |
| 40 | output_size: 80 |
| 41 | spk_embed_dim: !ref <spk_embed_dim> |
| 42 | output_type: 'mel' |
| 43 | vocab_size: 6561 |
| 44 | input_frame_rate: !ref <token_frame_rate> |
| 45 | only_mask_loss: True |
| 46 | token_mel_ratio: !ref <token_mel_ratio> |
| 47 | pre_lookahead_len: 3 |
| 48 | pre_lookahead_layer: !new:cosyvoice.transformer.upsample_encoder.PreLookaheadLayer |
| 49 | in_channels: 80 |
| 50 | channels: 1024 |
| 51 | pre_lookahead_len: 3 |
| 52 | decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM |
| 53 | in_channels: 240 |
| 54 | n_spks: 1 |
| 55 | spk_emb_dim: 80 |
| 56 | cfm_params: !new:omegaconf.DictConfig |
| 57 | content: |
| 58 | sigma_min: 1e-06 |
| 59 | solver: 'euler' |
| 60 | t_scheduler: 'cosine' |
| 61 | training_cfg_rate: 0.2 |
| 62 | inference_cfg_rate: 0.7 |
| 63 | reg_loss_type: 'l1' |
| 64 | estimator: !new:cosyvoice.flow.DiT.dit.DiT |
| 65 | dim: 1024 |
| 66 | depth: 22 |
| 67 | heads: 16 |
| 68 | dim_head: 64 |
| 69 | ff_mult: 2 |
| 70 | mel_dim: 80 |
| 71 | mu_dim: 80 |
| 72 | spk_dim: 80 |
| 73 | out_channels: 80 |
| 74 | static_chunk_size: !ref <chunk_size> * <token_mel_ratio> |
| 75 | num_decoding_left_chunks: !ref <num_decoding_left_chunks> |
| 76 | |
| 77 | hift: !new:cosyvoice.hifigan.generator.CausalHiFTGenerator |
| 78 | in_channels: 80 |
| 79 | base_channels: 512 |
| 80 | nb_harmonics: 8 |
| 81 | sampling_rate: !ref <sample_rate> |
| 82 | nsf_alpha: 0.1 |
| 83 | nsf_sigma: 0.003 |
| 84 | nsf_voiced_threshold: 10 |
| 85 | upsample_rates: [8, 5, 3] |
| 86 | upsample_kernel_sizes: [16, 11, 7] |
| 87 | istft_params: |
| 88 | n_fft: 16 |
| 89 | hop_len: 4 |
| 90 | resblock_kernel_sizes: [3, 7, 11] |
| 91 | resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] |
| 92 | source_resblock_kernel_sizes: [7, 7, 11] |
| 93 | source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] |
| 94 | lrelu_slope: 0.1 |
| 95 | audio_limit: 0.99 |
| 96 | conv_pre_look_right: 4 |
| 97 | f0_predictor: !new:cosyvoice.hifigan.f0_predictor.CausalConvRNNF0Predictor |
| 98 | num_class: 1 |
| 99 | in_channels: 80 |
| 100 | cond_channels: 512 |
| 101 | |
| 102 | # gan related module |
| 103 | mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram |
| 104 | n_fft: 1920 |
| 105 | num_mels: 80 |
| 106 | sampling_rate: !ref <sample_rate> |
| 107 | hop_size: 480 |
| 108 | win_size: 1920 |
| 109 | fmin: 0 |
| 110 | fmax: null |
| 111 | center: False |
| 112 | hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan |
| 113 | generator: !ref <hift> |
| 114 | discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator |
| 115 | mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator |
| 116 | mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator |
| 117 | mel_spec_transform: [ |
| 118 | !ref <mel_spec_transform1> |
| 119 | ] |
| 120 | |
| 121 | # processor functions |
| 122 | parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener |
| 123 | get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer |
| 124 | token_path: !ref <qwen_pretrain_path> |
| 125 | skip_special_tokens: True |
| 126 | version: cosyvoice3 |
| 127 | allowed_special: 'all' |
| 128 | tokenize: !name:cosyvoice.dataset.processor.tokenize |
| 129 | get_tokenizer: !ref <get_tokenizer> |
| 130 | allowed_special: !ref <allowed_special> |
| 131 | filter: !name:cosyvoice.dataset.processor.filter |
| 132 | max_length: 40960 |
| 133 | min_length: 100 |
| 134 | token_max_length: 200 |
| 135 | token_min_length: 1 |
| 136 | resample: !name:cosyvoice.dataset.processor.resample |
| 137 | resample_rate: !ref <sample_rate> |
| 138 | truncate: !name:cosyvoice.dataset.processor.truncate |
| 139 | truncate_length: 24480 # must be a multiplier of hop_size |
| 140 | feat_extractor: !name:matcha.utils.audio.mel_spectrogram |
| 141 | n_fft: 1920 |
| 142 | num_mels: 80 |
| 143 | sampling_rate: !ref <sample_rate> |
| 144 | hop_size: 480 |
| 145 | win_size: 1920 |
| 146 | fmin: 0 |
| 147 | fmax: null |
| 148 | center: False |
| 149 | compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank |
| 150 | feat_extractor: !ref <feat_extractor> |
| 151 | compute_f0: !name:cosyvoice.dataset.processor.compute_f0 |
| 152 | sample_rate: !ref <sample_rate> |
| 153 | hop_size: 480 |
| 154 | parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding |
| 155 | normalize: True |
| 156 | shuffle: !name:cosyvoice.dataset.processor.shuffle |
| 157 | shuffle_size: 1000 |
| 158 | sort: !name:cosyvoice.dataset.processor.sort |
| 159 | sort_size: 500 # sort_size should be less than shuffle_size |
| 160 | batch: !name:cosyvoice.dataset.processor.batch |
| 161 | batch_type: 'dynamic' |
| 162 | max_frames_in_batch: 2000 |
| 163 | padding: !name:cosyvoice.dataset.processor.padding |
| 164 | use_spk_embedding: False # change to True during sft |
| 165 | |
| 166 | |
| 167 | # dataset processor pipeline |
| 168 | data_pipeline: [ |
| 169 | !ref <parquet_opener>, |
| 170 | !ref <tokenize>, |
| 171 | !ref <filter>, |
| 172 | !ref <resample>, |
| 173 | !ref <compute_fbank>, |
| 174 | !ref <parse_embedding>, |
| 175 | !ref <shuffle>, |
| 176 | !ref <sort>, |
| 177 | !ref <batch>, |
| 178 | !ref <padding>, |
| 179 | ] |
| 180 | data_pipeline_gan: [ |
| 181 | !ref <parquet_opener>, |
| 182 | !ref <tokenize>, |
| 183 | !ref <filter>, |
| 184 | !ref <resample>, |
| 185 | !ref <truncate>, |
| 186 | !ref <compute_fbank>, |
| 187 | !ref <compute_f0>, |
| 188 | !ref <parse_embedding>, |
| 189 | !ref <shuffle>, |
| 190 | !ref <sort>, |
| 191 | !ref <batch>, |
| 192 | !ref <padding>, |
| 193 | ] |
| 194 | |
| 195 | # llm flow train conf |
| 196 | train_conf: |
| 197 | optim: adam |
| 198 | optim_conf: |
| 199 | lr: 1e-5 # change to 1e-5 during sft |
| 200 | scheduler: constantlr # change to constantlr during sft |
| 201 | scheduler_conf: |
| 202 | warmup_steps: 2500 |
| 203 | max_epoch: 200 |
| 204 | grad_clip: 5 |
| 205 | accum_grad: 2 |
| 206 | log_interval: 100 |
| 207 | save_per_step: -1 |
| 208 | |
| 209 | # gan train conf |
| 210 | train_conf_gan: |
| 211 | optim: adam |
| 212 | optim_conf: |
| 213 | lr: 0.0002 # use small lr for gan training |
| 214 | scheduler: constantlr |
| 215 | optim_d: adam |
| 216 | optim_conf_d: |
| 217 | lr: 0.0002 # use small lr for gan training |
| 218 | scheduler_d: constantlr |
| 219 | max_epoch: 200 |
| 220 | grad_clip: 5 |
| 221 | accum_grad: 1 # in gan training, accum_grad must be 1 |
| 222 | log_interval: 100 |
| 223 | save_per_step: -1 |
| 224 | |