audio_tokenizer/config.json
2.5 KB · 130 lines · json Raw
1 {
2 "acoustic_model_config": {
3 "codebook_dim": 8,
4 "codebook_loss_weight": 1.0,
5 "codebook_size": 1024,
6 "commitment_loss_weight": 0.25,
7 "decoder_hidden_size": 1024,
8 "downsampling_ratios": [
9 8,
10 5,
11 4,
12 2,
13 3
14 ],
15 "encoder_hidden_size": 64,
16 "hidden_size": 256,
17 "hop_length": 960,
18 "model_type": "dac",
19 "n_codebooks": 9,
20 "quantizer_dropout": 0,
21 "sampling_rate": 16000,
22 "upsampling_ratios": [
23 8,
24 5,
25 4,
26 2,
27 3
28 ]
29 },
30 "architectures": [
31 "HiggsAudioV2TokenizerModel"
32 ],
33 "block_dilations": [
34 1,
35 1
36 ],
37 "channel_ratios": [
38 1,
39 1
40 ],
41 "codebook_dim": 64,
42 "codebook_size": 1024,
43 "downsample_factor": 320,
44 "dtype": "float32",
45 "initializer_range": 0.02,
46 "kernel_size": 3,
47 "model_type": "higgs_audio_v2_tokenizer",
48 "sample_rate": 24000,
49 "semantic_model_config": {
50 "activation_dropout": 0.1,
51 "apply_spec_augment": true,
52 "attention_dropout": 0.1,
53 "bos_token_id": 1,
54 "classifier_proj_size": 256,
55 "conv_bias": false,
56 "conv_dim": [
57 512,
58 512,
59 512,
60 512,
61 512,
62 512,
63 512
64 ],
65 "conv_kernel": [
66 10,
67 3,
68 3,
69 3,
70 3,
71 2,
72 2
73 ],
74 "conv_pos_batch_norm": false,
75 "conv_stride": [
76 5,
77 2,
78 2,
79 2,
80 2,
81 2,
82 2
83 ],
84 "ctc_loss_reduction": "sum",
85 "ctc_zero_infinity": false,
86 "do_stable_layer_norm": false,
87 "eos_token_id": 2,
88 "feat_extract_activation": "gelu",
89 "feat_extract_norm": "group",
90 "feat_proj_dropout": 0.0,
91 "feat_proj_layer_norm": true,
92 "final_dropout": 0.1,
93 "hidden_act": "gelu",
94 "hidden_dropout": 0.1,
95 "hidden_size": 768,
96 "initializer_range": 0.02,
97 "intermediate_size": 3072,
98 "layer_norm_eps": 1e-05,
99 "layerdrop": 0.1,
100 "mask_feature_length": 10,
101 "mask_feature_min_masks": 0,
102 "mask_feature_prob": 0.0,
103 "mask_time_length": 10,
104 "mask_time_min_masks": 2,
105 "mask_time_prob": 0.0,
106 "model_type": "hubert",
107 "num_attention_heads": 12,
108 "num_conv_pos_embedding_groups": 16,
109 "num_conv_pos_embeddings": 128,
110 "num_feat_extract_layers": 7,
111 "num_hidden_layers": 12,
112 "pad_token_id": 0,
113 "use_weighted_layer_sum": false,
114 "vocab_size": 32
115 },
116 "semantic_sample_rate": 16000,
117 "strides": [
118 1,
119 1
120 ],
121 "target_bandwidths": [
122 0.5,
123 1,
124 1.5,
125 2
126 ],
127 "transformers_version": "5.3.0.dev0",
128 "unit_kernel_size": 3
129 }
130