audio_tokenizer/config.json

2.5 KB · 130 lines · json Raw

1	`{`
2	`"acoustic_model_config": {`
3	`"codebook_dim": 8,`
4	`"codebook_loss_weight": 1.0,`
5	`"codebook_size": 1024,`
6	`"commitment_loss_weight": 0.25,`
7	`"decoder_hidden_size": 1024,`
8	`"downsampling_ratios": [`
9	`8,`
10	`5,`
11	`4,`
12	`2,`
13	`3`
14	`],`
15	`"encoder_hidden_size": 64,`
16	`"hidden_size": 256,`
17	`"hop_length": 960,`
18	`"model_type": "dac",`
19	`"n_codebooks": 9,`
20	`"quantizer_dropout": 0,`
21	`"sampling_rate": 16000,`
22	`"upsampling_ratios": [`
23	`8,`
24	`5,`
25	`4,`
26	`2,`
27	`3`
28	`]`
29	`},`
30	`"architectures": [`
31	`"HiggsAudioV2TokenizerModel"`
32	`],`
33	`"block_dilations": [`
34	`1,`
35	`1`
36	`],`
37	`"channel_ratios": [`
38	`1,`
39	`1`
40	`],`
41	`"codebook_dim": 64,`
42	`"codebook_size": 1024,`
43	`"downsample_factor": 320,`
44	`"dtype": "float32",`
45	`"initializer_range": 0.02,`
46	`"kernel_size": 3,`
47	`"model_type": "higgs_audio_v2_tokenizer",`
48	`"sample_rate": 24000,`
49	`"semantic_model_config": {`
50	`"activation_dropout": 0.1,`
51	`"apply_spec_augment": true,`
52	`"attention_dropout": 0.1,`
53	`"bos_token_id": 1,`
54	`"classifier_proj_size": 256,`
55	`"conv_bias": false,`
56	`"conv_dim": [`
57	`512,`
58	`512,`
59	`512,`
60	`512,`
61	`512,`
62	`512,`
63	`512`
64	`],`
65	`"conv_kernel": [`
66	`10,`
67	`3,`
68	`3,`
69	`3,`
70	`3,`
71	`2,`
72	`2`
73	`],`
74	`"conv_pos_batch_norm": false,`
75	`"conv_stride": [`
76	`5,`
77	`2,`
78	`2,`
79	`2,`
80	`2,`
81	`2,`
82	`2`
83	`],`
84	`"ctc_loss_reduction": "sum",`
85	`"ctc_zero_infinity": false,`
86	`"do_stable_layer_norm": false,`
87	`"eos_token_id": 2,`
88	`"feat_extract_activation": "gelu",`
89	`"feat_extract_norm": "group",`
90	`"feat_proj_dropout": 0.0,`
91	`"feat_proj_layer_norm": true,`
92	`"final_dropout": 0.1,`
93	`"hidden_act": "gelu",`
94	`"hidden_dropout": 0.1,`
95	`"hidden_size": 768,`
96	`"initializer_range": 0.02,`
97	`"intermediate_size": 3072,`
98	`"layer_norm_eps": 1e-05,`
99	`"layerdrop": 0.1,`
100	`"mask_feature_length": 10,`
101	`"mask_feature_min_masks": 0,`
102	`"mask_feature_prob": 0.0,`
103	`"mask_time_length": 10,`
104	`"mask_time_min_masks": 2,`
105	`"mask_time_prob": 0.0,`
106	`"model_type": "hubert",`
107	`"num_attention_heads": 12,`
108	`"num_conv_pos_embedding_groups": 16,`
109	`"num_conv_pos_embeddings": 128,`
110	`"num_feat_extract_layers": 7,`
111	`"num_hidden_layers": 12,`
112	`"pad_token_id": 0,`
113	`"use_weighted_layer_sum": false,`
114	`"vocab_size": 32`
115	`},`
116	`"semantic_sample_rate": 16000,`
117	`"strides": [`
118	`1,`
119	`1`
120	`],`
121	`"target_bandwidths": [`
122	`0.5,`
123	`1,`
124	`1.5,`
125	`2`
126	`],`
127	`"transformers_version": "5.3.0.dev0",`
128	`"unit_kernel_size": 3`
129	`}`
130