config.json
2.7 KB · 118 lines · json Raw
1 {
2 "activation_dropout": 0.0,
3 "activation_function": "relu",
4 "adaptor_dropout": 0.1,
5 "adaptor_kernel_size": 8,
6 "adaptor_stride": 8,
7 "add_adapter": true,
8 "architectures": [
9 "SeamlessM4Tv2Model"
10 ],
11 "attention_dropout": 0.1,
12 "bos_token_id": 2,
13 "char_vocab_size": 10943,
14 "conv_depthwise_kernel_size": 31,
15 "decoder_attention_heads": 16,
16 "decoder_ffn_dim": 8192,
17 "decoder_layerdrop": 0.05,
18 "decoder_layers": 24,
19 "decoder_start_token_id": 3,
20 "dropout": 0.1,
21 "encoder_attention_heads": 16,
22 "encoder_ffn_dim": 8192,
23 "encoder_layerdrop": 0.05,
24 "encoder_layers": 24,
25 "eos_token_id": 3,
26 "feature_projection_input_dim": 160,
27 "hidden_size": 1024,
28 "initializer_range": 0.02,
29 "is_encoder_decoder": true,
30 "lang_embed_dim": 256,
31 "layer_norm_eps": 1e-05,
32 "leaky_relu_slope": 0.1,
33 "left_max_position_embeddings": 64,
34 "max_new_tokens": 256,
35 "max_position_embeddings": 4096,
36 "model_type": "seamless_m4t_v2",
37 "num_adapter_layers": 1,
38 "num_attention_heads": 16,
39 "num_hidden_layers": 24,
40 "pad_token_id": 0,
41 "position_embeddings_type": "relative_key",
42 "resblock_dilation_sizes": [
43 [
44 1,
45 3,
46 5
47 ],
48 [
49 1,
50 3,
51 5
52 ],
53 [
54 1,
55 3,
56 5
57 ]
58 ],
59 "resblock_kernel_sizes": [
60 3,
61 7,
62 11
63 ],
64 "right_max_position_embeddings": 8,
65 "sampling_rate": 16000,
66 "scale_embedding": true,
67 "speech_encoder_attention_heads": 16,
68 "speech_encoder_chunk_size": 20000,
69 "speech_encoder_dropout": 0.0,
70 "speech_encoder_hidden_act": "swish",
71 "speech_encoder_intermediate_size": 4096,
72 "speech_encoder_layerdrop": 0.1,
73 "speech_encoder_layers": 24,
74 "speech_encoder_left_chunk_num": 128,
75 "spkr_embed_dim": 256,
76 "t2u_bos_token_id": 0,
77 "t2u_decoder_attention_heads": 16,
78 "t2u_decoder_ffn_dim": 8192,
79 "t2u_decoder_layers": 6,
80 "t2u_encoder_attention_heads": 16,
81 "t2u_encoder_ffn_dim": 8192,
82 "t2u_encoder_layers": 6,
83 "t2u_eos_token_id": 2,
84 "t2u_max_position_embeddings": 4096,
85 "t2u_pad_token_id": 1,
86 "t2u_variance_pred_dropout": 0.5,
87 "t2u_variance_predictor_embed_dim": 1024,
88 "t2u_variance_predictor_hidden_dim": 256,
89 "t2u_variance_predictor_kernel_size": 3,
90 "t2u_vocab_size": 10082,
91 "torch_dtype": "float32",
92 "transformers_version": "4.36.0.dev0",
93 "unit_embed_dim": 1280,
94 "unit_hifi_gan_vocab_size": 10000,
95 "upsample_initial_channel": 512,
96 "upsample_kernel_sizes": [
97 11,
98 8,
99 8,
100 4,
101 4
102 ],
103 "upsample_rates": [
104 5,
105 4,
106 4,
107 2,
108 2
109 ],
110 "use_cache": true,
111 "var_pred_dropout": 0.5,
112 "variance_predictor_kernel_size": 3,
113 "vocab_size": 256102,
114 "vocoder_num_langs": 36,
115 "vocoder_num_spkrs": 200,
116 "vocoder_offset": 4
117 }
118