params.json
1.3 KB · 56 lines · json Raw
1 {
2 "dim": 3072,
3 "n_layers": 26,
4 "head_dim": 128,
5 "hidden_dim": 9216,
6 "n_heads": 32,
7 "n_kv_heads": 8,
8 "use_biases": false,
9 "causal": true,
10 "rope_theta": 1000000.0,
11 "norm_eps": 1e-05,
12 "vocab_size": 131072,
13 "model_parallel": 1,
14 "tied_embeddings": true,
15 "sliding_window": 8192,
16 "model_max_length": 131072,
17 "multimodal": {
18 "whisper_model_args": {
19 "encoder_args": {
20 "audio_encoding_args": {
21 "sampling_rate": 16000,
22 "frame_rate": 12.5,
23 "num_mel_bins": 128,
24 "hop_length": 160,
25 "window_size": 400,
26 "chunk_length_s": null,
27 "global_log_mel_max": 1.5,
28 "transcription_format": "streaming"
29 },
30 "dim": 1280,
31 "n_layers": 32,
32 "head_dim": 64,
33 "hidden_dim": 5120,
34 "n_heads": 32,
35 "vocab_size": 131072,
36 "n_kv_heads": 32,
37 "use_biases": true,
38 "use_cache": false,
39 "rope_theta": 1000000.0,
40 "causal": true,
41 "norm_eps": 1e-05,
42 "pos_embed": "rope",
43 "max_source_positions": null,
44 "ffn_type": "swiglu",
45 "norm_type": "rms_norm",
46 "sliding_window": 750
47 },
48 "downsample_args": {
49 "downsample_factor": 4
50 }
51 }
52 },
53 "ada_rms_norm_t_cond": true,
54 "ada_rms_norm_t_cond_dim": 32
55 }
56