config.json
| 1 | { |
| 2 | "activation_fn_name": "swish", |
| 3 | "architectures": [ |
| 4 | "OpenELMForCausalLM" |
| 5 | ], |
| 6 | "auto_map": { |
| 7 | "AutoConfig": "configuration_openelm.OpenELMConfig", |
| 8 | "AutoModelForCausalLM": "modeling_openelm.OpenELMForCausalLM" |
| 9 | }, |
| 10 | "bos_token_id": 1, |
| 11 | "eos_token_id": 2, |
| 12 | "ffn_dim_divisor": 256, |
| 13 | "ffn_multipliers": [ |
| 14 | 0.5, |
| 15 | 0.63, |
| 16 | 0.76, |
| 17 | 0.89, |
| 18 | 1.02, |
| 19 | 1.15, |
| 20 | 1.28, |
| 21 | 1.41, |
| 22 | 1.54, |
| 23 | 1.67, |
| 24 | 1.8, |
| 25 | 1.93, |
| 26 | 2.06, |
| 27 | 2.19, |
| 28 | 2.31, |
| 29 | 2.44, |
| 30 | 2.57, |
| 31 | 2.7, |
| 32 | 2.83, |
| 33 | 2.96, |
| 34 | 3.09, |
| 35 | 3.22, |
| 36 | 3.35, |
| 37 | 3.48, |
| 38 | 3.61, |
| 39 | 3.74, |
| 40 | 3.87, |
| 41 | 4.0 |
| 42 | ], |
| 43 | "ffn_with_glu": true, |
| 44 | "head_dim": 64, |
| 45 | "initializer_range": 0.02, |
| 46 | "max_context_length": 2048, |
| 47 | "model_dim": 2048, |
| 48 | "model_type": "openelm", |
| 49 | "normalization_layer_name": "rms_norm", |
| 50 | "normalize_qk_projections": true, |
| 51 | "num_gqa_groups": 4, |
| 52 | "num_kv_heads": [ |
| 53 | 4, |
| 54 | 4, |
| 55 | 4, |
| 56 | 5, |
| 57 | 5, |
| 58 | 5, |
| 59 | 5, |
| 60 | 5, |
| 61 | 5, |
| 62 | 5, |
| 63 | 6, |
| 64 | 6, |
| 65 | 6, |
| 66 | 6, |
| 67 | 6, |
| 68 | 6, |
| 69 | 6, |
| 70 | 6, |
| 71 | 7, |
| 72 | 7, |
| 73 | 7, |
| 74 | 7, |
| 75 | 7, |
| 76 | 7, |
| 77 | 8, |
| 78 | 8, |
| 79 | 8, |
| 80 | 8 |
| 81 | ], |
| 82 | "num_query_heads": [ |
| 83 | 16, |
| 84 | 16, |
| 85 | 16, |
| 86 | 20, |
| 87 | 20, |
| 88 | 20, |
| 89 | 20, |
| 90 | 20, |
| 91 | 20, |
| 92 | 20, |
| 93 | 24, |
| 94 | 24, |
| 95 | 24, |
| 96 | 24, |
| 97 | 24, |
| 98 | 24, |
| 99 | 24, |
| 100 | 24, |
| 101 | 28, |
| 102 | 28, |
| 103 | 28, |
| 104 | 28, |
| 105 | 28, |
| 106 | 28, |
| 107 | 32, |
| 108 | 32, |
| 109 | 32, |
| 110 | 32 |
| 111 | ], |
| 112 | "num_transformer_layers": 28, |
| 113 | "qkv_multipliers": [ |
| 114 | 0.5, |
| 115 | 1.0 |
| 116 | ], |
| 117 | "rope_freq_constant": 10000, |
| 118 | "rope_max_length": 4096, |
| 119 | "share_input_output_layers": true, |
| 120 | "torch_dtype": "bfloat16", |
| 121 | "transformers_version": "4.39.3", |
| 122 | "use_cache": true, |
| 123 | "vocab_size": 32000 |
| 124 | } |
| 125 | |