config.json
5.3 KB · 194 lines · json Raw
1 {
2 "architectures": [
3 "KimiK25ForConditionalGeneration"
4 ],
5 "auto_map": {
6 "AutoConfig": "configuration_kimi_k25.KimiK25Config",
7 "AutoModel": "modeling_kimi_k25.KimiK25ForConditionalGeneration",
8 "AutoModelForCausalLM": "modeling_kimi_k25.KimiK25ForConditionalGeneration"
9 },
10 "bos_token_id": 163584,
11 "dtype": "bfloat16",
12 "eos_token_id": 163585,
13 "ignore_index": -100,
14 "media_placeholder_token_id": 163605,
15 "model_type": "kimi_k25",
16 "pad_token_id": 163839,
17 "text_config": {
18 "_name_or_path": "",
19 "add_cross_attention": false,
20 "architectures": [
21 "DeepseekV3ForCausalLM"
22 ],
23 "attention_bias": false,
24 "attention_dropout": 0.0,
25 "auto_map": {
26 "AutoConfig": "configuration_deepseek.DeepseekV3Config",
27 "AutoModel": "modeling_deepseek.DeepseekV3Model",
28 "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
29 },
30 "aux_loss_alpha": 0.001,
31 "bad_words_ids": null,
32 "begin_suppress_tokens": null,
33 "bos_token_id": 163584,
34 "chunk_size_feed_forward": 0,
35 "cross_attention_hidden_size": null,
36 "decoder_start_token_id": null,
37 "diversity_penalty": 0.0,
38 "do_sample": false,
39 "dtype": "bfloat16",
40 "early_stopping": false,
41 "encoder_no_repeat_ngram_size": 0,
42 "eos_token_id": 163585,
43 "ep_size": 1,
44 "exponential_decay_length_penalty": null,
45 "finetuning_task": null,
46 "first_k_dense_replace": 1,
47 "forced_bos_token_id": null,
48 "forced_eos_token_id": null,
49 "hidden_act": "silu",
50 "hidden_size": 7168,
51 "id2label": {
52 "0": "LABEL_0",
53 "1": "LABEL_1"
54 },
55 "initializer_range": 0.02,
56 "intermediate_size": 18432,
57 "is_decoder": false,
58 "is_encoder_decoder": false,
59 "kv_lora_rank": 512,
60 "label2id": {
61 "LABEL_0": 0,
62 "LABEL_1": 1
63 },
64 "length_penalty": 1.0,
65 "max_length": 20,
66 "max_position_embeddings": 262144,
67 "min_length": 0,
68 "model_type": "kimi_k2",
69 "moe_intermediate_size": 2048,
70 "moe_layer_freq": 1,
71 "n_group": 1,
72 "n_routed_experts": 384,
73 "n_shared_experts": 1,
74 "no_repeat_ngram_size": 0,
75 "norm_topk_prob": true,
76 "num_attention_heads": 64,
77 "num_beam_groups": 1,
78 "num_beams": 1,
79 "num_experts_per_tok": 8,
80 "num_hidden_layers": 61,
81 "num_key_value_heads": 64,
82 "num_nextn_predict_layers": 0,
83 "num_return_sequences": 1,
84 "output_attentions": false,
85 "output_hidden_states": false,
86 "output_scores": false,
87 "pad_token_id": 163839,
88 "prefix": null,
89 "pretraining_tp": 1,
90 "problem_type": null,
91 "pruned_heads": {},
92 "q_lora_rank": 1536,
93 "qk_nope_head_dim": 128,
94 "qk_rope_head_dim": 64,
95 "quantization_config": {
96 "config_groups": {
97 "group_0": {
98 "input_activations": null,
99 "output_activations": null,
100 "targets": [
101 "Linear"
102 ],
103 "weights": {
104 "actorder": null,
105 "block_structure": null,
106 "dynamic": false,
107 "group_size": 32,
108 "num_bits": 4,
109 "observer": "minmax",
110 "observer_kwargs": {},
111 "strategy": "group",
112 "symmetric": true,
113 "type": "int"
114 }
115 }
116 },
117 "format": "pack-quantized",
118 "ignore": [
119 "re:.*self_attn.*",
120 "re:.*shared_experts.*",
121 "re:.*mlp\\.(gate|up|gate_up|down)_proj.*",
122 "re:.*lm_head.*",
123 "re:vision_tower.*",
124 "re:mm_projector.*"
125 ],
126 "kv_cache_scheme": null,
127 "quant_method": "compressed-tensors",
128 "quantization_status": "compressed"
129 },
130 "remove_invalid_values": false,
131 "repetition_penalty": 1.0,
132 "return_dict": true,
133 "return_dict_in_generate": false,
134 "rms_norm_eps": 1e-05,
135 "rope_scaling": {
136 "beta_fast": 32.0,
137 "beta_slow": 1.0,
138 "factor": 64.0,
139 "mscale": 1.0,
140 "mscale_all_dim": 1.0,
141 "original_max_position_embeddings": 4096,
142 "type": "yarn"
143 },
144 "rope_theta": 50000.0,
145 "routed_scaling_factor": 2.827,
146 "scoring_func": "sigmoid",
147 "sep_token_id": null,
148 "seq_aux": true,
149 "suppress_tokens": null,
150 "task_specific_params": null,
151 "temperature": 1.0,
152 "tf_legacy_loss": false,
153 "tie_encoder_decoder": false,
154 "tie_word_embeddings": false,
155 "tokenizer_class": null,
156 "top_k": 50,
157 "top_p": 1.0,
158 "topk_group": 1,
159 "topk_method": "noaux_tc",
160 "torchscript": false,
161 "transformers_version": "4.56.2",
162 "typical_p": 1.0,
163 "use_bfloat16": false,
164 "use_cache": true,
165 "v_head_dim": 128,
166 "vocab_size": 163840
167 },
168 "tie_word_embeddings": false,
169 "use_unified_vision_chunk": true,
170 "video_placeholder": "<|kimi_k25_video_placeholder|>",
171 "vision_config": {
172 "_attn_implementation": "flash_attention_2",
173 "init_pos_emb_height": 64,
174 "init_pos_emb_time": 4,
175 "init_pos_emb_width": 64,
176 "merge_kernel_size": [
177 2,
178 2
179 ],
180 "merge_type": "sd2_tpool",
181 "mm_hidden_size": 1152,
182 "mm_projector_type": "patchmerger",
183 "patch_size": 14,
184 "pos_emb_type": "divided_fixed",
185 "projector_hidden_act": "gelu",
186 "projector_ln_eps": 1e-05,
187 "text_hidden_size": 7168,
188 "video_attn_type": "spatial_temporal",
189 "vt_hidden_size": 1152,
190 "vt_intermediate_size": 4304,
191 "vt_num_attention_heads": 16,
192 "vt_num_hidden_layers": 27
193 }
194 }