config.json · Qwen2.5-Coder-0.5B-Instruct-Gensyn-Swarm-tall_tame_panther | QuantaMrkt

config.json

1.2 KB · 55 lines · json Raw

1	`{`
2	`"architectures": [`
3	`"Qwen2ForCausalLM"`
4	`],`
5	`"attention_dropout": 0.0,`
6	`"bos_token_id": 151643,`
7	`"dtype": "bfloat16",`
8	`"eos_token_id": 151645,`
9	`"hidden_act": "silu",`
10	`"hidden_size": 896,`
11	`"initializer_range": 0.02,`
12	`"intermediate_size": 4864,`
13	`"layer_types": [`
14	`"full_attention",`
15	`"full_attention",`
16	`"full_attention",`
17	`"full_attention",`
18	`"full_attention",`
19	`"full_attention",`
20	`"full_attention",`
21	`"full_attention",`
22	`"full_attention",`
23	`"full_attention",`
24	`"full_attention",`
25	`"full_attention",`
26	`"full_attention",`
27	`"full_attention",`
28	`"full_attention",`
29	`"full_attention",`
30	`"full_attention",`
31	`"full_attention",`
32	`"full_attention",`
33	`"full_attention",`
34	`"full_attention",`
35	`"full_attention",`
36	`"full_attention",`
37	`"full_attention"`
38	`],`
39	`"max_position_embeddings": 32768,`
40	`"max_window_layers": 24,`
41	`"model_type": "qwen2",`
42	`"num_attention_heads": 14,`
43	`"num_hidden_layers": 24,`
44	`"num_key_value_heads": 2,`
45	`"rms_norm_eps": 1e-06,`
46	`"rope_scaling": null,`
47	`"rope_theta": 1000000.0,`
48	`"sliding_window": null,`
49	`"tie_word_embeddings": true,`
50	`"transformers_version": "4.57.1",`
51	`"use_cache": true,`
52	`"use_sliding_window": false,`
53	`"vocab_size": 151936`
54	`}`
55