config.json · RootSignals-Judge-Llama-70B

config.json

1.9 KB · 83 lines · json Raw

1	`{`
2	`"architectures": [`
3	`"LlamaForCausalLM"`
4	`],`
5	`"attention_bias": false,`
6	`"attention_dropout": 0.0,`
7	`"bos_token_id": 128000,`
8	`"eos_token_id": [`
9	`128001,`
10	`128008,`
11	`128009`
12	`],`
13	`"head_dim": 128,`
14	`"hidden_act": "silu",`
15	`"hidden_size": 8192,`
16	`"initializer_range": 0.02,`
17	`"intermediate_size": 28672,`
18	`"max_position_embeddings": 131072,`
19	`"mlp_bias": false,`
20	`"model_type": "llama",`
21	`"num_attention_heads": 64,`
22	`"num_hidden_layers": 80,`
23	`"num_key_value_heads": 8,`
24	`"pretraining_tp": 1,`
25	`"quantization_config": {`
26	`"config_groups": {`
27	`"group_0": {`
28	`"input_activations": {`
29	`"actorder": null,`
30	`"block_structure": null,`
31	`"dynamic": true,`
32	`"group_size": null,`
33	`"num_bits": 8,`
34	`"observer": null,`
35	`"observer_kwargs": {},`
36	`"strategy": "token",`
37	`"symmetric": true,`
38	`"type": "float"`
39	`},`
40	`"output_activations": null,`
41	`"targets": [`
42	`"Linear"`
43	`],`
44	`"weights": {`
45	`"actorder": null,`
46	`"block_structure": null,`
47	`"dynamic": false,`
48	`"group_size": null,`
49	`"num_bits": 8,`
50	`"observer": "minmax",`
51	`"observer_kwargs": {},`
52	`"strategy": "channel",`
53	`"symmetric": true,`
54	`"type": "float"`
55	`}`
56	`}`
57	`},`
58	`"format": "float-quantized",`
59	`"global_compression_ratio": 1.463543865167781,`
60	`"ignore": [`
61	`"lm_head"`
62	`],`
63	`"kv_cache_scheme": null,`
64	`"quant_method": "compressed-tensors",`
65	`"quantization_status": "compressed",`
66	`"sparsity_config": {}`
67	`},`
68	`"rms_norm_eps": 1e-05,`
69	`"rope_scaling": {`
70	`"factor": 8.0,`
71	`"high_freq_factor": 4.0,`
72	`"low_freq_factor": 1.0,`
73	`"original_max_position_embeddings": 8192,`
74	`"rope_type": "llama3"`
75	`},`
76	`"rope_theta": 500000.0,`
77	`"tie_word_embeddings": false,`
78	`"torch_dtype": "float16",`
79	`"transformers_version": "4.47.1",`
80	`"use_cache": true,`
81	`"vocab_size": 128256`
82	`}`
83