config.json · Llama-3.1-Nemotron-70B-Instruct

config.json

930 B · 41 lines · json Raw

1	`{`
2	`"_name_or_path": "meta-llama/Llama-3.1-70B-Instruct",`
3	`"architectures": [`
4	`"LlamaForCausalLM"`
5	`],`
6	`"attention_bias": false,`
7	`"attention_dropout": 0.0,`
8	`"bos_token_id": 128000,`
9	`"eos_token_id": [`
10	`128001,`
11	`128008,`
12	`128009`
13	`],`
14	`"head_dim": 128,`
15	`"hidden_act": "silu",`
16	`"hidden_size": 8192,`
17	`"initializer_range": 0.02,`
18	`"intermediate_size": 28672,`
19	`"max_position_embeddings": 131072,`
20	`"mlp_bias": false,`
21	`"model_type": "llama",`
22	`"num_attention_heads": 64,`
23	`"num_hidden_layers": 80,`
24	`"num_key_value_heads": 8,`
25	`"pretraining_tp": 1,`
26	`"rms_norm_eps": 1e-05,`
27	`"rope_scaling": {`
28	`"factor": 8.0,`
29	`"high_freq_factor": 4.0,`
30	`"low_freq_factor": 1.0,`
31	`"original_max_position_embeddings": 8192,`
32	`"rope_type": "llama3"`
33	`},`
34	`"rope_theta": 500000.0,`
35	`"tie_word_embeddings": false,`
36	`"torch_dtype": "bfloat16",`
37	`"transformers_version": "4.40.0",`
38	`"use_cache": true,`
39	`"vocab_size": 128256`
40	`}`
41