config.json
2.5 KB · 78 lines · json Raw
1 {
2 "activation_function": "swiglu",
3 "architectures": [
4 "NomicBertModel"
5 ],
6 "attention_probs_dropout_prob": 0.0,
7 "attn_pdrop": 0.0,
8 "auto_map": {
9 "AutoConfig": "nomic-ai/nomic-bert-2048--configuration_hf_nomic_bert.NomicBertConfig",
10 "AutoModel": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertModel",
11 "AutoModelForMaskedLM": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForPreTraining",
12 "AutoModelForSequenceClassification": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForSequenceClassification",
13 "AutoModelForMultipleChoice": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForMultipleChoice",
14 "AutoModelForQuestionAnswering": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForQuestionAnswering",
15 "AutoModelForTokenClassification": "nomic-ai/nomic-bert-2048--modeling_hf_nomic_bert.NomicBertForTokenClassification"
16 },
17 "bos_token_id": null,
18 "causal": false,
19 "classifier_dropout": null,
20 "dense_seq_output": true,
21 "embd_pdrop": 0.0,
22 "eos_token_id": null,
23 "fused_bias_fc": true,
24 "fused_dropout_add_ln": true,
25 "head_dim": 64,
26 "hidden_act": "silu",
27 "hidden_dropout_prob": 0.0,
28 "hidden_size": 768,
29 "initializer_range": 0.02,
30 "intermediate_size": 3072,
31 "layer_norm_epsilon": 1e-12,
32 "layer_norm_eps": 1e-12,
33 "max_position_embeddings": 2048,
34 "max_trained_positions": 2048,
35 "mlp_fc1_bias": false,
36 "mlp_fc2_bias": false,
37 "model_type": "nomic_bert",
38 "n_embd": 768,
39 "n_head": 12,
40 "n_inner": 3072,
41 "n_layer": 12,
42 "n_positions": 8192,
43 "num_attention_heads": 12,
44 "num_hidden_layers": 12,
45 "pad_token_id": 0,
46 "pad_vocab_size_multiple": 64,
47 "parallel_block": false,
48 "parallel_block_tied_norm": false,
49 "prenorm": false,
50 "qkv_proj_bias": false,
51 "reorder_and_upcast_attn": false,
52 "resid_pdrop": 0.0,
53 "rope_parameters": {
54 "rope_theta": 1000.0,
55 "rope_type": "default"
56 },
57 "rotary_emb_base": 1000,
58 "rotary_emb_fraction": 1.0,
59 "rotary_emb_interleaved": false,
60 "rotary_emb_scale_base": null,
61 "rotary_scaling_factor": null,
62 "scale_attn_by_inverse_layer_idx": false,
63 "scale_attn_weights": true,
64 "summary_activation": null,
65 "summary_first_dropout": 0.0,
66 "summary_proj_to_labels": true,
67 "summary_type": "cls_index",
68 "summary_use_proj": true,
69 "torch_dtype": "float32",
70 "transformers_version": "5.3.0.dev0",
71 "type_vocab_size": 2,
72 "use_cache": true,
73 "use_flash_attn": true,
74 "use_rms_norm": false,
75 "use_xentropy": true,
76 "vocab_size": 30528
77 }
78