config.json
1.5 KB · 66 lines · json Raw
1 {
2 "architectures": [
3 "GlmOcrForConditionalGeneration"
4 ],
5 "model_type": "glm_ocr",
6 "text_config": {
7 "model_type": "glm_ocr_text",
8 "pad_token_id": 59246,
9 "vocab_size": 59392,
10 "eos_token_id": [
11 59246,
12 59253
13 ],
14 "attention_bias": false,
15 "attention_dropout": 0.0,
16 "head_dim": 128,
17 "hidden_act": "silu",
18 "hidden_size": 1536,
19 "initializer_range": 0.02,
20 "intermediate_size": 4608,
21 "max_position_embeddings": 131072,
22 "num_attention_heads": 16,
23 "num_hidden_layers": 16,
24 "num_nextn_predict_layers": 1,
25 "num_key_value_heads": 8,
26 "rms_norm_eps": 1e-05,
27 "dtype": "bfloat16",
28 "rope_parameters": {
29 "rope_type": "default",
30 "mrope_section": [
31 16,
32 24,
33 24
34 ],
35 "partial_rotary_factor": 1.0,
36 "rope_theta": 10000
37 },
38 "tie_word_embeddings": false,
39 "use_cache": true
40 },
41 "vision_config": {
42 "model_type": "glm_ocr_vision",
43 "hidden_size": 1024,
44 "depth": 24,
45 "num_heads": 16,
46 "attention_bias": true,
47 "intermediate_size": 4096,
48 "hidden_act": "silu",
49 "hidden_dropout_prob": 0.0,
50 "initializer_range": 0.02,
51 "image_size": 336,
52 "patch_size": 14,
53 "out_hidden_size": 1536,
54 "rms_norm_eps": 1e-05,
55 "spatial_merge_size": 2,
56 "temporal_patch_size": 2
57 },
58 "image_start_token_id": 59256,
59 "image_end_token_id": 59257,
60 "video_start_token_id": 59258,
61 "video_end_token_id": 59259,
62 "image_token_id": 59280,
63 "video_token_id": 59281,
64 "transformers_version": "5.0.1dev0"
65 }
66