config.json
1.6 KB · 69 lines · json Raw
1 {
2 "architectures": [
3 "Qwen3VLForConditionalGeneration"
4 ],
5 "dtype": "bfloat16",
6 "eos_token_id": 151645,
7 "hidden_size": 4096,
8 "image_token_id": 151655,
9 "model_type": "qwen3_vl",
10 "pad_token_id": 151643,
11 "text_config": {
12 "attention_bias": false,
13 "attention_dropout": 0.0,
14 "bos_token_id": 151643,
15 "dtype": "float32",
16 "eos_token_id": 151645,
17 "head_dim": 128,
18 "hidden_act": "silu",
19 "hidden_size": 4096,
20 "initializer_range": 0.02,
21 "intermediate_size": 12288,
22 "max_position_embeddings": 262144,
23 "model_type": "qwen3_vl_text",
24 "num_attention_heads": 32,
25 "num_hidden_layers": 36,
26 "num_key_value_heads": 8,
27 "rms_norm_eps": 1e-06,
28 "rope_scaling": {
29 "mrope_interleaved": true,
30 "mrope_section": [
31 24,
32 20,
33 20
34 ],
35 "rope_type": "default"
36 },
37 "rope_theta": 5000000,
38 "use_cache": false,
39 "vocab_size": 151936
40 },
41 "tie_word_embeddings": false,
42 "transformers_version": "4.57.0",
43 "use_cache": false,
44 "video_token_id": 151656,
45 "vision_config": {
46 "deepstack_visual_indexes": [
47 8,
48 16,
49 24
50 ],
51 "depth": 27,
52 "dtype": "float32",
53 "hidden_act": "gelu_pytorch_tanh",
54 "hidden_size": 1152,
55 "in_channels": 3,
56 "initializer_range": 0.02,
57 "intermediate_size": 4304,
58 "model_type": "qwen3_vl",
59 "num_heads": 16,
60 "num_position_embeddings": 2304,
61 "out_hidden_size": 4096,
62 "patch_size": 16,
63 "spatial_merge_size": 2,
64 "temporal_patch_size": 2
65 },
66 "vision_end_token_id": 151653,
67 "vision_start_token_id": 151652
68 }
69