configuration_locateanything.py
| 1 | # -------------------------------------------------------- |
| 2 | # InternVL |
| 3 | # Copyright (c) 2023 OpenGVLab |
| 4 | # Licensed under The MIT License [see LICENSE for details] |
| 5 | # -------------------------------------------------------- |
| 6 | |
| 7 | import copy |
| 8 | |
| 9 | from transformers.models.qwen2.configuration_qwen2 import Qwen2Config |
| 10 | from transformers.models.qwen3.configuration_qwen3 import Qwen3Config |
| 11 | from transformers.configuration_utils import PretrainedConfig |
| 12 | from transformers.utils import logging |
| 13 | logger = logging.get_logger(__name__) |
| 14 | |
| 15 | class MoonViTConfig(PretrainedConfig): |
| 16 | model_type = "moonvit" |
| 17 | |
| 18 | def __init__( |
| 19 | self, |
| 20 | patch_size: int = 14, |
| 21 | init_pos_emb_height: int = 64, |
| 22 | init_pos_emb_width: int = 64, |
| 23 | num_attention_heads: int = 16, |
| 24 | num_hidden_layers: int = 27, |
| 25 | hidden_size: int = 1152, |
| 26 | intermediate_size: int = 4304, |
| 27 | merge_kernel_size: tuple[int, int] = (2, 2), |
| 28 | **kwargs, |
| 29 | ): |
| 30 | super().__init__(**kwargs) |
| 31 | self.patch_size = patch_size |
| 32 | # Positional embedding config |
| 33 | self.init_pos_emb_height = init_pos_emb_height |
| 34 | self.init_pos_emb_width = init_pos_emb_width |
| 35 | # Transformer config |
| 36 | self.num_hidden_layers = num_hidden_layers |
| 37 | self.num_attention_heads = num_attention_heads |
| 38 | self.hidden_size = hidden_size |
| 39 | self.intermediate_size = intermediate_size |
| 40 | # Patch merger config |
| 41 | self.merge_kernel_size = merge_kernel_size |
| 42 | |
| 43 | |
| 44 | class LocateAnythingConfig(PretrainedConfig): |
| 45 | model_type = 'locateanything' |
| 46 | is_composition = True |
| 47 | sub_configs = {"vision_config": MoonViTConfig, "text_config": Qwen2Config} |
| 48 | def __init__( |
| 49 | self, |
| 50 | vision_config=None, |
| 51 | text_config=None, |
| 52 | use_backbone_lora=0, |
| 53 | use_llm_lora=0, |
| 54 | downsample_ratio=0.5, |
| 55 | template=None, |
| 56 | loss_version='v1', |
| 57 | mlp_checkpoint=False, |
| 58 | image_token_index=151667, |
| 59 | box_start_token_id=151668, |
| 60 | box_end_token_id=151669, |
| 61 | coord_start_token_id=151677, |
| 62 | coord_end_token_id=152677, |
| 63 | ref_start_token_id=151672, |
| 64 | ref_end_token_id=151673, |
| 65 | none_token_id=4064, |
| 66 | **kwargs): |
| 67 | super().__init__(**kwargs) |
| 68 | |
| 69 | if vision_config is None: |
| 70 | vision_config = {'model_type': 'moonvit'} |
| 71 | logger.info('vision_config is None. Initializing the MoonViTConfig with default values.') |
| 72 | |
| 73 | if text_config is None: |
| 74 | text_config = {'architectures': ['Qwen2ForCausalLM']} |
| 75 | logger.info('text_config is None. Initializing the Qwen2Config config with default values.') |
| 76 | |
| 77 | if vision_config['model_type'] == 'moonvit': |
| 78 | self.vision_config = MoonViTConfig(**vision_config) |
| 79 | else: |
| 80 | raise ValueError('Unsupported model_type: {}. Only moonvit is supported.'.format(vision_config['model_type'])) |
| 81 | |
| 82 | |
| 83 | if text_config['architectures'][0] == 'Qwen2ForCausalLM': |
| 84 | self.text_config = Qwen2Config(**text_config) |
| 85 | elif text_config['architectures'][0] == 'Qwen3ForCausalLM': |
| 86 | self.text_config = Qwen3Config(**text_config) |
| 87 | else: |
| 88 | raise ValueError('Unsupported architecture: {}. Only Qwen2ForCausalLM and Qwen3ForCausalLM are supported.'.format(text_config['architectures'][0])) |
| 89 | self.use_backbone_lora = use_backbone_lora |
| 90 | self.use_llm_lora = use_llm_lora |
| 91 | self.mlp_checkpoint = mlp_checkpoint |
| 92 | self.downsample_ratio = downsample_ratio |
| 93 | self.template = template |
| 94 | self.loss_version = loss_version |
| 95 | self.tie_word_embeddings = self.text_config.tie_word_embeddings |
| 96 | self.image_token_index = image_token_index |
| 97 | self.box_start_token_id = box_start_token_id |
| 98 | self.box_end_token_id = box_end_token_id |
| 99 | self.coord_start_token_id = coord_start_token_id |
| 100 | self.coord_end_token_id = coord_end_token_id |
| 101 | self.ref_start_token_id = ref_start_token_id |
| 102 | self.ref_end_token_id = ref_end_token_id |
| 103 | self.none_token_id = none_token_id |
| 104 | |
| 105 | def to_dict(self): |
| 106 | """ |
| 107 | Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. |
| 108 | |
| 109 | Returns: |
| 110 | `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, |
| 111 | """ |
| 112 | output = copy.deepcopy(self.__dict__) |
| 113 | output['vision_config'] = self.vision_config.to_dict() |
| 114 | output['text_config'] = self.text_config.to_dict() |
| 115 | output['model_type'] = self.__class__.model_type |
| 116 | output['use_backbone_lora'] = self.use_backbone_lora |
| 117 | output['use_llm_lora'] = self.use_llm_lora |
| 118 | output['downsample_ratio'] = self.downsample_ratio |
| 119 | output['template'] = self.template |
| 120 | output['image_token_index'] = self.image_token_index |
| 121 | output['box_start_token_id'] = self.box_start_token_id |
| 122 | output['box_end_token_id'] = self.box_end_token_id |
| 123 | output['coord_start_token_id'] = self.coord_start_token_id |
| 124 | output['coord_end_token_id'] = self.coord_end_token_id |
| 125 | output['ref_start_token_id'] = self.ref_start_token_id |
| 126 | output['ref_end_token_id'] = self.ref_end_token_id |
| 127 | output['none_token_id'] = self.none_token_id |
| 128 | output['_attn_implementation'] = self._attn_implementation |
| 129 | if hasattr(self, '_attn_implementation_autoset'): |
| 130 | output['_attn_implementation_autoset'] = self._attn_implementation_autoset |
| 131 | return output |
| 132 | |