VideoChat-Flash-Qwen2-7B_res448 / vision_tower_builder.py

Upload folder using huggingface_hub

76e4411 verified 7 days ago

24.8 kB

	from typing import Optional, Tuple, Union, Dict
	from dataclasses import dataclass
	from functools import partial, reduce
	from PIL import Image
	import torch
	import torch.utils.checkpoint
	from torch import nn
	import os
	from transformers.image_processing_utils import BatchFeature, get_size_dict
	from transformers.image_transforms import (
	convert_to_rgb,
	normalize,
	rescale,
	resize,
	to_channel_dimension_format,
	)
	from transformers.image_utils import (
	ChannelDimension,
	PILImageResampling,
	to_numpy_array,
	)
	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.utils.checkpoint as checkpoint
	from functools import partial
	try:
	from flash_attn import flash_attn_qkvpacked_func
	except:
	print("You need to install flash_attn")
	from timm.models.layers import drop_path, to_2tuple, trunc_normal_



	class DropPath(nn.Module):
	"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
	"""
	def __init__(self, drop_prob=None):
	super(DropPath, self).__init__()
	self.drop_prob = drop_prob

	def forward(self, x):
	return drop_path(x, self.drop_prob, self.training)

	def extra_repr(self) -> str:
	return 'p={}'.format(self.drop_prob)


	class Mlp(nn.Module):
	def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
	super().__init__()
	out_features = out_features or in_features
	hidden_features = hidden_features or in_features
	self.fc1 = nn.Linear(in_features, hidden_features)
	self.act = act_layer()
	self.fc2 = nn.Linear(hidden_features, out_features)
	self.drop = nn.Dropout(drop)

	def forward(self, x):
	x = self.fc1(x)
	x = self.act(x)
	x = self.drop(x)
	x = self.fc2(x)
	x = self.drop(x)
	return x

	class Attention(nn.Module):
	def __init__(
	self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
	proj_drop=0., attn_head_dim=None,
	attn_type='flash_v2'):
	super().__init__()
	self.num_heads = num_heads
	head_dim = dim // num_heads
	if attn_head_dim is not None:
	head_dim = attn_head_dim
	all_head_dim = head_dim * self.num_heads
	self.scale = qk_scale or head_dim ** -0.5

	self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
	if qkv_bias:
	self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
	self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
	else:
	self.q_bias = None
	self.v_bias = None

	if attn_type not in ['origin', 'flash_v2']:
	raise NotImplementedError(f"Not support attn_type: {attn_type}")

	# print('umt:', f'attn_type: {attn_type}')

	self.attn_type = attn_type
	if attn_type == 'flash_v2':
	self.attn_drop = attn_drop
	else:
	self.attn_drop = nn.Dropout(attn_drop)
	self.proj = nn.Linear(all_head_dim, dim)
	self.proj_drop = nn.Dropout(proj_drop)

	def forward(self, x):
	B, N, C = x.shape
	qkv_bias = None
	if self.q_bias is not None:
	qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
	# qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
	qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)

	if self.attn_type == 'flash_v2':
	qkv = qkv.reshape(B, N, 3, self.num_heads, -1)
	x = flash_attn_qkvpacked_func(qkv, dropout_p=self.attn_drop, softmax_scale=self.scale, causal=False).reshape(B, N, -1)
	else:
	qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
	q, k, v = qkv[0], qkv[1], qkv[
	2] # make torchscript happy (cannot use tensor as tuple)
	# B num_heads N head_dim

	q = q * self.scale
	attn = (q @ k.transpose(-2, -1))

	attn = attn.softmax(dim=-1)
	attn = self.attn_drop(attn)

	x = (attn @ v).transpose(1, 2).reshape(B, N, -1)

	x = self.proj(x)
	x = self.proj_drop(x)
	return x




	class Block(nn.Module):
	def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
	drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
	attn_head_dim=None):
	super().__init__()
	self.norm1 = norm_layer(dim)
	self.attn = Attention(
	dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
	attn_drop=attn_drop, proj_drop=drop, attn_head_dim=attn_head_dim)
	# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
	self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
	self.norm2 = norm_layer(dim)
	mlp_hidden_dim = int(dim * mlp_ratio)
	self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

	if init_values > 0:
	self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
	self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)),requires_grad=True)
	else:
	self.gamma_1, self.gamma_2 = None, None

	def forward(self, x):
	if self.gamma_1 is None:
	x = x + self.drop_path(self.attn(self.norm1(x)))
	x = x + self.drop_path(self.mlp(self.norm2(x)))
	else:
	x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
	x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
	return x


	class PatchEmbed(nn.Module):
	""" Image to Patch Embedding
	"""
	def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, num_frames=16, tubelet_size=2):
	super().__init__()
	img_size = to_2tuple(img_size)
	patch_size = to_2tuple(patch_size)
	self.tubelet_size = int(tubelet_size)
	num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) * (num_frames // self.tubelet_size)
	self.img_size = img_size
	self.patch_size = patch_size
	self.num_patches = num_patches
	self.proj = nn.Conv3d(
	in_channels=in_chans, out_channels=embed_dim,
	kernel_size=(self.tubelet_size, patch_size[0], patch_size[1]),
	stride=(self.tubelet_size, patch_size[0], patch_size[1])
	)
	# print('umt:', f'Num of patches: {num_patches}')

	def forward(self, x, **kwargs):
	B, C, T, H, W = x.shape
	# FIXME look at relaxing size constraints
	# assert H == self.img_size[0] and W == self.img_size[1], \
	# f"Input image size ({H}{W}) doesn't match model ({self.img_size[0]}{self.img_size[1]})."
	x = self.proj(x).flatten(2).transpose(1, 2)
	return x

	# sin-cos position encoding
	# https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31
	def get_sinusoid_encoding_table(n_position, d_hid, ckpt_num_frame=-1, cur_frame=12):
	''' Sinusoid position encoding table '''
	# TODO: make it with torch instead of numpy
	def get_position_angle_vec(position):
	return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]

	if ckpt_num_frame != -1 and ckpt_num_frame != cur_frame:
	# print('umt:', f"Interpolate position embedding")
	# print('umt:', f"Testing frame: {cur_frame}")
	# print('umt:', f"Checkpoint frame: {ckpt_num_frame}")

	T = ckpt_num_frame # checkpoint frame
	new_T = cur_frame # testing frame
	n_position = n_position // new_T * T # generate checkpoint position embedding
	sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
	sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
	sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
	sinusoid_table = torch.tensor(sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0)
	# interpolate
	P = int((n_position // T) ** 0.5)
	C = d_hid
	sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C)
	sinusoid_table = sinusoid_table.permute(0, 2, 3, 4, 1).reshape(-1, C, T) # BHW, C, T
	sinusoid_table = torch.nn.functional.interpolate(sinusoid_table, size=new_T, mode='linear')
	sinusoid_table = sinusoid_table.reshape(1, P, P, C, new_T).permute(0, 4, 1, 2, 3) # B, T, H, W, C
	sinusoid_table = sinusoid_table.flatten(1, 3)
	return sinusoid_table
	else:
	sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
	sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
	sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
	return torch.tensor(sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0)


	def get_sinusoid_encoding_table2(n_position=784, d_hid=1024, cur_frame=8, ckpt_num_frame=4, pre_n_position=784):
	''' Sinusoid position encoding table '''
	# TODO: make it with torch instead of numpy
	def get_position_angle_vec(position):
	return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]

	# generate checkpoint position embedding
	sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(pre_n_position)])
	sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
	sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
	sinusoid_table = torch.tensor(sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0)

	# print(f"n_position: {n_position}")
	# print(f"pre_n_position: {pre_n_position}")

	if n_position != pre_n_position:
	T = ckpt_num_frame # checkpoint frame
	P = 14 # checkpoint size
	C = d_hid
	new_P = int((n_position // cur_frame) ** 0.5) # testing size
	# print(f'Pretraining uses 14x14, but current version is {new_P}x{new_P}')
	# print(f'Interpolate the position embedding')
	sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C)
	sinusoid_table = sinusoid_table.reshape(-1, P, P, C).permute(0, 3, 1, 2)
	sinusoid_table = torch.nn.functional.interpolate(
	sinusoid_table, size=(new_P, new_P), mode='bicubic', align_corners=False)
	# BT, C, H, W -> BT, H, W, C -> B, T, H, W, C
	sinusoid_table = sinusoid_table.permute(0, 2, 3, 1).reshape(-1, T, new_P, new_P, C)
	sinusoid_table = sinusoid_table.flatten(1, 3) # B, THW, C

	if cur_frame != ckpt_num_frame:
	# print(f'Pretraining uses 4 frames, but current frame is {cur_frame}')
	# print(f'Interpolate the position embedding')
	T = ckpt_num_frame # checkpoint frame
	new_T = cur_frame # testing frame
	# interpolate
	P = int((n_position // cur_frame) ** 0.5) # testing size
	C = d_hid
	sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C)
	sinusoid_table = sinusoid_table.permute(0, 2, 3, 4, 1).reshape(-1, C, T) # BHW, C, T
	sinusoid_table = torch.nn.functional.interpolate(sinusoid_table, size=new_T, mode='linear')
	sinusoid_table = sinusoid_table.reshape(1, P, P, C, new_T).permute(0, 4, 1, 2, 3) # B, T, H, W, C
	sinusoid_table = sinusoid_table.flatten(1, 3) # B, THW, C

	return sinusoid_table


	class PretrainVisionTransformerEncoder(nn.Module):
	""" Vision Transformer with support for patch or hybrid CNN input stage
	"""
	def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, depth=12,
	num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
	drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None, num_frames=8, tubelet_size=1,
	use_learnable_pos_emb=False,
	use_checkpoint=False, checkpoint_num=0,
	ckpt_num_frame=-1, with_ln=True, return_index=-1
	):
	super().__init__()
	self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
	self.patch_embed = PatchEmbed(
	img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
	num_frames=num_frames, tubelet_size=tubelet_size
	)
	num_patches = self.patch_embed.num_patches
	self.depth = depth + return_index + 1
	self.use_checkpoint = use_checkpoint
	self.checkpoint_num = checkpoint_num
	# print('umt:', f"Use checkpoint: {use_checkpoint}")
	# print('umt:', f"Checkpoint number: {checkpoint_num}")
	# print('UMT:', f"Real runing depth: {self.depth}")

	# TODO: Add the cls token
	if use_learnable_pos_emb:
	self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
	self.img_pos_embed = nn.Parameter(torch.zeros(1, num_patches//(num_frames//tubelet_size) + 1, embed_dim))
	else:
	# sine-cosine positional embeddings
	if img_size != 224:
	self.pos_embed = get_sinusoid_encoding_table2(num_patches, embed_dim, ckpt_num_frame=ckpt_num_frame, cur_frame=num_frames//tubelet_size)
	self.img_pos_embed = get_sinusoid_encoding_table2(num_patches//(num_frames//tubelet_size), embed_dim, cur_frame=1, ckpt_num_frame=1, pre_n_position=14*14)
	else:
	self.pos_embed = get_sinusoid_encoding_table(num_patches, embed_dim, ckpt_num_frame=ckpt_num_frame, cur_frame=num_frames//tubelet_size)
	self.img_pos_embed = get_sinusoid_encoding_table(num_patches//(num_frames//tubelet_size), embed_dim)

	dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
	self.blocks = nn.ModuleList([
	Block(
	dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
	drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
	init_values=init_values)
	for i in range(self.depth)])

	if with_ln:
	self.vision_layernorm = nn.LayerNorm(embed_dim, eps=1e-12)
	else:
	self.vision_layernorm = nn.Identity()

	if use_learnable_pos_emb:
	trunc_normal_(self.pos_embed, std=.02)

	@torch.jit.ignore
	def no_weight_decay(self):
	return {'pos_embed', 'cls_token'}

	def forward_features(self, x, use_image=False):
	x = self.patch_embed(x)

	if use_image:
	x = x + self.img_pos_embed.type_as(x).to(x.device).clone().detach()
	else:
	x = x + self.pos_embed.type_as(x).to(x.device).clone().detach()

	B, _, C = x.shape
	x_vis = x

	for idx, blk in enumerate(self.blocks):
	if self.use_checkpoint and idx < self.checkpoint_num:
	x_vis = checkpoint.checkpoint(blk, x_vis)
	else:
	x_vis = blk(x_vis)

	# with ln ot not
	x_vis = self.vision_layernorm(x_vis)
	return x_vis

	def forward(self, x, use_image=False):
	x_vis = self.forward_features(x, use_image)
	return x_vis


	class PretrainVisionTransformer(nn.Module):
	""" Vision Transformer with support for patch or hybrid CNN input stage
	"""
	def __init__(self,
	img_size=224,
	patch_size=16,
	encoder_in_chans=3,
	encoder_embed_dim=768,
	encoder_depth=12,
	encoder_num_heads=12,
	mlp_ratio=4.,
	qkv_bias=True,
	qk_scale=None,
	drop_rate=0.,
	attn_drop_rate=0.,
	drop_path_rate=0.,
	norm_layer=partial(nn.LayerNorm, eps=1e-6),
	init_values=0.,
	use_learnable_pos_emb=False,
	num_frames=8,
	tubelet_size=1,
	use_checkpoint=False,
	checkpoint_num=0,
	ckpt_num_frame=4, # the pretrained model uses 4 frames
	return_index=-1,
	with_ln=False
	):
	super().__init__()

	self.encoder = PretrainVisionTransformerEncoder(
	img_size=img_size,
	patch_size=patch_size,
	in_chans=encoder_in_chans,
	embed_dim=encoder_embed_dim,
	depth=encoder_depth,
	num_heads=encoder_num_heads,
	mlp_ratio=mlp_ratio,
	qkv_bias=qkv_bias,
	qk_scale=qk_scale,
	drop_rate=drop_rate,
	attn_drop_rate=attn_drop_rate,
	drop_path_rate=drop_path_rate,
	norm_layer=norm_layer,
	init_values=init_values,
	num_frames=num_frames,
	tubelet_size=tubelet_size,
	use_learnable_pos_emb=use_learnable_pos_emb,
	use_checkpoint=use_checkpoint,
	checkpoint_num=checkpoint_num,
	ckpt_num_frame=ckpt_num_frame,
	with_ln=with_ln,
	return_index=return_index
	)
	# print('umt:', f'With LN: {with_ln}')
	# print('UMT:', f'Total {encoder_depth} layer')
	# print('UMT:', f'Return {encoder_depth+return_index+1}-th layer')

	self.apply(self._init_weights)

	def _init_weights(self, m):
	if isinstance(m, nn.Linear):
	nn.init.xavier_uniform_(m.weight)
	if isinstance(m, nn.Linear) and m.bias is not None:
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.LayerNorm):
	nn.init.constant_(m.bias, 0)
	nn.init.constant_(m.weight, 1.0)

	@torch.jit.ignore
	def no_weight_decay(self):
	return {'pos_embed', 'cls_token', 'clip_pos_embed'}

	def forward(self, x, use_image=False):
	T = x.shape[2]
	x_vis = self.encoder(x, use_image) # [B, N_vis, C_e]
	B, TL, C = x_vis.shape
	x_vis = x_vis.view(B, T, TL // T, C)

	return x_vis







	class UMTImageProcessor:
	def __init__(self, image_mean=(0.485, 0.456, 0.406), image_std=(0.229, 0.224, 0.225), size=(224, 224), crop_size: Dict[str, int] = None, resample=PILImageResampling.BICUBIC, rescale_factor=1 / 255, data_format=ChannelDimension.FIRST):
	crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
	crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")

	self.image_mean = image_mean
	self.image_std = image_std
	self.size = size
	self.resample = resample
	self.rescale_factor = rescale_factor
	self.data_format = data_format
	self.crop_size = crop_size

	def preprocess(self, images, return_tensors, target_size=None):
	if isinstance(images, Image.Image):
	images = [images]
	else:
	# to adapt video data
	images = [to_numpy_array(image) for image in images]
	assert isinstance(images, list)

	if target_size is None:
	target_size = self.size

	transforms = [
	convert_to_rgb,
	to_numpy_array,
	partial(resize, size=target_size, resample=self.resample, data_format=self.data_format),
	partial(rescale, scale=self.rescale_factor, data_format=self.data_format),
	partial(normalize, mean=self.image_mean, std=self.image_std, data_format=self.data_format),
	partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format),
	]

	images = reduce(lambda x, f: [*map(f, x)], transforms, images)
	data = {"pixel_values": images}

	return BatchFeature(data=data, tensor_type=return_tensors)


	class UMTVisionConfig:
	model_type = "umt_vision_model"

	def __init__(
	self,
	num_frames=4,
	hidden_size=1024,
	num_hidden_layers=24,
	num_attention_heads=16,
	num_channels=3,
	image_size=224,
	patch_size=16,
	return_idx=-2
	# **kwargs,
	):
	# super().__init__(**kwargs)
	self.num_frames = num_frames
	self.hidden_size = hidden_size
	self.num_hidden_layers = num_hidden_layers
	self.num_attention_heads = num_attention_heads
	self.num_channels = num_channels
	self.patch_size = patch_size
	self.image_size = image_size
	self.return_idx = return_idx


	def build_vit(config, pt_type='origin'):
	model = PretrainVisionTransformer(
	img_size=config.image_size,
	patch_size=16,
	encoder_embed_dim=1024,
	encoder_depth=24,
	encoder_num_heads=16,
	drop_path_rate=0.,
	num_frames=config.num_frames,
	tubelet_size=1,
	use_checkpoint=True,
	checkpoint_num=24,
	return_index=config.return_idx,
	with_ln=True, # merge vision_layernorm in it
	)

	# no need to load pt

	return model



	class UMTVisionTower(nn.Module):
	def __init__(self, vision_tower, vision_tower_cfg, delay_load=False, pt_type='origin', image_size=224):
	super().__init__()

	self.is_loaded = False
	self.pt_type = pt_type

	self.config = UMTVisionConfig(num_frames=vision_tower_cfg.mm_local_num_frames, return_idx=vision_tower_cfg.mm_vision_select_layer, image_size=image_size)

	self.vision_tower_name = vision_tower

	self.image_processor = UMTImageProcessor(size=(image_size, image_size))

	if not delay_load:
	print(f"Loading vision tower: {vision_tower}")
	self.load_model()
	elif getattr(vision_tower_cfg, "unfreeze_mm_vision_tower", False):
	# TODO: better detector is needed.
	print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.")
	self.load_model()
	elif hasattr(vision_tower_cfg, "mm_tunable_parts") and "mm_vision_tower" in vision_tower_cfg.mm_tunable_parts:
	print(f"The checkpoint seems to contain `vision_tower` weights: `mm_tunable_parts` contains `mm_vision_tower`.")
	self.load_model()
	else:
	self.cfg_only = self.config

	def load_model(self, device_map=None):
	if self.is_loaded:
	print("{} is already loaded, `load_model` called again, skipping.".format(self.vision_tower_name))
	return

	self.vision_tower = build_vit(self.config, pt_type=self.pt_type)
	self.vision_tower.requires_grad_(False)

	self.is_loaded = True

	def forward(self, images):
	if type(images) is list:
	raise NotImplementedError
	else:
	# input: B T C H W
	# output: B T*L C
	T = images.shape[1]
	images = images.permute(0, 2, 1, 3, 4)
	image_embeds = self.vision_tower(images, use_image=(T == 1))
	B, T, L, C = image_embeds.shape
	image_embeds = image_embeds.reshape(B, -1, C)

	return image_embeds

	@property
	def dummy_feature(self):
	return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)

	@property
	def dtype(self):
	for p in self.vision_tower.parameters():
	return p.dtype

	@property
	def device(self):
	for p in self.vision_tower.parameters():
	return p.device

	@property
	def hidden_size(self):
	return self.config.hidden_size

	@property
	def num_patches(self):
	return (self.config.image_size // self.config.patch_size) ** 2

	@property
	def num_patches_per_side(self):
	return self.config.image_size // self.config.patch_size

	@property
	def image_size(self):
	return self.config.image_size


	def build_vision_tower(vision_tower_cfg, **kwargs):
	vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None))


	if "umt-hd" in vision_tower:
	return UMTVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, image_size=448, **kwargs)
	elif "umt" in vision_tower:
	raise NotImplementedError
	return UMTVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs)

	raise ValueError(f"Unknown vision tower: {vision_tower}")