Spaces:

fclong
/

summary

Runtime error

App Files Files Community

summary / fengshen /examples /pretrain_taiyi_clip /pretrain.py

fclong

Upload 396 files

8ebda9e almost 2 years ago

raw

history blame contribute delete

13.3 kB

	from pytorch_lightning import (
	LightningModule,
	Trainer,
	)
	from pytorch_lightning.callbacks import (
	LearningRateMonitor,
	)
	from fengshen.models.clip import (
	TaiyiCLIPModel,
	TaiyiCLIPProcessor,
	)
	from fengshen.models.model_utils import (
	add_module_args,
	configure_optimizers,
	get_total_steps,
	)
	import torch
	import torch.nn.functional as F
	import argparse
	import math
	from fengshen.data.universal_datamodule import UniversalDataModule
	from fengshen.data.taiyi_stable_diffusion_datasets.taiyi_datasets import add_data_args, load_data
	from fengshen.utils.universal_checkpoint import UniversalCheckpoint
	import os
	import numpy as np
	from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor

	OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
	OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)


	class Collator():
	def __init__(self, args, processor):
	self.processor = processor
	self.seq_length = args.seq_length
	self.transforms = Compose([
	ToTensor(),
	RandomResizedCrop(args.resolution, scale=(0.9, 1.0),
	interpolation=InterpolationMode.BICUBIC),
	Normalize(mean=OPENAI_DATASET_MEAN, std=OPENAI_DATASET_STD),
	])

	def __call__(self, inputs):
	max_length = min(self.seq_length, max([len(i['caption']) for i in inputs]))
	images = []
	texts = []
	labels = []
	for i in inputs:
	# instance_image = Image.open(i['img_path'])
	# instance_image = jpeg4py.JPEG(i['img_path']).decode()
	instance_image = np.load(i['npy_path'])
	images.append(self.transforms(instance_image))
	texts.append(i['caption'])
	labels.append(i['labels'] if 'labels' in i else -100)
	# images_input = self.processor(images=images, return_tensors="pt")
	texts_input = self.processor(text=texts,
	max_length=max_length,
	padding='max_length',
	truncation=True,
	return_tensors='pt')
	# return images_input, texts_input, labels
	return {'pixel_values': torch.stack(images)}, texts_input, labels


	class TaiyiCLIP(LightningModule):
	@staticmethod
	def add_module_specific_args(parent_parser):
	parser = parent_parser.add_argument_group('Taiyi CLIP')
	parser.add_argument('--loss_type', choices=['local', 'global'], default='local')
	parser.add_argument('--seq_length', default=77)
	parser.add_argument('--gather_with_grad', default=False, action='store_true')
	parser.add_argument('--freeze_image_tower', default=False, action='store_true')
	return parent_parser

	def __init__(self, args, **kwargs) -> None:
	super().__init__()
	self.save_hyperparameters(args)

	self.model = TaiyiCLIPModel.from_pretrained(args.model_path)
	self.processor = TaiyiCLIPProcessor.from_pretrained(args.model_path)

	self.local_loss = args.loss_type == 'local'

	if args.freeze_image_tower:
	for param in self.model.vision_model.parameters():
	param.requires_grad = False
	self.model.visual_projection.requires_grad = False

	# cache
	self.cache_labels = True
	self.prev_num_logits = 0
	self.labels = {}

	def setup(self, stage) -> None:
	if stage == 'fit':
	self.total_steps = get_total_steps(self.trainer, self.hparams)
	print('Total steps: {}' .format(self.total_steps))
	elif stage == 'validate':
	self.total_steps = 100

	def configure_optimizers(self):
	return configure_optimizers(self)

	def forward(self, image, text):
	assert image is not None
	assert text is not None
	image_features = self.model.get_image_features(**image)
	text_features = self.model.get_text_features(**text)

	image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
	text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)

	return image_features, text_features, self.model.logit_scale.exp()

	def gather_features(self, features):
	if self.trainer.world_size == 1:
	return features
	all_features = self.all_gather(
	features, sync_grads=self.hparams.gather_with_grad)
	if not self.local_loss and not self.gather_with_grad:
	# 如果是全局loss，并且不需要梯度，需要把梯度更新回tensor
	all_features[self.global_rank] = features
	all_features = all_features.view(-1, all_features.shape[-1])
	return all_features

	def clip_loss(self, image_features, text_features, logit_scale):

	logits_per_image = None

	# 如果我冻住VIT并且是local_loss，那么我只需要自己的这部分text feature就行
	# 因为根本不需要image2text的feature训练VIT
	if self.hparams.freeze_image_tower and self.local_loss:
	all_text_features = None
	else:
	all_text_features = self.gather_features(
	text_features)
	all_image_features = self.gather_features(
	image_features)

	if self.local_loss:
	if all_text_features is not None:
	logits_per_image = logit_scale * image_features @ all_text_features.T
	logits_per_text = logit_scale * text_features @ all_image_features.T
	else:
	# 如果是global_loss，那all_text_features肯定不是空的
	logits_per_image = logit_scale * all_image_features @ all_text_features.T
	logits_per_text = logits_per_image.T

	num_logits = logits_per_text.shape[0]
	if self.prev_num_logits != num_logits or self.device not in self.labels:
	labels = torch.arange(num_logits, device=self.device, dtype=torch.long)
	if self.trainer.world_size > 1 and self.local_loss:
	labels = labels + num_logits * self.global_rank
	if self.cache_labels:
	self.labels[self.device] = labels
	self.prev_num_logits = num_logits
	else:
	labels = self.labels[self.device]

	total_loss = (
	F.cross_entropy(logits_per_image, labels) +
	F.cross_entropy(logits_per_text, labels)
	) / 2 if logits_per_image is not None else F.cross_entropy(logits_per_text, labels)
	return total_loss

	def training_step(self, batch):
	image, text, _ = batch
	image_features, text_features, logit_scale = self(image, text)
	total_loss = self.clip_loss(image_features, text_features, logit_scale)
	self.log('train_loss', total_loss, sync_dist=False)
	return total_loss

	def on_train_batch_end(self, outputs, batch, batch_idx: int) -> None:
	with torch.no_grad():
	self.model.logit_scale.clamp_(0, math.log(100))

	def get_metrics(self, image_features, text_features, labels, logit_scale):
	# 计算相似度，支持多个样本的情况（比如一个图片有多个caption）
	# img2txt计算的时候要用到，因为一张图片可能对应多个文本。
	# txt2img计算的时候不需要（一般一个text只有一个对应图片）
	metrics = {}
	logits_per_image = (logit_scale * image_features @ text_features.t()).detach().cpu()
	logits_per_text = logits_per_image.t().detach().cpu()

	logits = {"image_to_text": logits_per_image, "text_to_image": logits_per_text}

	label2idx = {} # 计算label到idx的映射。
	repeat_id = []
	for i, label in enumerate(labels):
	if label not in label2idx:
	label2idx[label] = [i]
	else:
	# 表示该index的标签出现过，记录这个index，后续算txt2img分数的时候，这些index的权值要降低。
	label2idx[label].append(i)
	repeat_id.append(i)

	ground_truth = [label2idx[label] for label in labels]

	for name, logit in logits.items():
	if name == 'text_to_image':
	logit[:, repeat_id] -= 1e8 # 这部分的分数要降低。（重复出现的图片，直接忽略）
	r_stat = {1: [], 5: [], 10: []}
	# r1_stat, r5_stat, r10_stat = [], [], []
	# index of the largest element to the smallest
	ranking = torch.argsort(logit, descending=True)
	for i, each_query in enumerate(ranking[:, :10]):
	for j, q in enumerate(each_query):
	found = False
	if q in ground_truth[i]:
	for k, v in r_stat.items():
	if j < k:
	found = True
	v.append(1)
	if found:
	break
	for k, v in r_stat.items():
	metrics[f'{name}_R@{k}'] = sum(v)/len(logit)
	return metrics

	def validation_step(self, batch, batch_idx):
	image, text, label = batch
	image_features, text_features, logit_scale = self(image, text)
	return image_features, text_features, logit_scale, text['input_ids'].shape[0], label

	def validation_epoch_end(self, val_outputs):
	all_image_features = []
	all_text_features = []
	all_labels = []
	sample_size = 0
	for o in val_outputs:
	all_image_features.append(o[0])
	all_text_features.append(o[1])
	sample_size += o[3]
	all_labels += o[4]
	if len(all_image_features) == 0 or len(all_text_features) == 0:
	return
	all_image_features = torch.cat(all_image_features)
	all_text_features = torch.cat(all_text_features)
	logit_scale = val_outputs[0][2].mean()
	logits_per_image = logit_scale * all_image_features @ all_text_features.t()
	logits_per_text = logits_per_image.t()

	labels = torch.arange(sample_size, device=self.device).long()
	total_loss = (F.cross_entropy(logits_per_image, labels)
	+ F.cross_entropy(logits_per_text, labels)) / 2

	val_metrics = self.get_metrics(
	image_features=all_image_features,
	text_features=all_text_features,
	logit_scale=logit_scale,
	labels=all_labels)
	loss = total_loss / sample_size
	self.log('val_loss', loss, sync_dist=False)
	for k, v in val_metrics.items():
	self.log(f'val_{k}', v, sync_dist=False)

	def on_load_checkpoint(self, checkpoint) -> None:
	# 兼容低版本lightning，低版本lightning从ckpt起来时steps数会被重置为0
	global_step_offset = checkpoint["global_step"]
	if 'global_samples' in checkpoint:
	self.consumed_samples = checkpoint['global_samples']
	self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset

	def on_save_checkpoint(self, checkpoint) -> None:
	# 保存的时候把权重按huggingface的形式保存出来
	if self.global_rank == 0:
	dir_path = os.path.join(
	self.hparams.default_root_dir, f'hf_out_{self.trainer.current_epoch}_{self.trainer.global_step}')
	if not os.path.exists(dir_path):
	os.mkdir(dir_path)
	self.model.save_pretrained(dir_path)
	self.processor.save_pretrained(dir_path)


	if __name__ == '__main__':
	args_parser = argparse.ArgumentParser()
	args_parser = add_module_args(args_parser)
	args_parser = add_data_args(args_parser)
	args_parser = UniversalDataModule.add_data_specific_args(args_parser)
	args_parser = Trainer.add_argparse_args(args_parser)
	args_parser = TaiyiCLIP.add_module_specific_args(args_parser)
	args_parser = UniversalCheckpoint.add_argparse_args(args_parser)
	args = args_parser.parse_args()

	lr_monitor = LearningRateMonitor(logging_interval='step')
	checkpoint_callback = UniversalCheckpoint(args)

	trainer = Trainer.from_argparse_args(args,
	callbacks=[
	lr_monitor,
	checkpoint_callback])

	model = TaiyiCLIP(args)
	processor = model.processor
	collate_fn = Collator(args, processor)
	datasets = load_data(args, global_rank=trainer.global_rank)

	# 加载单个验证集：！！！验证代码有效性临时这样干的，验证完有效性会删除
	from fengshen.examples.pretrain_taiyi_clip.flickr_datasets import flickr30k_CNA
	img_root = '/shared_space/ccnl/mm_data/Flickr30k-CNA/flickr30k/images'
	text_annot_path = '/shared_space/ccnl/mm_data/Flickr30k-CNA/test/flickr30k_cn_test.txt'

	datasets[args.val_datasets_field] = flickr30k_CNA(img_root, text_annot_path, collate_fn)

	datamoule = UniversalDataModule(
	tokenizer=None, collate_fn=collate_fn, args=args, datasets=datasets)

	trainer.fit(model, datamoule, ckpt_path=args.load_ckpt_path)