Forbu14
/

openai_clip_embeddings

Model card Files Files and versions Community

openai_clip_embeddings / eval.py

Forbu14's picture

adding main files

69d022a over 1 year ago

history blame contribute delete

3.61 kB

	from mteb import MTEB
	import torch
	import clip

	import numpy as np

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	MODEL, PREPROCESS = clip.load("RN50", device=DEVICE)


	TASK_LIST_CLASSIFICATION = [
	"AmazonCounterfactualClassification",
	"AmazonPolarityClassification",
	"AmazonReviewsClassification",
	"Banking77Classification",
	"EmotionClassification",
	"ImdbClassification",
	"MassiveIntentClassification",
	"MassiveScenarioClassification",
	"MTOPDomainClassification",
	"MTOPIntentClassification",
	"ToxicConversationsClassification",
	"TweetSentimentExtractionClassification",
	]

	TASK_LIST_CLUSTERING = [
	"ArxivClusteringP2P",
	"ArxivClusteringS2S",
	"BiorxivClusteringP2P",
	"BiorxivClusteringS2S",
	"MedrxivClusteringP2P",
	"MedrxivClusteringS2S",
	"RedditClustering",
	"RedditClusteringP2P",
	"StackExchangeClustering",
	"StackExchangeClusteringP2P",
	"TwentyNewsgroupsClustering",
	]

	TASK_LIST_PAIR_CLASSIFICATION = [
	"SprintDuplicateQuestions",
	"TwitterSemEval2015",
	"TwitterURLCorpus",
	]

	TASK_LIST_RERANKING = [
	"AskUbuntuDupQuestions",
	"MindSmallReranking",
	"SciDocsRR",
	"StackOverflowDupQuestions",
	]

	TASK_LIST_RETRIEVAL = [
	"ArguAna",
	"ClimateFEVER",
	"CQADupstackAndroidRetrieval",
	"CQADupstackEnglishRetrieval",
	"CQADupstackGamingRetrieval",
	"CQADupstackGisRetrieval",
	"CQADupstackMathematicaRetrieval",
	"CQADupstackPhysicsRetrieval",
	"CQADupstackProgrammersRetrieval",
	"CQADupstackStatsRetrieval",
	"CQADupstackTexRetrieval",
	"CQADupstackUnixRetrieval",
	"CQADupstackWebmastersRetrieval",
	"CQADupstackWordpressRetrieval",
	"DBPedia",
	"FEVER",
	"FiQA2018",
	"HotpotQA",
	"MSMARCO",
	"NFCorpus",
	"NQ",
	"QuoraRetrieval",
	"SCIDOCS",
	"SciFact",
	"Touche2020",
	"TRECCOVID",
	]

	TASK_LIST_STS = [
	"BIOSSES",
	"SICK-R",
	"STS12",
	"STS13",
	"STS14",
	"STS15",
	"STS16",
	"STS17",
	"STS22",
	"STSBenchmark",
	"SummEval",
	]

	TASK_LIST = TASK_LIST_CLASSIFICATION
	+ TASK_LIST_CLUSTERING
	+ TASK_LIST_PAIR_CLASSIFICATION
	+ TASK_LIST_RERANKING
	+ TASK_LIST_RETRIEVAL
	+ TASK_LIST_STS




	class ClipModel:
	"""
	This is an wrapper class for the clip embedding model.
	"""

	def encode(self, sentences, batch_size=1, **kwargs):
	"""Returns a list of embeddings for the given sentences.
	Args:
	sentences (`List[str]`): List of sentences to encode
	batch_size (`int`): Batch size for the encoding

	Returns:
	`List[np.ndarray]` or `List[tensor]`: List of embeddings for the given sentences
	"""
	embeddings = []
	for i in range(0, len(sentences)):
	batch = sentences[i]
	try:
	text = clip.tokenize(batch).to(DEVICE)[
	:, :77
	] # clip.tokenize(batch).to(DEVICE)

	with torch.no_grad():
	text_features = MODEL.encode_text(text)

	except:
	print("too long token")
	text = clip.tokenize(batch[: (77 * 2)]).to(DEVICE)[
	:, :77
	] # clip.tokenize(batch).to(DEVICE)

	with torch.no_grad():
	text_features = MODEL.encode_text(text)

	embeddings.append(text_features.cpu().numpy().squeeze())

	return embeddings


	model = ClipModel()
	evaluation = MTEB(tasks=TASK_LIST, output_folder=f"results/clip/", task_langs=["en"])
	evaluation.run(model)