--- library_name: transformers tags: [] --- # yujiepan/clip-vit-tiny-random-patch14-336 This model is intended for debugging. ## Usage ```python from transformers import CLIPProcessor, CLIPModel, CLIPConfig from PIL import Image import requests import torch model_id = "yujiepan/clip-vit-tiny-random-patch14-336" model = CLIPModel.from_pretrained(model_id).cuda() processor = CLIPProcessor.from_pretrained(model_id) url = "https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png" image = Image.open(requests.get(url, stream=True).raw) text = "A description of the image" inputs = processor(text=[text], images=image, return_tensors="pt", padding=True).to("cuda") with torch.no_grad(): outputs = model(**inputs) logits_per_image = outputs.logits_per_image # shape: [batch_size, num_texts] logits_per_text = outputs.logits_per_text # shape: [batch_size, num_images] probs = logits_per_image.softmax(dim=1) # shape: [batch_size, num_texts] print(probs) ``` ## Codes ```python from transformers import CLIPProcessor, CLIPModel, CLIPConfig from PIL import Image import requests import torch model_name = "openai/clip-vit-large-patch14-336" config = CLIPConfig.from_pretrained(model_name) config = config.to_dict() config["projection_dim"] = 8 config["text_config"]["hidden_size"] = 8 config["text_config"]["projection_dim"] = 8 config["text_config"]["intermediate_size"] = 16 config["text_config"]["num_hidden_layers"] = 2 config["text_config"]["num_attention_heads"] = 2 config["vision_config"]["hidden_size"] = 8 config["vision_config"]["projection_dim"] = 8 config["vision_config"]["intermediate_size"] = 16 config["vision_config"]["num_hidden_layers"] = 2 config["vision_config"]["num_attention_heads"] = 2 config = CLIPConfig.from_dict(config) model = CLIPModel(config).half().cuda() processor = CLIPProcessor.from_pretrained(model_name) url = "https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png" image = Image.open(requests.get(url, stream=True).raw) text = "A description of the image" inputs = processor(text=[text], images=image, return_tensors="pt", padding=True).to("cuda") with torch.no_grad(): outputs = model(**inputs) logits_per_image = outputs.logits_per_image # shape: [batch_size, num_texts] logits_per_text = outputs.logits_per_text # shape: [batch_size, num_images] probs = logits_per_image.softmax(dim=1) # shape: [batch_size, num_texts] print(probs) model.push_to_hub("yujiepan/clip-vit-tiny-random-patch14-336") processor.push_to_hub("yujiepan/clip-vit-tiny-random-patch14-336") ```