mirzaburanali
commited on
Commit
·
bfc21e7
1
Parent(s):
4aaac24
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# !pip install transformers
|
2 |
+
|
3 |
+
#import all the import libraries
|
4 |
+
#visionencodermodel can be used to initialize an image to text with any pretrained transformer
|
5 |
+
#based vision model as the encoder
|
6 |
+
# vif feature extractor extracts features
|
7 |
+
# auto tokenizer is responsible preprocessing text into array of numbers as an input to model
|
8 |
+
# pil stands for python image library, it deals with images
|
9 |
+
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
|
10 |
+
import torch
|
11 |
+
from PIL import Image
|
12 |
+
|
13 |
+
#providing hugging face source, pass the hugging face url, this will download the model
|
14 |
+
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
|
15 |
+
|
16 |
+
#download the feature extractor
|
17 |
+
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
|
18 |
+
#download the tokenizer
|
19 |
+
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
|
20 |
+
|
21 |
+
#check if there is any gpu available if not we will use the cpu
|
22 |
+
|
23 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
24 |
+
model.to(device)
|
25 |
+
|
26 |
+
#create a dictionary with these parameters
|
27 |
+
|
28 |
+
max_length = 16
|
29 |
+
num_beams = 4
|
30 |
+
num_return_sequences = 3
|
31 |
+
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
|
32 |
+
|
33 |
+
# this function uses a combination of computer vision and natural language
|
34 |
+
# processing techniques to generate textual descriptions or captions of an input image.
|
35 |
+
|
36 |
+
def predict_step(image):
|
37 |
+
i_image = Image.fromarray(image.astype('uint8'), 'RGB')
|
38 |
+
pixel_values = feature_extractor(images=i_image, return_tensors="pt").pixel_values
|
39 |
+
pixel_values = pixel_values.to(device)
|
40 |
+
|
41 |
+
output_ids = model.generate(pixel_values, **gen_kwargs, num_return_sequences=num_return_sequences)
|
42 |
+
|
43 |
+
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
|
44 |
+
preds = [pred.strip() for pred in preds]
|
45 |
+
return tuple(preds)
|
46 |
+
|
47 |
+
# ! pip install gradio
|
48 |
+
|
49 |
+
import gradio as gr
|
50 |
+
|
51 |
+
#create an interface
|
52 |
+
|
53 |
+
inputs = gr.inputs.Image(type='numpy', label='Original Image')
|
54 |
+
outputs = [gr.outputs.Textbox(label=f'Caption {i+1}') for i in range(num_return_sequences)]
|
55 |
+
|
56 |
+
title = "Image Captioning using ViT + GPT2"
|
57 |
+
description = "Image caption is generated for the uploaded image using ViT and GPT2. For training, COCO Dataset was utilised. If you see any biases (gender, race, etc.) in our picture captioning model that we were unable to identify during our stress tests, please use the 'Flag' button to mark the offending image."
|
58 |
+
article = " <a href='https://huggingface.co/sachin/vit2distilgpt2'>Model Repo on Hugging Face Model Hub</a>"
|
59 |
+
examples = [["/content/test1.png"], ["/content/test2.png"],["/content/test3.png"]]
|
60 |
+
|
61 |
+
gr.Interface(
|
62 |
+
predict_step,
|
63 |
+
inputs,
|
64 |
+
outputs,
|
65 |
+
title=title,
|
66 |
+
description=description,
|
67 |
+
article=article,
|
68 |
+
examples=examples,
|
69 |
+
theme="huggingface",
|
70 |
+
).launch(debug=True, enable_queue=True)
|
71 |
+
iface.launch(inline=False)
|