{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "initial_id", "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Load Image to Text model\n", "from transformers import AutoProcessor, AutoModelForCausalLM\n", "import requests\n", "\n", "image_processor = AutoProcessor.from_pretrained(\"microsoft/git-base\")\n", "image_to_text_model = AutoModelForCausalLM.from_pretrained(\"microsoft/git-base\")" ] }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Load Translation model\n", "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"google-t5/t5-small\")\n", "model = AutoModelForSeq2SeqLM.from_pretrained(\"google-t5/t5-small\")" ], "id": "be52bb44374be3a1" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "def generate_caption(image):\n", " pixel_values = image_processor(images=image, return_tensors=\"pt\").pixel_values\n", " generated_ids = image_to_text_model.generate(pixel_values=pixel_values, max_length=200)\n", " generated_caption = image_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]\n", "\n", " return generated_caption" ], "id": "eb994d7ef0dc73f6" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "def translate(text):\n", " inputs = tokenizer(text, return_tensors='pt')\n", " input_ids = inputs.input_ids\n", " attention_mask = inputs.attention_mask\n", "\n", " try:\n", " input_ids = input_ids.to('cuda')\n", " attention_mask = attention_mask.to('cuda')\n", " model = translation_model.to(\"cuda\")\n", " except:\n", " print('No NVidia GPU, model performance may not be as good')\n", " model = translation_model\n", "\n", " output = model.generate(input_ids, attention_mask=attention_mask, forced_bos_token_id=tokenizer.lang_code_to_id['pt_XX'])\n", " translated = tokenizer.decode(output[0], skip_special_tokens=True)\n", "\n", " return translated" ], "id": "f9742a337b32cc1" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Carregamento de imagens locais\n", "import sys\n", "import cv2\n", "from PIL import Image\n", "\n", "img_url = 'http://images.cocodataset.org/val2017/000000039769.jpg'\n", "# img_url = 'https://farm4.staticflickr.com/3733/9000662079_ce3599d0d8_z.jpg'\n", "# img_url = 'https://farm4.staticflickr.com/3088/5793281956_2a15b2559c_z.jpg'\n", "# img_url = 'https://farm5.staticflickr.com/4073/4816939054_844feb0078_z.jpg'\n", "\n", "image = Image.open(requests.get(img_url, stream=True).raw)" ], "id": "97f3e60bca81b195" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "caption = generate_caption(image)\n", "\n", "print(caption)" ], "id": "1a4c1ed0fc31fd60" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "translated_caption = translate(caption)\n", "\n", "print(translated_caption)" ], "id": "a4d4f92f2c0b3922" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }