Taizun's picture
Update app.py
360f58b verified
import torch
import gradio as gr
from PIL import Image
import scipy.io.wavfile as wavfile
from transformers import pipeline
# Set device for processing
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load models
caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
def generate_audio(text):
"""Generate audio narration from text."""
narrated_text = narrator(text)
wavfile.write("output.wav", rate=narrated_text["sampling_rate"], data=narrated_text["audio"][0])
return "output.wav"
def caption_my_image(pil_image):
"""Generate caption for the image and convert it to audio."""
semantics = caption_image(images=pil_image)[0]['generated_text']
return generate_audio(semantics)
# Define the Gradio interface
demo = gr.Interface(
fn=caption_my_image,
inputs=[gr.Image(label="Upload Your Image", type="pil")],
outputs=[gr.Audio(label="Generated Audio Caption")],
title="Image Captioning and Narration",
description=(
"Upload an image to generate a descriptive caption and listen to its narration.\n"
"This app is brought to you by **Taizun**."
),
theme="compact" # Use a minimalistic theme
)
# Launch the application
demo.launch()