Spaces:
Running
Running
import streamlit as st | |
import requests | |
from PIL import Image | |
import io | |
from huggingface_hub import InferenceClient | |
# Streamlit page setup | |
st.set_page_config(page_title="MTSS Image Accessibility Alt Text Generator", layout="centered") | |
# Add the logo image with a specified width | |
image_width = 300 # Set the desired width in pixels | |
st.image('MTSS.ai_Logo.png', width=image_width) | |
st.header('VisionTexts™ | Accessibility') | |
st.subheader('Image Alt Text Creator') | |
# Retrieve the Hugging Face API Key from secrets | |
huggingface_api_key = st.secrets["huggingface_api_key"] | |
# Initialize the Hugging Face inference client | |
client = InferenceClient(token=huggingface_api_key) | |
# File uploader allows user to add their own image | |
uploaded_file = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"]) | |
if uploaded_file: | |
# Display the uploaded image | |
image = Image.open(uploaded_file).convert('RGB') | |
image_width = 200 # Set the desired width in pixels | |
with st.expander("Image", expanded=True): | |
st.image(image, caption=uploaded_file.name, width=image_width, use_column_width=False) | |
else: | |
st.warning("Please upload an image.") | |
# Option for adding additional details | |
show_details = st.checkbox("Add additional details about the image.", value=False) | |
if show_details: | |
# Text input for additional details about the image | |
additional_details = st.text_area( | |
"Provide specific information that is important to include in the alt text or reflect why the image is being used:" | |
) | |
else: | |
additional_details = "" | |
# Button to trigger the analysis | |
analyze_button = st.button("Analyze the Image", type="secondary") | |
# Prompt for complex image description | |
complex_image_prompt_text = ( | |
"As an expert in image accessibility and alternative text, thoroughly describe the image caption provided. " | |
"Provide a detailed description using not more than 500 characters that conveys the essential information in eight or fewer clear and concise sentences. " | |
"Skip phrases like 'image of' or 'picture of.' " | |
"Your description should form a clear, well-structured, and factual paragraph that avoids bullet points, focusing on creating a seamless narrative. " | |
"Importantly, only describe what is visibly present in the image and avoid making assumptions or adding extraneous information. " | |
"Stick to the facts and ensure the description is accurate and reliable." | |
) | |
# Functions to query the Hugging Face Inference API | |
def query_image_caption(image): | |
# Convert PIL image to bytes | |
buffered = io.BytesIO() | |
image.save(buffered, format="JPEG") | |
image_bytes = buffered.getvalue() | |
# Use the InferenceClient's image_to_text method | |
response = client.image_to_text( | |
# model="Salesforce/blip-image-captioning-large", | |
model="nlpconnect/vit-gpt2-image-captioning", | |
image=image_bytes, | |
) | |
return response | |
def query_llm(prompt): | |
# System prompt (optional) | |
system_prompt = "You are an expert in image accessibility and alternative text." | |
# Generate the response using the Hugging Face InferenceClient's chat completion | |
response = client.chat.completions.create( | |
model="meta-llama/Llama-2-7b-chat-hf", | |
messages=[ | |
{"role": "system", "content": system_prompt}, # Optional system prompt | |
{"role": "user", "content": prompt} | |
], | |
stream=True, | |
temperature=0.5, | |
max_tokens=1024, | |
top_p=0.7 | |
) | |
# Collect the streamed response | |
response_content = "" | |
for message in response: | |
if "choices" in message and len(message["choices"]) > 0: | |
delta = message["choices"][0].get("delta", {}) | |
content = delta.get("content", "") | |
response_content += content | |
# Optionally, you can update the progress to the user here | |
return response_content.strip() | |
# Check if an image has been uploaded and if the button has been pressed | |
if uploaded_file is not None and analyze_button: | |
with st.spinner("Analyzing the image..."): | |
# Get the caption from the image using the image captioning API | |
caption_response = query_image_caption(image) | |
# Handle potential errors from the API | |
if isinstance(caption_response, dict) and caption_response.get("error"): | |
st.error(f"Error with image captioning model: {caption_response['error']}") | |
else: | |
# Since caption_response is a string, assign it directly | |
image_caption = caption_response | |
# Use the complex image prompt text | |
prompt_text = complex_image_prompt_text | |
# Include additional details if provided | |
if additional_details: | |
prompt_text += f"\n\nAdditional context provided by the user:\n{additional_details}" | |
# Create the full prompt | |
full_prompt = f"{prompt_text}\n\nImage Caption: {image_caption}" | |
# Use the language model to generate the alt text description | |
llm_response = query_llm(full_prompt) | |
# Display the generated alt text | |
st.markdown("### Generated Alt Text:") | |
st.write(llm_response) | |
st.success('Powered by MTSS GPT. AI can make mistakes. Consider checking important information.') | |
else: | |
st.write("Please upload an image and click 'Analyze the Image' to generate alt text.") |