Spaces:
PlayHT
/
Running on CPU Upgrade

File size: 3,074 Bytes
69617e8
 
1101b16
0ed5bd6
1101b16
69617e8
 
 
 
b326bed
 
1101b16
0ed5bd6
 
 
 
 
 
 
0b47c5d
69617e8
 
 
 
 
0ed5bd6
69617e8
 
 
 
 
 
 
 
 
 
 
0ed5bd6
 
 
 
 
69617e8
 
 
 
0ed5bd6
99f3aa9
 
 
69617e8
99f3aa9
69617e8
99f3aa9
1101b16
99f3aa9
 
1101b16
99f3aa9
1101b16
0ed5bd6
99f3aa9
 
1101b16
 
99f3aa9
1101b16
 
0ed5bd6
0b47c5d
0ed5bd6
 
 
 
69617e8
 
0ed5bd6
 
 
 
 
 
 
 
 
 
69617e8
0ed5bd6
 
1101b16
69617e8
0ed5bd6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import google.generativeai as genai
import gradio as gr
import requests

# Configure Google Gemini API
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

# Play.ht API keys
API_KEY = os.getenv('PLAY_API_KEY')
USER_ID = os.getenv('PLAY_USER_ID')

# theme selection let's go with this before the branded color
#theme={"primary_hue": "#b4fd83"}
theme = gr.themes.Base(
    primary_hue="emerald",
)


# Function to upload image to Gemini and get roasted text
def upload_to_gemini(path, mime_type="image/jpeg"):
    file = genai.upload_file(path, mime_type=mime_type)
    return file

def generate_roast(image_path):
    # Upload the image to Gemini and get the text
    uploaded_file = upload_to_gemini(image_path)
    generation_config = {
        "temperature": 1,
        "top_p": 0.95,
        "top_k": 40,
        "max_output_tokens": 8192,
        "response_mime_type": "text/plain",
    }
    model = genai.GenerativeModel(
        model_name="gemini-1.5-flash-002",
        generation_config=generation_config,
        system_instruction="You are a professional satirist and fashion expert. You will be given a profile picture. Your duty is to roast whatever is given to you in the funniest way possible!",
    )
    
    chat_session = model.start_chat(
        history=[{"role": "user", "parts": [uploaded_file]}]
    )
    response = chat_session.send_message("Roast this image!")
    return response.text

# Function to convert text to speech with Play.ht
def text_to_speech(text):
    url = "https://api.play.ht/api/v2/tts/stream"
    payload = {
        "voice": "s3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json",
        "output_format": "mp3",
        "text": text,
    }
    headers = {
        "accept": "audio/mpeg",
        "content-type": "application/json",
        "Authorization": API_KEY,
        "X-User-ID": USER_ID
    }
    
    response = requests.post(url, json=payload, headers=headers)
    if response.status_code == 200:
        audio_path = "output_audio.mp3"
        with open(audio_path, "wb") as audio_file:
            audio_file.write(response.content)
        return audio_path
    else:
        return f"Error: {response.status_code} - {response.text}"

# Gradio Interface
with gr.Blocks(theme = theme) as demo:
    gr.Markdown("# Image to Text-to-Speech Roasting App")
    gr.Markdown("Upload an image, and the AI will roast it and convert the roast to audio.")
    
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="filepath", label="Upload Image")
        with gr.Column():
            output_text = gr.Textbox(label="Roast Text")
            audio_output = gr.Audio(label="Roast Audio")
    
    def process_image(image):
        roast_text = generate_roast(image)
        audio_path = text_to_speech(roast_text)
        return roast_text, audio_path
    
    submit_button = gr.Button("Generate Roast")
    submit_button.click(process_image, inputs=image_input, outputs=[output_text, audio_output])

# Launch the app
demo.launch(debug=True)