Spaces:
Running
Running
import os, random, json | |
import numpy as np | |
from pydub import AudioSegment | |
from pydub.utils import make_chunks | |
from pydub.effects import compress_dynamic_range | |
from PIL import Image | |
import cv2 | |
from moviepy.editor import VideoClip, AudioFileClip | |
import gradio as gr | |
# Load configuration | |
def load_config(config_path): | |
with open(config_path, 'r') as config_file: | |
return json.load(config_file) | |
def process_audio_and_generate_video(config_path, audio_file): | |
config = load_config(config_path) | |
# Load the images | |
closed_mouth_img = Image.open(config['frame_paths']['closed_mouth']) | |
open_mouth_img = Image.open(config['frame_paths']['open_mouth']) | |
closed_mouth_blinking_img = Image.open(config['frame_paths']['closed_mouth_blinking']) | |
open_mouth_blinking_img = Image.open(config['frame_paths']['open_mouth_blinking']) | |
# Create a background with the color from config | |
background_color = tuple(config['background_color']) | |
background = Image.new('RGBA', closed_mouth_img.size, background_color) | |
# Composite the images with the background | |
closed_mouth_img = Image.alpha_composite(background, closed_mouth_img) | |
open_mouth_img = Image.alpha_composite(background, open_mouth_img) | |
closed_mouth_blinking_img = Image.alpha_composite(background, closed_mouth_blinking_img) | |
open_mouth_blinking_img = Image.alpha_composite(background, open_mouth_blinking_img) | |
# Convert images to OpenCV format | |
closed_mouth_cv = cv2.cvtColor(np.array(closed_mouth_img), cv2.COLOR_RGBA2RGB) | |
open_mouth_cv = cv2.cvtColor(np.array(open_mouth_img), cv2.COLOR_RGBA2RGB) | |
closed_mouth_blinking_cv = cv2.cvtColor(np.array(closed_mouth_blinking_img), cv2.COLOR_RGBA2RGB) | |
open_mouth_blinking_cv = cv2.cvtColor(np.array(open_mouth_blinking_img), cv2.COLOR_RGBA2RGB) | |
# Set parameters | |
frame_rate = config['frame_rate'] | |
frame_duration_ms = config['frame_duration_ms'] // frame_rate | |
# Load the audio | |
audio = AudioSegment.from_file(audio_file) | |
# Apply compression | |
compressed_audio = compress_dynamic_range(audio, threshold=-20.0, ratio=8.0, attack=1.0, release=10.0) | |
# Normalize audio | |
target_dBFS = -10.0 | |
change_in_dBFS = target_dBFS - compressed_audio.dBFS | |
normalized_audio = compressed_audio.apply_gain(change_in_dBFS) | |
# Split the audio into chunks of the same duration as the frames | |
audio_chunks = make_chunks(normalized_audio, frame_duration_ms) | |
# Function to calculate decibels of a chunk | |
def calculate_decibels(chunk): | |
return chunk.dBFS | |
# Decide whether to use dynamic threshold or a fixed threshold | |
if config["dynamic_threshold"] == 1: | |
# Calculate average decibels | |
average_dBFS = sum(chunk.dBFS for chunk in audio_chunks) / len(audio_chunks) | |
decibel_threshold = average_dBFS + 4 # Set threshold above average | |
else: | |
decibel_threshold = config['decibel_threshold'] | |
# Blink logic | |
blink_duration = config['blink_duration'] | |
last_blink_time = config['initial_blink_time'] | |
# Decide whether to blink | |
def should_blink(t, last_blink_time): | |
if t - last_blink_time > random.uniform(config['minimum_blinking_delay'], config['maximum_blinking_delay']): | |
return True | |
return False | |
# Function to generate frames | |
def make_frame(t): | |
nonlocal last_blink_time | |
frame_index = int(t * frame_rate) | |
if should_blink(t, last_blink_time): | |
last_blink_time = t | |
if 0 <= (t - last_blink_time) <= blink_duration: | |
if frame_index < len(audio_chunks): | |
chunk = audio_chunks[frame_index] | |
decibels = calculate_decibels(chunk) | |
return open_mouth_blinking_cv if decibels > decibel_threshold else closed_mouth_blinking_cv | |
else: | |
return closed_mouth_blinking_cv | |
if frame_index < len(audio_chunks): | |
chunk = audio_chunks[frame_index] | |
decibels = calculate_decibels(chunk) | |
return open_mouth_cv if decibels > decibel_threshold else closed_mouth_cv | |
else: | |
return closed_mouth_cv | |
# Create a video clip | |
video_clip = VideoClip(make_frame, duration=len(audio_chunks) / frame_rate) | |
# Load the audio | |
audio_clip = AudioFileClip(audio_file) | |
# Set the audio of the video to the loaded audio | |
video_with_audio = video_clip.set_audio(audio_clip) | |
# Write the final video with audio | |
output_video_path = os.path.join(config['output_path'], f"{os.path.basename(audio_file).split('.')[0]}.mp4") | |
video_with_audio.write_videofile(output_video_path, fps=frame_rate, codec=config['codec'], audio_codec=config["audio_codec"]) | |
return output_video_path | |
html_content = """ | |
<h3>How to Use</h3> | |
<p>Add 1-4 images in the <b>frames</b> folder and modify the paths in the <b>config.json</b> to use the images you want.<br> | |
Put the audios into the <b>audio</b> folder. It will create as many animations as there are audios.</p> | |
<h3>Frame Images:</h3> | |
<table> | |
<tr> | |
<th>Closed Mouth</th> | |
<th>Closed Mouth Blinking</th> | |
<th>Open Mouth</th> | |
<th>Open Mouth Blinking</th> | |
</tr> | |
<tr> | |
<td><img src="https://github.com/user-attachments/assets/3ed0c597-df0e-4165-98d4-cf978e1338bb" alt="closed_mouth" width="150"/></td> | |
<td><img src="https://github.com/user-attachments/assets/1296c2a7-4304-4935-b398-4ee5e1fe8a10" alt="closed_mouth_blinking" width="150"/></td> | |
<td><img src="https://github.com/user-attachments/assets/4715a73a-1a27-4ac9-a20b-954dde0aac0b" alt="open_mouth" width="150"/></td> | |
<td><img src="https://github.com/user-attachments/assets/b7d04648-9158-4dd2-889c-27c67a64e0b2" alt="open_mouth_blinking" width="150"/></td> | |
</tr> | |
</table> | |
<a href="https://github.com/user-attachments/assets/dcf3728c-0d3b-455d-b17e-5e9819be069b">Download the assets here</a> | |
""" | |
# Gradio interface | |
def gradio_interface(config_file, audio_file): | |
video_path = process_audio_and_generate_video(config_file, audio_file) | |
return video_path | |
with gr.Blocks() as demo: | |
gr.HTML(html_content) | |
config_file_input = gr.File(label="Upload Config File (JSON)") | |
audio_file_input = gr.Audio(label="Upload Audio File", type="filepath") | |
output_video = gr.Video(label="Generated Video") | |
generate_button = gr.Button("Generate Animation") | |
generate_button.click(gradio_interface, [config_file_input, audio_file_input], output_video) | |
demo.launch() | |