import os, random, json import numpy as np from pydub import AudioSegment from pydub.utils import make_chunks from pydub.effects import compress_dynamic_range from PIL import Image import cv2 from moviepy.editor import VideoClip, AudioFileClip import gradio as gr # Load configuration def load_config(config_path): with open(config_path, 'r') as config_file: return json.load(config_file) def process_audio_and_generate_video(config_path, audio_file): config = load_config(config_path) # Load the images closed_mouth_img = Image.open(config['frame_paths']['closed_mouth']) open_mouth_img = Image.open(config['frame_paths']['open_mouth']) closed_mouth_blinking_img = Image.open(config['frame_paths']['closed_mouth_blinking']) open_mouth_blinking_img = Image.open(config['frame_paths']['open_mouth_blinking']) # Create a background with the color from config background_color = tuple(config['background_color']) background = Image.new('RGBA', closed_mouth_img.size, background_color) # Composite the images with the background closed_mouth_img = Image.alpha_composite(background, closed_mouth_img) open_mouth_img = Image.alpha_composite(background, open_mouth_img) closed_mouth_blinking_img = Image.alpha_composite(background, closed_mouth_blinking_img) open_mouth_blinking_img = Image.alpha_composite(background, open_mouth_blinking_img) # Convert images to OpenCV format closed_mouth_cv = cv2.cvtColor(np.array(closed_mouth_img), cv2.COLOR_RGBA2RGB) open_mouth_cv = cv2.cvtColor(np.array(open_mouth_img), cv2.COLOR_RGBA2RGB) closed_mouth_blinking_cv = cv2.cvtColor(np.array(closed_mouth_blinking_img), cv2.COLOR_RGBA2RGB) open_mouth_blinking_cv = cv2.cvtColor(np.array(open_mouth_blinking_img), cv2.COLOR_RGBA2RGB) # Set parameters frame_rate = config['frame_rate'] frame_duration_ms = config['frame_duration_ms'] // frame_rate # Load the audio audio = AudioSegment.from_file(audio_file) # Apply compression compressed_audio = compress_dynamic_range(audio, threshold=-20.0, ratio=8.0, attack=1.0, release=10.0) # Normalize audio target_dBFS = -10.0 change_in_dBFS = target_dBFS - compressed_audio.dBFS normalized_audio = compressed_audio.apply_gain(change_in_dBFS) # Split the audio into chunks of the same duration as the frames audio_chunks = make_chunks(normalized_audio, frame_duration_ms) # Function to calculate decibels of a chunk def calculate_decibels(chunk): return chunk.dBFS # Decide whether to use dynamic threshold or a fixed threshold if config["dynamic_threshold"] == 1: # Calculate average decibels average_dBFS = sum(chunk.dBFS for chunk in audio_chunks) / len(audio_chunks) decibel_threshold = average_dBFS + 4 # Set threshold above average else: decibel_threshold = config['decibel_threshold'] # Blink logic blink_duration = config['blink_duration'] last_blink_time = config['initial_blink_time'] # Decide whether to blink def should_blink(t, last_blink_time): if t - last_blink_time > random.uniform(config['minimum_blinking_delay'], config['maximum_blinking_delay']): return True return False # Function to generate frames def make_frame(t): nonlocal last_blink_time frame_index = int(t * frame_rate) if should_blink(t, last_blink_time): last_blink_time = t if 0 <= (t - last_blink_time) <= blink_duration: if frame_index < len(audio_chunks): chunk = audio_chunks[frame_index] decibels = calculate_decibels(chunk) return open_mouth_blinking_cv if decibels > decibel_threshold else closed_mouth_blinking_cv else: return closed_mouth_blinking_cv if frame_index < len(audio_chunks): chunk = audio_chunks[frame_index] decibels = calculate_decibels(chunk) return open_mouth_cv if decibels > decibel_threshold else closed_mouth_cv else: return closed_mouth_cv # Create a video clip video_clip = VideoClip(make_frame, duration=len(audio_chunks) / frame_rate) # Load the audio audio_clip = AudioFileClip(audio_file) # Set the audio of the video to the loaded audio video_with_audio = video_clip.set_audio(audio_clip) # Write the final video with audio output_video_path = os.path.join(config['output_path'], f"{os.path.basename(audio_file).split('.')[0]}.mp4") video_with_audio.write_videofile(output_video_path, fps=frame_rate, codec=config['codec'], audio_codec=config["audio_codec"]) return output_video_path html_content = """

How to Use

Add 1-4 images in the frames folder and modify the paths in the config.json to use the images you want.
Put the audios into the audio folder. It will create as many animations as there are audios.

Frame Images:

Closed Mouth Closed Mouth Blinking Open Mouth Open Mouth Blinking
closed_mouth closed_mouth_blinking open_mouth open_mouth_blinking
Download the assets here """ # Gradio interface def gradio_interface(config_file, audio_file): video_path = process_audio_and_generate_video(config_file, audio_file) return video_path with gr.Blocks() as demo: gr.HTML(html_content) config_file_input = gr.File(label="Upload Config File (JSON)") audio_file_input = gr.Audio(label="Upload Audio File", type="filepath") output_video = gr.Video(label="Generated Video") generate_button = gr.Button("Generate Animation") generate_button.click(gradio_interface, [config_file_input, audio_file_input], output_video) demo.launch()