fffiloni commited on
Commit
eb23cc1
·
verified ·
1 Parent(s): b4acdb1

Update hf_gradio_app.py

Browse files
Files changed (1) hide show
  1. hf_gradio_app.py +32 -3
hf_gradio_app.py CHANGED
@@ -1,5 +1,8 @@
1
  import os, random, time
2
  import uuid
 
 
 
3
  from huggingface_hub import snapshot_download
4
 
5
  # Download models
@@ -70,8 +73,36 @@ with torch.inference_mode():
70
  pipeline = VideoPipeline(vae=vae, reference_net=reference_net, diffusion_net=diffusion_net, scheduler=noise_scheduler, image_proj=image_proj)
71
  pipeline.to(device=device, dtype=weight_dtype)
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  @torch.inference_mode()
74
  def generate(input_video, input_audio, seed, progress=gr.Progress(track_tqdm=True)):
 
 
 
 
 
 
 
75
  resolution = 512
76
  num_generated_frames_per_clip = 16
77
  fps = 30
@@ -157,8 +188,6 @@ def generate(input_video, input_audio, seed, progress=gr.Progress(track_tqdm=Tru
157
 
158
  return video_path
159
 
160
- import gradio as gr
161
-
162
  with gr.Blocks(analytics_enabled=False) as demo:
163
  with gr.Column():
164
  gr.Markdown("# MEMO: Memory-Guided Diffusion for Expressive Talking Video Generation")
@@ -185,7 +214,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
185
  with gr.Row():
186
  with gr.Column():
187
  input_video = gr.Image(label="Upload Input Image", type="filepath")
188
- input_audio = gr.Audio(label="Upload Input Audio", type="filepath")
189
  seed = gr.Number(label="Seed (0 for Random)", value=0, precision=0)
190
  with gr.Column():
191
  video_output = gr.Video(label="Generated Video")
 
1
  import os, random, time
2
  import uuid
3
+ import tempfile
4
+ from pydub import AudioSegment
5
+ import gradio as gr
6
  from huggingface_hub import snapshot_download
7
 
8
  # Download models
 
73
  pipeline = VideoPipeline(vae=vae, reference_net=reference_net, diffusion_net=diffusion_net, scheduler=noise_scheduler, image_proj=image_proj)
74
  pipeline.to(device=device, dtype=weight_dtype)
75
 
76
+ def process_audio(file_path):
77
+ # Create a temporary directory
78
+ with tempfile.TemporaryDirectory() as temp_dir:
79
+ # Load the audio file
80
+ audio = AudioSegment.from_file(file_path)
81
+
82
+ # Check and cut the audio if longer than 4 seconds
83
+ max_duration = 4 * 1000 # 4 seconds in milliseconds
84
+ if len(audio) > max_duration:
85
+ audio = audio[:max_duration]
86
+
87
+ # Save the processed audio in the temporary directory
88
+ output_path = os.path.join(temp_dir, "trimmed_audio.wav")
89
+ audio.export(output_path, format="wav")
90
+
91
+ # Temporary file is available here for use
92
+ print(f"Processed audio saved at: {output_path}")
93
+
94
+ # Return the path for reference (optional)
95
+ return output_path
96
+
97
  @torch.inference_mode()
98
  def generate(input_video, input_audio, seed, progress=gr.Progress(track_tqdm=True)):
99
+
100
+ is_shared_ui = True if "fffiloni/MEMO" in os.environ['SPACE_ID'] else False
101
+
102
+ if is_shared_ui:
103
+ input_audio = process_audio(input_audio)
104
+ print(f"Processed file was stored temporarily at: {input_audio}")
105
+
106
  resolution = 512
107
  num_generated_frames_per_clip = 16
108
  fps = 30
 
188
 
189
  return video_path
190
 
 
 
191
  with gr.Blocks(analytics_enabled=False) as demo:
192
  with gr.Column():
193
  gr.Markdown("# MEMO: Memory-Guided Diffusion for Expressive Talking Video Generation")
 
214
  with gr.Row():
215
  with gr.Column():
216
  input_video = gr.Image(label="Upload Input Image", type="filepath")
217
+ input_audio = gr.Audio(label="Upload Input Audio", type="filepath", info="On shared UI, audio length is trimmed to max 4 seconds")
218
  seed = gr.Number(label="Seed (0 for Random)", value=0, precision=0)
219
  with gr.Column():
220
  video_output = gr.Video(label="Generated Video")