vumichien commited on
Commit
8702c0e
·
1 Parent(s): 223da19

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -7
app.py CHANGED
@@ -42,6 +42,7 @@ from fairseq import checkpoint_utils, options, tasks, utils
42
  from fairseq.dataclass.configs import GenerationConfig
43
  from huggingface_hub import hf_hub_download
44
  import gradio as gr
 
45
 
46
  # os.chdir('/home/user/app/av_hubert/avhubert')
47
 
@@ -131,18 +132,44 @@ def predict(process_video):
131
 
132
 
133
  # ---- Gradio Layout -----
 
134
  video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
135
- video_out = gr.Video(label="Audio Visual Video", mirror_webcam=False, interactive=True)
136
- text_output = gr.Textbox()
137
  demo = gr.Blocks()
138
  demo.encrypt = False
 
 
139
  with demo:
140
- examples = gr.Examples(examples=
141
- [ ["demo1.mp4", "roi1.mp4"],
142
- ["demo2.mp4", "roi2.mp4"],
143
- ["demo3.mp4", "roi3.mp4"],],
144
- label="Examples", inputs=[video_in, video_out])
 
145
  with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  video_in.render()
147
  video_out.render()
148
  with gr.Row():
 
42
  from fairseq.dataclass.configs import GenerationConfig
43
  from huggingface_hub import hf_hub_download
44
  import gradio as gr
45
+ from pytube import YouTube
46
 
47
  # os.chdir('/home/user/app/av_hubert/avhubert')
48
 
 
132
 
133
 
134
  # ---- Gradio Layout -----
135
+ youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True)
136
  video_in = gr.Video(label="Input Video", mirror_webcam=False, interactive=True)
137
+ video_out = gr.Video(label="Audio Visual Video", mirror_webcam=False, interactive=True)
 
138
  demo = gr.Blocks()
139
  demo.encrypt = False
140
+ text_output = gr.Textbox()
141
+
142
  with demo:
143
+ gr.Markdown('''
144
+ <div>
145
+ <h1 style='text-align: center'>Speech Recognition from Visual Lip Movement by Audio-Visual Hidden Unit BERT Model (Avhubert)</h1>
146
+ This space uses Avhubert models from <a href='https://github.com/facebookresearch' target='_blank'><b>Meta Research</b></a> to recoginze the speech from Lip Movement 🤗
147
+ </div>
148
+ ''')
149
  with gr.Row():
150
+ gr.Markdown('''
151
+ ### Reading Lip movement with youtube link using Avhubert
152
+ ##### Step 1a. Download video from youtube (Note: the length of video should be less than 10 seconds if not it will be cut and the face should be stable for better result)
153
+ ##### Step 1b. You also can upload video directly
154
+ ##### Step 2. Generating landmarks surrounding mouth area
155
+ ##### Step 3. Reading lip movement.
156
+ ''')
157
+ with gr.Row():
158
+ gr.Markdown('''
159
+ ### You can test by following examples:
160
+ ''')
161
+ examples = gr.Examples(examples=
162
+ [ "https://www.youtube.com/watch?v=ZXVDnuepW2s",
163
+ "https://www.youtube.com/watch?v=X8_glJn1B8o",
164
+ "https://www.youtube.com/watch?v=80yqL2KzBVw"],
165
+ label="Examples", inputs=[youtube_url_in])
166
+ with gr.Column():
167
+ youtube_url_in.render()
168
+ download_youtube_btn = gr.Button("Download Youtube video")
169
+ download_youtube_btn.click(get_youtube, [youtube_url_in], [
170
+ video_in])
171
+ print(video_in)
172
+ with gr.Row():
173
  video_in.render()
174
  video_out.render()
175
  with gr.Row():