Spaces:
Running
Running
import streamlit as st | |
import os | |
from utils.demo import load_video, ctc_decode | |
from utils.two_stream_infer import load_model | |
import os | |
from scripts.extract_lip_coordinates import generate_lip_coordinates | |
import options as opt | |
st.set_page_config(layout="wide") | |
model = load_model() | |
st.title("LipCoordNet Demo") | |
st.info( | |
"The inference speed is very slow on Huggingface spaces due to it being processed entirely on CPU. For a quicker inference, please clone the repository and change the “device” under options.py to “cuda” for local inference using GPU", | |
icon="ℹ️", | |
) | |
# Generating a list of options or videos | |
options = os.listdir(os.path.join("app_input")) | |
selected_video = st.selectbox("Choose video", options) | |
col1, col2 = st.columns(2) | |
with col1: | |
file_path = os.path.join("app_input", selected_video) | |
video_name = selected_video.split(".")[0] | |
os.system(f"ffmpeg -i {file_path} -vcodec libx264 {video_name}.mp4 -y") | |
# Rendering inside of the app | |
video = open(f"{video_name}.mp4", "rb") | |
video_bytes = video.read() | |
st.video(video_bytes) | |
with col1, st.spinner("Splitting video into frames"): | |
video, img_p, files = load_video(f"{video_name}.mp4", opt.device) | |
prediction_video = video | |
st.markdown(f"Frames Generated:\n{files}") | |
frames_generated = True | |
with col1, st.spinner("Generating Lip Landmark Coordinates"): | |
coordinates = generate_lip_coordinates(f"{video_name}_samples") | |
prediction_coordinates = coordinates | |
st.markdown(f"Coordinates Generated:\n{coordinates}") | |
coordinates_generated = True | |
with col2: | |
st.info("Ready to make prediction!") | |
generate = st.button("Generate") | |
if generate: | |
with col2, st.spinner("Generating..."): | |
y = model( | |
prediction_video[None, ...].to(opt.device), | |
prediction_coordinates[None, ...].to(opt.device), | |
) | |
txt = ctc_decode(y[0]) | |
st.text(txt[-1]) | |