Spaces:
Running
Running
wissemkarous
commited on
init
Browse files- README.md +7 -7
- app.py +59 -0
- cvtransforms.py +14 -0
- dataset.py +155 -0
- options.py +20 -0
- packages.txt +1 -0
- requirements.txt +0 -0
README.md
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk:
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
license:
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: SilentSpeak
|
3 |
+
emoji: π
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: indigo
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.29.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
license: mit
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
from utils.demo import load_video, ctc_decode
|
4 |
+
from utils.two_stream_infer import load_model
|
5 |
+
import os
|
6 |
+
from scripts.extract_lip_coordinates import generate_lip_coordinates
|
7 |
+
import options as opt
|
8 |
+
|
9 |
+
st.set_page_config(layout="wide")
|
10 |
+
|
11 |
+
model = load_model()
|
12 |
+
|
13 |
+
st.title("LipCoordNet Demo")
|
14 |
+
|
15 |
+
st.info(
|
16 |
+
"The inference speed is very slow on Huggingface spaces due to it being processed entirely on CPU. For a quicker inference, please clone the repository and change the βdeviceβ under options.py to βcudaβ for local inference using GPU",
|
17 |
+
icon="βΉοΈ",
|
18 |
+
)
|
19 |
+
|
20 |
+
# Generating a list of options or videos
|
21 |
+
options = os.listdir(os.path.join("app_input"))
|
22 |
+
selected_video = st.selectbox("Choose video", options)
|
23 |
+
|
24 |
+
col1, col2 = st.columns(2)
|
25 |
+
|
26 |
+
|
27 |
+
with col1:
|
28 |
+
file_path = os.path.join("app_input", selected_video)
|
29 |
+
video_name = selected_video.split(".")[0]
|
30 |
+
os.system(f"ffmpeg -i {file_path} -vcodec libx264 {video_name}.mp4 -y")
|
31 |
+
|
32 |
+
# Rendering inside of the app
|
33 |
+
video = open(f"{video_name}.mp4", "rb")
|
34 |
+
video_bytes = video.read()
|
35 |
+
st.video(video_bytes)
|
36 |
+
|
37 |
+
|
38 |
+
with col1, st.spinner("Splitting video into frames"):
|
39 |
+
video, img_p, files = load_video(f"{video_name}.mp4", opt.device)
|
40 |
+
prediction_video = video
|
41 |
+
st.markdown(f"Frames Generated:\n{files}")
|
42 |
+
frames_generated = True
|
43 |
+
with col1, st.spinner("Generating Lip Landmark Coordinates"):
|
44 |
+
coordinates = generate_lip_coordinates(f"{video_name}_samples")
|
45 |
+
prediction_coordinates = coordinates
|
46 |
+
st.markdown(f"Coordinates Generated:\n{coordinates}")
|
47 |
+
coordinates_generated = True
|
48 |
+
|
49 |
+
with col2:
|
50 |
+
st.info("Ready to make prediction!")
|
51 |
+
generate = st.button("Generate")
|
52 |
+
if generate:
|
53 |
+
with col2, st.spinner("Generating..."):
|
54 |
+
y = model(
|
55 |
+
prediction_video[None, ...].to(opt.device),
|
56 |
+
prediction_coordinates[None, ...].to(opt.device),
|
57 |
+
)
|
58 |
+
txt = ctc_decode(y[0])
|
59 |
+
st.text(txt[-1])
|
cvtransforms.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding: utf-8
|
2 |
+
import random
|
3 |
+
|
4 |
+
|
5 |
+
def HorizontalFlip(batch_img, p=0.5):
|
6 |
+
# (T, H, W, C)
|
7 |
+
if random.random() > p:
|
8 |
+
batch_img = batch_img[:, :, ::-1, ...]
|
9 |
+
return batch_img
|
10 |
+
|
11 |
+
|
12 |
+
def ColorNormalize(batch_img):
|
13 |
+
batch_img = batch_img / 255.0
|
14 |
+
return batch_img
|
dataset.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# encoding: utf-8
|
2 |
+
import numpy as np
|
3 |
+
import cv2
|
4 |
+
import os
|
5 |
+
from torch.utils.data import Dataset
|
6 |
+
from cvtransforms import *
|
7 |
+
import torch
|
8 |
+
import editdistance
|
9 |
+
|
10 |
+
|
11 |
+
class MyDataset(Dataset):
|
12 |
+
letters = [
|
13 |
+
" ",
|
14 |
+
"A",
|
15 |
+
"B",
|
16 |
+
"C",
|
17 |
+
"D",
|
18 |
+
"E",
|
19 |
+
"F",
|
20 |
+
"G",
|
21 |
+
"H",
|
22 |
+
"I",
|
23 |
+
"J",
|
24 |
+
"K",
|
25 |
+
"L",
|
26 |
+
"M",
|
27 |
+
"N",
|
28 |
+
"O",
|
29 |
+
"P",
|
30 |
+
"Q",
|
31 |
+
"R",
|
32 |
+
"S",
|
33 |
+
"T",
|
34 |
+
"U",
|
35 |
+
"V",
|
36 |
+
"W",
|
37 |
+
"X",
|
38 |
+
"Y",
|
39 |
+
"Z",
|
40 |
+
]
|
41 |
+
|
42 |
+
def __init__(self, video_path, anno_path, file_list, vid_pad, txt_pad, phase):
|
43 |
+
self.anno_path = anno_path
|
44 |
+
self.vid_pad = vid_pad
|
45 |
+
self.txt_pad = txt_pad
|
46 |
+
self.phase = phase
|
47 |
+
|
48 |
+
with open(file_list, "r") as f:
|
49 |
+
self.videos = [
|
50 |
+
os.path.join(video_path, line.strip()) for line in f.readlines()
|
51 |
+
]
|
52 |
+
|
53 |
+
self.data = []
|
54 |
+
for vid in self.videos:
|
55 |
+
# items = vid.split(os.path.sep)
|
56 |
+
items = vid.split("/")
|
57 |
+
self.data.append((vid, items[-4], items[-1]))
|
58 |
+
|
59 |
+
def __getitem__(self, idx):
|
60 |
+
(vid, spk, name) = self.data[idx]
|
61 |
+
vid = self._load_vid(vid)
|
62 |
+
anno = self._load_anno(
|
63 |
+
os.path.join(self.anno_path, spk, "align", name + ".align")
|
64 |
+
)
|
65 |
+
|
66 |
+
if self.phase == "train":
|
67 |
+
vid = HorizontalFlip(vid)
|
68 |
+
|
69 |
+
vid = ColorNormalize(vid)
|
70 |
+
|
71 |
+
vid_len = vid.shape[0]
|
72 |
+
anno_len = anno.shape[0]
|
73 |
+
vid = self._padding(vid, self.vid_pad)
|
74 |
+
anno = self._padding(anno, self.txt_pad)
|
75 |
+
|
76 |
+
return {
|
77 |
+
"vid": torch.FloatTensor(vid.transpose(3, 0, 1, 2)),
|
78 |
+
"txt": torch.LongTensor(anno),
|
79 |
+
"txt_len": anno_len,
|
80 |
+
"vid_len": vid_len,
|
81 |
+
}
|
82 |
+
|
83 |
+
def __len__(self):
|
84 |
+
return len(self.data)
|
85 |
+
|
86 |
+
def _load_vid(self, p):
|
87 |
+
files = os.listdir(p)
|
88 |
+
files = list(filter(lambda file: file.find(".jpg") != -1, files))
|
89 |
+
files = sorted(files, key=lambda file: int(os.path.splitext(file)[0]))
|
90 |
+
array = [cv2.imread(os.path.join(p, file)) for file in files]
|
91 |
+
array = list(filter(lambda im: not im is None, array))
|
92 |
+
array = [
|
93 |
+
cv2.resize(im, (128, 64), interpolation=cv2.INTER_LANCZOS4) for im in array
|
94 |
+
]
|
95 |
+
array = np.stack(array, axis=0).astype(np.float32)
|
96 |
+
return array
|
97 |
+
|
98 |
+
def _load_anno(self, name):
|
99 |
+
with open(name, "r") as f:
|
100 |
+
lines = [line.strip().split(" ") for line in f.readlines()]
|
101 |
+
txt = [line[2] for line in lines]
|
102 |
+
txt = list(filter(lambda s: not s.upper() in ["SIL", "SP"], txt))
|
103 |
+
return MyDataset.txt2arr(" ".join(txt).upper(), 1)
|
104 |
+
|
105 |
+
def _padding(self, array, length):
|
106 |
+
array = [array[_] for _ in range(array.shape[0])]
|
107 |
+
size = array[0].shape
|
108 |
+
for i in range(length - len(array)):
|
109 |
+
array.append(np.zeros(size))
|
110 |
+
return np.stack(array, axis=0)
|
111 |
+
|
112 |
+
@staticmethod
|
113 |
+
def txt2arr(txt, start):
|
114 |
+
arr = []
|
115 |
+
for c in list(txt):
|
116 |
+
arr.append(MyDataset.letters.index(c) + start)
|
117 |
+
return np.array(arr)
|
118 |
+
|
119 |
+
@staticmethod
|
120 |
+
def arr2txt(arr, start):
|
121 |
+
txt = []
|
122 |
+
for n in arr:
|
123 |
+
if n >= start:
|
124 |
+
txt.append(MyDataset.letters[n - start])
|
125 |
+
return "".join(txt).strip()
|
126 |
+
|
127 |
+
@staticmethod
|
128 |
+
def ctc_arr2txt(arr, start):
|
129 |
+
pre = -1
|
130 |
+
txt = []
|
131 |
+
for n in arr:
|
132 |
+
if pre != n and n >= start:
|
133 |
+
if (
|
134 |
+
len(txt) > 0
|
135 |
+
and txt[-1] == " "
|
136 |
+
and MyDataset.letters[n - start] == " "
|
137 |
+
):
|
138 |
+
pass
|
139 |
+
else:
|
140 |
+
txt.append(MyDataset.letters[n - start])
|
141 |
+
pre = n
|
142 |
+
return "".join(txt).strip()
|
143 |
+
|
144 |
+
@staticmethod
|
145 |
+
def wer(predict, truth):
|
146 |
+
word_pairs = [(p[0].split(" "), p[1].split(" ")) for p in zip(predict, truth)]
|
147 |
+
wer = [1.0 * editdistance.eval(p[0], p[1]) / len(p[1]) for p in word_pairs]
|
148 |
+
return wer
|
149 |
+
|
150 |
+
@staticmethod
|
151 |
+
def cer(predict, truth):
|
152 |
+
cer = [
|
153 |
+
1.0 * editdistance.eval(p[0], p[1]) / len(p[1]) for p in zip(predict, truth)
|
154 |
+
]
|
155 |
+
return cer
|
options.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gpu = "0"
|
2 |
+
random_seed = 0
|
3 |
+
data_type = "unseen"
|
4 |
+
video_path = "../lip/"
|
5 |
+
train_list = f"data/{data_type}_train.txt"
|
6 |
+
val_list = f"data/{data_type}_val.txt"
|
7 |
+
anno_path = "../GRID_align_txt"
|
8 |
+
vid_padding = 75
|
9 |
+
txt_padding = 200
|
10 |
+
batch_size = 8
|
11 |
+
base_lr = 2e-5
|
12 |
+
num_workers = 4
|
13 |
+
max_epoch = 90
|
14 |
+
display = 10
|
15 |
+
test_step = 1000
|
16 |
+
save_prefix = f"weights/LipNet_{data_type}"
|
17 |
+
is_optimize = True
|
18 |
+
device = "cpu"
|
19 |
+
|
20 |
+
two_stream_weights = "pretrain/LipNet_coords_loss_0.025581153109669685_wer_0.01746208431890914_cer_0.006488426950253695.pt"
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
ffmpeg
|
requirements.txt
ADDED
Binary file (3.37 kB). View file
|
|