Spaces:
Build error
Build error
Matyáš Boháček
commited on
Commit
·
a001524
1
Parent(s):
96a591f
Init commit
Browse files- README.md +6 -6
- app.py +173 -0
- examples/chair.mp4 +0 -0
- examples/computer.mp4 +0 -0
- examples/work.mp4 +0 -0
- flagged/log.csv +3 -0
- requirements.txt +7 -0
- spoter-checkpoint.pth +3 -0
- spoter/gaussian_noise.py +18 -0
- spoter/spoter_model.py +70 -0
- spoter/utils.py +81 -0
- spoter_mod/.idea/.gitignore +8 -0
- spoter_mod/.idea/inspectionProfiles/Project_Default.xml +24 -0
- spoter_mod/.idea/inspectionProfiles/profiles_settings.xml +6 -0
- spoter_mod/.idea/misc.xml +4 -0
- spoter_mod/.idea/modules.xml +8 -0
- spoter_mod/.idea/vcs.xml +6 -0
- spoter_mod/LICENSE +201 -0
- spoter_mod/README.md +61 -0
- spoter_mod/augmentations/__init__.py +231 -0
- spoter_mod/data_structurization/autsl.py +22 -0
- spoter_mod/data_structurization/wlasl.py +32 -0
- spoter_mod/datasets/czech_slr_dataset.py +153 -0
- spoter_mod/normalization/body_normalization.py +226 -0
- spoter_mod/normalization/hand_normalization.py +192 -0
- spoter_mod/normalization/main.py +43 -0
- spoter_mod/pose_model_identifier.py +103 -0
- spoter_mod/requirements.txt +7 -0
- spoter_mod/skeleton_extractor.py +60 -0
- spoter_mod/sweep-agent.sh +18 -0
- spoter_mod/sweep.yaml +50 -0
- spoter_mod/train.py +312 -0
- spoter_mod/utils.py +41 -0
- spoter_mod/wandb/debug-cli.log +0 -0
README.md
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.0.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces
|
|
|
1 |
---
|
2 |
+
title: Spoter Demo Test
|
3 |
+
emoji: 🧏
|
4 |
+
colorFrom: green
|
5 |
+
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.0.6
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
|
app.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
import gradio as gr
|
6 |
+
from spoter_mod.skeleton_extractor import obtain_pose_data
|
7 |
+
from spoter_mod.normalization.body_normalization import normalize_single_dict as normalize_single_body_dict, BODY_IDENTIFIERS
|
8 |
+
from spoter_mod.normalization.hand_normalization import normalize_single_dict as normalize_single_hand_dict, HAND_IDENTIFIERS
|
9 |
+
|
10 |
+
|
11 |
+
model = torch.load("spoter-checkpoint.pth", map_location=torch.device('cpu'))
|
12 |
+
model.train(False)
|
13 |
+
|
14 |
+
HAND_IDENTIFIERS = [id + "_Left" for id in HAND_IDENTIFIERS] + [id + "_Right" for id in HAND_IDENTIFIERS]
|
15 |
+
GLOSS = ['book', 'drink', 'computer', 'before', 'chair', 'go', 'clothes', 'who', 'candy', 'cousin', 'deaf', 'fine',
|
16 |
+
'help', 'no', 'thin', 'walk', 'year', 'yes', 'all', 'black', 'cool', 'finish', 'hot', 'like', 'many', 'mother',
|
17 |
+
'now', 'orange', 'table', 'thanksgiving', 'what', 'woman', 'bed', 'blue', 'bowling', 'can', 'dog', 'family',
|
18 |
+
'fish', 'graduate', 'hat', 'hearing', 'kiss', 'language', 'later', 'man', 'shirt', 'study', 'tall', 'white',
|
19 |
+
'wrong', 'accident', 'apple', 'bird', 'change', 'color', 'corn', 'cow', 'dance', 'dark', 'doctor', 'eat',
|
20 |
+
'enjoy', 'forget', 'give', 'last', 'meet', 'pink', 'pizza', 'play', 'school', 'secretary', 'short', 'time',
|
21 |
+
'want', 'work', 'africa', 'basketball', 'birthday', 'brown', 'but', 'cheat', 'city', 'cook', 'decide', 'full',
|
22 |
+
'how', 'jacket', 'letter', 'medicine', 'need', 'paint', 'paper', 'pull', 'purple', 'right', 'same', 'son',
|
23 |
+
'tell', 'thursday']
|
24 |
+
|
25 |
+
device = torch.device("cpu")
|
26 |
+
if torch.cuda.is_available():
|
27 |
+
device = torch.device("cuda")
|
28 |
+
|
29 |
+
|
30 |
+
def tensor_to_dictionary(landmarks_tensor: torch.Tensor) -> dict:
|
31 |
+
|
32 |
+
data_array = landmarks_tensor.numpy()
|
33 |
+
output = {}
|
34 |
+
|
35 |
+
for landmark_index, identifier in enumerate(BODY_IDENTIFIERS + HAND_IDENTIFIERS):
|
36 |
+
output[identifier] = data_array[:, landmark_index]
|
37 |
+
|
38 |
+
return output
|
39 |
+
|
40 |
+
|
41 |
+
def dictionary_to_tensor(landmarks_dict: dict) -> torch.Tensor:
|
42 |
+
|
43 |
+
output = np.empty(shape=(len(landmarks_dict["leftEar"]), len(BODY_IDENTIFIERS + HAND_IDENTIFIERS), 2))
|
44 |
+
|
45 |
+
for landmark_index, identifier in enumerate(BODY_IDENTIFIERS + HAND_IDENTIFIERS):
|
46 |
+
output[:, landmark_index, 0] = [frame[0] for frame in landmarks_dict[identifier]]
|
47 |
+
output[:, landmark_index, 1] = [frame[1] for frame in landmarks_dict[identifier]]
|
48 |
+
|
49 |
+
return torch.from_numpy(output)
|
50 |
+
|
51 |
+
|
52 |
+
def greet(label, video0, video1):
|
53 |
+
|
54 |
+
if label == "Webcam":
|
55 |
+
video = video0
|
56 |
+
|
57 |
+
elif label == "Video":
|
58 |
+
video = video1
|
59 |
+
|
60 |
+
elif label == "X":
|
61 |
+
return {"A": 0.8, "B": 0.1, "C": 0.1}
|
62 |
+
|
63 |
+
else:
|
64 |
+
return {}
|
65 |
+
|
66 |
+
data = obtain_pose_data(video)
|
67 |
+
|
68 |
+
depth_map = np.empty(shape=(len(data.data_hub["nose_X"]), len(BODY_IDENTIFIERS + HAND_IDENTIFIERS), 2))
|
69 |
+
|
70 |
+
for index, identifier in enumerate(BODY_IDENTIFIERS + HAND_IDENTIFIERS):
|
71 |
+
depth_map[:, index, 0] = data.data_hub[identifier + "_X"]
|
72 |
+
depth_map[:, index, 1] = data.data_hub[identifier + "_Y"]
|
73 |
+
|
74 |
+
depth_map = torch.from_numpy(np.copy(depth_map))
|
75 |
+
|
76 |
+
depth_map = tensor_to_dictionary(depth_map)
|
77 |
+
|
78 |
+
keys = copy.copy(list(depth_map.keys()))
|
79 |
+
for key in keys:
|
80 |
+
data = depth_map[key]
|
81 |
+
del depth_map[key]
|
82 |
+
depth_map[key.replace("_Left", "_0").replace("_Right", "_1")] = data
|
83 |
+
|
84 |
+
depth_map = normalize_single_body_dict(depth_map)
|
85 |
+
depth_map = normalize_single_hand_dict(depth_map)
|
86 |
+
|
87 |
+
keys = copy.copy(list(depth_map.keys()))
|
88 |
+
for key in keys:
|
89 |
+
data = depth_map[key]
|
90 |
+
del depth_map[key]
|
91 |
+
depth_map[key.replace("_0", "_Left").replace("_1", "_Right")] = data
|
92 |
+
|
93 |
+
depth_map = dictionary_to_tensor(depth_map)
|
94 |
+
|
95 |
+
depth_map = depth_map - 0.5
|
96 |
+
|
97 |
+
inputs = depth_map.squeeze(0).to(device)
|
98 |
+
outputs = model(inputs).expand(1, -1, -1)
|
99 |
+
results = torch.nn.functional.softmax(outputs, dim=2).detach().numpy()[0, 0]
|
100 |
+
|
101 |
+
results = {GLOSS[i]: float(results[i]) for i in range(100)}
|
102 |
+
|
103 |
+
return results
|
104 |
+
|
105 |
+
|
106 |
+
label = gr.outputs.Label(num_top_classes=5, label="Top class probabilities")
|
107 |
+
demo = gr.Interface(fn=greet, inputs=[gr.Dropdown(["Webcam", "Video"], label="Please select the input type:", type="value"), gr.Video(source="webcam", label="Webcam recording", type="mp4"), gr.Video(source="upload", label="Video upload", type="mp4")], outputs=label,
|
108 |
+
title="SPOTER Sign language recognition",
|
109 |
+
description="",
|
110 |
+
article="This is joint work of [Matyas Bohacek](https://scholar.google.cz/citations?user=wDy1xBwAAAAJ) and [Zhuo Cao](https://www.linkedin.com/in/zhuo-cao-b0787a1aa/?originalSubdomain=hk). For more info, visit [our website.](https://www.signlanguagerecognition.com)",
|
111 |
+
css="""
|
112 |
+
@font-face {
|
113 |
+
font-family: Graphik;
|
114 |
+
font-weight: regular;
|
115 |
+
src: url("https://www.signlanguagerecognition.com/supplementary/GraphikRegular.otf") format("opentype");
|
116 |
+
}
|
117 |
+
|
118 |
+
@font-face {
|
119 |
+
font-family: Graphik;
|
120 |
+
font-weight: bold;
|
121 |
+
src: url("https://www.signlanguagerecognition.com/supplementary/GraphikBold.otf") format("opentype");
|
122 |
+
}
|
123 |
+
|
124 |
+
@font-face {
|
125 |
+
font-family: MonumentExpanded;
|
126 |
+
font-weight: regular;
|
127 |
+
src: url("https://www.signlanguagerecognition.com/supplementary/MonumentExtended-Regular.otf") format("opentype");
|
128 |
+
}
|
129 |
+
|
130 |
+
@font-face {
|
131 |
+
font-family: MonumentExpanded;
|
132 |
+
font-weight: bold;
|
133 |
+
src: url("https://www.signlanguagerecognition.com/supplementary/MonumentExtended-Bold.otf") format("opentype");
|
134 |
+
}
|
135 |
+
|
136 |
+
html {
|
137 |
+
font-family: "Graphik";
|
138 |
+
}
|
139 |
+
|
140 |
+
h1 {
|
141 |
+
font-family: "MonumentExpanded";
|
142 |
+
}
|
143 |
+
|
144 |
+
#12 {
|
145 |
+
- background-image: linear-gradient(to left, #61D836, #6CB346) !important;
|
146 |
+
background-color: #61D836 !important;
|
147 |
+
}
|
148 |
+
|
149 |
+
#12:hover {
|
150 |
+
- background-image: linear-gradient(to left, #61D836, #6CB346) !important;
|
151 |
+
background-color: #6CB346 !important;
|
152 |
+
border: 0 !important;
|
153 |
+
border-color: 0 !important;
|
154 |
+
}
|
155 |
+
|
156 |
+
.dark .gr-button-primary {
|
157 |
+
--tw-gradient-from: #61D836;
|
158 |
+
--tw-gradient-to: #6CB346;
|
159 |
+
border: 0 !important;
|
160 |
+
border-color: 0 !important;
|
161 |
+
}
|
162 |
+
|
163 |
+
.dark .gr-button-primary:hover {
|
164 |
+
--tw-gradient-from: #64A642;
|
165 |
+
--tw-gradient-to: #58933B;
|
166 |
+
border: 0 !important;
|
167 |
+
border-color: 0 !important;
|
168 |
+
}
|
169 |
+
""",
|
170 |
+
cache_examples=True
|
171 |
+
)
|
172 |
+
|
173 |
+
demo.launch(debug=True)
|
examples/chair.mp4
ADDED
Binary file (990 kB). View file
|
|
examples/computer.mp4
ADDED
Binary file (858 kB). View file
|
|
examples/work.mp4
ADDED
Binary file (785 kB). View file
|
|
flagged/log.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
'name','output','flag','username','timestamp'
|
2 |
+
'Hello','Hello Hello!!','','','2022-05-28 16:06:47.684383'
|
3 |
+
'Hello','Hello Hello!!','','','2022-05-28 16:06:49.325378'
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
opencv-python
|
2 |
+
mediapipe
|
3 |
+
pandas
|
4 |
+
torch==1.8.1
|
5 |
+
numpy
|
6 |
+
scikit-learn
|
7 |
+
protobuf==3.20.1
|
spoter-checkpoint.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b4f0927fbaddf11da6762ca76474a7bbc049565599e3fc6f081caa5cc00fb53a
|
3 |
+
size 23764668
|
spoter/gaussian_noise.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import torch
|
3 |
+
|
4 |
+
|
5 |
+
class GaussianNoise(object):
|
6 |
+
def __init__(self, mean=0., std=1.):
|
7 |
+
self.std = std
|
8 |
+
self.mean = mean
|
9 |
+
|
10 |
+
def __call__(self, tensor):
|
11 |
+
return tensor + torch.randn(tensor.size()) * self.std + self.mean
|
12 |
+
|
13 |
+
def __repr__(self):
|
14 |
+
return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)
|
15 |
+
|
16 |
+
|
17 |
+
if __name__ == "__main__":
|
18 |
+
pass
|
spoter/spoter_model.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import copy
|
3 |
+
import torch
|
4 |
+
|
5 |
+
import torch.nn as nn
|
6 |
+
from typing import Optional
|
7 |
+
|
8 |
+
|
9 |
+
def _get_clones(mod, n):
|
10 |
+
return nn.ModuleList([copy.deepcopy(mod) for _ in range(n)])
|
11 |
+
|
12 |
+
|
13 |
+
class SPOTERTransformerDecoderLayer(nn.TransformerDecoderLayer):
|
14 |
+
"""
|
15 |
+
Edited TransformerDecoderLayer implementation omitting the redundant self-attention operation as opposed to the
|
16 |
+
standard implementation.
|
17 |
+
"""
|
18 |
+
|
19 |
+
def __init__(self, d_model, nhead, dim_feedforward, dropout, activation):
|
20 |
+
super(SPOTERTransformerDecoderLayer, self).__init__(d_model, nhead, dim_feedforward, dropout, activation)
|
21 |
+
|
22 |
+
del self.self_attn
|
23 |
+
|
24 |
+
def forward(self, tgt: torch.Tensor, memory: torch.Tensor, tgt_mask: Optional[torch.Tensor] = None,
|
25 |
+
memory_mask: Optional[torch.Tensor] = None, tgt_key_padding_mask: Optional[torch.Tensor] = None,
|
26 |
+
memory_key_padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
|
27 |
+
|
28 |
+
tgt = tgt + self.dropout1(tgt)
|
29 |
+
tgt = self.norm1(tgt)
|
30 |
+
tgt2 = self.multihead_attn(tgt, memory, memory, attn_mask=memory_mask,
|
31 |
+
key_padding_mask=memory_key_padding_mask)[0]
|
32 |
+
tgt = tgt + self.dropout2(tgt2)
|
33 |
+
tgt = self.norm2(tgt)
|
34 |
+
tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
|
35 |
+
tgt = tgt + self.dropout3(tgt2)
|
36 |
+
tgt = self.norm3(tgt)
|
37 |
+
|
38 |
+
return tgt
|
39 |
+
|
40 |
+
|
41 |
+
class SPOTER(nn.Module):
|
42 |
+
"""
|
43 |
+
Implementation of the SPOTER (Sign POse-based TransformER) architecture for sign language recognition from sequence
|
44 |
+
of skeletal data.
|
45 |
+
"""
|
46 |
+
|
47 |
+
def __init__(self, num_classes, hidden_dim=55):
|
48 |
+
super().__init__()
|
49 |
+
|
50 |
+
self.row_embed = nn.Parameter(torch.rand(50, hidden_dim))
|
51 |
+
self.pos = nn.Parameter(torch.cat([self.row_embed[0].unsqueeze(0).repeat(1, 1, 1)], dim=-1).flatten(0, 1).unsqueeze(0))
|
52 |
+
self.class_query = nn.Parameter(torch.rand(1, hidden_dim))
|
53 |
+
self.transformer = nn.Transformer(hidden_dim, 9, 6, 6)
|
54 |
+
self.linear_class = nn.Linear(hidden_dim, num_classes)
|
55 |
+
|
56 |
+
# Deactivate the initial attention decoder mechanism
|
57 |
+
custom_decoder_layer = SPOTERTransformerDecoderLayer(self.transformer.d_model, self.transformer.nhead, 2048,
|
58 |
+
0.1, "relu")
|
59 |
+
self.transformer.decoder.layers = _get_clones(custom_decoder_layer, self.transformer.decoder.num_layers)
|
60 |
+
|
61 |
+
def forward(self, inputs):
|
62 |
+
h = torch.unsqueeze(inputs.flatten(start_dim=1), 1).float()
|
63 |
+
h = self.transformer(self.pos + h, self.class_query.unsqueeze(0)).transpose(0, 1)
|
64 |
+
res = self.linear_class(h)
|
65 |
+
|
66 |
+
return res
|
67 |
+
|
68 |
+
|
69 |
+
if __name__ == "__main__":
|
70 |
+
pass
|
spoter/utils.py
ADDED
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import logging
|
3 |
+
import torch
|
4 |
+
|
5 |
+
|
6 |
+
def train_epoch(model, dataloader, criterion, optimizer, device, scheduler=None):
|
7 |
+
|
8 |
+
pred_correct, pred_all = 0, 0
|
9 |
+
running_loss = 0.0
|
10 |
+
|
11 |
+
for i, data in enumerate(dataloader):
|
12 |
+
inputs, labels = data
|
13 |
+
inputs = inputs.squeeze(0).to(device)
|
14 |
+
labels = labels.to(device, dtype=torch.long)
|
15 |
+
|
16 |
+
optimizer.zero_grad()
|
17 |
+
outputs = model(inputs).expand(1, -1, -1)
|
18 |
+
|
19 |
+
loss = criterion(outputs[0], labels[0])
|
20 |
+
loss.backward()
|
21 |
+
optimizer.step()
|
22 |
+
running_loss += loss
|
23 |
+
|
24 |
+
# Statistics
|
25 |
+
if int(torch.argmax(torch.nn.functional.softmax(outputs, dim=2))) == int(labels[0][0]):
|
26 |
+
pred_correct += 1
|
27 |
+
pred_all += 1
|
28 |
+
|
29 |
+
if scheduler:
|
30 |
+
scheduler.step(running_loss.item() / len(dataloader))
|
31 |
+
|
32 |
+
return running_loss, pred_correct, pred_all, (pred_correct / pred_all)
|
33 |
+
|
34 |
+
|
35 |
+
def evaluate(model, dataloader, device, print_stats=False):
|
36 |
+
|
37 |
+
pred_correct, pred_all = 0, 0
|
38 |
+
stats = {i: [0, 0] for i in range(101)}
|
39 |
+
|
40 |
+
for i, data in enumerate(dataloader):
|
41 |
+
inputs, labels = data
|
42 |
+
inputs = inputs.squeeze(0).to(device)
|
43 |
+
labels = labels.to(device, dtype=torch.long)
|
44 |
+
|
45 |
+
outputs = model(inputs).expand(1, -1, -1)
|
46 |
+
|
47 |
+
# Statistics
|
48 |
+
if int(torch.argmax(torch.nn.functional.softmax(outputs, dim=2))) == int(labels[0][0]):
|
49 |
+
stats[int(labels[0][0])][0] += 1
|
50 |
+
pred_correct += 1
|
51 |
+
|
52 |
+
stats[int(labels[0][0])][1] += 1
|
53 |
+
pred_all += 1
|
54 |
+
|
55 |
+
if print_stats:
|
56 |
+
stats = {key: value[0] / value[1] for key, value in stats.items() if value[1] != 0}
|
57 |
+
print("Label accuracies statistics:")
|
58 |
+
print(str(stats) + "\n")
|
59 |
+
logging.info("Label accuracies statistics:")
|
60 |
+
logging.info(str(stats) + "\n")
|
61 |
+
|
62 |
+
return pred_correct, pred_all, (pred_correct / pred_all)
|
63 |
+
|
64 |
+
|
65 |
+
def evaluate_top_k(model, dataloader, device, k=5):
|
66 |
+
|
67 |
+
pred_correct, pred_all = 0, 0
|
68 |
+
|
69 |
+
for i, data in enumerate(dataloader):
|
70 |
+
inputs, labels = data
|
71 |
+
inputs = inputs.squeeze(0).to(device)
|
72 |
+
labels = labels.to(device, dtype=torch.long)
|
73 |
+
|
74 |
+
outputs = model(inputs).expand(1, -1, -1)
|
75 |
+
|
76 |
+
if int(labels[0][0]) in torch.topk(outputs, k).indices.tolist():
|
77 |
+
pred_correct += 1
|
78 |
+
|
79 |
+
pred_all += 1
|
80 |
+
|
81 |
+
return pred_correct, pred_all, (pred_correct / pred_all)
|
spoter_mod/.idea/.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Default ignored files
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
4 |
+
# Editor-based HTTP Client requests
|
5 |
+
/httpRequests/
|
6 |
+
# Datasource local storage ignored files
|
7 |
+
/dataSources/
|
8 |
+
/dataSources.local.xml
|
spoter_mod/.idea/inspectionProfiles/Project_Default.xml
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<profile version="1.0">
|
3 |
+
<option name="myName" value="Project Default" />
|
4 |
+
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
5 |
+
<option name="ignoredPackages">
|
6 |
+
<value>
|
7 |
+
<list size="11">
|
8 |
+
<item index="0" class="java.lang.String" itemvalue="pandas" />
|
9 |
+
<item index="1" class="java.lang.String" itemvalue="feedparser" />
|
10 |
+
<item index="2" class="java.lang.String" itemvalue="sklearn" />
|
11 |
+
<item index="3" class="java.lang.String" itemvalue="numpy" />
|
12 |
+
<item index="4" class="java.lang.String" itemvalue="coremltools" />
|
13 |
+
<item index="5" class="java.lang.String" itemvalue="h5py" />
|
14 |
+
<item index="6" class="java.lang.String" itemvalue="torch" />
|
15 |
+
<item index="7" class="java.lang.String" itemvalue="einops" />
|
16 |
+
<item index="8" class="java.lang.String" itemvalue="firebase-admin" />
|
17 |
+
<item index="9" class="java.lang.String" itemvalue="pyemd" />
|
18 |
+
<item index="10" class="java.lang.String" itemvalue="matplotlib" />
|
19 |
+
</list>
|
20 |
+
</value>
|
21 |
+
</option>
|
22 |
+
</inspection_tool>
|
23 |
+
</profile>
|
24 |
+
</component>
|
spoter_mod/.idea/inspectionProfiles/profiles_settings.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<settings>
|
3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
4 |
+
<version value="1.0" />
|
5 |
+
</settings>
|
6 |
+
</component>
|
spoter_mod/.idea/misc.xml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (spoter)" project-jdk-type="Python SDK" />
|
4 |
+
</project>
|
spoter_mod/.idea/modules.xml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectModuleManager">
|
4 |
+
<modules>
|
5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/spoter.iml" filepath="$PROJECT_DIR$/.idea/spoter.iml" />
|
6 |
+
</modules>
|
7 |
+
</component>
|
8 |
+
</project>
|
spoter_mod/.idea/vcs.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="VcsDirectoryMappings">
|
4 |
+
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
5 |
+
</component>
|
6 |
+
</project>
|
spoter_mod/LICENSE
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Apache License
|
2 |
+
Version 2.0, January 2004
|
3 |
+
http://www.apache.org/licenses/
|
4 |
+
|
5 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
6 |
+
|
7 |
+
1. Definitions.
|
8 |
+
|
9 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
10 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
11 |
+
|
12 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
13 |
+
the copyright owner that is granting the License.
|
14 |
+
|
15 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
16 |
+
other entities that control, are controlled by, or are under common
|
17 |
+
control with that entity. For the purposes of this definition,
|
18 |
+
"control" means (i) the power, direct or indirect, to cause the
|
19 |
+
direction or management of such entity, whether by contract or
|
20 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
21 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
22 |
+
|
23 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
24 |
+
exercising permissions granted by this License.
|
25 |
+
|
26 |
+
"Source" form shall mean the preferred form for making modifications,
|
27 |
+
including but not limited to software source code, documentation
|
28 |
+
source, and configuration files.
|
29 |
+
|
30 |
+
"Object" form shall mean any form resulting from mechanical
|
31 |
+
transformation or translation of a Source form, including but
|
32 |
+
not limited to compiled object code, generated documentation,
|
33 |
+
and conversions to other media types.
|
34 |
+
|
35 |
+
"Work" shall mean the work of authorship, whether in Source or
|
36 |
+
Object form, made available under the License, as indicated by a
|
37 |
+
copyright notice that is included in or attached to the work
|
38 |
+
(an example is provided in the Appendix below).
|
39 |
+
|
40 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
41 |
+
form, that is based on (or derived from) the Work and for which the
|
42 |
+
editorial revisions, annotations, elaborations, or other modifications
|
43 |
+
represent, as a whole, an original work of authorship. For the purposes
|
44 |
+
of this License, Derivative Works shall not include works that remain
|
45 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
46 |
+
the Work and Derivative Works thereof.
|
47 |
+
|
48 |
+
"Contribution" shall mean any work of authorship, including
|
49 |
+
the original version of the Work and any modifications or additions
|
50 |
+
to that Work or Derivative Works thereof, that is intentionally
|
51 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
52 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
53 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
54 |
+
means any form of electronic, verbal, or written communication sent
|
55 |
+
to the Licensor or its representatives, including but not limited to
|
56 |
+
communication on electronic mailing lists, source code control systems,
|
57 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
58 |
+
Licensor for the purpose of discussing and improving the Work, but
|
59 |
+
excluding communication that is conspicuously marked or otherwise
|
60 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
61 |
+
|
62 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
63 |
+
on behalf of whom a Contribution has been received by Licensor and
|
64 |
+
subsequently incorporated within the Work.
|
65 |
+
|
66 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
67 |
+
this License, each Contributor hereby grants to You a perpetual,
|
68 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
69 |
+
copyright license to reproduce, prepare Derivative Works of,
|
70 |
+
publicly display, publicly perform, sublicense, and distribute the
|
71 |
+
Work and such Derivative Works in Source or Object form.
|
72 |
+
|
73 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
74 |
+
this License, each Contributor hereby grants to You a perpetual,
|
75 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
76 |
+
(except as stated in this section) patent license to make, have made,
|
77 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
78 |
+
where such license applies only to those patent claims licensable
|
79 |
+
by such Contributor that are necessarily infringed by their
|
80 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
81 |
+
with the Work to which such Contribution(s) was submitted. If You
|
82 |
+
institute patent litigation against any entity (including a
|
83 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
84 |
+
or a Contribution incorporated within the Work constitutes direct
|
85 |
+
or contributory patent infringement, then any patent licenses
|
86 |
+
granted to You under this License for that Work shall terminate
|
87 |
+
as of the date such litigation is filed.
|
88 |
+
|
89 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
90 |
+
Work or Derivative Works thereof in any medium, with or without
|
91 |
+
modifications, and in Source or Object form, provided that You
|
92 |
+
meet the following conditions:
|
93 |
+
|
94 |
+
(a) You must give any other recipients of the Work or
|
95 |
+
Derivative Works a copy of this License; and
|
96 |
+
|
97 |
+
(b) You must cause any modified files to carry prominent notices
|
98 |
+
stating that You changed the files; and
|
99 |
+
|
100 |
+
(c) You must retain, in the Source form of any Derivative Works
|
101 |
+
that You distribute, all copyright, patent, trademark, and
|
102 |
+
attribution notices from the Source form of the Work,
|
103 |
+
excluding those notices that do not pertain to any part of
|
104 |
+
the Derivative Works; and
|
105 |
+
|
106 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
107 |
+
distribution, then any Derivative Works that You distribute must
|
108 |
+
include a readable copy of the attribution notices contained
|
109 |
+
within such NOTICE file, excluding those notices that do not
|
110 |
+
pertain to any part of the Derivative Works, in at least one
|
111 |
+
of the following places: within a NOTICE text file distributed
|
112 |
+
as part of the Derivative Works; within the Source form or
|
113 |
+
documentation, if provided along with the Derivative Works; or,
|
114 |
+
within a display generated by the Derivative Works, if and
|
115 |
+
wherever such third-party notices normally appear. The contents
|
116 |
+
of the NOTICE file are for informational purposes only and
|
117 |
+
do not modify the License. You may add Your own attribution
|
118 |
+
notices within Derivative Works that You distribute, alongside
|
119 |
+
or as an addendum to the NOTICE text from the Work, provided
|
120 |
+
that such additional attribution notices cannot be construed
|
121 |
+
as modifying the License.
|
122 |
+
|
123 |
+
You may add Your own copyright statement to Your modifications and
|
124 |
+
may provide additional or different license terms and conditions
|
125 |
+
for use, reproduction, or distribution of Your modifications, or
|
126 |
+
for any such Derivative Works as a whole, provided Your use,
|
127 |
+
reproduction, and distribution of the Work otherwise complies with
|
128 |
+
the conditions stated in this License.
|
129 |
+
|
130 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
131 |
+
any Contribution intentionally submitted for inclusion in the Work
|
132 |
+
by You to the Licensor shall be under the terms and conditions of
|
133 |
+
this License, without any additional terms or conditions.
|
134 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
135 |
+
the terms of any separate license agreement you may have executed
|
136 |
+
with Licensor regarding such Contributions.
|
137 |
+
|
138 |
+
6. Trademarks. This License does not grant permission to use the trade
|
139 |
+
names, trademarks, service marks, or product names of the Licensor,
|
140 |
+
except as required for reasonable and customary use in describing the
|
141 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
142 |
+
|
143 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
144 |
+
agreed to in writing, Licensor provides the Work (and each
|
145 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
146 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
147 |
+
implied, including, without limitation, any warranties or conditions
|
148 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
149 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
150 |
+
appropriateness of using or redistributing the Work and assume any
|
151 |
+
risks associated with Your exercise of permissions under this License.
|
152 |
+
|
153 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
154 |
+
whether in tort (including negligence), contract, or otherwise,
|
155 |
+
unless required by applicable law (such as deliberate and grossly
|
156 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
157 |
+
liable to You for damages, including any direct, indirect, special,
|
158 |
+
incidental, or consequential damages of any character arising as a
|
159 |
+
result of this License or out of the use or inability to use the
|
160 |
+
Work (including but not limited to damages for loss of goodwill,
|
161 |
+
work stoppage, computer failure or malfunction, or any and all
|
162 |
+
other commercial damages or losses), even if such Contributor
|
163 |
+
has been advised of the possibility of such damages.
|
164 |
+
|
165 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
166 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
167 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
168 |
+
or other liability obligations and/or rights consistent with this
|
169 |
+
License. However, in accepting such obligations, You may act only
|
170 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
171 |
+
of any other Contributor, and only if You agree to indemnify,
|
172 |
+
defend, and hold each Contributor harmless for any liability
|
173 |
+
incurred by, or claims asserted against, such Contributor by reason
|
174 |
+
of your accepting any such warranty or additional liability.
|
175 |
+
|
176 |
+
END OF TERMS AND CONDITIONS
|
177 |
+
|
178 |
+
APPENDIX: How to apply the Apache License to your work.
|
179 |
+
|
180 |
+
To apply the Apache License to your work, attach the following
|
181 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
182 |
+
replaced with your own identifying information. (Don't include
|
183 |
+
the brackets!) The text should be enclosed in the appropriate
|
184 |
+
comment syntax for the file format. We also recommend that a
|
185 |
+
file or class name and description of purpose be included on the
|
186 |
+
same "printed page" as the copyright notice for easier
|
187 |
+
identification within third-party archives.
|
188 |
+
|
189 |
+
Copyright 2021-2022 Matyáš Boháček
|
190 |
+
|
191 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
192 |
+
you may not use this file except in compliance with the License.
|
193 |
+
You may obtain a copy of the License at
|
194 |
+
|
195 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
196 |
+
|
197 |
+
Unless required by applicable law or agreed to in writing, software
|
198 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
199 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
200 |
+
See the License for the specific language governing permissions and
|
201 |
+
limitations under the License.
|
spoter_mod/README.md
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
![Alt Text](http://spoter.signlanguagerecognition.com/img/GitHub_banner.png)
|
2 |
+
|
3 |
+
> by **[Matyáš Boháček](https://github.com/matyasbohacek)** and **[Marek Hrúz](https://github.com/mhruz)**, University of West Bohemia <br>
|
4 |
+
> Should you have any questions or inquiries, feel free to contact us [here](mailto:[email protected]).
|
5 |
+
|
6 |
+
[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/sign-pose-based-transformer-for-word-level/sign-language-recognition-on-lsa64)](https://paperswithcode.com/sota/sign-language-recognition-on-lsa64?p=sign-pose-based-transformer-for-word-level)
|
7 |
+
|
8 |
+
Repository accompanying the [Sign Pose-based Transformer for Word-level Sign Language Recognition](https://openaccess.thecvf.com/content/WACV2022W/HADCV/html/Bohacek_Sign_Pose-Based_Transformer_for_Word-Level_Sign_Language_Recognition_WACVW_2022_paper.html) paper, where we present a novel architecture for word-level sign language recognition based on the Transformer model. We designed our solution with low computational cost in mind, since we see egreat potential in the usage of such recognition system on hand-held devices. We introduce multiple original augmentation techniques tailored for the task of sign language recognition and propose a unique normalization scheme based on sign language linguistics.
|
9 |
+
|
10 |
+
![Alt Text](http://spoter.signlanguagerecognition.com/img/architecture_github.gif)
|
11 |
+
|
12 |
+
## Get Started
|
13 |
+
|
14 |
+
First, make sure to install all necessary dependencies using:
|
15 |
+
|
16 |
+
```shell
|
17 |
+
pip install -r requirements.txt
|
18 |
+
```
|
19 |
+
|
20 |
+
To train the model, simply specify the hyperparameters and run the following:
|
21 |
+
|
22 |
+
```
|
23 |
+
python -m train
|
24 |
+
--experiment_name [str; name of the experiment to name the output logs and plots]
|
25 |
+
|
26 |
+
--epochs [int; number of epochs]
|
27 |
+
--lr [float; learning rate]
|
28 |
+
|
29 |
+
--training_set_path [str; path to the csv file with training set's skeletal data]
|
30 |
+
--validation_set_path [str; path to the csv file with validation set's skeletal data]
|
31 |
+
--testing_set_path [str; path to the csv file with testing set's skeletal data]
|
32 |
+
```
|
33 |
+
|
34 |
+
If either the validation or testing sets' paths are left empty, these corresponding metrics will not be calculated. We also provide out-of-the box parameter to split the validation set as a desired split of the training set while preserving the label distribution for datasets without author-specified splits. These and many other specific hyperparameters with their descriptions can be found in the [train.py](https://github.com/matyasbohacek/spoter/blob/main/train.py) file. All of them are provided a default value we found to be working well in our experiments.
|
35 |
+
|
36 |
+
## Data
|
37 |
+
|
38 |
+
As SPOTER works on top of sequences of signers' skeletal data extracted from videos, we wanted to eliminate the computational demands of such annotation for each training run by pre-collecting this. For this reason and reproducibility, we are open-sourcing this data for WLASL100 and LSA64 datasets along with the repository. You can find the data [here](https://github.com/matyasbohacek/spoter/releases/tag/supplementary-data).
|
39 |
+
|
40 |
+
![Alt Text](http://spoter.signlanguagerecognition.com/img/datasets_overview.gif)
|
41 |
+
|
42 |
+
## License
|
43 |
+
|
44 |
+
The **code** is published under the [Apache License 2.0](https://github.com/matyasbohacek/spoter/blob/main/LICENSE) which allows for both academic and commercial use if relevant License and copyright notice is included, our work is cited and all changes are stated.
|
45 |
+
|
46 |
+
The accompanying skeletal data of the [WLASL](https://arxiv.org/pdf/1910.11006.pdf) and [LSA64](https://core.ac.uk/download/pdf/76495887.pdf) datasets used for experiments are, however, shared under the [Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)](https://creativecommons.org/licenses/by-nc/4.0/) license allowing only for non-commercial usage.
|
47 |
+
|
48 |
+
## Citation
|
49 |
+
|
50 |
+
If you find our work relevant, build upon it or compare your approaches with it, please cite our work as stated below:
|
51 |
+
|
52 |
+
```
|
53 |
+
@InProceedings{Bohacek_2022_WACV,
|
54 |
+
author = {Boh\'a\v{c}ek, Maty\'a\v{s} and Hr\'uz, Marek},
|
55 |
+
title = {Sign Pose-Based Transformer for Word-Level Sign Language Recognition},
|
56 |
+
booktitle = {Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV) Workshops},
|
57 |
+
month = {January},
|
58 |
+
year = {2022},
|
59 |
+
pages = {182-191}
|
60 |
+
}
|
61 |
+
```
|
spoter_mod/augmentations/__init__.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import math
|
3 |
+
import logging
|
4 |
+
import cv2
|
5 |
+
import random
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
from normalization.body_normalization import BODY_IDENTIFIERS
|
10 |
+
from normalization.hand_normalization import HAND_IDENTIFIERS
|
11 |
+
|
12 |
+
|
13 |
+
HAND_IDENTIFIERS = [id + "_0" for id in HAND_IDENTIFIERS] + [id + "_1" for id in HAND_IDENTIFIERS]
|
14 |
+
ARM_IDENTIFIERS_ORDER = ["neck", "$side$Shoulder", "$side$Elbow", "$side$Wrist"]
|
15 |
+
|
16 |
+
|
17 |
+
def __random_pass(prob):
|
18 |
+
return random.random() < prob
|
19 |
+
|
20 |
+
|
21 |
+
def __numpy_to_dictionary(data_array: np.ndarray) -> dict:
|
22 |
+
"""
|
23 |
+
Supplementary method converting a NumPy array of body landmark data into dictionaries. The array data must match the
|
24 |
+
order of the BODY_IDENTIFIERS list.
|
25 |
+
"""
|
26 |
+
|
27 |
+
output = {}
|
28 |
+
|
29 |
+
for landmark_index, identifier in enumerate(BODY_IDENTIFIERS):
|
30 |
+
output[identifier] = data_array[:, landmark_index].tolist()
|
31 |
+
|
32 |
+
return output
|
33 |
+
|
34 |
+
|
35 |
+
def __dictionary_to_numpy(landmarks_dict: dict) -> np.ndarray:
|
36 |
+
"""
|
37 |
+
Supplementary method converting dictionaries of body landmark data into respective NumPy arrays. The resulting array
|
38 |
+
will match the order of the BODY_IDENTIFIERS list.
|
39 |
+
"""
|
40 |
+
|
41 |
+
output = np.empty(shape=(len(landmarks_dict["leftEar"]), len(BODY_IDENTIFIERS), 2))
|
42 |
+
|
43 |
+
for landmark_index, identifier in enumerate(BODY_IDENTIFIERS):
|
44 |
+
output[:, landmark_index, 0] = np.array(landmarks_dict[identifier])[:, 0]
|
45 |
+
output[:, landmark_index, 1] = np.array(landmarks_dict[identifier])[:, 1]
|
46 |
+
|
47 |
+
return output
|
48 |
+
|
49 |
+
|
50 |
+
def __rotate(origin: tuple, point: tuple, angle: float):
|
51 |
+
"""
|
52 |
+
Rotates a point counterclockwise by a given angle around a given origin.
|
53 |
+
|
54 |
+
:param origin: Landmark in the (X, Y) format of the origin from which to count angle of rotation
|
55 |
+
:param point: Landmark in the (X, Y) format to be rotated
|
56 |
+
:param angle: Angle under which the point shall be rotated
|
57 |
+
:return: New landmarks (coordinates)
|
58 |
+
"""
|
59 |
+
|
60 |
+
ox, oy = origin
|
61 |
+
px, py = point
|
62 |
+
|
63 |
+
qx = ox + math.cos(angle) * (px - ox) - math.sin(angle) * (py - oy)
|
64 |
+
qy = oy + math.sin(angle) * (px - ox) + math.cos(angle) * (py - oy)
|
65 |
+
|
66 |
+
return qx, qy
|
67 |
+
|
68 |
+
|
69 |
+
def __preprocess_row_sign(sign: dict) -> (dict, dict):
|
70 |
+
"""
|
71 |
+
Supplementary method splitting the single-dictionary skeletal data into two dictionaries of body and hand landmarks
|
72 |
+
respectively.
|
73 |
+
"""
|
74 |
+
|
75 |
+
sign_eval = sign
|
76 |
+
|
77 |
+
if "nose_X" in sign_eval:
|
78 |
+
body_landmarks = {identifier: [(x, y) for x, y in zip(sign_eval[identifier + "_X"], sign_eval[identifier + "_Y"])]
|
79 |
+
for identifier in BODY_IDENTIFIERS}
|
80 |
+
hand_landmarks = {identifier: [(x, y) for x, y in zip(sign_eval[identifier + "_X"], sign_eval[identifier + "_Y"])]
|
81 |
+
for identifier in HAND_IDENTIFIERS}
|
82 |
+
|
83 |
+
else:
|
84 |
+
body_landmarks = {identifier: sign_eval[identifier] for identifier in BODY_IDENTIFIERS}
|
85 |
+
hand_landmarks = {identifier: sign_eval[identifier] for identifier in HAND_IDENTIFIERS}
|
86 |
+
|
87 |
+
return body_landmarks, hand_landmarks
|
88 |
+
|
89 |
+
|
90 |
+
def __wrap_sign_into_row(body_identifiers: dict, hand_identifiers: dict) -> dict:
|
91 |
+
"""
|
92 |
+
Supplementary method for merging body and hand data into a single dictionary.
|
93 |
+
"""
|
94 |
+
|
95 |
+
return {**body_identifiers, **hand_identifiers}
|
96 |
+
|
97 |
+
|
98 |
+
def augment_rotate(sign: dict, angle_range: tuple) -> dict:
|
99 |
+
"""
|
100 |
+
AUGMENTATION TECHNIQUE. All the joint coordinates in each frame are rotated by a random angle up to 13 degrees with
|
101 |
+
the center of rotation lying in the center of the frame, which is equal to [0.5; 0.5].
|
102 |
+
|
103 |
+
:param sign: Dictionary with sequential skeletal data of the signing person
|
104 |
+
:param angle_range: Tuple containing the angle range (minimal and maximal angle in degrees) to randomly choose the
|
105 |
+
angle by which the landmarks will be rotated from
|
106 |
+
|
107 |
+
:return: Dictionary with augmented (by rotation) sequential skeletal data of the signing person
|
108 |
+
"""
|
109 |
+
|
110 |
+
body_landmarks, hand_landmarks = __preprocess_row_sign(sign)
|
111 |
+
angle = math.radians(random.uniform(*angle_range))
|
112 |
+
|
113 |
+
body_landmarks = {key: [__rotate((0.5, 0.5), frame, angle) for frame in value] for key, value in
|
114 |
+
body_landmarks.items()}
|
115 |
+
hand_landmarks = {key: [__rotate((0.5, 0.5), frame, angle) for frame in value] for key, value in
|
116 |
+
hand_landmarks.items()}
|
117 |
+
|
118 |
+
return __wrap_sign_into_row(body_landmarks, hand_landmarks)
|
119 |
+
|
120 |
+
|
121 |
+
def augment_shear(sign: dict, type: str, squeeze_ratio: tuple) -> dict:
|
122 |
+
"""
|
123 |
+
AUGMENTATION TECHNIQUE.
|
124 |
+
|
125 |
+
- Squeeze. All the frames are squeezed from both horizontal sides. Two different random proportions up to 15% of
|
126 |
+
the original frame's width for both left and right side are cut.
|
127 |
+
|
128 |
+
- Perspective transformation. The joint coordinates are projected onto a new plane with a spatially defined
|
129 |
+
center of projection, which simulates recording the sign video with a slight tilt. Each time, the right or left
|
130 |
+
side, as well as the proportion by which both the width and height will be reduced, are chosen randomly. This
|
131 |
+
proportion is selected from a uniform distribution on the [0; 1) interval. Subsequently, the new plane is
|
132 |
+
delineated by reducing the width at the desired side and the respective vertical edge (height) at both of its
|
133 |
+
adjacent corners.
|
134 |
+
|
135 |
+
:param sign: Dictionary with sequential skeletal data of the signing person
|
136 |
+
:param type: Type of shear augmentation to perform (either 'squeeze' or 'perspective')
|
137 |
+
:param squeeze_ratio: Tuple containing the relative range from what the proportion of the original width will be
|
138 |
+
randomly chosen. These proportions will either be cut from both sides or used to construct the
|
139 |
+
new projection
|
140 |
+
|
141 |
+
:return: Dictionary with augmented (by squeezing or perspective transformation) sequential skeletal data of the
|
142 |
+
signing person
|
143 |
+
"""
|
144 |
+
|
145 |
+
body_landmarks, hand_landmarks = __preprocess_row_sign(sign)
|
146 |
+
|
147 |
+
if type == "squeeze":
|
148 |
+
move_left = random.uniform(*squeeze_ratio)
|
149 |
+
move_right = random.uniform(*squeeze_ratio)
|
150 |
+
|
151 |
+
src = np.array(((0, 1), (1, 1), (0, 0), (1, 0)), dtype=np.float32)
|
152 |
+
dest = np.array(((0 + move_left, 1), (1 - move_right, 1), (0 + move_left, 0), (1 - move_right, 0)),
|
153 |
+
dtype=np.float32)
|
154 |
+
mtx = cv2.getPerspectiveTransform(src, dest)
|
155 |
+
|
156 |
+
elif type == "perspective":
|
157 |
+
|
158 |
+
move_ratio = random.uniform(*squeeze_ratio)
|
159 |
+
src = np.array(((0, 1), (1, 1), (0, 0), (1, 0)), dtype=np.float32)
|
160 |
+
|
161 |
+
if __random_pass(0.5):
|
162 |
+
dest = np.array(((0 + move_ratio, 1 - move_ratio), (1, 1), (0 + move_ratio, 0 + move_ratio), (1, 0)),
|
163 |
+
dtype=np.float32)
|
164 |
+
else:
|
165 |
+
dest = np.array(((0, 1), (1 - move_ratio, 1 - move_ratio), (0, 0), (1 - move_ratio, 0 + move_ratio)),
|
166 |
+
dtype=np.float32)
|
167 |
+
|
168 |
+
mtx = cv2.getPerspectiveTransform(src, dest)
|
169 |
+
|
170 |
+
else:
|
171 |
+
|
172 |
+
logging.error("Unsupported shear type provided.")
|
173 |
+
return {}
|
174 |
+
|
175 |
+
landmarks_array = __dictionary_to_numpy(body_landmarks)
|
176 |
+
augmented_landmarks = cv2.perspectiveTransform(np.array(landmarks_array, dtype=np.float32), mtx)
|
177 |
+
|
178 |
+
augmented_zero_landmark = cv2.perspectiveTransform(np.array([[[0, 0]]], dtype=np.float32), mtx)[0][0]
|
179 |
+
augmented_landmarks = np.stack([np.where(sub == augmented_zero_landmark, [0, 0], sub) for sub in augmented_landmarks])
|
180 |
+
|
181 |
+
body_landmarks = __numpy_to_dictionary(augmented_landmarks)
|
182 |
+
|
183 |
+
return __wrap_sign_into_row(body_landmarks, hand_landmarks)
|
184 |
+
|
185 |
+
|
186 |
+
def augment_arm_joint_rotate(sign: dict, probability: float, angle_range: tuple) -> dict:
|
187 |
+
"""
|
188 |
+
AUGMENTATION TECHNIQUE. The joint coordinates of both arms are passed successively, and the impending landmark is
|
189 |
+
slightly rotated with respect to the current one. The chance of each joint to be rotated is 3:10 and the angle of
|
190 |
+
alternation is a uniform random angle up to +-4 degrees. This simulates slight, negligible variances in each
|
191 |
+
execution of a sign, which do not change its semantic meaning.
|
192 |
+
|
193 |
+
:param sign: Dictionary with sequential skeletal data of the signing person
|
194 |
+
:param probability: Probability of each joint to be rotated (float from the range [0, 1])
|
195 |
+
:param angle_range: Tuple containing the angle range (minimal and maximal angle in degrees) to randomly choose the
|
196 |
+
angle by which the landmarks will be rotated from
|
197 |
+
|
198 |
+
:return: Dictionary with augmented (by arm joint rotation) sequential skeletal data of the signing person
|
199 |
+
"""
|
200 |
+
|
201 |
+
body_landmarks, hand_landmarks = __preprocess_row_sign(sign)
|
202 |
+
|
203 |
+
# Iterate over both directions (both hands)
|
204 |
+
for side in ["left", "right"]:
|
205 |
+
# Iterate gradually over the landmarks on arm
|
206 |
+
for landmark_index, landmark_origin in enumerate(ARM_IDENTIFIERS_ORDER):
|
207 |
+
landmark_origin = landmark_origin.replace("$side$", side)
|
208 |
+
|
209 |
+
# End the process on the current hand if the landmark is not present
|
210 |
+
if landmark_origin not in body_landmarks:
|
211 |
+
break
|
212 |
+
|
213 |
+
# Perform rotation by provided probability
|
214 |
+
if __random_pass(probability):
|
215 |
+
angle = math.radians(random.uniform(*angle_range))
|
216 |
+
|
217 |
+
for to_be_rotated in ARM_IDENTIFIERS_ORDER[landmark_index + 1:]:
|
218 |
+
to_be_rotated = to_be_rotated.replace("$side$", side)
|
219 |
+
|
220 |
+
# Skip if the landmark is not present
|
221 |
+
if to_be_rotated not in body_landmarks:
|
222 |
+
continue
|
223 |
+
|
224 |
+
body_landmarks[to_be_rotated] = [__rotate(body_landmarks[landmark_origin][frame_index], frame,
|
225 |
+
angle) for frame_index, frame in enumerate(body_landmarks[to_be_rotated])]
|
226 |
+
|
227 |
+
return __wrap_sign_into_row(body_landmarks, hand_landmarks)
|
228 |
+
|
229 |
+
|
230 |
+
if __name__ == "__main__":
|
231 |
+
pass
|
spoter_mod/data_structurization/autsl.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import os
|
3 |
+
import tqdm
|
4 |
+
|
5 |
+
import pandas as pd
|
6 |
+
from shutil import copyfile
|
7 |
+
|
8 |
+
|
9 |
+
MAIN_PATH = "/Users/matyasbohacek/Documents/Academics/Projects/AUTSL"
|
10 |
+
BATCH = "test"
|
11 |
+
|
12 |
+
df = pd.read_csv(MAIN_PATH + "/" + BATCH + "_labels.csv", encoding="utf-8", sep=";")
|
13 |
+
|
14 |
+
if not os.path.exists(MAIN_PATH + "/" + BATCH + "_preprocessed/"):
|
15 |
+
os.mkdir(MAIN_PATH + "/" + BATCH + "_preprocessed/")
|
16 |
+
|
17 |
+
for index_row, row in tqdm.tqdm(df.iterrows()):
|
18 |
+
if not os.path.exists(MAIN_PATH + "/" + BATCH + "_preprocessed/" + str(row["label"]) + "/"):
|
19 |
+
os.mkdir(MAIN_PATH + "/" + BATCH + "_preprocessed/" + str(row["label"]) + "/")
|
20 |
+
|
21 |
+
copyfile(MAIN_PATH + "/" + BATCH + "/" + str(row["video"]) + "_color.mp4", MAIN_PATH + "/" + BATCH + "_preprocessed/" + str(row["label"]) + "/" + str(row["video"]) + "_color.mp4")
|
22 |
+
|
spoter_mod/data_structurization/wlasl.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
import tqdm
|
5 |
+
|
6 |
+
from shutil import copyfile
|
7 |
+
|
8 |
+
|
9 |
+
MAIN_PATH = "/Users/matyasbohacek/Documents/Academics/Projects/WLASL/start_kit"
|
10 |
+
BATCH = "train"
|
11 |
+
|
12 |
+
if not os.path.exists(MAIN_PATH + "/" + BATCH + "_preprocessed/"):
|
13 |
+
os.mkdir(MAIN_PATH + "/" + BATCH + "_preprocessed/")
|
14 |
+
|
15 |
+
with open(MAIN_PATH + "/specs.json") as f:
|
16 |
+
data = json.load(f)
|
17 |
+
|
18 |
+
for item_index, item in tqdm.tqdm(enumerate(data)):
|
19 |
+
|
20 |
+
for video in item["instances"]:
|
21 |
+
|
22 |
+
if video["split"] != BATCH:
|
23 |
+
continue
|
24 |
+
|
25 |
+
if not os.path.exists(MAIN_PATH + "/" + BATCH + "_preprocessed/" + str(item_index) + "/"):
|
26 |
+
os.mkdir(MAIN_PATH + "/" + BATCH + "_preprocessed/" + str(item_index) + "/")
|
27 |
+
|
28 |
+
original_path = MAIN_PATH + "/videos/" + str(video["video_id"]) + ".mp4"
|
29 |
+
new_path = MAIN_PATH + "/" + BATCH + "_preprocessed/" + str(item_index) + "/" + str(video["video_id"]) + ".mp4"
|
30 |
+
|
31 |
+
copyfile(original_path, new_path)
|
32 |
+
|
spoter_mod/datasets/czech_slr_dataset.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ast
|
2 |
+
import torch
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
import torch.utils.data as torch_data
|
6 |
+
|
7 |
+
from random import randrange
|
8 |
+
from augmentations import *
|
9 |
+
from normalization.body_normalization import BODY_IDENTIFIERS
|
10 |
+
from normalization.hand_normalization import HAND_IDENTIFIERS
|
11 |
+
from normalization.body_normalization import normalize_single_dict as normalize_single_body_dict
|
12 |
+
from normalization.hand_normalization import normalize_single_dict as normalize_single_hand_dict
|
13 |
+
|
14 |
+
HAND_IDENTIFIERS = [id + "_0" for id in HAND_IDENTIFIERS] + [id + "_1" for id in HAND_IDENTIFIERS]
|
15 |
+
|
16 |
+
DEFAULT_AUGMENTATIONS_CONFIG = {
|
17 |
+
"rotate-angle": 13,
|
18 |
+
"perspective-transform-ratio": 0.1,
|
19 |
+
"squeeze-ratio": 0.15,
|
20 |
+
"arm-joint-rotate-angle": 4,
|
21 |
+
"arm-joint-rotate-probability": 0.3
|
22 |
+
}
|
23 |
+
|
24 |
+
|
25 |
+
def load_dataset(file_location: str):
|
26 |
+
|
27 |
+
# Load the datset csv file
|
28 |
+
df = pd.read_csv(file_location, encoding="utf-8")
|
29 |
+
|
30 |
+
# TO BE DELETED
|
31 |
+
df.columns = [item.replace("_Left_", "_0_").replace("_Right_", "_1_") for item in list(df.columns)]
|
32 |
+
if "neck_X" not in df.columns:
|
33 |
+
df["neck_X"] = [0 for _ in range(df.shape[0])]
|
34 |
+
df["neck_Y"] = [0 for _ in range(df.shape[0])]
|
35 |
+
|
36 |
+
# TEMP
|
37 |
+
labels = df["labels"].to_list()
|
38 |
+
labels = [label + 1 for label in df["labels"].to_list()]
|
39 |
+
data = []
|
40 |
+
|
41 |
+
for row_index, row in df.iterrows():
|
42 |
+
current_row = np.empty(shape=(len(ast.literal_eval(row["leftEar_X"])), len(BODY_IDENTIFIERS + HAND_IDENTIFIERS), 2))
|
43 |
+
for index, identifier in enumerate(BODY_IDENTIFIERS + HAND_IDENTIFIERS):
|
44 |
+
current_row[:, index, 0] = ast.literal_eval(row[identifier + "_X"])
|
45 |
+
current_row[:, index, 1] = ast.literal_eval(row[identifier + "_Y"])
|
46 |
+
|
47 |
+
data.append(current_row)
|
48 |
+
|
49 |
+
return data, labels
|
50 |
+
|
51 |
+
|
52 |
+
def tensor_to_dictionary(landmarks_tensor: torch.Tensor) -> dict:
|
53 |
+
|
54 |
+
data_array = landmarks_tensor.numpy()
|
55 |
+
output = {}
|
56 |
+
|
57 |
+
for landmark_index, identifier in enumerate(BODY_IDENTIFIERS + HAND_IDENTIFIERS):
|
58 |
+
output[identifier] = data_array[:, landmark_index]
|
59 |
+
|
60 |
+
return output
|
61 |
+
|
62 |
+
|
63 |
+
def dictionary_to_tensor(landmarks_dict: dict) -> torch.Tensor:
|
64 |
+
|
65 |
+
output = np.empty(shape=(len(landmarks_dict["leftEar"]), len(BODY_IDENTIFIERS + HAND_IDENTIFIERS), 2))
|
66 |
+
|
67 |
+
for landmark_index, identifier in enumerate(BODY_IDENTIFIERS + HAND_IDENTIFIERS):
|
68 |
+
output[:, landmark_index, 0] = [frame[0] for frame in landmarks_dict[identifier]]
|
69 |
+
output[:, landmark_index, 1] = [frame[1] for frame in landmarks_dict[identifier]]
|
70 |
+
|
71 |
+
return torch.from_numpy(output)
|
72 |
+
|
73 |
+
|
74 |
+
class CzechSLRDataset(torch_data.Dataset):
|
75 |
+
"""Advanced object representation of the HPOES dataset for loading hand joints landmarks utilizing the Torch's
|
76 |
+
built-in Dataset properties"""
|
77 |
+
|
78 |
+
data: [np.ndarray]
|
79 |
+
labels: [np.ndarray]
|
80 |
+
|
81 |
+
def __init__(self, dataset_filename: str, num_labels=5, transform=None, augmentations=False,
|
82 |
+
augmentations_prob=0.5, normalize=True, augmentations_config: dict = DEFAULT_AUGMENTATIONS_CONFIG):
|
83 |
+
"""
|
84 |
+
Initiates the HPOESDataset with the pre-loaded data from the h5 file.
|
85 |
+
|
86 |
+
:param dataset_filename: Path to the h5 file
|
87 |
+
:param transform: Any data transformation to be applied (default: None)
|
88 |
+
"""
|
89 |
+
|
90 |
+
loaded_data = load_dataset(dataset_filename)
|
91 |
+
data, labels = loaded_data[0], loaded_data[1]
|
92 |
+
|
93 |
+
self.data = data
|
94 |
+
self.labels = labels
|
95 |
+
self.targets = list(labels)
|
96 |
+
self.num_labels = num_labels
|
97 |
+
self.transform = transform
|
98 |
+
|
99 |
+
self.augmentations = augmentations
|
100 |
+
self.augmentations_prob = augmentations_prob
|
101 |
+
self.augmentations_config = augmentations_config
|
102 |
+
self.normalize = normalize
|
103 |
+
|
104 |
+
def __getitem__(self, idx):
|
105 |
+
"""
|
106 |
+
Allocates, potentially transforms and returns the item at the desired index.
|
107 |
+
|
108 |
+
:param idx: Index of the item
|
109 |
+
:return: Tuple containing both the depth map and the label
|
110 |
+
"""
|
111 |
+
|
112 |
+
depth_map = torch.from_numpy(np.copy(self.data[idx]))
|
113 |
+
label = torch.Tensor([self.labels[idx] - 1])
|
114 |
+
|
115 |
+
depth_map = tensor_to_dictionary(depth_map)
|
116 |
+
|
117 |
+
# Apply potential augmentations
|
118 |
+
if self.augmentations and random.random() < self.augmentations_prob:
|
119 |
+
|
120 |
+
selected_aug = randrange(4)
|
121 |
+
|
122 |
+
if selected_aug == 0:
|
123 |
+
depth_map = augment_rotate(depth_map, (-self.augmentations_config["rotate-angle"], self.augmentations_config["rotate-angle"]))
|
124 |
+
|
125 |
+
if selected_aug == 1:
|
126 |
+
depth_map = augment_shear(depth_map, "perspective", (0, self.augmentations_config["perspective-transform-ratio"]))
|
127 |
+
|
128 |
+
if selected_aug == 2:
|
129 |
+
depth_map = augment_shear(depth_map, "squeeze", (0, self.augmentations_config["squeeze-ratio"]))
|
130 |
+
|
131 |
+
if selected_aug == 3:
|
132 |
+
depth_map = augment_arm_joint_rotate(depth_map, self.augmentations_config["arm-joint-rotate-probability"], (-self.augmentations_config["arm-joint-rotate-angle"], self.augmentations_config["arm-joint-rotate-angle"]))
|
133 |
+
|
134 |
+
if self.normalize:
|
135 |
+
depth_map = normalize_single_body_dict(depth_map)
|
136 |
+
depth_map = normalize_single_hand_dict(depth_map)
|
137 |
+
|
138 |
+
depth_map = dictionary_to_tensor(depth_map)
|
139 |
+
|
140 |
+
# Move the landmark position interval to improve performance
|
141 |
+
depth_map = depth_map - 0.5
|
142 |
+
|
143 |
+
if self.transform:
|
144 |
+
depth_map = self.transform(depth_map)
|
145 |
+
|
146 |
+
return depth_map, label
|
147 |
+
|
148 |
+
def __len__(self):
|
149 |
+
return len(self.labels)
|
150 |
+
|
151 |
+
|
152 |
+
if __name__ == "__main__":
|
153 |
+
pass
|
spoter_mod/normalization/body_normalization.py
ADDED
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import logging
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
BODY_IDENTIFIERS = [
|
6 |
+
"nose",
|
7 |
+
"neck",
|
8 |
+
"rightEye",
|
9 |
+
"leftEye",
|
10 |
+
"rightEar",
|
11 |
+
"leftEar",
|
12 |
+
"rightShoulder",
|
13 |
+
"leftShoulder",
|
14 |
+
"rightElbow",
|
15 |
+
"leftElbow",
|
16 |
+
"rightWrist",
|
17 |
+
"leftWrist"
|
18 |
+
]
|
19 |
+
|
20 |
+
|
21 |
+
def normalize_body_full(df: pd.DataFrame) -> (pd.DataFrame, list):
|
22 |
+
"""
|
23 |
+
Normalizes the body position data using the Bohacek-normalization algorithm.
|
24 |
+
|
25 |
+
:param df: pd.DataFrame to be normalized
|
26 |
+
:return: pd.DataFrame with normalized values for body pose
|
27 |
+
"""
|
28 |
+
|
29 |
+
# TODO: Fix division by zero
|
30 |
+
|
31 |
+
normalized_df = pd.DataFrame(columns=df.columns)
|
32 |
+
invalid_row_indexes = []
|
33 |
+
body_landmarks = {"X": [], "Y": []}
|
34 |
+
|
35 |
+
# Construct the relevant identifiers
|
36 |
+
for identifier in BODY_IDENTIFIERS:
|
37 |
+
body_landmarks["X"].append(identifier + "_X")
|
38 |
+
body_landmarks["Y"].append(identifier + "_Y")
|
39 |
+
|
40 |
+
# Iterate over all of the records in the dataset
|
41 |
+
for index, row in df.iterrows():
|
42 |
+
|
43 |
+
sequence_size = len(row["leftEar_Y"])
|
44 |
+
valid_sequence = True
|
45 |
+
original_row = row
|
46 |
+
|
47 |
+
last_starting_point, last_ending_point = None, None
|
48 |
+
|
49 |
+
# Treat each element of the sequence (analyzed frame) individually
|
50 |
+
for sequence_index in range(sequence_size):
|
51 |
+
|
52 |
+
# Prevent from even starting the analysis if some necessary elements are not present
|
53 |
+
if (row["leftShoulder_X"][sequence_index] == 0 or row["rightShoulder_X"][sequence_index] == 0) and (row["neck_X"][sequence_index] == 0 or row["nose_X"][sequence_index] == 0):
|
54 |
+
if not last_starting_point:
|
55 |
+
valid_sequence = False
|
56 |
+
continue
|
57 |
+
|
58 |
+
else:
|
59 |
+
starting_point, ending_point = last_starting_point, last_ending_point
|
60 |
+
|
61 |
+
else:
|
62 |
+
|
63 |
+
# NOTE:
|
64 |
+
#
|
65 |
+
# While in the paper, it is written that the head metric is calculated by halving the shoulder distance,
|
66 |
+
# this is meant for the distance between the very ends of one's shoulder, as literature studying body
|
67 |
+
# metrics and ratios generally states. The Vision Pose Estimation API, however, seems to be predicting
|
68 |
+
# rather the center of one's shoulder. Based on our experiments and manual reviews of the data, employing
|
69 |
+
# this as just the plain shoulder distance seems to be more corresponding to the desired metric.
|
70 |
+
#
|
71 |
+
# Please, review this if using other third-party pose estimation libraries.
|
72 |
+
|
73 |
+
if row["leftShoulder_X"][sequence_index] != 0 and row["rightShoulder_X"][sequence_index] != 0:
|
74 |
+
left_shoulder = (row["leftShoulder_X"][sequence_index], row["leftShoulder_Y"][sequence_index])
|
75 |
+
right_shoulder = (row["rightShoulder_X"][sequence_index], row["rightShoulder_Y"][sequence_index])
|
76 |
+
shoulder_distance = ((((left_shoulder[0] - right_shoulder[0]) ** 2) + (
|
77 |
+
(left_shoulder[1] - right_shoulder[1]) ** 2)) ** 0.5)
|
78 |
+
head_metric = shoulder_distance
|
79 |
+
else:
|
80 |
+
neck = (row["neck_X"][sequence_index], row["neck_Y"][sequence_index])
|
81 |
+
nose = (row["nose_X"][sequence_index], row["nose_Y"][sequence_index])
|
82 |
+
neck_nose_distance = ((((neck[0] - nose[0]) ** 2) + ((neck[1] - nose[1]) ** 2)) ** 0.5)
|
83 |
+
head_metric = neck_nose_distance
|
84 |
+
|
85 |
+
# Set the starting and ending point of the normalization bounding box
|
86 |
+
starting_point = [row["neck_X"][sequence_index] - 3 * head_metric, row["leftEye_Y"][sequence_index] + (head_metric / 2)]
|
87 |
+
ending_point = [row["neck_X"][sequence_index] + 3 * head_metric, starting_point[1] - 6 * head_metric]
|
88 |
+
|
89 |
+
last_starting_point, last_ending_point = starting_point, ending_point
|
90 |
+
|
91 |
+
# Ensure that all of the bounding-box-defining coordinates are not out of the picture
|
92 |
+
if starting_point[0] < 0: starting_point[0] = 0
|
93 |
+
if starting_point[1] < 0: starting_point[1] = 0
|
94 |
+
if ending_point[0] < 0: ending_point[0] = 0
|
95 |
+
if ending_point[1] < 0: ending_point[1] = 0
|
96 |
+
|
97 |
+
# Normalize individual landmarks and save the results
|
98 |
+
for identifier in BODY_IDENTIFIERS:
|
99 |
+
key = identifier + "_"
|
100 |
+
|
101 |
+
# Prevent from trying to normalize incorrectly captured points
|
102 |
+
if row[key + "X"][sequence_index] == 0:
|
103 |
+
continue
|
104 |
+
|
105 |
+
normalized_x = (row[key + "X"][sequence_index] - starting_point[0]) / (ending_point[0] -
|
106 |
+
starting_point[0])
|
107 |
+
normalized_y = (row[key + "Y"][sequence_index] - ending_point[1]) / (starting_point[1] -
|
108 |
+
ending_point[1])
|
109 |
+
|
110 |
+
row[key + "X"][sequence_index] = normalized_x
|
111 |
+
row[key + "Y"][sequence_index] = normalized_y
|
112 |
+
|
113 |
+
if valid_sequence:
|
114 |
+
normalized_df = normalized_df.append(row, ignore_index=True)
|
115 |
+
else:
|
116 |
+
logging.warning(" BODY LANDMARKS: One video instance could not be normalized.")
|
117 |
+
normalized_df = normalized_df.append(original_row, ignore_index=True)
|
118 |
+
invalid_row_indexes.append(index)
|
119 |
+
|
120 |
+
print("The normalization of body is finished.")
|
121 |
+
print("\t-> Original size:", df.shape[0])
|
122 |
+
print("\t-> Normalized size:", normalized_df.shape[0])
|
123 |
+
print("\t-> Problematic videos:", len(invalid_row_indexes))
|
124 |
+
|
125 |
+
return normalized_df, invalid_row_indexes
|
126 |
+
|
127 |
+
|
128 |
+
def normalize_single_dict(row: dict):
|
129 |
+
"""
|
130 |
+
Normalizes the skeletal data for a given sequence of frames with signer's body pose data. The normalization follows
|
131 |
+
the definition from our paper.
|
132 |
+
|
133 |
+
:param row: Dictionary containing key-value pairs with joint identifiers and corresponding lists (sequences) of
|
134 |
+
that particular joints coordinates
|
135 |
+
:return: Dictionary with normalized skeletal data (following the same schema as input data)
|
136 |
+
"""
|
137 |
+
|
138 |
+
sequence_size = len(row["leftEar"])
|
139 |
+
valid_sequence = True
|
140 |
+
original_row = row
|
141 |
+
|
142 |
+
last_starting_point, last_ending_point = None, None
|
143 |
+
|
144 |
+
# Treat each element of the sequence (analyzed frame) individually
|
145 |
+
for sequence_index in range(sequence_size):
|
146 |
+
|
147 |
+
# Prevent from even starting the analysis if some necessary elements are not present
|
148 |
+
if (row["leftShoulder"][sequence_index][0] == 0 or row["rightShoulder"][sequence_index][0] == 0) and (
|
149 |
+
row["neck"][sequence_index][0] == 0 or row["nose"][sequence_index][0] == 0):
|
150 |
+
if not last_starting_point:
|
151 |
+
valid_sequence = False
|
152 |
+
continue
|
153 |
+
|
154 |
+
else:
|
155 |
+
starting_point, ending_point = last_starting_point, last_ending_point
|
156 |
+
|
157 |
+
else:
|
158 |
+
|
159 |
+
# NOTE:
|
160 |
+
#
|
161 |
+
# While in the paper, it is written that the head metric is calculated by halving the shoulder distance,
|
162 |
+
# this is meant for the distance between the very ends of one's shoulder, as literature studying body
|
163 |
+
# metrics and ratios generally states. The Vision Pose Estimation API, however, seems to be predicting
|
164 |
+
# rather the center of one's shoulder. Based on our experiments and manual reviews of the data, employing
|
165 |
+
# this as just the plain shoulder distance seems to be more corresponding to the desired metric.
|
166 |
+
#
|
167 |
+
# Please, review this if using other third-party pose estimation libraries.
|
168 |
+
|
169 |
+
if row["leftShoulder"][sequence_index][0] != 0 and row["rightShoulder"][sequence_index][0] != 0:
|
170 |
+
left_shoulder = (row["leftShoulder"][sequence_index][0], row["leftShoulder"][sequence_index][1])
|
171 |
+
right_shoulder = (row["rightShoulder"][sequence_index][0], row["rightShoulder"][sequence_index][1])
|
172 |
+
shoulder_distance = ((((left_shoulder[0] - right_shoulder[0]) ** 2) + (
|
173 |
+
(left_shoulder[1] - right_shoulder[1]) ** 2)) ** 0.5)
|
174 |
+
head_metric = shoulder_distance
|
175 |
+
else:
|
176 |
+
neck = (row["neck"][sequence_index][0], row["neck"][sequence_index][1])
|
177 |
+
nose = (row["nose"][sequence_index][0], row["nose"][sequence_index][1])
|
178 |
+
neck_nose_distance = ((((neck[0] - nose[0]) ** 2) + ((neck[1] - nose[1]) ** 2)) ** 0.5)
|
179 |
+
head_metric = neck_nose_distance
|
180 |
+
|
181 |
+
# Set the starting and ending point of the normalization bounding box
|
182 |
+
#starting_point = [row["neck"][sequence_index][0] - 3 * head_metric,
|
183 |
+
# row["leftEye"][sequence_index][1] + (head_metric / 2)]
|
184 |
+
starting_point = [row["neck"][sequence_index][0] - 1 * head_metric,
|
185 |
+
row["leftEye"][sequence_index][1] - head_metric/2]
|
186 |
+
ending_point = [row["neck"][sequence_index][0] + 1 * head_metric,
|
187 |
+
starting_point[1] + 3 * head_metric]
|
188 |
+
|
189 |
+
last_starting_point, last_ending_point = starting_point, ending_point
|
190 |
+
|
191 |
+
# Ensure that all of the bounding-box-defining coordinates are not out of the picture
|
192 |
+
if starting_point[0] < 0: starting_point[0] = 0
|
193 |
+
if starting_point[1] > 1: starting_point[1] = 1
|
194 |
+
if ending_point[0] < 0: ending_point[0] = 0
|
195 |
+
if ending_point[1] > 1: ending_point[1] = 1
|
196 |
+
|
197 |
+
# Normalize individual landmarks and save the results
|
198 |
+
for identifier in BODY_IDENTIFIERS:
|
199 |
+
key = identifier
|
200 |
+
|
201 |
+
# Prevent from trying to normalize incorrectly captured points
|
202 |
+
if row[key][sequence_index][0] == 0:
|
203 |
+
continue
|
204 |
+
|
205 |
+
if (ending_point[0] - starting_point[0]) == 0 or (starting_point[1] - ending_point[1]) == 0:
|
206 |
+
logging.info("Problematic normalization")
|
207 |
+
valid_sequence = False
|
208 |
+
break
|
209 |
+
|
210 |
+
normalized_x = (row[key][sequence_index][0] - starting_point[0]) / (ending_point[0] - starting_point[0])
|
211 |
+
normalized_y = (row[key][sequence_index][1] - starting_point[1]) / (ending_point[1] - starting_point[1])
|
212 |
+
|
213 |
+
row[key][sequence_index] = list(row[key][sequence_index])
|
214 |
+
|
215 |
+
row[key][sequence_index][0] = normalized_x
|
216 |
+
row[key][sequence_index][1] = normalized_y
|
217 |
+
|
218 |
+
if valid_sequence:
|
219 |
+
return row
|
220 |
+
|
221 |
+
else:
|
222 |
+
return original_row
|
223 |
+
|
224 |
+
|
225 |
+
if __name__ == "__main__":
|
226 |
+
pass
|
spoter_mod/normalization/hand_normalization.py
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import logging
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
HAND_IDENTIFIERS = [
|
6 |
+
"wrist",
|
7 |
+
"indexTip",
|
8 |
+
"indexDIP",
|
9 |
+
"indexPIP",
|
10 |
+
"indexMCP",
|
11 |
+
"middleTip",
|
12 |
+
"middleDIP",
|
13 |
+
"middlePIP",
|
14 |
+
"middleMCP",
|
15 |
+
"ringTip",
|
16 |
+
"ringDIP",
|
17 |
+
"ringPIP",
|
18 |
+
"ringMCP",
|
19 |
+
"littleTip",
|
20 |
+
"littleDIP",
|
21 |
+
"littlePIP",
|
22 |
+
"littleMCP",
|
23 |
+
"thumbTip",
|
24 |
+
"thumbIP",
|
25 |
+
"thumbMP",
|
26 |
+
"thumbCMC"
|
27 |
+
]
|
28 |
+
|
29 |
+
|
30 |
+
def normalize_hands_full(df: pd.DataFrame) -> pd.DataFrame:
|
31 |
+
"""
|
32 |
+
Normalizes the hands position data using the Bohacek-normalization algorithm.
|
33 |
+
|
34 |
+
:param df: pd.DataFrame to be normalized
|
35 |
+
:return: pd.DataFrame with normalized values for hand pose
|
36 |
+
"""
|
37 |
+
|
38 |
+
# TODO: Fix division by zero
|
39 |
+
df.columns = [item.replace("_left_", "_0_").replace("_right_", "_1_") for item in list(df.columns)]
|
40 |
+
|
41 |
+
normalized_df = pd.DataFrame(columns=df.columns)
|
42 |
+
|
43 |
+
hand_landmarks = {"X": {0: [], 1: []}, "Y": {0: [], 1: []}}
|
44 |
+
|
45 |
+
# Determine how many hands are present in the dataset
|
46 |
+
range_hand_size = 1
|
47 |
+
if "wrist_1_X" in df.columns:
|
48 |
+
range_hand_size = 2
|
49 |
+
|
50 |
+
# Construct the relevant identifiers
|
51 |
+
for identifier in HAND_IDENTIFIERS:
|
52 |
+
for hand_index in range(range_hand_size):
|
53 |
+
hand_landmarks["X"][hand_index].append(identifier + "_" + str(hand_index) + "_X")
|
54 |
+
hand_landmarks["Y"][hand_index].append(identifier + "_" + str(hand_index) + "_Y")
|
55 |
+
|
56 |
+
# Iterate over all of the records in the dataset
|
57 |
+
for index, row in df.iterrows():
|
58 |
+
# Treat each hand individually
|
59 |
+
for hand_index in range(range_hand_size):
|
60 |
+
|
61 |
+
sequence_size = len(row["wrist_" + str(hand_index) + "_X"])
|
62 |
+
|
63 |
+
# Treat each element of the sequence (analyzed frame) individually
|
64 |
+
for sequence_index in range(sequence_size):
|
65 |
+
|
66 |
+
# Retrieve all of the X and Y values of the current frame
|
67 |
+
landmarks_x_values = [row[key][sequence_index] for key in hand_landmarks["X"][hand_index] if row[key][sequence_index] != 0]
|
68 |
+
landmarks_y_values = [row[key][sequence_index] for key in hand_landmarks["Y"][hand_index] if row[key][sequence_index] != 0]
|
69 |
+
|
70 |
+
# Prevent from even starting the analysis if some necessary elements are not present
|
71 |
+
if not landmarks_x_values or not landmarks_y_values:
|
72 |
+
logging.warning(
|
73 |
+
" HAND LANDMARKS: One frame could not be normalized as there is no data present. Record: " + str(index) +
|
74 |
+
", Frame: " + str(sequence_index))
|
75 |
+
continue
|
76 |
+
|
77 |
+
# Calculate the deltas
|
78 |
+
width, height = max(landmarks_x_values) - min(landmarks_x_values), max(landmarks_y_values) - min(
|
79 |
+
landmarks_y_values)
|
80 |
+
if width > height:
|
81 |
+
delta_x = 0.1 * width
|
82 |
+
delta_y = delta_x + ((width - height) / 2)
|
83 |
+
else:
|
84 |
+
delta_y = 0.1 * height
|
85 |
+
delta_x = delta_y + ((height - width) / 2)
|
86 |
+
|
87 |
+
# Set the starting and ending point of the normalization bounding box
|
88 |
+
starting_point = (min(landmarks_x_values) - delta_x, min(landmarks_y_values) - delta_y)
|
89 |
+
ending_point = (max(landmarks_x_values) + delta_x, max(landmarks_y_values) + delta_y)
|
90 |
+
|
91 |
+
# Normalize individual landmarks and save the results
|
92 |
+
for identifier in HAND_IDENTIFIERS:
|
93 |
+
key = identifier + "_" + str(hand_index) + "_"
|
94 |
+
|
95 |
+
# Prevent from trying to normalize incorrectly captured points
|
96 |
+
if row[key + "X"][sequence_index] == 0 or (ending_point[0] - starting_point[0]) == 0 or (starting_point[1] - ending_point[1]) == 0:
|
97 |
+
continue
|
98 |
+
|
99 |
+
normalized_x = (row[key + "X"][sequence_index] - starting_point[0]) / (ending_point[0] -
|
100 |
+
starting_point[0])
|
101 |
+
normalized_y = (row[key + "Y"][sequence_index] - ending_point[1]) / (starting_point[1] -
|
102 |
+
ending_point[1])
|
103 |
+
|
104 |
+
row[key + "X"][sequence_index] = normalized_x
|
105 |
+
row[key + "Y"][sequence_index] = normalized_y
|
106 |
+
|
107 |
+
normalized_df = normalized_df.append(row, ignore_index=True)
|
108 |
+
|
109 |
+
return normalized_df
|
110 |
+
|
111 |
+
|
112 |
+
def normalize_single_dict(row: dict):
|
113 |
+
"""
|
114 |
+
Normalizes the skeletal data for a given sequence of frames with signer's hand pose data. The normalization follows
|
115 |
+
the definition from our paper.
|
116 |
+
|
117 |
+
:param row: Dictionary containing key-value pairs with joint identifiers and corresponding lists (sequences) of
|
118 |
+
that particular joints coordinates
|
119 |
+
:return: Dictionary with normalized skeletal data (following the same schema as input data)
|
120 |
+
"""
|
121 |
+
|
122 |
+
hand_landmarks = {0: [], 1: []}
|
123 |
+
|
124 |
+
# Determine how many hands are present in the dataset
|
125 |
+
range_hand_size = 1
|
126 |
+
if "wrist_1" in row.keys():
|
127 |
+
range_hand_size = 2
|
128 |
+
|
129 |
+
# Construct the relevant identifiers
|
130 |
+
for identifier in HAND_IDENTIFIERS:
|
131 |
+
for hand_index in range(range_hand_size):
|
132 |
+
hand_landmarks[hand_index].append(identifier + "_" + str(hand_index))
|
133 |
+
|
134 |
+
# Treat each hand individually
|
135 |
+
for hand_index in range(range_hand_size):
|
136 |
+
|
137 |
+
sequence_size = len(row["wrist_" + str(hand_index)])
|
138 |
+
|
139 |
+
# Treat each element of the sequence (analyzed frame) individually
|
140 |
+
for sequence_index in range(sequence_size):
|
141 |
+
|
142 |
+
# Retrieve all of the X and Y values of the current frame
|
143 |
+
landmarks_x_values = [row[key][sequence_index][0] for key in hand_landmarks[hand_index] if
|
144 |
+
row[key][sequence_index][0] != 0]
|
145 |
+
landmarks_y_values = [row[key][sequence_index][1] for key in hand_landmarks[hand_index] if
|
146 |
+
row[key][sequence_index][1] != 0]
|
147 |
+
|
148 |
+
# Prevent from even starting the analysis if some necessary elements are not present
|
149 |
+
if not landmarks_x_values or not landmarks_y_values:
|
150 |
+
continue
|
151 |
+
|
152 |
+
# Calculate the deltas
|
153 |
+
width, height = max(landmarks_x_values) - min(landmarks_x_values), max(landmarks_y_values) - min(
|
154 |
+
landmarks_y_values)
|
155 |
+
if width > height:
|
156 |
+
delta_x = 0.1 * width
|
157 |
+
delta_y = delta_x + ((width - height) / 2)
|
158 |
+
else:
|
159 |
+
delta_y = 0.1 * height
|
160 |
+
delta_x = delta_y + ((height - width) / 2)
|
161 |
+
|
162 |
+
# Set the starting and ending point of the normalization bounding box
|
163 |
+
starting_point = [min(landmarks_x_values) - delta_x, min(landmarks_y_values) - delta_y]
|
164 |
+
ending_point = [max(landmarks_x_values) + delta_x, max(landmarks_y_values) + delta_y]
|
165 |
+
# Ensure that all of the bounding-box-defining coordinates are not out of the picture
|
166 |
+
if starting_point[0] < 0: starting_point[0] = 0
|
167 |
+
if starting_point[1] > 1: starting_point[1] = 1
|
168 |
+
if ending_point[0] < 0: ending_point[0] = 0
|
169 |
+
if ending_point[1] > 1: ending_point[1] = 1
|
170 |
+
|
171 |
+
# Normalize individual landmarks and save the results
|
172 |
+
for identifier in HAND_IDENTIFIERS:
|
173 |
+
key = identifier + "_" + str(hand_index)
|
174 |
+
|
175 |
+
# Prevent from trying to normalize incorrectly captured points
|
176 |
+
if row[key][sequence_index][0] == 0 or (ending_point[0] - starting_point[0]) == 0 or (
|
177 |
+
starting_point[1] - ending_point[1]) == 0:
|
178 |
+
continue
|
179 |
+
|
180 |
+
normalized_x = (row[key][sequence_index][0] - starting_point[0]) / (ending_point[0] - starting_point[0])
|
181 |
+
normalized_y = (row[key][sequence_index][1] - starting_point[1]) / (ending_point[1] - starting_point[1])
|
182 |
+
|
183 |
+
row[key][sequence_index] = list(row[key][sequence_index])
|
184 |
+
|
185 |
+
row[key][sequence_index][0] = normalized_x
|
186 |
+
row[key][sequence_index][1] = normalized_y
|
187 |
+
|
188 |
+
return row
|
189 |
+
|
190 |
+
|
191 |
+
if __name__ == "__main__":
|
192 |
+
pass
|
spoter_mod/normalization/main.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import ast
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
from normalization.hand_normalization import normalize_hands_full
|
6 |
+
from normalization.body_normalization import normalize_body_full
|
7 |
+
|
8 |
+
|
9 |
+
# Load the dataset
|
10 |
+
df = pd.read_csv("/Users/matyasbohacek/Documents/WLASL_test_15fps.csv", encoding="utf-8")
|
11 |
+
|
12 |
+
# Retrieve metadata
|
13 |
+
video_size_heights = df["video_size_height"].to_list()
|
14 |
+
video_size_widths = df["video_size_width"].to_list()
|
15 |
+
|
16 |
+
# Delete redundant (non-related) properties
|
17 |
+
del df["video_size_height"]
|
18 |
+
del df["video_size_width"]
|
19 |
+
|
20 |
+
# Temporarily remove other relevant metadata
|
21 |
+
labels = df["labels"].to_list()
|
22 |
+
video_fps = df["video_fps"].to_list()
|
23 |
+
del df["labels"]
|
24 |
+
del df["video_fps"]
|
25 |
+
|
26 |
+
# Convert the strings into lists
|
27 |
+
convert = lambda x: ast.literal_eval(str(x))
|
28 |
+
for column in df.columns:
|
29 |
+
df[column] = df[column].apply(convert)
|
30 |
+
|
31 |
+
# Perform the normalizations
|
32 |
+
df = normalize_hands_full(df)
|
33 |
+
df, invalid_row_indexes = normalize_body_full(df)
|
34 |
+
|
35 |
+
# Clear lists of items from deleted rows
|
36 |
+
# labels = [t for i, t in enumerate(labels) if i not in invalid_row_indexes]
|
37 |
+
# video_fps = [t for i, t in enumerate(video_fps) if i not in invalid_row_indexes]
|
38 |
+
|
39 |
+
# Return the metadata back to the dataset
|
40 |
+
df["labels"] = labels
|
41 |
+
df["video_fps"] = video_fps
|
42 |
+
|
43 |
+
df.to_csv("/Users/matyasbohacek/Desktop/WLASL_test_15fps_normalized.csv", encoding="utf-8", index=False)
|
spoter_mod/pose_model_identifier.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
BODY_IDENTIFIERS = {
|
4 |
+
"nose": 0,
|
5 |
+
"neck": -1,
|
6 |
+
"rightEye": 5,
|
7 |
+
"leftEye": 2,
|
8 |
+
"rightEar": 8,
|
9 |
+
"leftEar": 7,
|
10 |
+
"rightShoulder": 12,
|
11 |
+
"leftShoulder": 11,
|
12 |
+
"rightElbow": 14,
|
13 |
+
"leftElbow": 13,
|
14 |
+
"rightWrist": 16,
|
15 |
+
"leftWrist": 15
|
16 |
+
}
|
17 |
+
HAND_IDENTIFIERS = {
|
18 |
+
"wrist": 0,
|
19 |
+
"indexTip": 8,
|
20 |
+
"indexDIP": 7,
|
21 |
+
"indexPIP": 6,
|
22 |
+
"indexMCP": 5,
|
23 |
+
"middleTip": 12,
|
24 |
+
"middleDIP": 11,
|
25 |
+
"middlePIP": 10,
|
26 |
+
"middleMCP": 9,
|
27 |
+
"ringTip": 16,
|
28 |
+
"ringDIP": 15,
|
29 |
+
"ringPIP": 14,
|
30 |
+
"ringMCP": 13,
|
31 |
+
"littleTip": 20,
|
32 |
+
"littleDIP": 19,
|
33 |
+
"littlePIP": 18,
|
34 |
+
"littleMCP": 17,
|
35 |
+
"thumbTip": 4,
|
36 |
+
"thumbIP": 3,
|
37 |
+
"thumbMP": 2,
|
38 |
+
"thumbCMC": 1
|
39 |
+
}
|
40 |
+
|
41 |
+
|
42 |
+
class mp_holistic_data:
|
43 |
+
def __init__(self, column_names):
|
44 |
+
self.data_hub = {}
|
45 |
+
for n in column_names[1:-1]:
|
46 |
+
self.data_hub[n] = []
|
47 |
+
|
48 |
+
def hand_append_zero(self, handedness):
|
49 |
+
for k in self.data_hub.keys():
|
50 |
+
if "_" + handedness + "_" in k:
|
51 |
+
self.data_hub[k].append(0)
|
52 |
+
|
53 |
+
def hand_append_value(self, handedness, hand_landmarks):
|
54 |
+
for name, lm_idx in HAND_IDENTIFIERS.items():
|
55 |
+
lm = hand_landmarks.landmark[lm_idx]
|
56 |
+
for xy, xy_value in zip(['_X', '_Y'], [lm.x, lm.y]):
|
57 |
+
k = name + '_' + handedness + xy
|
58 |
+
self.data_hub[k].append(xy_value)
|
59 |
+
|
60 |
+
def get_series(self):
|
61 |
+
return pd.Series(self.data_hub)
|
62 |
+
|
63 |
+
def extract_data(self, holistic_results):
|
64 |
+
def neck(pose_results):
|
65 |
+
ls = pose_results.pose_landmarks.landmark[11]
|
66 |
+
rs = pose_results.pose_landmarks.landmark[12]
|
67 |
+
no = pose_results.pose_landmarks.landmark[0]
|
68 |
+
if (ls.visibility > 0.5) & (rs.visibility > 0.5) & (no.visibility > 0.5):
|
69 |
+
# This indicates the neck better. But it does not affect the result.
|
70 |
+
cx = (ls.x + rs.x) / 2
|
71 |
+
cy = (ls.y + rs.y) / 2
|
72 |
+
dx = no.x - cx
|
73 |
+
dy = no.y - cy
|
74 |
+
x = cx + 0.3 * dx
|
75 |
+
y = cy + 0.3 * dy
|
76 |
+
# x = (ls.x+rs.x)/2
|
77 |
+
# y = (ls.y+rs.y)/2
|
78 |
+
else:
|
79 |
+
x = 0
|
80 |
+
y = 0
|
81 |
+
return [x, y]
|
82 |
+
|
83 |
+
# for the frame that can not extract skeleton from
|
84 |
+
if not holistic_results.pose_landmarks:
|
85 |
+
return
|
86 |
+
for name, lm_idx in BODY_IDENTIFIERS.items():
|
87 |
+
if name == "neck":
|
88 |
+
xy_value = neck(holistic_results)
|
89 |
+
else:
|
90 |
+
lm = holistic_results.pose_landmarks.landmark[lm_idx]
|
91 |
+
visible = float(lm.visibility >= 0.5)
|
92 |
+
xy_value = [lm.x * visible, lm.y * visible]
|
93 |
+
for xy_id, xy in zip(['_X', '_Y'], xy_value):
|
94 |
+
s_name = name + xy_id
|
95 |
+
self.data_hub[s_name].append(xy)
|
96 |
+
|
97 |
+
for handedness, lm in zip(['Right', 'Left'],
|
98 |
+
[holistic_results.right_hand_landmarks, holistic_results.left_hand_landmarks]):
|
99 |
+
if lm:
|
100 |
+
self.hand_append_value(handedness, lm)
|
101 |
+
else:
|
102 |
+
self.hand_append_zero(handedness)
|
103 |
+
return
|
spoter_mod/requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas==1.1.5
|
2 |
+
tqdm==4.54.1
|
3 |
+
matplotlib
|
4 |
+
torch==1.8.1
|
5 |
+
torchvision
|
6 |
+
scikit-learn
|
7 |
+
opencv-python
|
spoter_mod/skeleton_extractor.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import warnings
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
from os import path
|
5 |
+
import cv2
|
6 |
+
import mediapipe as mp
|
7 |
+
import json
|
8 |
+
from spoter_mod.pose_model_identifier import BODY_IDENTIFIERS, HAND_IDENTIFIERS, mp_holistic_data
|
9 |
+
|
10 |
+
mp_drawing = mp.solutions.drawing_utils
|
11 |
+
mp_holistic = mp.solutions.holistic
|
12 |
+
mp_drawing_styles = mp.solutions.drawing_styles
|
13 |
+
|
14 |
+
holistic = mp_holistic.Holistic()
|
15 |
+
|
16 |
+
column_names = []
|
17 |
+
column_names.append('video_id')
|
18 |
+
for id_name in BODY_IDENTIFIERS.keys():
|
19 |
+
for xy in ["_X", "_Y"]:
|
20 |
+
column_names.append(id_name + xy)
|
21 |
+
|
22 |
+
for lr in ["_Right", "_Left"]:
|
23 |
+
for id_name in HAND_IDENTIFIERS.keys():
|
24 |
+
for xy in ["_X", "_Y"]:
|
25 |
+
column_names.append(id_name + lr + xy)
|
26 |
+
|
27 |
+
column_names.append('labels')
|
28 |
+
|
29 |
+
|
30 |
+
def create_df(flnm, column_names):
|
31 |
+
df = pd.DataFrame(columns=column_names)
|
32 |
+
return df
|
33 |
+
|
34 |
+
|
35 |
+
def save_data(df, data, flnm):
|
36 |
+
df = df.append(data.get_series(), ignore_index=True)
|
37 |
+
df.to_pickle(flnm)
|
38 |
+
|
39 |
+
|
40 |
+
def obtain_pose_data(path):
|
41 |
+
cap = cv2.VideoCapture(path)
|
42 |
+
data = mp_holistic_data(column_names)
|
43 |
+
while cap.isOpened():
|
44 |
+
ret, frame = cap.read()
|
45 |
+
if not ret:
|
46 |
+
break
|
47 |
+
# Recolor image to RGB
|
48 |
+
image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
49 |
+
|
50 |
+
# Make detection
|
51 |
+
holistic_results = holistic.process(image)
|
52 |
+
# Extract feature and save to mp_pose_data class
|
53 |
+
data.extract_data(holistic_results)
|
54 |
+
cap.release()
|
55 |
+
|
56 |
+
return data
|
57 |
+
|
58 |
+
|
59 |
+
if __name__ == '__main__':
|
60 |
+
pass
|
spoter_mod/sweep-agent.sh
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
#PBS -N spoter-zhuo-sweep
|
3 |
+
#PBS -q gpu
|
4 |
+
#PBS -l walltime=24:00:00
|
5 |
+
|
6 |
+
#PBS -l select=1:ncpus=1:ngpus=1:cluster=adan:mem=10gb
|
7 |
+
#PBS -j oe
|
8 |
+
#PBS -m ae
|
9 |
+
|
10 |
+
echo "Experiment starting..."
|
11 |
+
|
12 |
+
cd /storage/plzen4-ntis/home/mbohacek/spoter-zhuo
|
13 |
+
|
14 |
+
module add conda-modules
|
15 |
+
conda activate cslr-transformers
|
16 |
+
|
17 |
+
wandb agent matyasbohacek/Zhuo-collab-SPOTER-Sweep/bh6fc056
|
18 |
+
|
spoter_mod/sweep.yaml
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
program: train.py
|
2 |
+
method: bayes
|
3 |
+
project: Zhuo-collab-SPOTER-Sweep
|
4 |
+
metric:
|
5 |
+
name: best-accuracy
|
6 |
+
goal: maximize
|
7 |
+
parameters:
|
8 |
+
augmentations_probability:
|
9 |
+
min: 0.20
|
10 |
+
max: 0.75
|
11 |
+
rotate_angle:
|
12 |
+
min: 5
|
13 |
+
max: 20
|
14 |
+
perspective_transform_ratio:
|
15 |
+
min: 0.05
|
16 |
+
max: 0.2
|
17 |
+
squeeze_ratio:
|
18 |
+
min: 0.05
|
19 |
+
max: 0.4
|
20 |
+
arm_joint_rotate_angle:
|
21 |
+
min: 1
|
22 |
+
max: 10
|
23 |
+
arm_joint_rotate_probability:
|
24 |
+
min: 0.2
|
25 |
+
max: 0.5
|
26 |
+
command:
|
27 |
+
- python3
|
28 |
+
- "-m"
|
29 |
+
- train
|
30 |
+
- "--epochs"
|
31 |
+
- 130
|
32 |
+
- "--num_classes"
|
33 |
+
- 100
|
34 |
+
- "--lr"
|
35 |
+
- 0.001
|
36 |
+
- "--experiment_name"
|
37 |
+
- "zhuo-repro"
|
38 |
+
- "--training_set_path"
|
39 |
+
- "/storage/plzen4-ntis/home/mbohacek/spoter-zhuo/WLASL100_zhuo_train.csv"
|
40 |
+
- "--testing_set_path"
|
41 |
+
- "/storage/plzen4-ntis/home/mbohacek/spoter-zhuo/WLASL100_zhuo_test.csv"
|
42 |
+
- "--validation_set_path"
|
43 |
+
- "/storage/plzen4-ntis/home/mbohacek/spoter-zhuo/WLASL100_zhuo_val.csv"
|
44 |
+
- "--validation_set"
|
45 |
+
- "from-file"
|
46 |
+
- "--wandb_key"
|
47 |
+
- "beb8925bb5b17aaecd40139da4c299f76753291e"
|
48 |
+
- "--wandb_entity"
|
49 |
+
- "matyasbohacek"
|
50 |
+
- ${args}
|
spoter_mod/train.py
ADDED
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import os
|
3 |
+
import argparse
|
4 |
+
import random
|
5 |
+
import logging
|
6 |
+
import torch
|
7 |
+
import wandb
|
8 |
+
|
9 |
+
import numpy as np
|
10 |
+
import torch.nn as nn
|
11 |
+
import torch.optim as optim
|
12 |
+
import matplotlib.pyplot as plt
|
13 |
+
import matplotlib.ticker as ticker
|
14 |
+
from torchvision import transforms
|
15 |
+
from torch.utils.data import DataLoader
|
16 |
+
from pathlib import Path
|
17 |
+
|
18 |
+
from utils import __balance_val_split, __split_of_train_sequence
|
19 |
+
from datasets.czech_slr_dataset import CzechSLRDataset
|
20 |
+
from spoter.spoter_model import SPOTER
|
21 |
+
from spoter.utils import train_epoch, evaluate
|
22 |
+
from spoter.gaussian_noise import GaussianNoise
|
23 |
+
|
24 |
+
|
25 |
+
def get_default_args():
|
26 |
+
parser = argparse.ArgumentParser(add_help=False)
|
27 |
+
|
28 |
+
parser.add_argument("--experiment_name", type=str, default="lsa_64_spoter",
|
29 |
+
help="Name of the experiment after which the logs and plots will be named")
|
30 |
+
parser.add_argument("--num_classes", type=int, default=64, help="Number of classes to be recognized by the model")
|
31 |
+
parser.add_argument("--hidden_dim", type=int, default=108,
|
32 |
+
help="Hidden dimension of the underlying Transformer model")
|
33 |
+
parser.add_argument("--seed", type=int, default=379,
|
34 |
+
help="Seed with which to initialize all the random components of the training")
|
35 |
+
|
36 |
+
# Data
|
37 |
+
parser.add_argument("--training_set_path", type=str, default="", help="Path to the training dataset CSV file")
|
38 |
+
parser.add_argument("--testing_set_path", type=str, default="", help="Path to the testing dataset CSV file")
|
39 |
+
parser.add_argument("--experimental_train_split", type=float, default=None,
|
40 |
+
help="Determines how big a portion of the training set should be employed (intended for the "
|
41 |
+
"gradually enlarging training set experiment from the paper)")
|
42 |
+
|
43 |
+
parser.add_argument("--validation_set", type=str, choices=["from-file", "split-from-train", "none"],
|
44 |
+
default="from-file", help="Type of validation set construction. See README for further rederence")
|
45 |
+
parser.add_argument("--validation_set_size", type=float,
|
46 |
+
help="Proportion of the training set to be split as validation set, if 'validation_size' is set"
|
47 |
+
" to 'split-from-train'")
|
48 |
+
parser.add_argument("--validation_set_path", type=str, default="", help="Path to the validation dataset CSV file")
|
49 |
+
|
50 |
+
# Training hyperparameters
|
51 |
+
parser.add_argument("--epochs", type=int, default=100, help="Number of epochs to train the model for")
|
52 |
+
parser.add_argument("--lr", type=float, default=0.001, help="Learning rate for the model training")
|
53 |
+
parser.add_argument("--log_freq", type=int, default=1,
|
54 |
+
help="Log frequency (frequency of printing all the training info)")
|
55 |
+
|
56 |
+
# Checkpointing
|
57 |
+
parser.add_argument("--save_checkpoints", type=bool, default=True,
|
58 |
+
help="Determines whether to save weights checkpoints")
|
59 |
+
|
60 |
+
# Scheduler
|
61 |
+
parser.add_argument("--scheduler_factor", type=int, default=0.1, help="Factor for the ReduceLROnPlateau scheduler")
|
62 |
+
parser.add_argument("--scheduler_patience", type=int, default=5,
|
63 |
+
help="Patience for the ReduceLROnPlateau scheduler")
|
64 |
+
|
65 |
+
# Gaussian noise normalization
|
66 |
+
parser.add_argument("--gaussian_mean", type=int, default=0, help="Mean parameter for Gaussian noise layer")
|
67 |
+
parser.add_argument("--gaussian_std", type=int, default=0.001,
|
68 |
+
help="Standard deviation parameter for Gaussian noise layer")
|
69 |
+
|
70 |
+
parser.add_argument("--augmentations_probability", type=float, default=0.5, help="") # 0.462
|
71 |
+
parser.add_argument("--rotate_angle", type=int, default=17, help="") # 17
|
72 |
+
parser.add_argument("--perspective_transform_ratio", type=float, default=0.2, help="") # 0.1682
|
73 |
+
parser.add_argument("--squeeze_ratio", type=float, default=0.4, help="") # 0.3971
|
74 |
+
parser.add_argument("--arm_joint_rotate_angle", type=int, default=4, help="") # 3
|
75 |
+
parser.add_argument("--arm_joint_rotate_probability", type=float, default=0.4, help="") # 0.3596
|
76 |
+
|
77 |
+
# Visualization
|
78 |
+
parser.add_argument("--plot_stats", type=bool, default=True,
|
79 |
+
help="Determines whether continuous statistics should be plotted at the end")
|
80 |
+
parser.add_argument("--plot_lr", type=bool, default=True,
|
81 |
+
help="Determines whether the LR should be plotted at the end")
|
82 |
+
|
83 |
+
# WANDB
|
84 |
+
parser.add_argument("--wandb_key", type=str, default="", help="")
|
85 |
+
parser.add_argument("--wandb_entity", type=str, default="", help="")
|
86 |
+
|
87 |
+
return parser
|
88 |
+
|
89 |
+
|
90 |
+
def train(args):
|
91 |
+
|
92 |
+
if args.wandb_key:
|
93 |
+
wandb.login(key=args.wandb_key)
|
94 |
+
wandb.init(project=args.experiment_name, entity=args.wandb_entity)
|
95 |
+
wandb.config.update(args)
|
96 |
+
|
97 |
+
# MARK: TRAINING PREPARATION AND MODULES
|
98 |
+
args.experiment_name = args.experiment_name + "_lr" + wandb.run.id
|
99 |
+
|
100 |
+
# Initialize all the random seeds
|
101 |
+
random.seed(args.seed)
|
102 |
+
np.random.seed(args.seed)
|
103 |
+
os.environ["PYTHONHASHSEED"] = str(args.seed)
|
104 |
+
torch.manual_seed(args.seed)
|
105 |
+
torch.cuda.manual_seed(args.seed)
|
106 |
+
torch.cuda.manual_seed_all(args.seed)
|
107 |
+
torch.backends.cudnn.deterministic = True
|
108 |
+
g = torch.Generator()
|
109 |
+
g.manual_seed(args.seed)
|
110 |
+
|
111 |
+
# Set the output format to print into the console and save into LOG file
|
112 |
+
logging.basicConfig(
|
113 |
+
level=logging.INFO,
|
114 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
115 |
+
handlers=[
|
116 |
+
logging.FileHandler(args.experiment_name + "_" + str(args.experimental_train_split).replace(".", "") + ".log")
|
117 |
+
]
|
118 |
+
)
|
119 |
+
|
120 |
+
# Set device to CUDA only if applicable
|
121 |
+
device = torch.device("cpu")
|
122 |
+
if torch.cuda.is_available():
|
123 |
+
device = torch.device("cuda")
|
124 |
+
|
125 |
+
# Construct the model
|
126 |
+
slrt_model = SPOTER(num_classes=args.num_classes, hidden_dim=args.hidden_dim)
|
127 |
+
slrt_model.train(True)
|
128 |
+
slrt_model.to(device)
|
129 |
+
|
130 |
+
# Construct the other modules
|
131 |
+
cel_criterion = nn.CrossEntropyLoss()
|
132 |
+
sgd_optimizer = optim.SGD(slrt_model.parameters(), lr=args.lr)
|
133 |
+
scheduler = optim.lr_scheduler.ReduceLROnPlateau(sgd_optimizer, factor=args.scheduler_factor, patience=args.scheduler_patience)
|
134 |
+
|
135 |
+
# Ensure that the path for checkpointing and for images both exist
|
136 |
+
Path("out-checkpoints/" + args.experiment_name + "/").mkdir(parents=True, exist_ok=True)
|
137 |
+
Path("out-img/").mkdir(parents=True, exist_ok=True)
|
138 |
+
|
139 |
+
|
140 |
+
# MARK: DATA
|
141 |
+
|
142 |
+
# Training set
|
143 |
+
transform = transforms.Compose([GaussianNoise(args.gaussian_mean, args.gaussian_std)])
|
144 |
+
augmentations_config = {
|
145 |
+
"rotate-angle": args.rotate_angle,
|
146 |
+
"perspective-transform-ratio": args.perspective_transform_ratio,
|
147 |
+
"squeeze-ratio": args.squeeze_ratio,
|
148 |
+
"arm-joint-rotate-angle": args.arm_joint_rotate_angle,
|
149 |
+
"arm-joint-rotate-probability": args.arm_joint_rotate_probability
|
150 |
+
}
|
151 |
+
|
152 |
+
train_set = CzechSLRDataset(args.training_set_path, transform=transform, augmentations=True,
|
153 |
+
augmentations_prob=args.augmentations_probability, augmentations_config=augmentations_config)
|
154 |
+
|
155 |
+
# Validation set
|
156 |
+
if args.validation_set == "from-file":
|
157 |
+
val_set = CzechSLRDataset(args.validation_set_path)
|
158 |
+
val_loader = DataLoader(val_set, shuffle=True, generator=g)
|
159 |
+
|
160 |
+
elif args.validation_set == "split-from-train":
|
161 |
+
train_set, val_set = __balance_val_split(train_set, 0.2)
|
162 |
+
|
163 |
+
val_set.transform = None
|
164 |
+
val_set.augmentations = False
|
165 |
+
val_loader = DataLoader(val_set, shuffle=True, generator=g)
|
166 |
+
|
167 |
+
else:
|
168 |
+
val_loader = None
|
169 |
+
|
170 |
+
# Testing set
|
171 |
+
if args.testing_set_path:
|
172 |
+
eval_set = CzechSLRDataset(args.testing_set_path)
|
173 |
+
eval_loader = DataLoader(eval_set, shuffle=True, generator=g)
|
174 |
+
|
175 |
+
else:
|
176 |
+
eval_loader = None
|
177 |
+
|
178 |
+
# Final training set refinements
|
179 |
+
if args.experimental_train_split:
|
180 |
+
train_set = __split_of_train_sequence(train_set, args.experimental_train_split)
|
181 |
+
|
182 |
+
train_loader = DataLoader(train_set, shuffle=True, generator=g)
|
183 |
+
|
184 |
+
|
185 |
+
# MARK: TRAINING
|
186 |
+
train_acc, val_acc = 0, 0
|
187 |
+
losses, train_accs, val_accs = [], [], []
|
188 |
+
lr_progress = []
|
189 |
+
top_train_acc, top_val_acc = 0, 0
|
190 |
+
checkpoint_index = 0
|
191 |
+
|
192 |
+
if args.experimental_train_split:
|
193 |
+
print("Starting " + args.experiment_name + "_" + str(args.experimental_train_split).replace(".", "") + "...\n\n")
|
194 |
+
logging.info("Starting " + args.experiment_name + "_" + str(args.experimental_train_split).replace(".", "") + "...\n\n")
|
195 |
+
|
196 |
+
else:
|
197 |
+
print("Starting " + args.experiment_name + "...\n\n")
|
198 |
+
logging.info("Starting " + args.experiment_name + "...\n\n")
|
199 |
+
|
200 |
+
for epoch in range(args.epochs):
|
201 |
+
train_loss, _, _, train_acc = train_epoch(slrt_model, train_loader, cel_criterion, sgd_optimizer, device)
|
202 |
+
losses.append(train_loss.item() / len(train_loader))
|
203 |
+
train_accs.append(train_acc)
|
204 |
+
|
205 |
+
if val_loader:
|
206 |
+
slrt_model.train(False)
|
207 |
+
_, _, val_acc = evaluate(slrt_model, val_loader, device)
|
208 |
+
slrt_model.train(True)
|
209 |
+
val_accs.append(val_acc)
|
210 |
+
|
211 |
+
# Save checkpoints if they are best in the current subset
|
212 |
+
if args.save_checkpoints:
|
213 |
+
if train_acc > top_train_acc:
|
214 |
+
top_train_acc = train_acc
|
215 |
+
torch.save(slrt_model, "out-checkpoints/" + args.experiment_name + "/checkpoint_t_" + str(checkpoint_index) + ".pth")
|
216 |
+
|
217 |
+
if val_acc > top_val_acc:
|
218 |
+
top_val_acc = val_acc
|
219 |
+
torch.save(slrt_model, "out-checkpoints/" + args.experiment_name + "/checkpoint_v_" + str(checkpoint_index) + ".pth")
|
220 |
+
|
221 |
+
if epoch % args.log_freq == 0:
|
222 |
+
print("[" + str(epoch + 1) + "] TRAIN loss: " + str(train_loss.item() / len(train_loader)) + " acc: " + str(train_acc))
|
223 |
+
logging.info("[" + str(epoch + 1) + "] TRAIN loss: " + str(train_loss.item() / len(train_loader)) + " acc: " + str(train_acc))
|
224 |
+
|
225 |
+
wandb.log({
|
226 |
+
"epoch": int(epoch + 1),
|
227 |
+
"train-loss": float(train_loss.item() / len(train_loader)),
|
228 |
+
"train-accuracy": train_acc
|
229 |
+
})
|
230 |
+
|
231 |
+
if val_loader:
|
232 |
+
print("[" + str(epoch + 1) + "] VALIDATION acc: " + str(val_acc))
|
233 |
+
logging.info("[" + str(epoch + 1) + "] VALIDATION acc: " + str(val_acc))
|
234 |
+
|
235 |
+
if args.wandb_key:
|
236 |
+
wandb.log({
|
237 |
+
"validation-accuracy": val_acc
|
238 |
+
})
|
239 |
+
|
240 |
+
print("")
|
241 |
+
logging.info("")
|
242 |
+
|
243 |
+
# Reset the top accuracies on static subsets
|
244 |
+
if epoch % 10 == 0:
|
245 |
+
top_train_acc, top_val_acc = 0, 0
|
246 |
+
checkpoint_index += 1
|
247 |
+
|
248 |
+
lr_progress.append(sgd_optimizer.param_groups[0]["lr"])
|
249 |
+
|
250 |
+
# MARK: TESTING
|
251 |
+
|
252 |
+
print("\nTesting checkpointed models starting...\n")
|
253 |
+
logging.info("\nTesting checkpointed models starting...\n")
|
254 |
+
|
255 |
+
top_result, top_result_name = 0, ""
|
256 |
+
|
257 |
+
if eval_loader:
|
258 |
+
for i in range(checkpoint_index):
|
259 |
+
for checkpoint_id in ["t", "v"]:
|
260 |
+
# tested_model = VisionTransformer(dim=2, mlp_dim=108, num_classes=100, depth=12, heads=8)
|
261 |
+
tested_model = torch.load("out-checkpoints/" + args.experiment_name + "/checkpoint_" + checkpoint_id + "_" + str(i) + ".pth")
|
262 |
+
tested_model.train(False)
|
263 |
+
_, _, eval_acc = evaluate(tested_model, eval_loader, device, print_stats=True)
|
264 |
+
|
265 |
+
if eval_acc > top_result:
|
266 |
+
top_result = eval_acc
|
267 |
+
top_result_name = args.experiment_name + "/checkpoint_" + checkpoint_id + "_" + str(i)
|
268 |
+
|
269 |
+
print("checkpoint_" + checkpoint_id + "_" + str(i) + " -> " + str(eval_acc))
|
270 |
+
logging.info("checkpoint_" + checkpoint_id + "_" + str(i) + " -> " + str(eval_acc))
|
271 |
+
|
272 |
+
print("\nThe top result was recorded at " + str(top_result) + " testing accuracy. The best checkpoint is " + top_result_name + ".")
|
273 |
+
logging.info("\nThe top result was recorded at " + str(top_result) + " testing accuracy. The best checkpoint is " + top_result_name + ".")
|
274 |
+
|
275 |
+
if args.wandb_key:
|
276 |
+
wandb.run.summary["best-accuracy"] = top_result
|
277 |
+
wandb.run.summary["best-checkpoint"] = top_result_name
|
278 |
+
|
279 |
+
# PLOT 0: Performance (loss, accuracies) chart plotting
|
280 |
+
if args.plot_stats:
|
281 |
+
fig, ax = plt.subplots()
|
282 |
+
ax.plot(range(1, len(losses) + 1), losses, c="#D64436", label="Training loss")
|
283 |
+
ax.plot(range(1, len(train_accs) + 1), train_accs, c="#00B09B", label="Training accuracy")
|
284 |
+
|
285 |
+
if val_loader:
|
286 |
+
ax.plot(range(1, len(val_accs) + 1), val_accs, c="#E0A938", label="Validation accuracy")
|
287 |
+
|
288 |
+
ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))
|
289 |
+
|
290 |
+
ax.set(xlabel="Epoch", ylabel="Accuracy / Loss", title="")
|
291 |
+
plt.legend(loc="upper center", bbox_to_anchor=(0.5, 1.05), ncol=4, fancybox=True, shadow=True, fontsize="xx-small")
|
292 |
+
ax.grid()
|
293 |
+
|
294 |
+
fig.savefig("out-img/" + args.experiment_name + "_loss.png")
|
295 |
+
|
296 |
+
# PLOT 1: Learning rate progress
|
297 |
+
if args.plot_lr:
|
298 |
+
fig1, ax1 = plt.subplots()
|
299 |
+
ax1.plot(range(1, len(lr_progress) + 1), lr_progress, label="LR")
|
300 |
+
ax1.set(xlabel="Epoch", ylabel="LR", title="")
|
301 |
+
ax1.grid()
|
302 |
+
|
303 |
+
fig1.savefig("out-img/" + args.experiment_name + "_lr.png")
|
304 |
+
|
305 |
+
print("\nAny desired statistics have been plotted.\nThe experiment is finished.")
|
306 |
+
logging.info("\nAny desired statistics have been plotted.\nThe experiment is finished.")
|
307 |
+
|
308 |
+
|
309 |
+
if __name__ == '__main__':
|
310 |
+
parser = argparse.ArgumentParser("", parents=[get_default_args()], add_help=False)
|
311 |
+
args = parser.parse_args()
|
312 |
+
train(args)
|
spoter_mod/utils.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
from collections import Counter
|
5 |
+
from torch.utils.data import Subset
|
6 |
+
from sklearn.model_selection import train_test_split
|
7 |
+
|
8 |
+
|
9 |
+
def __balance_val_split(dataset, val_split=0.):
|
10 |
+
targets = np.array(dataset.targets)
|
11 |
+
train_indices, val_indices = train_test_split(
|
12 |
+
np.arange(targets.shape[0]),
|
13 |
+
test_size=val_split,
|
14 |
+
stratify=targets
|
15 |
+
)
|
16 |
+
|
17 |
+
train_dataset = Subset(dataset, indices=train_indices)
|
18 |
+
val_dataset = Subset(dataset, indices=val_indices)
|
19 |
+
|
20 |
+
return train_dataset, val_dataset
|
21 |
+
|
22 |
+
|
23 |
+
def __split_of_train_sequence(subset: Subset, train_split=1.0):
|
24 |
+
if train_split == 1:
|
25 |
+
return subset
|
26 |
+
|
27 |
+
targets = np.array([subset.dataset.targets[i] for i in subset.indices])
|
28 |
+
train_indices, _ = train_test_split(
|
29 |
+
np.arange(targets.shape[0]),
|
30 |
+
test_size=1 - train_split,
|
31 |
+
stratify=targets
|
32 |
+
)
|
33 |
+
|
34 |
+
train_dataset = Subset(subset.dataset, indices=[subset.indices[i] for i in train_indices])
|
35 |
+
|
36 |
+
return train_dataset
|
37 |
+
|
38 |
+
|
39 |
+
def __log_class_statistics(subset: Subset):
|
40 |
+
train_classes = [subset.dataset.targets[i] for i in subset.indices]
|
41 |
+
print(dict(Counter(train_classes)))
|
spoter_mod/wandb/debug-cli.log
ADDED
File without changes
|