Matyáš Boháček commited on
Commit
a001524
·
1 Parent(s): 96a591f

Init commit

Browse files
README.md CHANGED
@@ -1,12 +1,12 @@
1
  ---
2
- title: SPOTER Sign Language Recognition
3
- emoji: 🏢
4
- colorFrom: blue
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 3.0.19
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Spoter Demo Test
3
+ emoji: 🧏
4
+ colorFrom: green
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 3.0.6
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference
app.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+
3
+ import torch
4
+ import numpy as np
5
+ import gradio as gr
6
+ from spoter_mod.skeleton_extractor import obtain_pose_data
7
+ from spoter_mod.normalization.body_normalization import normalize_single_dict as normalize_single_body_dict, BODY_IDENTIFIERS
8
+ from spoter_mod.normalization.hand_normalization import normalize_single_dict as normalize_single_hand_dict, HAND_IDENTIFIERS
9
+
10
+
11
+ model = torch.load("spoter-checkpoint.pth", map_location=torch.device('cpu'))
12
+ model.train(False)
13
+
14
+ HAND_IDENTIFIERS = [id + "_Left" for id in HAND_IDENTIFIERS] + [id + "_Right" for id in HAND_IDENTIFIERS]
15
+ GLOSS = ['book', 'drink', 'computer', 'before', 'chair', 'go', 'clothes', 'who', 'candy', 'cousin', 'deaf', 'fine',
16
+ 'help', 'no', 'thin', 'walk', 'year', 'yes', 'all', 'black', 'cool', 'finish', 'hot', 'like', 'many', 'mother',
17
+ 'now', 'orange', 'table', 'thanksgiving', 'what', 'woman', 'bed', 'blue', 'bowling', 'can', 'dog', 'family',
18
+ 'fish', 'graduate', 'hat', 'hearing', 'kiss', 'language', 'later', 'man', 'shirt', 'study', 'tall', 'white',
19
+ 'wrong', 'accident', 'apple', 'bird', 'change', 'color', 'corn', 'cow', 'dance', 'dark', 'doctor', 'eat',
20
+ 'enjoy', 'forget', 'give', 'last', 'meet', 'pink', 'pizza', 'play', 'school', 'secretary', 'short', 'time',
21
+ 'want', 'work', 'africa', 'basketball', 'birthday', 'brown', 'but', 'cheat', 'city', 'cook', 'decide', 'full',
22
+ 'how', 'jacket', 'letter', 'medicine', 'need', 'paint', 'paper', 'pull', 'purple', 'right', 'same', 'son',
23
+ 'tell', 'thursday']
24
+
25
+ device = torch.device("cpu")
26
+ if torch.cuda.is_available():
27
+ device = torch.device("cuda")
28
+
29
+
30
+ def tensor_to_dictionary(landmarks_tensor: torch.Tensor) -> dict:
31
+
32
+ data_array = landmarks_tensor.numpy()
33
+ output = {}
34
+
35
+ for landmark_index, identifier in enumerate(BODY_IDENTIFIERS + HAND_IDENTIFIERS):
36
+ output[identifier] = data_array[:, landmark_index]
37
+
38
+ return output
39
+
40
+
41
+ def dictionary_to_tensor(landmarks_dict: dict) -> torch.Tensor:
42
+
43
+ output = np.empty(shape=(len(landmarks_dict["leftEar"]), len(BODY_IDENTIFIERS + HAND_IDENTIFIERS), 2))
44
+
45
+ for landmark_index, identifier in enumerate(BODY_IDENTIFIERS + HAND_IDENTIFIERS):
46
+ output[:, landmark_index, 0] = [frame[0] for frame in landmarks_dict[identifier]]
47
+ output[:, landmark_index, 1] = [frame[1] for frame in landmarks_dict[identifier]]
48
+
49
+ return torch.from_numpy(output)
50
+
51
+
52
+ def greet(label, video0, video1):
53
+
54
+ if label == "Webcam":
55
+ video = video0
56
+
57
+ elif label == "Video":
58
+ video = video1
59
+
60
+ elif label == "X":
61
+ return {"A": 0.8, "B": 0.1, "C": 0.1}
62
+
63
+ else:
64
+ return {}
65
+
66
+ data = obtain_pose_data(video)
67
+
68
+ depth_map = np.empty(shape=(len(data.data_hub["nose_X"]), len(BODY_IDENTIFIERS + HAND_IDENTIFIERS), 2))
69
+
70
+ for index, identifier in enumerate(BODY_IDENTIFIERS + HAND_IDENTIFIERS):
71
+ depth_map[:, index, 0] = data.data_hub[identifier + "_X"]
72
+ depth_map[:, index, 1] = data.data_hub[identifier + "_Y"]
73
+
74
+ depth_map = torch.from_numpy(np.copy(depth_map))
75
+
76
+ depth_map = tensor_to_dictionary(depth_map)
77
+
78
+ keys = copy.copy(list(depth_map.keys()))
79
+ for key in keys:
80
+ data = depth_map[key]
81
+ del depth_map[key]
82
+ depth_map[key.replace("_Left", "_0").replace("_Right", "_1")] = data
83
+
84
+ depth_map = normalize_single_body_dict(depth_map)
85
+ depth_map = normalize_single_hand_dict(depth_map)
86
+
87
+ keys = copy.copy(list(depth_map.keys()))
88
+ for key in keys:
89
+ data = depth_map[key]
90
+ del depth_map[key]
91
+ depth_map[key.replace("_0", "_Left").replace("_1", "_Right")] = data
92
+
93
+ depth_map = dictionary_to_tensor(depth_map)
94
+
95
+ depth_map = depth_map - 0.5
96
+
97
+ inputs = depth_map.squeeze(0).to(device)
98
+ outputs = model(inputs).expand(1, -1, -1)
99
+ results = torch.nn.functional.softmax(outputs, dim=2).detach().numpy()[0, 0]
100
+
101
+ results = {GLOSS[i]: float(results[i]) for i in range(100)}
102
+
103
+ return results
104
+
105
+
106
+ label = gr.outputs.Label(num_top_classes=5, label="Top class probabilities")
107
+ demo = gr.Interface(fn=greet, inputs=[gr.Dropdown(["Webcam", "Video"], label="Please select the input type:", type="value"), gr.Video(source="webcam", label="Webcam recording", type="mp4"), gr.Video(source="upload", label="Video upload", type="mp4")], outputs=label,
108
+ title="SPOTER Sign language recognition",
109
+ description="",
110
+ article="This is joint work of [Matyas Bohacek](https://scholar.google.cz/citations?user=wDy1xBwAAAAJ) and [Zhuo Cao](https://www.linkedin.com/in/zhuo-cao-b0787a1aa/?originalSubdomain=hk). For more info, visit [our website.](https://www.signlanguagerecognition.com)",
111
+ css="""
112
+ @font-face {
113
+ font-family: Graphik;
114
+ font-weight: regular;
115
+ src: url("https://www.signlanguagerecognition.com/supplementary/GraphikRegular.otf") format("opentype");
116
+ }
117
+
118
+ @font-face {
119
+ font-family: Graphik;
120
+ font-weight: bold;
121
+ src: url("https://www.signlanguagerecognition.com/supplementary/GraphikBold.otf") format("opentype");
122
+ }
123
+
124
+ @font-face {
125
+ font-family: MonumentExpanded;
126
+ font-weight: regular;
127
+ src: url("https://www.signlanguagerecognition.com/supplementary/MonumentExtended-Regular.otf") format("opentype");
128
+ }
129
+
130
+ @font-face {
131
+ font-family: MonumentExpanded;
132
+ font-weight: bold;
133
+ src: url("https://www.signlanguagerecognition.com/supplementary/MonumentExtended-Bold.otf") format("opentype");
134
+ }
135
+
136
+ html {
137
+ font-family: "Graphik";
138
+ }
139
+
140
+ h1 {
141
+ font-family: "MonumentExpanded";
142
+ }
143
+
144
+ #12 {
145
+ - background-image: linear-gradient(to left, #61D836, #6CB346) !important;
146
+ background-color: #61D836 !important;
147
+ }
148
+
149
+ #12:hover {
150
+ - background-image: linear-gradient(to left, #61D836, #6CB346) !important;
151
+ background-color: #6CB346 !important;
152
+ border: 0 !important;
153
+ border-color: 0 !important;
154
+ }
155
+
156
+ .dark .gr-button-primary {
157
+ --tw-gradient-from: #61D836;
158
+ --tw-gradient-to: #6CB346;
159
+ border: 0 !important;
160
+ border-color: 0 !important;
161
+ }
162
+
163
+ .dark .gr-button-primary:hover {
164
+ --tw-gradient-from: #64A642;
165
+ --tw-gradient-to: #58933B;
166
+ border: 0 !important;
167
+ border-color: 0 !important;
168
+ }
169
+ """,
170
+ cache_examples=True
171
+ )
172
+
173
+ demo.launch(debug=True)
examples/chair.mp4 ADDED
Binary file (990 kB). View file
 
examples/computer.mp4 ADDED
Binary file (858 kB). View file
 
examples/work.mp4 ADDED
Binary file (785 kB). View file
 
flagged/log.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 'name','output','flag','username','timestamp'
2
+ 'Hello','Hello Hello!!','','','2022-05-28 16:06:47.684383'
3
+ 'Hello','Hello Hello!!','','','2022-05-28 16:06:49.325378'
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ opencv-python
2
+ mediapipe
3
+ pandas
4
+ torch==1.8.1
5
+ numpy
6
+ scikit-learn
7
+ protobuf==3.20.1
spoter-checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4f0927fbaddf11da6762ca76474a7bbc049565599e3fc6f081caa5cc00fb53a
3
+ size 23764668
spoter/gaussian_noise.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+
4
+
5
+ class GaussianNoise(object):
6
+ def __init__(self, mean=0., std=1.):
7
+ self.std = std
8
+ self.mean = mean
9
+
10
+ def __call__(self, tensor):
11
+ return tensor + torch.randn(tensor.size()) * self.std + self.mean
12
+
13
+ def __repr__(self):
14
+ return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)
15
+
16
+
17
+ if __name__ == "__main__":
18
+ pass
spoter/spoter_model.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import copy
3
+ import torch
4
+
5
+ import torch.nn as nn
6
+ from typing import Optional
7
+
8
+
9
+ def _get_clones(mod, n):
10
+ return nn.ModuleList([copy.deepcopy(mod) for _ in range(n)])
11
+
12
+
13
+ class SPOTERTransformerDecoderLayer(nn.TransformerDecoderLayer):
14
+ """
15
+ Edited TransformerDecoderLayer implementation omitting the redundant self-attention operation as opposed to the
16
+ standard implementation.
17
+ """
18
+
19
+ def __init__(self, d_model, nhead, dim_feedforward, dropout, activation):
20
+ super(SPOTERTransformerDecoderLayer, self).__init__(d_model, nhead, dim_feedforward, dropout, activation)
21
+
22
+ del self.self_attn
23
+
24
+ def forward(self, tgt: torch.Tensor, memory: torch.Tensor, tgt_mask: Optional[torch.Tensor] = None,
25
+ memory_mask: Optional[torch.Tensor] = None, tgt_key_padding_mask: Optional[torch.Tensor] = None,
26
+ memory_key_padding_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
27
+
28
+ tgt = tgt + self.dropout1(tgt)
29
+ tgt = self.norm1(tgt)
30
+ tgt2 = self.multihead_attn(tgt, memory, memory, attn_mask=memory_mask,
31
+ key_padding_mask=memory_key_padding_mask)[0]
32
+ tgt = tgt + self.dropout2(tgt2)
33
+ tgt = self.norm2(tgt)
34
+ tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
35
+ tgt = tgt + self.dropout3(tgt2)
36
+ tgt = self.norm3(tgt)
37
+
38
+ return tgt
39
+
40
+
41
+ class SPOTER(nn.Module):
42
+ """
43
+ Implementation of the SPOTER (Sign POse-based TransformER) architecture for sign language recognition from sequence
44
+ of skeletal data.
45
+ """
46
+
47
+ def __init__(self, num_classes, hidden_dim=55):
48
+ super().__init__()
49
+
50
+ self.row_embed = nn.Parameter(torch.rand(50, hidden_dim))
51
+ self.pos = nn.Parameter(torch.cat([self.row_embed[0].unsqueeze(0).repeat(1, 1, 1)], dim=-1).flatten(0, 1).unsqueeze(0))
52
+ self.class_query = nn.Parameter(torch.rand(1, hidden_dim))
53
+ self.transformer = nn.Transformer(hidden_dim, 9, 6, 6)
54
+ self.linear_class = nn.Linear(hidden_dim, num_classes)
55
+
56
+ # Deactivate the initial attention decoder mechanism
57
+ custom_decoder_layer = SPOTERTransformerDecoderLayer(self.transformer.d_model, self.transformer.nhead, 2048,
58
+ 0.1, "relu")
59
+ self.transformer.decoder.layers = _get_clones(custom_decoder_layer, self.transformer.decoder.num_layers)
60
+
61
+ def forward(self, inputs):
62
+ h = torch.unsqueeze(inputs.flatten(start_dim=1), 1).float()
63
+ h = self.transformer(self.pos + h, self.class_query.unsqueeze(0)).transpose(0, 1)
64
+ res = self.linear_class(h)
65
+
66
+ return res
67
+
68
+
69
+ if __name__ == "__main__":
70
+ pass
spoter/utils.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+ import torch
4
+
5
+
6
+ def train_epoch(model, dataloader, criterion, optimizer, device, scheduler=None):
7
+
8
+ pred_correct, pred_all = 0, 0
9
+ running_loss = 0.0
10
+
11
+ for i, data in enumerate(dataloader):
12
+ inputs, labels = data
13
+ inputs = inputs.squeeze(0).to(device)
14
+ labels = labels.to(device, dtype=torch.long)
15
+
16
+ optimizer.zero_grad()
17
+ outputs = model(inputs).expand(1, -1, -1)
18
+
19
+ loss = criterion(outputs[0], labels[0])
20
+ loss.backward()
21
+ optimizer.step()
22
+ running_loss += loss
23
+
24
+ # Statistics
25
+ if int(torch.argmax(torch.nn.functional.softmax(outputs, dim=2))) == int(labels[0][0]):
26
+ pred_correct += 1
27
+ pred_all += 1
28
+
29
+ if scheduler:
30
+ scheduler.step(running_loss.item() / len(dataloader))
31
+
32
+ return running_loss, pred_correct, pred_all, (pred_correct / pred_all)
33
+
34
+
35
+ def evaluate(model, dataloader, device, print_stats=False):
36
+
37
+ pred_correct, pred_all = 0, 0
38
+ stats = {i: [0, 0] for i in range(101)}
39
+
40
+ for i, data in enumerate(dataloader):
41
+ inputs, labels = data
42
+ inputs = inputs.squeeze(0).to(device)
43
+ labels = labels.to(device, dtype=torch.long)
44
+
45
+ outputs = model(inputs).expand(1, -1, -1)
46
+
47
+ # Statistics
48
+ if int(torch.argmax(torch.nn.functional.softmax(outputs, dim=2))) == int(labels[0][0]):
49
+ stats[int(labels[0][0])][0] += 1
50
+ pred_correct += 1
51
+
52
+ stats[int(labels[0][0])][1] += 1
53
+ pred_all += 1
54
+
55
+ if print_stats:
56
+ stats = {key: value[0] / value[1] for key, value in stats.items() if value[1] != 0}
57
+ print("Label accuracies statistics:")
58
+ print(str(stats) + "\n")
59
+ logging.info("Label accuracies statistics:")
60
+ logging.info(str(stats) + "\n")
61
+
62
+ return pred_correct, pred_all, (pred_correct / pred_all)
63
+
64
+
65
+ def evaluate_top_k(model, dataloader, device, k=5):
66
+
67
+ pred_correct, pred_all = 0, 0
68
+
69
+ for i, data in enumerate(dataloader):
70
+ inputs, labels = data
71
+ inputs = inputs.squeeze(0).to(device)
72
+ labels = labels.to(device, dtype=torch.long)
73
+
74
+ outputs = model(inputs).expand(1, -1, -1)
75
+
76
+ if int(labels[0][0]) in torch.topk(outputs, k).indices.tolist():
77
+ pred_correct += 1
78
+
79
+ pred_all += 1
80
+
81
+ return pred_correct, pred_all, (pred_correct / pred_all)
spoter_mod/.idea/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Editor-based HTTP Client requests
5
+ /httpRequests/
6
+ # Datasource local storage ignored files
7
+ /dataSources/
8
+ /dataSources.local.xml
spoter_mod/.idea/inspectionProfiles/Project_Default.xml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
5
+ <option name="ignoredPackages">
6
+ <value>
7
+ <list size="11">
8
+ <item index="0" class="java.lang.String" itemvalue="pandas" />
9
+ <item index="1" class="java.lang.String" itemvalue="feedparser" />
10
+ <item index="2" class="java.lang.String" itemvalue="sklearn" />
11
+ <item index="3" class="java.lang.String" itemvalue="numpy" />
12
+ <item index="4" class="java.lang.String" itemvalue="coremltools" />
13
+ <item index="5" class="java.lang.String" itemvalue="h5py" />
14
+ <item index="6" class="java.lang.String" itemvalue="torch" />
15
+ <item index="7" class="java.lang.String" itemvalue="einops" />
16
+ <item index="8" class="java.lang.String" itemvalue="firebase-admin" />
17
+ <item index="9" class="java.lang.String" itemvalue="pyemd" />
18
+ <item index="10" class="java.lang.String" itemvalue="matplotlib" />
19
+ </list>
20
+ </value>
21
+ </option>
22
+ </inspection_tool>
23
+ </profile>
24
+ </component>
spoter_mod/.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
spoter_mod/.idea/misc.xml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (spoter)" project-jdk-type="Python SDK" />
4
+ </project>
spoter_mod/.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/spoter.iml" filepath="$PROJECT_DIR$/.idea/spoter.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
spoter_mod/.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="$PROJECT_DIR$" vcs="Git" />
5
+ </component>
6
+ </project>
spoter_mod/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2021-2022 Matyáš Boháček
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
spoter_mod/README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ![Alt Text](http://spoter.signlanguagerecognition.com/img/GitHub_banner.png)
2
+
3
+ > by **[Matyáš Boháček](https://github.com/matyasbohacek)** and **[Marek Hrúz](https://github.com/mhruz)**, University of West Bohemia <br>
4
+ > Should you have any questions or inquiries, feel free to contact us [here](mailto:[email protected]).
5
+
6
+ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/sign-pose-based-transformer-for-word-level/sign-language-recognition-on-lsa64)](https://paperswithcode.com/sota/sign-language-recognition-on-lsa64?p=sign-pose-based-transformer-for-word-level)
7
+
8
+ Repository accompanying the [Sign Pose-based Transformer for Word-level Sign Language Recognition](https://openaccess.thecvf.com/content/WACV2022W/HADCV/html/Bohacek_Sign_Pose-Based_Transformer_for_Word-Level_Sign_Language_Recognition_WACVW_2022_paper.html) paper, where we present a novel architecture for word-level sign language recognition based on the Transformer model. We designed our solution with low computational cost in mind, since we see egreat potential in the usage of such recognition system on hand-held devices. We introduce multiple original augmentation techniques tailored for the task of sign language recognition and propose a unique normalization scheme based on sign language linguistics.
9
+
10
+ ![Alt Text](http://spoter.signlanguagerecognition.com/img/architecture_github.gif)
11
+
12
+ ## Get Started
13
+
14
+ First, make sure to install all necessary dependencies using:
15
+
16
+ ```shell
17
+ pip install -r requirements.txt
18
+ ```
19
+
20
+ To train the model, simply specify the hyperparameters and run the following:
21
+
22
+ ```
23
+ python -m train
24
+ --experiment_name [str; name of the experiment to name the output logs and plots]
25
+
26
+ --epochs [int; number of epochs]
27
+ --lr [float; learning rate]
28
+
29
+ --training_set_path [str; path to the csv file with training set's skeletal data]
30
+ --validation_set_path [str; path to the csv file with validation set's skeletal data]
31
+ --testing_set_path [str; path to the csv file with testing set's skeletal data]
32
+ ```
33
+
34
+ If either the validation or testing sets' paths are left empty, these corresponding metrics will not be calculated. We also provide out-of-the box parameter to split the validation set as a desired split of the training set while preserving the label distribution for datasets without author-specified splits. These and many other specific hyperparameters with their descriptions can be found in the [train.py](https://github.com/matyasbohacek/spoter/blob/main/train.py) file. All of them are provided a default value we found to be working well in our experiments.
35
+
36
+ ## Data
37
+
38
+ As SPOTER works on top of sequences of signers' skeletal data extracted from videos, we wanted to eliminate the computational demands of such annotation for each training run by pre-collecting this. For this reason and reproducibility, we are open-sourcing this data for WLASL100 and LSA64 datasets along with the repository. You can find the data [here](https://github.com/matyasbohacek/spoter/releases/tag/supplementary-data).
39
+
40
+ ![Alt Text](http://spoter.signlanguagerecognition.com/img/datasets_overview.gif)
41
+
42
+ ## License
43
+
44
+ The **code** is published under the [Apache License 2.0](https://github.com/matyasbohacek/spoter/blob/main/LICENSE) which allows for both academic and commercial use if relevant License and copyright notice is included, our work is cited and all changes are stated.
45
+
46
+ The accompanying skeletal data of the [WLASL](https://arxiv.org/pdf/1910.11006.pdf) and [LSA64](https://core.ac.uk/download/pdf/76495887.pdf) datasets used for experiments are, however, shared under the [Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)](https://creativecommons.org/licenses/by-nc/4.0/) license allowing only for non-commercial usage.
47
+
48
+ ## Citation
49
+
50
+ If you find our work relevant, build upon it or compare your approaches with it, please cite our work as stated below:
51
+
52
+ ```
53
+ @InProceedings{Bohacek_2022_WACV,
54
+ author = {Boh\'a\v{c}ek, Maty\'a\v{s} and Hr\'uz, Marek},
55
+ title = {Sign Pose-Based Transformer for Word-Level Sign Language Recognition},
56
+ booktitle = {Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV) Workshops},
57
+ month = {January},
58
+ year = {2022},
59
+ pages = {182-191}
60
+ }
61
+ ```
spoter_mod/augmentations/__init__.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import math
3
+ import logging
4
+ import cv2
5
+ import random
6
+
7
+ import numpy as np
8
+
9
+ from normalization.body_normalization import BODY_IDENTIFIERS
10
+ from normalization.hand_normalization import HAND_IDENTIFIERS
11
+
12
+
13
+ HAND_IDENTIFIERS = [id + "_0" for id in HAND_IDENTIFIERS] + [id + "_1" for id in HAND_IDENTIFIERS]
14
+ ARM_IDENTIFIERS_ORDER = ["neck", "$side$Shoulder", "$side$Elbow", "$side$Wrist"]
15
+
16
+
17
+ def __random_pass(prob):
18
+ return random.random() < prob
19
+
20
+
21
+ def __numpy_to_dictionary(data_array: np.ndarray) -> dict:
22
+ """
23
+ Supplementary method converting a NumPy array of body landmark data into dictionaries. The array data must match the
24
+ order of the BODY_IDENTIFIERS list.
25
+ """
26
+
27
+ output = {}
28
+
29
+ for landmark_index, identifier in enumerate(BODY_IDENTIFIERS):
30
+ output[identifier] = data_array[:, landmark_index].tolist()
31
+
32
+ return output
33
+
34
+
35
+ def __dictionary_to_numpy(landmarks_dict: dict) -> np.ndarray:
36
+ """
37
+ Supplementary method converting dictionaries of body landmark data into respective NumPy arrays. The resulting array
38
+ will match the order of the BODY_IDENTIFIERS list.
39
+ """
40
+
41
+ output = np.empty(shape=(len(landmarks_dict["leftEar"]), len(BODY_IDENTIFIERS), 2))
42
+
43
+ for landmark_index, identifier in enumerate(BODY_IDENTIFIERS):
44
+ output[:, landmark_index, 0] = np.array(landmarks_dict[identifier])[:, 0]
45
+ output[:, landmark_index, 1] = np.array(landmarks_dict[identifier])[:, 1]
46
+
47
+ return output
48
+
49
+
50
+ def __rotate(origin: tuple, point: tuple, angle: float):
51
+ """
52
+ Rotates a point counterclockwise by a given angle around a given origin.
53
+
54
+ :param origin: Landmark in the (X, Y) format of the origin from which to count angle of rotation
55
+ :param point: Landmark in the (X, Y) format to be rotated
56
+ :param angle: Angle under which the point shall be rotated
57
+ :return: New landmarks (coordinates)
58
+ """
59
+
60
+ ox, oy = origin
61
+ px, py = point
62
+
63
+ qx = ox + math.cos(angle) * (px - ox) - math.sin(angle) * (py - oy)
64
+ qy = oy + math.sin(angle) * (px - ox) + math.cos(angle) * (py - oy)
65
+
66
+ return qx, qy
67
+
68
+
69
+ def __preprocess_row_sign(sign: dict) -> (dict, dict):
70
+ """
71
+ Supplementary method splitting the single-dictionary skeletal data into two dictionaries of body and hand landmarks
72
+ respectively.
73
+ """
74
+
75
+ sign_eval = sign
76
+
77
+ if "nose_X" in sign_eval:
78
+ body_landmarks = {identifier: [(x, y) for x, y in zip(sign_eval[identifier + "_X"], sign_eval[identifier + "_Y"])]
79
+ for identifier in BODY_IDENTIFIERS}
80
+ hand_landmarks = {identifier: [(x, y) for x, y in zip(sign_eval[identifier + "_X"], sign_eval[identifier + "_Y"])]
81
+ for identifier in HAND_IDENTIFIERS}
82
+
83
+ else:
84
+ body_landmarks = {identifier: sign_eval[identifier] for identifier in BODY_IDENTIFIERS}
85
+ hand_landmarks = {identifier: sign_eval[identifier] for identifier in HAND_IDENTIFIERS}
86
+
87
+ return body_landmarks, hand_landmarks
88
+
89
+
90
+ def __wrap_sign_into_row(body_identifiers: dict, hand_identifiers: dict) -> dict:
91
+ """
92
+ Supplementary method for merging body and hand data into a single dictionary.
93
+ """
94
+
95
+ return {**body_identifiers, **hand_identifiers}
96
+
97
+
98
+ def augment_rotate(sign: dict, angle_range: tuple) -> dict:
99
+ """
100
+ AUGMENTATION TECHNIQUE. All the joint coordinates in each frame are rotated by a random angle up to 13 degrees with
101
+ the center of rotation lying in the center of the frame, which is equal to [0.5; 0.5].
102
+
103
+ :param sign: Dictionary with sequential skeletal data of the signing person
104
+ :param angle_range: Tuple containing the angle range (minimal and maximal angle in degrees) to randomly choose the
105
+ angle by which the landmarks will be rotated from
106
+
107
+ :return: Dictionary with augmented (by rotation) sequential skeletal data of the signing person
108
+ """
109
+
110
+ body_landmarks, hand_landmarks = __preprocess_row_sign(sign)
111
+ angle = math.radians(random.uniform(*angle_range))
112
+
113
+ body_landmarks = {key: [__rotate((0.5, 0.5), frame, angle) for frame in value] for key, value in
114
+ body_landmarks.items()}
115
+ hand_landmarks = {key: [__rotate((0.5, 0.5), frame, angle) for frame in value] for key, value in
116
+ hand_landmarks.items()}
117
+
118
+ return __wrap_sign_into_row(body_landmarks, hand_landmarks)
119
+
120
+
121
+ def augment_shear(sign: dict, type: str, squeeze_ratio: tuple) -> dict:
122
+ """
123
+ AUGMENTATION TECHNIQUE.
124
+
125
+ - Squeeze. All the frames are squeezed from both horizontal sides. Two different random proportions up to 15% of
126
+ the original frame's width for both left and right side are cut.
127
+
128
+ - Perspective transformation. The joint coordinates are projected onto a new plane with a spatially defined
129
+ center of projection, which simulates recording the sign video with a slight tilt. Each time, the right or left
130
+ side, as well as the proportion by which both the width and height will be reduced, are chosen randomly. This
131
+ proportion is selected from a uniform distribution on the [0; 1) interval. Subsequently, the new plane is
132
+ delineated by reducing the width at the desired side and the respective vertical edge (height) at both of its
133
+ adjacent corners.
134
+
135
+ :param sign: Dictionary with sequential skeletal data of the signing person
136
+ :param type: Type of shear augmentation to perform (either 'squeeze' or 'perspective')
137
+ :param squeeze_ratio: Tuple containing the relative range from what the proportion of the original width will be
138
+ randomly chosen. These proportions will either be cut from both sides or used to construct the
139
+ new projection
140
+
141
+ :return: Dictionary with augmented (by squeezing or perspective transformation) sequential skeletal data of the
142
+ signing person
143
+ """
144
+
145
+ body_landmarks, hand_landmarks = __preprocess_row_sign(sign)
146
+
147
+ if type == "squeeze":
148
+ move_left = random.uniform(*squeeze_ratio)
149
+ move_right = random.uniform(*squeeze_ratio)
150
+
151
+ src = np.array(((0, 1), (1, 1), (0, 0), (1, 0)), dtype=np.float32)
152
+ dest = np.array(((0 + move_left, 1), (1 - move_right, 1), (0 + move_left, 0), (1 - move_right, 0)),
153
+ dtype=np.float32)
154
+ mtx = cv2.getPerspectiveTransform(src, dest)
155
+
156
+ elif type == "perspective":
157
+
158
+ move_ratio = random.uniform(*squeeze_ratio)
159
+ src = np.array(((0, 1), (1, 1), (0, 0), (1, 0)), dtype=np.float32)
160
+
161
+ if __random_pass(0.5):
162
+ dest = np.array(((0 + move_ratio, 1 - move_ratio), (1, 1), (0 + move_ratio, 0 + move_ratio), (1, 0)),
163
+ dtype=np.float32)
164
+ else:
165
+ dest = np.array(((0, 1), (1 - move_ratio, 1 - move_ratio), (0, 0), (1 - move_ratio, 0 + move_ratio)),
166
+ dtype=np.float32)
167
+
168
+ mtx = cv2.getPerspectiveTransform(src, dest)
169
+
170
+ else:
171
+
172
+ logging.error("Unsupported shear type provided.")
173
+ return {}
174
+
175
+ landmarks_array = __dictionary_to_numpy(body_landmarks)
176
+ augmented_landmarks = cv2.perspectiveTransform(np.array(landmarks_array, dtype=np.float32), mtx)
177
+
178
+ augmented_zero_landmark = cv2.perspectiveTransform(np.array([[[0, 0]]], dtype=np.float32), mtx)[0][0]
179
+ augmented_landmarks = np.stack([np.where(sub == augmented_zero_landmark, [0, 0], sub) for sub in augmented_landmarks])
180
+
181
+ body_landmarks = __numpy_to_dictionary(augmented_landmarks)
182
+
183
+ return __wrap_sign_into_row(body_landmarks, hand_landmarks)
184
+
185
+
186
+ def augment_arm_joint_rotate(sign: dict, probability: float, angle_range: tuple) -> dict:
187
+ """
188
+ AUGMENTATION TECHNIQUE. The joint coordinates of both arms are passed successively, and the impending landmark is
189
+ slightly rotated with respect to the current one. The chance of each joint to be rotated is 3:10 and the angle of
190
+ alternation is a uniform random angle up to +-4 degrees. This simulates slight, negligible variances in each
191
+ execution of a sign, which do not change its semantic meaning.
192
+
193
+ :param sign: Dictionary with sequential skeletal data of the signing person
194
+ :param probability: Probability of each joint to be rotated (float from the range [0, 1])
195
+ :param angle_range: Tuple containing the angle range (minimal and maximal angle in degrees) to randomly choose the
196
+ angle by which the landmarks will be rotated from
197
+
198
+ :return: Dictionary with augmented (by arm joint rotation) sequential skeletal data of the signing person
199
+ """
200
+
201
+ body_landmarks, hand_landmarks = __preprocess_row_sign(sign)
202
+
203
+ # Iterate over both directions (both hands)
204
+ for side in ["left", "right"]:
205
+ # Iterate gradually over the landmarks on arm
206
+ for landmark_index, landmark_origin in enumerate(ARM_IDENTIFIERS_ORDER):
207
+ landmark_origin = landmark_origin.replace("$side$", side)
208
+
209
+ # End the process on the current hand if the landmark is not present
210
+ if landmark_origin not in body_landmarks:
211
+ break
212
+
213
+ # Perform rotation by provided probability
214
+ if __random_pass(probability):
215
+ angle = math.radians(random.uniform(*angle_range))
216
+
217
+ for to_be_rotated in ARM_IDENTIFIERS_ORDER[landmark_index + 1:]:
218
+ to_be_rotated = to_be_rotated.replace("$side$", side)
219
+
220
+ # Skip if the landmark is not present
221
+ if to_be_rotated not in body_landmarks:
222
+ continue
223
+
224
+ body_landmarks[to_be_rotated] = [__rotate(body_landmarks[landmark_origin][frame_index], frame,
225
+ angle) for frame_index, frame in enumerate(body_landmarks[to_be_rotated])]
226
+
227
+ return __wrap_sign_into_row(body_landmarks, hand_landmarks)
228
+
229
+
230
+ if __name__ == "__main__":
231
+ pass
spoter_mod/data_structurization/autsl.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import tqdm
4
+
5
+ import pandas as pd
6
+ from shutil import copyfile
7
+
8
+
9
+ MAIN_PATH = "/Users/matyasbohacek/Documents/Academics/Projects/AUTSL"
10
+ BATCH = "test"
11
+
12
+ df = pd.read_csv(MAIN_PATH + "/" + BATCH + "_labels.csv", encoding="utf-8", sep=";")
13
+
14
+ if not os.path.exists(MAIN_PATH + "/" + BATCH + "_preprocessed/"):
15
+ os.mkdir(MAIN_PATH + "/" + BATCH + "_preprocessed/")
16
+
17
+ for index_row, row in tqdm.tqdm(df.iterrows()):
18
+ if not os.path.exists(MAIN_PATH + "/" + BATCH + "_preprocessed/" + str(row["label"]) + "/"):
19
+ os.mkdir(MAIN_PATH + "/" + BATCH + "_preprocessed/" + str(row["label"]) + "/")
20
+
21
+ copyfile(MAIN_PATH + "/" + BATCH + "/" + str(row["video"]) + "_color.mp4", MAIN_PATH + "/" + BATCH + "_preprocessed/" + str(row["label"]) + "/" + str(row["video"]) + "_color.mp4")
22
+
spoter_mod/data_structurization/wlasl.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import json
4
+ import tqdm
5
+
6
+ from shutil import copyfile
7
+
8
+
9
+ MAIN_PATH = "/Users/matyasbohacek/Documents/Academics/Projects/WLASL/start_kit"
10
+ BATCH = "train"
11
+
12
+ if not os.path.exists(MAIN_PATH + "/" + BATCH + "_preprocessed/"):
13
+ os.mkdir(MAIN_PATH + "/" + BATCH + "_preprocessed/")
14
+
15
+ with open(MAIN_PATH + "/specs.json") as f:
16
+ data = json.load(f)
17
+
18
+ for item_index, item in tqdm.tqdm(enumerate(data)):
19
+
20
+ for video in item["instances"]:
21
+
22
+ if video["split"] != BATCH:
23
+ continue
24
+
25
+ if not os.path.exists(MAIN_PATH + "/" + BATCH + "_preprocessed/" + str(item_index) + "/"):
26
+ os.mkdir(MAIN_PATH + "/" + BATCH + "_preprocessed/" + str(item_index) + "/")
27
+
28
+ original_path = MAIN_PATH + "/videos/" + str(video["video_id"]) + ".mp4"
29
+ new_path = MAIN_PATH + "/" + BATCH + "_preprocessed/" + str(item_index) + "/" + str(video["video_id"]) + ".mp4"
30
+
31
+ copyfile(original_path, new_path)
32
+
spoter_mod/datasets/czech_slr_dataset.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import torch
3
+
4
+ import pandas as pd
5
+ import torch.utils.data as torch_data
6
+
7
+ from random import randrange
8
+ from augmentations import *
9
+ from normalization.body_normalization import BODY_IDENTIFIERS
10
+ from normalization.hand_normalization import HAND_IDENTIFIERS
11
+ from normalization.body_normalization import normalize_single_dict as normalize_single_body_dict
12
+ from normalization.hand_normalization import normalize_single_dict as normalize_single_hand_dict
13
+
14
+ HAND_IDENTIFIERS = [id + "_0" for id in HAND_IDENTIFIERS] + [id + "_1" for id in HAND_IDENTIFIERS]
15
+
16
+ DEFAULT_AUGMENTATIONS_CONFIG = {
17
+ "rotate-angle": 13,
18
+ "perspective-transform-ratio": 0.1,
19
+ "squeeze-ratio": 0.15,
20
+ "arm-joint-rotate-angle": 4,
21
+ "arm-joint-rotate-probability": 0.3
22
+ }
23
+
24
+
25
+ def load_dataset(file_location: str):
26
+
27
+ # Load the datset csv file
28
+ df = pd.read_csv(file_location, encoding="utf-8")
29
+
30
+ # TO BE DELETED
31
+ df.columns = [item.replace("_Left_", "_0_").replace("_Right_", "_1_") for item in list(df.columns)]
32
+ if "neck_X" not in df.columns:
33
+ df["neck_X"] = [0 for _ in range(df.shape[0])]
34
+ df["neck_Y"] = [0 for _ in range(df.shape[0])]
35
+
36
+ # TEMP
37
+ labels = df["labels"].to_list()
38
+ labels = [label + 1 for label in df["labels"].to_list()]
39
+ data = []
40
+
41
+ for row_index, row in df.iterrows():
42
+ current_row = np.empty(shape=(len(ast.literal_eval(row["leftEar_X"])), len(BODY_IDENTIFIERS + HAND_IDENTIFIERS), 2))
43
+ for index, identifier in enumerate(BODY_IDENTIFIERS + HAND_IDENTIFIERS):
44
+ current_row[:, index, 0] = ast.literal_eval(row[identifier + "_X"])
45
+ current_row[:, index, 1] = ast.literal_eval(row[identifier + "_Y"])
46
+
47
+ data.append(current_row)
48
+
49
+ return data, labels
50
+
51
+
52
+ def tensor_to_dictionary(landmarks_tensor: torch.Tensor) -> dict:
53
+
54
+ data_array = landmarks_tensor.numpy()
55
+ output = {}
56
+
57
+ for landmark_index, identifier in enumerate(BODY_IDENTIFIERS + HAND_IDENTIFIERS):
58
+ output[identifier] = data_array[:, landmark_index]
59
+
60
+ return output
61
+
62
+
63
+ def dictionary_to_tensor(landmarks_dict: dict) -> torch.Tensor:
64
+
65
+ output = np.empty(shape=(len(landmarks_dict["leftEar"]), len(BODY_IDENTIFIERS + HAND_IDENTIFIERS), 2))
66
+
67
+ for landmark_index, identifier in enumerate(BODY_IDENTIFIERS + HAND_IDENTIFIERS):
68
+ output[:, landmark_index, 0] = [frame[0] for frame in landmarks_dict[identifier]]
69
+ output[:, landmark_index, 1] = [frame[1] for frame in landmarks_dict[identifier]]
70
+
71
+ return torch.from_numpy(output)
72
+
73
+
74
+ class CzechSLRDataset(torch_data.Dataset):
75
+ """Advanced object representation of the HPOES dataset for loading hand joints landmarks utilizing the Torch's
76
+ built-in Dataset properties"""
77
+
78
+ data: [np.ndarray]
79
+ labels: [np.ndarray]
80
+
81
+ def __init__(self, dataset_filename: str, num_labels=5, transform=None, augmentations=False,
82
+ augmentations_prob=0.5, normalize=True, augmentations_config: dict = DEFAULT_AUGMENTATIONS_CONFIG):
83
+ """
84
+ Initiates the HPOESDataset with the pre-loaded data from the h5 file.
85
+
86
+ :param dataset_filename: Path to the h5 file
87
+ :param transform: Any data transformation to be applied (default: None)
88
+ """
89
+
90
+ loaded_data = load_dataset(dataset_filename)
91
+ data, labels = loaded_data[0], loaded_data[1]
92
+
93
+ self.data = data
94
+ self.labels = labels
95
+ self.targets = list(labels)
96
+ self.num_labels = num_labels
97
+ self.transform = transform
98
+
99
+ self.augmentations = augmentations
100
+ self.augmentations_prob = augmentations_prob
101
+ self.augmentations_config = augmentations_config
102
+ self.normalize = normalize
103
+
104
+ def __getitem__(self, idx):
105
+ """
106
+ Allocates, potentially transforms and returns the item at the desired index.
107
+
108
+ :param idx: Index of the item
109
+ :return: Tuple containing both the depth map and the label
110
+ """
111
+
112
+ depth_map = torch.from_numpy(np.copy(self.data[idx]))
113
+ label = torch.Tensor([self.labels[idx] - 1])
114
+
115
+ depth_map = tensor_to_dictionary(depth_map)
116
+
117
+ # Apply potential augmentations
118
+ if self.augmentations and random.random() < self.augmentations_prob:
119
+
120
+ selected_aug = randrange(4)
121
+
122
+ if selected_aug == 0:
123
+ depth_map = augment_rotate(depth_map, (-self.augmentations_config["rotate-angle"], self.augmentations_config["rotate-angle"]))
124
+
125
+ if selected_aug == 1:
126
+ depth_map = augment_shear(depth_map, "perspective", (0, self.augmentations_config["perspective-transform-ratio"]))
127
+
128
+ if selected_aug == 2:
129
+ depth_map = augment_shear(depth_map, "squeeze", (0, self.augmentations_config["squeeze-ratio"]))
130
+
131
+ if selected_aug == 3:
132
+ depth_map = augment_arm_joint_rotate(depth_map, self.augmentations_config["arm-joint-rotate-probability"], (-self.augmentations_config["arm-joint-rotate-angle"], self.augmentations_config["arm-joint-rotate-angle"]))
133
+
134
+ if self.normalize:
135
+ depth_map = normalize_single_body_dict(depth_map)
136
+ depth_map = normalize_single_hand_dict(depth_map)
137
+
138
+ depth_map = dictionary_to_tensor(depth_map)
139
+
140
+ # Move the landmark position interval to improve performance
141
+ depth_map = depth_map - 0.5
142
+
143
+ if self.transform:
144
+ depth_map = self.transform(depth_map)
145
+
146
+ return depth_map, label
147
+
148
+ def __len__(self):
149
+ return len(self.labels)
150
+
151
+
152
+ if __name__ == "__main__":
153
+ pass
spoter_mod/normalization/body_normalization.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+ import pandas as pd
4
+
5
+ BODY_IDENTIFIERS = [
6
+ "nose",
7
+ "neck",
8
+ "rightEye",
9
+ "leftEye",
10
+ "rightEar",
11
+ "leftEar",
12
+ "rightShoulder",
13
+ "leftShoulder",
14
+ "rightElbow",
15
+ "leftElbow",
16
+ "rightWrist",
17
+ "leftWrist"
18
+ ]
19
+
20
+
21
+ def normalize_body_full(df: pd.DataFrame) -> (pd.DataFrame, list):
22
+ """
23
+ Normalizes the body position data using the Bohacek-normalization algorithm.
24
+
25
+ :param df: pd.DataFrame to be normalized
26
+ :return: pd.DataFrame with normalized values for body pose
27
+ """
28
+
29
+ # TODO: Fix division by zero
30
+
31
+ normalized_df = pd.DataFrame(columns=df.columns)
32
+ invalid_row_indexes = []
33
+ body_landmarks = {"X": [], "Y": []}
34
+
35
+ # Construct the relevant identifiers
36
+ for identifier in BODY_IDENTIFIERS:
37
+ body_landmarks["X"].append(identifier + "_X")
38
+ body_landmarks["Y"].append(identifier + "_Y")
39
+
40
+ # Iterate over all of the records in the dataset
41
+ for index, row in df.iterrows():
42
+
43
+ sequence_size = len(row["leftEar_Y"])
44
+ valid_sequence = True
45
+ original_row = row
46
+
47
+ last_starting_point, last_ending_point = None, None
48
+
49
+ # Treat each element of the sequence (analyzed frame) individually
50
+ for sequence_index in range(sequence_size):
51
+
52
+ # Prevent from even starting the analysis if some necessary elements are not present
53
+ if (row["leftShoulder_X"][sequence_index] == 0 or row["rightShoulder_X"][sequence_index] == 0) and (row["neck_X"][sequence_index] == 0 or row["nose_X"][sequence_index] == 0):
54
+ if not last_starting_point:
55
+ valid_sequence = False
56
+ continue
57
+
58
+ else:
59
+ starting_point, ending_point = last_starting_point, last_ending_point
60
+
61
+ else:
62
+
63
+ # NOTE:
64
+ #
65
+ # While in the paper, it is written that the head metric is calculated by halving the shoulder distance,
66
+ # this is meant for the distance between the very ends of one's shoulder, as literature studying body
67
+ # metrics and ratios generally states. The Vision Pose Estimation API, however, seems to be predicting
68
+ # rather the center of one's shoulder. Based on our experiments and manual reviews of the data, employing
69
+ # this as just the plain shoulder distance seems to be more corresponding to the desired metric.
70
+ #
71
+ # Please, review this if using other third-party pose estimation libraries.
72
+
73
+ if row["leftShoulder_X"][sequence_index] != 0 and row["rightShoulder_X"][sequence_index] != 0:
74
+ left_shoulder = (row["leftShoulder_X"][sequence_index], row["leftShoulder_Y"][sequence_index])
75
+ right_shoulder = (row["rightShoulder_X"][sequence_index], row["rightShoulder_Y"][sequence_index])
76
+ shoulder_distance = ((((left_shoulder[0] - right_shoulder[0]) ** 2) + (
77
+ (left_shoulder[1] - right_shoulder[1]) ** 2)) ** 0.5)
78
+ head_metric = shoulder_distance
79
+ else:
80
+ neck = (row["neck_X"][sequence_index], row["neck_Y"][sequence_index])
81
+ nose = (row["nose_X"][sequence_index], row["nose_Y"][sequence_index])
82
+ neck_nose_distance = ((((neck[0] - nose[0]) ** 2) + ((neck[1] - nose[1]) ** 2)) ** 0.5)
83
+ head_metric = neck_nose_distance
84
+
85
+ # Set the starting and ending point of the normalization bounding box
86
+ starting_point = [row["neck_X"][sequence_index] - 3 * head_metric, row["leftEye_Y"][sequence_index] + (head_metric / 2)]
87
+ ending_point = [row["neck_X"][sequence_index] + 3 * head_metric, starting_point[1] - 6 * head_metric]
88
+
89
+ last_starting_point, last_ending_point = starting_point, ending_point
90
+
91
+ # Ensure that all of the bounding-box-defining coordinates are not out of the picture
92
+ if starting_point[0] < 0: starting_point[0] = 0
93
+ if starting_point[1] < 0: starting_point[1] = 0
94
+ if ending_point[0] < 0: ending_point[0] = 0
95
+ if ending_point[1] < 0: ending_point[1] = 0
96
+
97
+ # Normalize individual landmarks and save the results
98
+ for identifier in BODY_IDENTIFIERS:
99
+ key = identifier + "_"
100
+
101
+ # Prevent from trying to normalize incorrectly captured points
102
+ if row[key + "X"][sequence_index] == 0:
103
+ continue
104
+
105
+ normalized_x = (row[key + "X"][sequence_index] - starting_point[0]) / (ending_point[0] -
106
+ starting_point[0])
107
+ normalized_y = (row[key + "Y"][sequence_index] - ending_point[1]) / (starting_point[1] -
108
+ ending_point[1])
109
+
110
+ row[key + "X"][sequence_index] = normalized_x
111
+ row[key + "Y"][sequence_index] = normalized_y
112
+
113
+ if valid_sequence:
114
+ normalized_df = normalized_df.append(row, ignore_index=True)
115
+ else:
116
+ logging.warning(" BODY LANDMARKS: One video instance could not be normalized.")
117
+ normalized_df = normalized_df.append(original_row, ignore_index=True)
118
+ invalid_row_indexes.append(index)
119
+
120
+ print("The normalization of body is finished.")
121
+ print("\t-> Original size:", df.shape[0])
122
+ print("\t-> Normalized size:", normalized_df.shape[0])
123
+ print("\t-> Problematic videos:", len(invalid_row_indexes))
124
+
125
+ return normalized_df, invalid_row_indexes
126
+
127
+
128
+ def normalize_single_dict(row: dict):
129
+ """
130
+ Normalizes the skeletal data for a given sequence of frames with signer's body pose data. The normalization follows
131
+ the definition from our paper.
132
+
133
+ :param row: Dictionary containing key-value pairs with joint identifiers and corresponding lists (sequences) of
134
+ that particular joints coordinates
135
+ :return: Dictionary with normalized skeletal data (following the same schema as input data)
136
+ """
137
+
138
+ sequence_size = len(row["leftEar"])
139
+ valid_sequence = True
140
+ original_row = row
141
+
142
+ last_starting_point, last_ending_point = None, None
143
+
144
+ # Treat each element of the sequence (analyzed frame) individually
145
+ for sequence_index in range(sequence_size):
146
+
147
+ # Prevent from even starting the analysis if some necessary elements are not present
148
+ if (row["leftShoulder"][sequence_index][0] == 0 or row["rightShoulder"][sequence_index][0] == 0) and (
149
+ row["neck"][sequence_index][0] == 0 or row["nose"][sequence_index][0] == 0):
150
+ if not last_starting_point:
151
+ valid_sequence = False
152
+ continue
153
+
154
+ else:
155
+ starting_point, ending_point = last_starting_point, last_ending_point
156
+
157
+ else:
158
+
159
+ # NOTE:
160
+ #
161
+ # While in the paper, it is written that the head metric is calculated by halving the shoulder distance,
162
+ # this is meant for the distance between the very ends of one's shoulder, as literature studying body
163
+ # metrics and ratios generally states. The Vision Pose Estimation API, however, seems to be predicting
164
+ # rather the center of one's shoulder. Based on our experiments and manual reviews of the data, employing
165
+ # this as just the plain shoulder distance seems to be more corresponding to the desired metric.
166
+ #
167
+ # Please, review this if using other third-party pose estimation libraries.
168
+
169
+ if row["leftShoulder"][sequence_index][0] != 0 and row["rightShoulder"][sequence_index][0] != 0:
170
+ left_shoulder = (row["leftShoulder"][sequence_index][0], row["leftShoulder"][sequence_index][1])
171
+ right_shoulder = (row["rightShoulder"][sequence_index][0], row["rightShoulder"][sequence_index][1])
172
+ shoulder_distance = ((((left_shoulder[0] - right_shoulder[0]) ** 2) + (
173
+ (left_shoulder[1] - right_shoulder[1]) ** 2)) ** 0.5)
174
+ head_metric = shoulder_distance
175
+ else:
176
+ neck = (row["neck"][sequence_index][0], row["neck"][sequence_index][1])
177
+ nose = (row["nose"][sequence_index][0], row["nose"][sequence_index][1])
178
+ neck_nose_distance = ((((neck[0] - nose[0]) ** 2) + ((neck[1] - nose[1]) ** 2)) ** 0.5)
179
+ head_metric = neck_nose_distance
180
+
181
+ # Set the starting and ending point of the normalization bounding box
182
+ #starting_point = [row["neck"][sequence_index][0] - 3 * head_metric,
183
+ # row["leftEye"][sequence_index][1] + (head_metric / 2)]
184
+ starting_point = [row["neck"][sequence_index][0] - 1 * head_metric,
185
+ row["leftEye"][sequence_index][1] - head_metric/2]
186
+ ending_point = [row["neck"][sequence_index][0] + 1 * head_metric,
187
+ starting_point[1] + 3 * head_metric]
188
+
189
+ last_starting_point, last_ending_point = starting_point, ending_point
190
+
191
+ # Ensure that all of the bounding-box-defining coordinates are not out of the picture
192
+ if starting_point[0] < 0: starting_point[0] = 0
193
+ if starting_point[1] > 1: starting_point[1] = 1
194
+ if ending_point[0] < 0: ending_point[0] = 0
195
+ if ending_point[1] > 1: ending_point[1] = 1
196
+
197
+ # Normalize individual landmarks and save the results
198
+ for identifier in BODY_IDENTIFIERS:
199
+ key = identifier
200
+
201
+ # Prevent from trying to normalize incorrectly captured points
202
+ if row[key][sequence_index][0] == 0:
203
+ continue
204
+
205
+ if (ending_point[0] - starting_point[0]) == 0 or (starting_point[1] - ending_point[1]) == 0:
206
+ logging.info("Problematic normalization")
207
+ valid_sequence = False
208
+ break
209
+
210
+ normalized_x = (row[key][sequence_index][0] - starting_point[0]) / (ending_point[0] - starting_point[0])
211
+ normalized_y = (row[key][sequence_index][1] - starting_point[1]) / (ending_point[1] - starting_point[1])
212
+
213
+ row[key][sequence_index] = list(row[key][sequence_index])
214
+
215
+ row[key][sequence_index][0] = normalized_x
216
+ row[key][sequence_index][1] = normalized_y
217
+
218
+ if valid_sequence:
219
+ return row
220
+
221
+ else:
222
+ return original_row
223
+
224
+
225
+ if __name__ == "__main__":
226
+ pass
spoter_mod/normalization/hand_normalization.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+ import pandas as pd
4
+
5
+ HAND_IDENTIFIERS = [
6
+ "wrist",
7
+ "indexTip",
8
+ "indexDIP",
9
+ "indexPIP",
10
+ "indexMCP",
11
+ "middleTip",
12
+ "middleDIP",
13
+ "middlePIP",
14
+ "middleMCP",
15
+ "ringTip",
16
+ "ringDIP",
17
+ "ringPIP",
18
+ "ringMCP",
19
+ "littleTip",
20
+ "littleDIP",
21
+ "littlePIP",
22
+ "littleMCP",
23
+ "thumbTip",
24
+ "thumbIP",
25
+ "thumbMP",
26
+ "thumbCMC"
27
+ ]
28
+
29
+
30
+ def normalize_hands_full(df: pd.DataFrame) -> pd.DataFrame:
31
+ """
32
+ Normalizes the hands position data using the Bohacek-normalization algorithm.
33
+
34
+ :param df: pd.DataFrame to be normalized
35
+ :return: pd.DataFrame with normalized values for hand pose
36
+ """
37
+
38
+ # TODO: Fix division by zero
39
+ df.columns = [item.replace("_left_", "_0_").replace("_right_", "_1_") for item in list(df.columns)]
40
+
41
+ normalized_df = pd.DataFrame(columns=df.columns)
42
+
43
+ hand_landmarks = {"X": {0: [], 1: []}, "Y": {0: [], 1: []}}
44
+
45
+ # Determine how many hands are present in the dataset
46
+ range_hand_size = 1
47
+ if "wrist_1_X" in df.columns:
48
+ range_hand_size = 2
49
+
50
+ # Construct the relevant identifiers
51
+ for identifier in HAND_IDENTIFIERS:
52
+ for hand_index in range(range_hand_size):
53
+ hand_landmarks["X"][hand_index].append(identifier + "_" + str(hand_index) + "_X")
54
+ hand_landmarks["Y"][hand_index].append(identifier + "_" + str(hand_index) + "_Y")
55
+
56
+ # Iterate over all of the records in the dataset
57
+ for index, row in df.iterrows():
58
+ # Treat each hand individually
59
+ for hand_index in range(range_hand_size):
60
+
61
+ sequence_size = len(row["wrist_" + str(hand_index) + "_X"])
62
+
63
+ # Treat each element of the sequence (analyzed frame) individually
64
+ for sequence_index in range(sequence_size):
65
+
66
+ # Retrieve all of the X and Y values of the current frame
67
+ landmarks_x_values = [row[key][sequence_index] for key in hand_landmarks["X"][hand_index] if row[key][sequence_index] != 0]
68
+ landmarks_y_values = [row[key][sequence_index] for key in hand_landmarks["Y"][hand_index] if row[key][sequence_index] != 0]
69
+
70
+ # Prevent from even starting the analysis if some necessary elements are not present
71
+ if not landmarks_x_values or not landmarks_y_values:
72
+ logging.warning(
73
+ " HAND LANDMARKS: One frame could not be normalized as there is no data present. Record: " + str(index) +
74
+ ", Frame: " + str(sequence_index))
75
+ continue
76
+
77
+ # Calculate the deltas
78
+ width, height = max(landmarks_x_values) - min(landmarks_x_values), max(landmarks_y_values) - min(
79
+ landmarks_y_values)
80
+ if width > height:
81
+ delta_x = 0.1 * width
82
+ delta_y = delta_x + ((width - height) / 2)
83
+ else:
84
+ delta_y = 0.1 * height
85
+ delta_x = delta_y + ((height - width) / 2)
86
+
87
+ # Set the starting and ending point of the normalization bounding box
88
+ starting_point = (min(landmarks_x_values) - delta_x, min(landmarks_y_values) - delta_y)
89
+ ending_point = (max(landmarks_x_values) + delta_x, max(landmarks_y_values) + delta_y)
90
+
91
+ # Normalize individual landmarks and save the results
92
+ for identifier in HAND_IDENTIFIERS:
93
+ key = identifier + "_" + str(hand_index) + "_"
94
+
95
+ # Prevent from trying to normalize incorrectly captured points
96
+ if row[key + "X"][sequence_index] == 0 or (ending_point[0] - starting_point[0]) == 0 or (starting_point[1] - ending_point[1]) == 0:
97
+ continue
98
+
99
+ normalized_x = (row[key + "X"][sequence_index] - starting_point[0]) / (ending_point[0] -
100
+ starting_point[0])
101
+ normalized_y = (row[key + "Y"][sequence_index] - ending_point[1]) / (starting_point[1] -
102
+ ending_point[1])
103
+
104
+ row[key + "X"][sequence_index] = normalized_x
105
+ row[key + "Y"][sequence_index] = normalized_y
106
+
107
+ normalized_df = normalized_df.append(row, ignore_index=True)
108
+
109
+ return normalized_df
110
+
111
+
112
+ def normalize_single_dict(row: dict):
113
+ """
114
+ Normalizes the skeletal data for a given sequence of frames with signer's hand pose data. The normalization follows
115
+ the definition from our paper.
116
+
117
+ :param row: Dictionary containing key-value pairs with joint identifiers and corresponding lists (sequences) of
118
+ that particular joints coordinates
119
+ :return: Dictionary with normalized skeletal data (following the same schema as input data)
120
+ """
121
+
122
+ hand_landmarks = {0: [], 1: []}
123
+
124
+ # Determine how many hands are present in the dataset
125
+ range_hand_size = 1
126
+ if "wrist_1" in row.keys():
127
+ range_hand_size = 2
128
+
129
+ # Construct the relevant identifiers
130
+ for identifier in HAND_IDENTIFIERS:
131
+ for hand_index in range(range_hand_size):
132
+ hand_landmarks[hand_index].append(identifier + "_" + str(hand_index))
133
+
134
+ # Treat each hand individually
135
+ for hand_index in range(range_hand_size):
136
+
137
+ sequence_size = len(row["wrist_" + str(hand_index)])
138
+
139
+ # Treat each element of the sequence (analyzed frame) individually
140
+ for sequence_index in range(sequence_size):
141
+
142
+ # Retrieve all of the X and Y values of the current frame
143
+ landmarks_x_values = [row[key][sequence_index][0] for key in hand_landmarks[hand_index] if
144
+ row[key][sequence_index][0] != 0]
145
+ landmarks_y_values = [row[key][sequence_index][1] for key in hand_landmarks[hand_index] if
146
+ row[key][sequence_index][1] != 0]
147
+
148
+ # Prevent from even starting the analysis if some necessary elements are not present
149
+ if not landmarks_x_values or not landmarks_y_values:
150
+ continue
151
+
152
+ # Calculate the deltas
153
+ width, height = max(landmarks_x_values) - min(landmarks_x_values), max(landmarks_y_values) - min(
154
+ landmarks_y_values)
155
+ if width > height:
156
+ delta_x = 0.1 * width
157
+ delta_y = delta_x + ((width - height) / 2)
158
+ else:
159
+ delta_y = 0.1 * height
160
+ delta_x = delta_y + ((height - width) / 2)
161
+
162
+ # Set the starting and ending point of the normalization bounding box
163
+ starting_point = [min(landmarks_x_values) - delta_x, min(landmarks_y_values) - delta_y]
164
+ ending_point = [max(landmarks_x_values) + delta_x, max(landmarks_y_values) + delta_y]
165
+ # Ensure that all of the bounding-box-defining coordinates are not out of the picture
166
+ if starting_point[0] < 0: starting_point[0] = 0
167
+ if starting_point[1] > 1: starting_point[1] = 1
168
+ if ending_point[0] < 0: ending_point[0] = 0
169
+ if ending_point[1] > 1: ending_point[1] = 1
170
+
171
+ # Normalize individual landmarks and save the results
172
+ for identifier in HAND_IDENTIFIERS:
173
+ key = identifier + "_" + str(hand_index)
174
+
175
+ # Prevent from trying to normalize incorrectly captured points
176
+ if row[key][sequence_index][0] == 0 or (ending_point[0] - starting_point[0]) == 0 or (
177
+ starting_point[1] - ending_point[1]) == 0:
178
+ continue
179
+
180
+ normalized_x = (row[key][sequence_index][0] - starting_point[0]) / (ending_point[0] - starting_point[0])
181
+ normalized_y = (row[key][sequence_index][1] - starting_point[1]) / (ending_point[1] - starting_point[1])
182
+
183
+ row[key][sequence_index] = list(row[key][sequence_index])
184
+
185
+ row[key][sequence_index][0] = normalized_x
186
+ row[key][sequence_index][1] = normalized_y
187
+
188
+ return row
189
+
190
+
191
+ if __name__ == "__main__":
192
+ pass
spoter_mod/normalization/main.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import ast
3
+ import pandas as pd
4
+
5
+ from normalization.hand_normalization import normalize_hands_full
6
+ from normalization.body_normalization import normalize_body_full
7
+
8
+
9
+ # Load the dataset
10
+ df = pd.read_csv("/Users/matyasbohacek/Documents/WLASL_test_15fps.csv", encoding="utf-8")
11
+
12
+ # Retrieve metadata
13
+ video_size_heights = df["video_size_height"].to_list()
14
+ video_size_widths = df["video_size_width"].to_list()
15
+
16
+ # Delete redundant (non-related) properties
17
+ del df["video_size_height"]
18
+ del df["video_size_width"]
19
+
20
+ # Temporarily remove other relevant metadata
21
+ labels = df["labels"].to_list()
22
+ video_fps = df["video_fps"].to_list()
23
+ del df["labels"]
24
+ del df["video_fps"]
25
+
26
+ # Convert the strings into lists
27
+ convert = lambda x: ast.literal_eval(str(x))
28
+ for column in df.columns:
29
+ df[column] = df[column].apply(convert)
30
+
31
+ # Perform the normalizations
32
+ df = normalize_hands_full(df)
33
+ df, invalid_row_indexes = normalize_body_full(df)
34
+
35
+ # Clear lists of items from deleted rows
36
+ # labels = [t for i, t in enumerate(labels) if i not in invalid_row_indexes]
37
+ # video_fps = [t for i, t in enumerate(video_fps) if i not in invalid_row_indexes]
38
+
39
+ # Return the metadata back to the dataset
40
+ df["labels"] = labels
41
+ df["video_fps"] = video_fps
42
+
43
+ df.to_csv("/Users/matyasbohacek/Desktop/WLASL_test_15fps_normalized.csv", encoding="utf-8", index=False)
spoter_mod/pose_model_identifier.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ BODY_IDENTIFIERS = {
4
+ "nose": 0,
5
+ "neck": -1,
6
+ "rightEye": 5,
7
+ "leftEye": 2,
8
+ "rightEar": 8,
9
+ "leftEar": 7,
10
+ "rightShoulder": 12,
11
+ "leftShoulder": 11,
12
+ "rightElbow": 14,
13
+ "leftElbow": 13,
14
+ "rightWrist": 16,
15
+ "leftWrist": 15
16
+ }
17
+ HAND_IDENTIFIERS = {
18
+ "wrist": 0,
19
+ "indexTip": 8,
20
+ "indexDIP": 7,
21
+ "indexPIP": 6,
22
+ "indexMCP": 5,
23
+ "middleTip": 12,
24
+ "middleDIP": 11,
25
+ "middlePIP": 10,
26
+ "middleMCP": 9,
27
+ "ringTip": 16,
28
+ "ringDIP": 15,
29
+ "ringPIP": 14,
30
+ "ringMCP": 13,
31
+ "littleTip": 20,
32
+ "littleDIP": 19,
33
+ "littlePIP": 18,
34
+ "littleMCP": 17,
35
+ "thumbTip": 4,
36
+ "thumbIP": 3,
37
+ "thumbMP": 2,
38
+ "thumbCMC": 1
39
+ }
40
+
41
+
42
+ class mp_holistic_data:
43
+ def __init__(self, column_names):
44
+ self.data_hub = {}
45
+ for n in column_names[1:-1]:
46
+ self.data_hub[n] = []
47
+
48
+ def hand_append_zero(self, handedness):
49
+ for k in self.data_hub.keys():
50
+ if "_" + handedness + "_" in k:
51
+ self.data_hub[k].append(0)
52
+
53
+ def hand_append_value(self, handedness, hand_landmarks):
54
+ for name, lm_idx in HAND_IDENTIFIERS.items():
55
+ lm = hand_landmarks.landmark[lm_idx]
56
+ for xy, xy_value in zip(['_X', '_Y'], [lm.x, lm.y]):
57
+ k = name + '_' + handedness + xy
58
+ self.data_hub[k].append(xy_value)
59
+
60
+ def get_series(self):
61
+ return pd.Series(self.data_hub)
62
+
63
+ def extract_data(self, holistic_results):
64
+ def neck(pose_results):
65
+ ls = pose_results.pose_landmarks.landmark[11]
66
+ rs = pose_results.pose_landmarks.landmark[12]
67
+ no = pose_results.pose_landmarks.landmark[0]
68
+ if (ls.visibility > 0.5) & (rs.visibility > 0.5) & (no.visibility > 0.5):
69
+ # This indicates the neck better. But it does not affect the result.
70
+ cx = (ls.x + rs.x) / 2
71
+ cy = (ls.y + rs.y) / 2
72
+ dx = no.x - cx
73
+ dy = no.y - cy
74
+ x = cx + 0.3 * dx
75
+ y = cy + 0.3 * dy
76
+ # x = (ls.x+rs.x)/2
77
+ # y = (ls.y+rs.y)/2
78
+ else:
79
+ x = 0
80
+ y = 0
81
+ return [x, y]
82
+
83
+ # for the frame that can not extract skeleton from
84
+ if not holistic_results.pose_landmarks:
85
+ return
86
+ for name, lm_idx in BODY_IDENTIFIERS.items():
87
+ if name == "neck":
88
+ xy_value = neck(holistic_results)
89
+ else:
90
+ lm = holistic_results.pose_landmarks.landmark[lm_idx]
91
+ visible = float(lm.visibility >= 0.5)
92
+ xy_value = [lm.x * visible, lm.y * visible]
93
+ for xy_id, xy in zip(['_X', '_Y'], xy_value):
94
+ s_name = name + xy_id
95
+ self.data_hub[s_name].append(xy)
96
+
97
+ for handedness, lm in zip(['Right', 'Left'],
98
+ [holistic_results.right_hand_landmarks, holistic_results.left_hand_landmarks]):
99
+ if lm:
100
+ self.hand_append_value(handedness, lm)
101
+ else:
102
+ self.hand_append_zero(handedness)
103
+ return
spoter_mod/requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pandas==1.1.5
2
+ tqdm==4.54.1
3
+ matplotlib
4
+ torch==1.8.1
5
+ torchvision
6
+ scikit-learn
7
+ opencv-python
spoter_mod/skeleton_extractor.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+
3
+ import pandas as pd
4
+ from os import path
5
+ import cv2
6
+ import mediapipe as mp
7
+ import json
8
+ from spoter_mod.pose_model_identifier import BODY_IDENTIFIERS, HAND_IDENTIFIERS, mp_holistic_data
9
+
10
+ mp_drawing = mp.solutions.drawing_utils
11
+ mp_holistic = mp.solutions.holistic
12
+ mp_drawing_styles = mp.solutions.drawing_styles
13
+
14
+ holistic = mp_holistic.Holistic()
15
+
16
+ column_names = []
17
+ column_names.append('video_id')
18
+ for id_name in BODY_IDENTIFIERS.keys():
19
+ for xy in ["_X", "_Y"]:
20
+ column_names.append(id_name + xy)
21
+
22
+ for lr in ["_Right", "_Left"]:
23
+ for id_name in HAND_IDENTIFIERS.keys():
24
+ for xy in ["_X", "_Y"]:
25
+ column_names.append(id_name + lr + xy)
26
+
27
+ column_names.append('labels')
28
+
29
+
30
+ def create_df(flnm, column_names):
31
+ df = pd.DataFrame(columns=column_names)
32
+ return df
33
+
34
+
35
+ def save_data(df, data, flnm):
36
+ df = df.append(data.get_series(), ignore_index=True)
37
+ df.to_pickle(flnm)
38
+
39
+
40
+ def obtain_pose_data(path):
41
+ cap = cv2.VideoCapture(path)
42
+ data = mp_holistic_data(column_names)
43
+ while cap.isOpened():
44
+ ret, frame = cap.read()
45
+ if not ret:
46
+ break
47
+ # Recolor image to RGB
48
+ image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
49
+
50
+ # Make detection
51
+ holistic_results = holistic.process(image)
52
+ # Extract feature and save to mp_pose_data class
53
+ data.extract_data(holistic_results)
54
+ cap.release()
55
+
56
+ return data
57
+
58
+
59
+ if __name__ == '__main__':
60
+ pass
spoter_mod/sweep-agent.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ #PBS -N spoter-zhuo-sweep
3
+ #PBS -q gpu
4
+ #PBS -l walltime=24:00:00
5
+
6
+ #PBS -l select=1:ncpus=1:ngpus=1:cluster=adan:mem=10gb
7
+ #PBS -j oe
8
+ #PBS -m ae
9
+
10
+ echo "Experiment starting..."
11
+
12
+ cd /storage/plzen4-ntis/home/mbohacek/spoter-zhuo
13
+
14
+ module add conda-modules
15
+ conda activate cslr-transformers
16
+
17
+ wandb agent matyasbohacek/Zhuo-collab-SPOTER-Sweep/bh6fc056
18
+
spoter_mod/sweep.yaml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ program: train.py
2
+ method: bayes
3
+ project: Zhuo-collab-SPOTER-Sweep
4
+ metric:
5
+ name: best-accuracy
6
+ goal: maximize
7
+ parameters:
8
+ augmentations_probability:
9
+ min: 0.20
10
+ max: 0.75
11
+ rotate_angle:
12
+ min: 5
13
+ max: 20
14
+ perspective_transform_ratio:
15
+ min: 0.05
16
+ max: 0.2
17
+ squeeze_ratio:
18
+ min: 0.05
19
+ max: 0.4
20
+ arm_joint_rotate_angle:
21
+ min: 1
22
+ max: 10
23
+ arm_joint_rotate_probability:
24
+ min: 0.2
25
+ max: 0.5
26
+ command:
27
+ - python3
28
+ - "-m"
29
+ - train
30
+ - "--epochs"
31
+ - 130
32
+ - "--num_classes"
33
+ - 100
34
+ - "--lr"
35
+ - 0.001
36
+ - "--experiment_name"
37
+ - "zhuo-repro"
38
+ - "--training_set_path"
39
+ - "/storage/plzen4-ntis/home/mbohacek/spoter-zhuo/WLASL100_zhuo_train.csv"
40
+ - "--testing_set_path"
41
+ - "/storage/plzen4-ntis/home/mbohacek/spoter-zhuo/WLASL100_zhuo_test.csv"
42
+ - "--validation_set_path"
43
+ - "/storage/plzen4-ntis/home/mbohacek/spoter-zhuo/WLASL100_zhuo_val.csv"
44
+ - "--validation_set"
45
+ - "from-file"
46
+ - "--wandb_key"
47
+ - "beb8925bb5b17aaecd40139da4c299f76753291e"
48
+ - "--wandb_entity"
49
+ - "matyasbohacek"
50
+ - ${args}
spoter_mod/train.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import argparse
4
+ import random
5
+ import logging
6
+ import torch
7
+ import wandb
8
+
9
+ import numpy as np
10
+ import torch.nn as nn
11
+ import torch.optim as optim
12
+ import matplotlib.pyplot as plt
13
+ import matplotlib.ticker as ticker
14
+ from torchvision import transforms
15
+ from torch.utils.data import DataLoader
16
+ from pathlib import Path
17
+
18
+ from utils import __balance_val_split, __split_of_train_sequence
19
+ from datasets.czech_slr_dataset import CzechSLRDataset
20
+ from spoter.spoter_model import SPOTER
21
+ from spoter.utils import train_epoch, evaluate
22
+ from spoter.gaussian_noise import GaussianNoise
23
+
24
+
25
+ def get_default_args():
26
+ parser = argparse.ArgumentParser(add_help=False)
27
+
28
+ parser.add_argument("--experiment_name", type=str, default="lsa_64_spoter",
29
+ help="Name of the experiment after which the logs and plots will be named")
30
+ parser.add_argument("--num_classes", type=int, default=64, help="Number of classes to be recognized by the model")
31
+ parser.add_argument("--hidden_dim", type=int, default=108,
32
+ help="Hidden dimension of the underlying Transformer model")
33
+ parser.add_argument("--seed", type=int, default=379,
34
+ help="Seed with which to initialize all the random components of the training")
35
+
36
+ # Data
37
+ parser.add_argument("--training_set_path", type=str, default="", help="Path to the training dataset CSV file")
38
+ parser.add_argument("--testing_set_path", type=str, default="", help="Path to the testing dataset CSV file")
39
+ parser.add_argument("--experimental_train_split", type=float, default=None,
40
+ help="Determines how big a portion of the training set should be employed (intended for the "
41
+ "gradually enlarging training set experiment from the paper)")
42
+
43
+ parser.add_argument("--validation_set", type=str, choices=["from-file", "split-from-train", "none"],
44
+ default="from-file", help="Type of validation set construction. See README for further rederence")
45
+ parser.add_argument("--validation_set_size", type=float,
46
+ help="Proportion of the training set to be split as validation set, if 'validation_size' is set"
47
+ " to 'split-from-train'")
48
+ parser.add_argument("--validation_set_path", type=str, default="", help="Path to the validation dataset CSV file")
49
+
50
+ # Training hyperparameters
51
+ parser.add_argument("--epochs", type=int, default=100, help="Number of epochs to train the model for")
52
+ parser.add_argument("--lr", type=float, default=0.001, help="Learning rate for the model training")
53
+ parser.add_argument("--log_freq", type=int, default=1,
54
+ help="Log frequency (frequency of printing all the training info)")
55
+
56
+ # Checkpointing
57
+ parser.add_argument("--save_checkpoints", type=bool, default=True,
58
+ help="Determines whether to save weights checkpoints")
59
+
60
+ # Scheduler
61
+ parser.add_argument("--scheduler_factor", type=int, default=0.1, help="Factor for the ReduceLROnPlateau scheduler")
62
+ parser.add_argument("--scheduler_patience", type=int, default=5,
63
+ help="Patience for the ReduceLROnPlateau scheduler")
64
+
65
+ # Gaussian noise normalization
66
+ parser.add_argument("--gaussian_mean", type=int, default=0, help="Mean parameter for Gaussian noise layer")
67
+ parser.add_argument("--gaussian_std", type=int, default=0.001,
68
+ help="Standard deviation parameter for Gaussian noise layer")
69
+
70
+ parser.add_argument("--augmentations_probability", type=float, default=0.5, help="") # 0.462
71
+ parser.add_argument("--rotate_angle", type=int, default=17, help="") # 17
72
+ parser.add_argument("--perspective_transform_ratio", type=float, default=0.2, help="") # 0.1682
73
+ parser.add_argument("--squeeze_ratio", type=float, default=0.4, help="") # 0.3971
74
+ parser.add_argument("--arm_joint_rotate_angle", type=int, default=4, help="") # 3
75
+ parser.add_argument("--arm_joint_rotate_probability", type=float, default=0.4, help="") # 0.3596
76
+
77
+ # Visualization
78
+ parser.add_argument("--plot_stats", type=bool, default=True,
79
+ help="Determines whether continuous statistics should be plotted at the end")
80
+ parser.add_argument("--plot_lr", type=bool, default=True,
81
+ help="Determines whether the LR should be plotted at the end")
82
+
83
+ # WANDB
84
+ parser.add_argument("--wandb_key", type=str, default="", help="")
85
+ parser.add_argument("--wandb_entity", type=str, default="", help="")
86
+
87
+ return parser
88
+
89
+
90
+ def train(args):
91
+
92
+ if args.wandb_key:
93
+ wandb.login(key=args.wandb_key)
94
+ wandb.init(project=args.experiment_name, entity=args.wandb_entity)
95
+ wandb.config.update(args)
96
+
97
+ # MARK: TRAINING PREPARATION AND MODULES
98
+ args.experiment_name = args.experiment_name + "_lr" + wandb.run.id
99
+
100
+ # Initialize all the random seeds
101
+ random.seed(args.seed)
102
+ np.random.seed(args.seed)
103
+ os.environ["PYTHONHASHSEED"] = str(args.seed)
104
+ torch.manual_seed(args.seed)
105
+ torch.cuda.manual_seed(args.seed)
106
+ torch.cuda.manual_seed_all(args.seed)
107
+ torch.backends.cudnn.deterministic = True
108
+ g = torch.Generator()
109
+ g.manual_seed(args.seed)
110
+
111
+ # Set the output format to print into the console and save into LOG file
112
+ logging.basicConfig(
113
+ level=logging.INFO,
114
+ format="%(asctime)s [%(levelname)s] %(message)s",
115
+ handlers=[
116
+ logging.FileHandler(args.experiment_name + "_" + str(args.experimental_train_split).replace(".", "") + ".log")
117
+ ]
118
+ )
119
+
120
+ # Set device to CUDA only if applicable
121
+ device = torch.device("cpu")
122
+ if torch.cuda.is_available():
123
+ device = torch.device("cuda")
124
+
125
+ # Construct the model
126
+ slrt_model = SPOTER(num_classes=args.num_classes, hidden_dim=args.hidden_dim)
127
+ slrt_model.train(True)
128
+ slrt_model.to(device)
129
+
130
+ # Construct the other modules
131
+ cel_criterion = nn.CrossEntropyLoss()
132
+ sgd_optimizer = optim.SGD(slrt_model.parameters(), lr=args.lr)
133
+ scheduler = optim.lr_scheduler.ReduceLROnPlateau(sgd_optimizer, factor=args.scheduler_factor, patience=args.scheduler_patience)
134
+
135
+ # Ensure that the path for checkpointing and for images both exist
136
+ Path("out-checkpoints/" + args.experiment_name + "/").mkdir(parents=True, exist_ok=True)
137
+ Path("out-img/").mkdir(parents=True, exist_ok=True)
138
+
139
+
140
+ # MARK: DATA
141
+
142
+ # Training set
143
+ transform = transforms.Compose([GaussianNoise(args.gaussian_mean, args.gaussian_std)])
144
+ augmentations_config = {
145
+ "rotate-angle": args.rotate_angle,
146
+ "perspective-transform-ratio": args.perspective_transform_ratio,
147
+ "squeeze-ratio": args.squeeze_ratio,
148
+ "arm-joint-rotate-angle": args.arm_joint_rotate_angle,
149
+ "arm-joint-rotate-probability": args.arm_joint_rotate_probability
150
+ }
151
+
152
+ train_set = CzechSLRDataset(args.training_set_path, transform=transform, augmentations=True,
153
+ augmentations_prob=args.augmentations_probability, augmentations_config=augmentations_config)
154
+
155
+ # Validation set
156
+ if args.validation_set == "from-file":
157
+ val_set = CzechSLRDataset(args.validation_set_path)
158
+ val_loader = DataLoader(val_set, shuffle=True, generator=g)
159
+
160
+ elif args.validation_set == "split-from-train":
161
+ train_set, val_set = __balance_val_split(train_set, 0.2)
162
+
163
+ val_set.transform = None
164
+ val_set.augmentations = False
165
+ val_loader = DataLoader(val_set, shuffle=True, generator=g)
166
+
167
+ else:
168
+ val_loader = None
169
+
170
+ # Testing set
171
+ if args.testing_set_path:
172
+ eval_set = CzechSLRDataset(args.testing_set_path)
173
+ eval_loader = DataLoader(eval_set, shuffle=True, generator=g)
174
+
175
+ else:
176
+ eval_loader = None
177
+
178
+ # Final training set refinements
179
+ if args.experimental_train_split:
180
+ train_set = __split_of_train_sequence(train_set, args.experimental_train_split)
181
+
182
+ train_loader = DataLoader(train_set, shuffle=True, generator=g)
183
+
184
+
185
+ # MARK: TRAINING
186
+ train_acc, val_acc = 0, 0
187
+ losses, train_accs, val_accs = [], [], []
188
+ lr_progress = []
189
+ top_train_acc, top_val_acc = 0, 0
190
+ checkpoint_index = 0
191
+
192
+ if args.experimental_train_split:
193
+ print("Starting " + args.experiment_name + "_" + str(args.experimental_train_split).replace(".", "") + "...\n\n")
194
+ logging.info("Starting " + args.experiment_name + "_" + str(args.experimental_train_split).replace(".", "") + "...\n\n")
195
+
196
+ else:
197
+ print("Starting " + args.experiment_name + "...\n\n")
198
+ logging.info("Starting " + args.experiment_name + "...\n\n")
199
+
200
+ for epoch in range(args.epochs):
201
+ train_loss, _, _, train_acc = train_epoch(slrt_model, train_loader, cel_criterion, sgd_optimizer, device)
202
+ losses.append(train_loss.item() / len(train_loader))
203
+ train_accs.append(train_acc)
204
+
205
+ if val_loader:
206
+ slrt_model.train(False)
207
+ _, _, val_acc = evaluate(slrt_model, val_loader, device)
208
+ slrt_model.train(True)
209
+ val_accs.append(val_acc)
210
+
211
+ # Save checkpoints if they are best in the current subset
212
+ if args.save_checkpoints:
213
+ if train_acc > top_train_acc:
214
+ top_train_acc = train_acc
215
+ torch.save(slrt_model, "out-checkpoints/" + args.experiment_name + "/checkpoint_t_" + str(checkpoint_index) + ".pth")
216
+
217
+ if val_acc > top_val_acc:
218
+ top_val_acc = val_acc
219
+ torch.save(slrt_model, "out-checkpoints/" + args.experiment_name + "/checkpoint_v_" + str(checkpoint_index) + ".pth")
220
+
221
+ if epoch % args.log_freq == 0:
222
+ print("[" + str(epoch + 1) + "] TRAIN loss: " + str(train_loss.item() / len(train_loader)) + " acc: " + str(train_acc))
223
+ logging.info("[" + str(epoch + 1) + "] TRAIN loss: " + str(train_loss.item() / len(train_loader)) + " acc: " + str(train_acc))
224
+
225
+ wandb.log({
226
+ "epoch": int(epoch + 1),
227
+ "train-loss": float(train_loss.item() / len(train_loader)),
228
+ "train-accuracy": train_acc
229
+ })
230
+
231
+ if val_loader:
232
+ print("[" + str(epoch + 1) + "] VALIDATION acc: " + str(val_acc))
233
+ logging.info("[" + str(epoch + 1) + "] VALIDATION acc: " + str(val_acc))
234
+
235
+ if args.wandb_key:
236
+ wandb.log({
237
+ "validation-accuracy": val_acc
238
+ })
239
+
240
+ print("")
241
+ logging.info("")
242
+
243
+ # Reset the top accuracies on static subsets
244
+ if epoch % 10 == 0:
245
+ top_train_acc, top_val_acc = 0, 0
246
+ checkpoint_index += 1
247
+
248
+ lr_progress.append(sgd_optimizer.param_groups[0]["lr"])
249
+
250
+ # MARK: TESTING
251
+
252
+ print("\nTesting checkpointed models starting...\n")
253
+ logging.info("\nTesting checkpointed models starting...\n")
254
+
255
+ top_result, top_result_name = 0, ""
256
+
257
+ if eval_loader:
258
+ for i in range(checkpoint_index):
259
+ for checkpoint_id in ["t", "v"]:
260
+ # tested_model = VisionTransformer(dim=2, mlp_dim=108, num_classes=100, depth=12, heads=8)
261
+ tested_model = torch.load("out-checkpoints/" + args.experiment_name + "/checkpoint_" + checkpoint_id + "_" + str(i) + ".pth")
262
+ tested_model.train(False)
263
+ _, _, eval_acc = evaluate(tested_model, eval_loader, device, print_stats=True)
264
+
265
+ if eval_acc > top_result:
266
+ top_result = eval_acc
267
+ top_result_name = args.experiment_name + "/checkpoint_" + checkpoint_id + "_" + str(i)
268
+
269
+ print("checkpoint_" + checkpoint_id + "_" + str(i) + " -> " + str(eval_acc))
270
+ logging.info("checkpoint_" + checkpoint_id + "_" + str(i) + " -> " + str(eval_acc))
271
+
272
+ print("\nThe top result was recorded at " + str(top_result) + " testing accuracy. The best checkpoint is " + top_result_name + ".")
273
+ logging.info("\nThe top result was recorded at " + str(top_result) + " testing accuracy. The best checkpoint is " + top_result_name + ".")
274
+
275
+ if args.wandb_key:
276
+ wandb.run.summary["best-accuracy"] = top_result
277
+ wandb.run.summary["best-checkpoint"] = top_result_name
278
+
279
+ # PLOT 0: Performance (loss, accuracies) chart plotting
280
+ if args.plot_stats:
281
+ fig, ax = plt.subplots()
282
+ ax.plot(range(1, len(losses) + 1), losses, c="#D64436", label="Training loss")
283
+ ax.plot(range(1, len(train_accs) + 1), train_accs, c="#00B09B", label="Training accuracy")
284
+
285
+ if val_loader:
286
+ ax.plot(range(1, len(val_accs) + 1), val_accs, c="#E0A938", label="Validation accuracy")
287
+
288
+ ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))
289
+
290
+ ax.set(xlabel="Epoch", ylabel="Accuracy / Loss", title="")
291
+ plt.legend(loc="upper center", bbox_to_anchor=(0.5, 1.05), ncol=4, fancybox=True, shadow=True, fontsize="xx-small")
292
+ ax.grid()
293
+
294
+ fig.savefig("out-img/" + args.experiment_name + "_loss.png")
295
+
296
+ # PLOT 1: Learning rate progress
297
+ if args.plot_lr:
298
+ fig1, ax1 = plt.subplots()
299
+ ax1.plot(range(1, len(lr_progress) + 1), lr_progress, label="LR")
300
+ ax1.set(xlabel="Epoch", ylabel="LR", title="")
301
+ ax1.grid()
302
+
303
+ fig1.savefig("out-img/" + args.experiment_name + "_lr.png")
304
+
305
+ print("\nAny desired statistics have been plotted.\nThe experiment is finished.")
306
+ logging.info("\nAny desired statistics have been plotted.\nThe experiment is finished.")
307
+
308
+
309
+ if __name__ == '__main__':
310
+ parser = argparse.ArgumentParser("", parents=[get_default_args()], add_help=False)
311
+ args = parser.parse_args()
312
+ train(args)
spoter_mod/utils.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import numpy as np
3
+
4
+ from collections import Counter
5
+ from torch.utils.data import Subset
6
+ from sklearn.model_selection import train_test_split
7
+
8
+
9
+ def __balance_val_split(dataset, val_split=0.):
10
+ targets = np.array(dataset.targets)
11
+ train_indices, val_indices = train_test_split(
12
+ np.arange(targets.shape[0]),
13
+ test_size=val_split,
14
+ stratify=targets
15
+ )
16
+
17
+ train_dataset = Subset(dataset, indices=train_indices)
18
+ val_dataset = Subset(dataset, indices=val_indices)
19
+
20
+ return train_dataset, val_dataset
21
+
22
+
23
+ def __split_of_train_sequence(subset: Subset, train_split=1.0):
24
+ if train_split == 1:
25
+ return subset
26
+
27
+ targets = np.array([subset.dataset.targets[i] for i in subset.indices])
28
+ train_indices, _ = train_test_split(
29
+ np.arange(targets.shape[0]),
30
+ test_size=1 - train_split,
31
+ stratify=targets
32
+ )
33
+
34
+ train_dataset = Subset(subset.dataset, indices=[subset.indices[i] for i in train_indices])
35
+
36
+ return train_dataset
37
+
38
+
39
+ def __log_class_statistics(subset: Subset):
40
+ train_classes = [subset.dataset.targets[i] for i in subset.indices]
41
+ print(dict(Counter(train_classes)))
spoter_mod/wandb/debug-cli.log ADDED
File without changes