Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,492 Bytes
9c9498f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
import os
import pandas as pd
import json
from PIL import Image
import argparse
# preprocessing adapted from https://github.com/ejcgt/attention-target-detection/blob/master/dataset.py
parser = argparse.ArgumentParser()
parser.add_argument("--data_path", type=str, default="./data/gazefollow")
args = parser.parse_args()
def main(DATA_PATH):
# TRAIN
train_csv_path = os.path.join(DATA_PATH, "train_annotations_release.txt")
column_names = ['path', 'idx', 'body_bbox_x', 'body_bbox_y', 'body_bbox_w', 'body_bbox_h', 'eye_x', 'eye_y',
'gaze_x', 'gaze_y', 'bbox_x_min', 'bbox_y_min', 'bbox_x_max', 'bbox_y_max', 'inout', 'source', 'meta']
df = pd.read_csv(train_csv_path, header=None, names=column_names, index_col=False)
df = df[df['inout'] != -1]
df = df.groupby("path").agg(list) # aggregate over frames
multiperson_ex = 0
TRAIN_FRAMES = []
for path, row in df.iterrows():
img_path = os.path.join(DATA_PATH, path)
img = Image.open(img_path)
width, height = img.size
num_people = len(row['idx'])
if num_people > 1:
multiperson_ex += 1
heads = []
crop_constraint_xs = []
crop_constraint_ys = []
for i in range(num_people):
xmin, ymin, xmax, ymax = row['bbox_x_min'][i], row['bbox_y_min'][i], row['bbox_x_max'][i], row['bbox_y_max'][i]
gazex = row['gaze_x'][i] * float(width)
gazey = row['gaze_y'][i] * float(height)
gazex_norm = row['gaze_x'][i]
gazey_norm = row['gaze_y'][i]
if xmin > xmax:
temp = xmin
xmin = xmax
xmax = temp
if ymin > ymax:
temp = ymin
ymin = ymax
ymax = temp
# move in out of frame bbox annotations
xmin = max(xmin, 0)
ymin = max(ymin, 0)
xmax = min(xmax, width)
ymax = min(ymax, height)
# precalculate feasible crop region (containing bbox and gaze target)
crop_xmin = min(xmin, gazex)
crop_ymin = min(ymin, gazey)
crop_xmax = max(xmax, gazex)
crop_ymax = max(ymax, gazey)
crop_constraint_xs.extend([crop_xmin, crop_xmax])
crop_constraint_ys.extend([crop_ymin, crop_ymax])
heads.append({
'bbox': [xmin, ymin, xmax, ymax],
'bbox_norm': [xmin / float(width), ymin / float(height), xmax / float(width), xmax / float(height)],
'inout': row['inout'][i],
'gazex': [gazex], # convert to list for consistency with multi-annotation format
'gazey': [gazey],
'gazex_norm': [gazex_norm],
'gazey_norm': [gazey_norm],
'crop_region': [crop_xmin, crop_ymin, crop_xmax, crop_ymax],
'crop_region_norm': [crop_xmin / float(width), crop_ymin / float(height), crop_xmin / float(width), crop_ymax / float(height)],
'head_id': i
})
TRAIN_FRAMES.append({
'path': path,
'heads': heads,
'num_heads': num_people,
'width': width,
'height': height,
'crop_region': [min(crop_constraint_xs), min(crop_constraint_ys), max(crop_constraint_xs), max(crop_constraint_ys)],
})
print("Train set: {} frames, {} multi-person".format(len(TRAIN_FRAMES), multiperson_ex))
out_file = open(os.path.join(DATA_PATH, "train_preprocessed.json"), "w")
json.dump(TRAIN_FRAMES, out_file)
# TEST
test_csv_path = os.path.join(DATA_PATH, "test_annotations_release.txt")
column_names = ['path', 'idx', 'body_bbox_x', 'body_bbox_y', 'body_bbox_w', 'body_bbox_h', 'eye_x', 'eye_y',
'gaze_x', 'gaze_y', 'bbox_x_min', 'bbox_y_min', 'bbox_x_max', 'bbox_y_max', 'source', 'meta']
df = pd.read_csv(test_csv_path, header=None, names=column_names, index_col=False)
TEST_FRAME_DICT = {}
df = df.groupby(["path", "eye_x"]).agg(list) # aggregate over frames
for id, row in df.iterrows(): # aggregate by frame
path, _ = id
if path in TEST_FRAME_DICT.keys():
TEST_FRAME_DICT[path].append(row)
else:
TEST_FRAME_DICT[path] = [row]
multiperson_ex = 0
TEST_FRAMES = []
for path in TEST_FRAME_DICT.keys():
img_path = os.path.join(DATA_PATH, path)
img = Image.open(img_path)
width, height = img.size
item = TEST_FRAME_DICT[path]
num_people = len(item)
heads = []
crop_constraint_xs = []
crop_constraint_ys = []
for i in range(num_people):
row = item[i]
assert(row['bbox_x_min'].count(row['bbox_x_min'][0]) == len(row['bbox_x_min'])) # quick check that all bboxes are equivalent
xmin, ymin, xmax, ymax = row['bbox_x_min'][0], row['bbox_y_min'][0], row['bbox_x_max'][0], row['bbox_y_max'][0]
if xmin > xmax:
temp = xmin
xmin = xmax
xmax = temp
if ymin > ymax:
temp = ymin
ymin = ymax
ymax = temp
# move in out of frame bbox annotations
xmin = max(xmin, 0)
ymin = max(ymin, 0)
xmax = min(xmax, width)
ymax = min(ymax, height)
gazex_norm = [x for x in row['gaze_x']]
gazey_norm = [y for y in row['gaze_y']]
gazex = [x * float(width) for x in row['gaze_x']]
gazey = [y * float(height) for y in row['gaze_y']]
# precalculate feasible crop region (containing bbox and gaze target)
crop_xmin = min(xmin, *gazex)
crop_ymin = min(ymin, *gazey)
crop_xmax = max(xmax, *gazex)
crop_ymax = max(ymax, *gazey)
crop_constraint_xs.extend([crop_xmin, crop_xmax])
crop_constraint_ys.extend([crop_ymin, crop_ymax])
heads.append({
'bbox': [xmin, ymin, xmax, ymax],
'bbox_norm': [xmin / float(width), ymin / float(height), xmax / float(width), ymax / float(height)],
'gazex': gazex,
'gazey': gazey,
'gazex_norm': gazex_norm,
'gazey_norm': gazey_norm,
'inout': 1, # all test frames are in frame
'num_annot': len(gazex),
'crop_region': [crop_xmin, crop_ymin, crop_xmax, crop_ymax],
'crop_region_norm': [crop_xmin / float(width), crop_ymin / float(height), crop_xmax / float(width), crop_ymax / float(height)],
'head_id': i
})
# visualize_heads(img_path, heads)
TEST_FRAMES.append({
'path': path,
'heads': heads,
'num_heads': num_people,
'width': width,
'height': height,
'crop_region': [min(crop_constraint_xs), min(crop_constraint_ys), max(crop_constraint_xs), max(crop_constraint_ys)],
})
if num_people > 1:
multiperson_ex += 1
print("Test set: {} frames, {} multi-person".format(len(TEST_FRAMES), multiperson_ex))
out_file = open(os.path.join(DATA_PATH, "test_preprocessed.json"), "w")
json.dump(TEST_FRAMES, out_file)
if __name__ == "__main__":
main(args.data_path) |