|
import argparse |
|
import glob |
|
from functools import reduce |
|
import os |
|
import pandas as pd |
|
import json |
|
import numpy as np |
|
from PIL import Image |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--data_path", type=str, default="./data/videoattentiontarget") |
|
args = parser.parse_args() |
|
|
|
|
|
|
|
def merge_dfs(ls): |
|
for i, df in enumerate(ls): |
|
df.columns = [col if col == "path" else f"{col}_df{i}" for col in df.columns] |
|
merged_df = reduce( |
|
lambda left, right: pd.merge(left, right, on=["path"], how="outer"), ls |
|
) |
|
merged_df = merged_df.sort_values(by=["path"]) |
|
merged_df = merged_df.reset_index(drop=True) |
|
return merged_df |
|
|
|
def smooth_by_conv(window_size, df, col): |
|
"""Temporal smoothing on labels to match original VideoAttTarget evaluation. |
|
Adapted from https://github.com/ejcgt/attention-target-detection/blob/acd264a3c9e6002b71244dea8c1873e5c5818500/utils/myutils.py""" |
|
values = df[col].values |
|
padded_track = np.concatenate([values[0].repeat(window_size // 2), values, values[-1].repeat(window_size // 2)]) |
|
smoothed_signals = np.convolve( |
|
padded_track.squeeze(), np.ones(window_size) / window_size, mode="valid" |
|
) |
|
return smoothed_signals |
|
|
|
def smooth_df(window_size, df): |
|
df["xmin"] = smooth_by_conv(window_size, df, "xmin") |
|
df["ymin"] = smooth_by_conv(window_size, df, "ymin") |
|
df["xmax"] = smooth_by_conv(window_size, df, "xmax") |
|
df["ymax"] = smooth_by_conv(window_size, df, "ymax") |
|
return df |
|
|
|
|
|
def main(PATH): |
|
|
|
splits = ["train", "test"] |
|
|
|
for split in splits: |
|
sequences = [] |
|
max_num_ppl = 0 |
|
seq_idx = 0 |
|
for seq_path in glob.glob( |
|
os.path.join(PATH, "annotations", split, "*", "*") |
|
): |
|
seq_img_path = os.path.join("images", *seq_path.split("/")[-2:] |
|
) |
|
sample_image = os.path.join(PATH, seq_img_path, os.listdir(os.path.join(PATH, seq_img_path))[0]) |
|
width, height = Image.open(sample_image).size |
|
seq_dict = {"path": seq_img_path, "width": width, "height": height} |
|
frames = [] |
|
person_files = glob.glob(os.path.join(seq_path, "*")) |
|
num_ppl = len(person_files) |
|
if num_ppl > max_num_ppl: |
|
max_num_ppl = num_ppl |
|
person_dfs = [ |
|
pd.read_csv( |
|
file, |
|
header=None, |
|
index_col=False, |
|
names=["path", "xmin", "ymin", "xmax", "ymax", "gazex", "gazey"], |
|
) |
|
for file in person_files |
|
] |
|
|
|
window_size = 11 |
|
person_dfs = [smooth_df(window_size, df) for df in person_dfs] |
|
merged_df = merge_dfs(person_dfs) |
|
for frame_idx, row in merged_df.iterrows(): |
|
frame_dict = { |
|
"path": os.path.join(seq_img_path, row["path"]), |
|
"heads": [], |
|
} |
|
p_idx = 0 |
|
for i in range(1, num_ppl * 6 + 1, 6): |
|
if not np.isnan(row.iloc[i]): |
|
xmin, ymin, xmax, ymax, gazex, gazey = row[i: i+6].values.tolist() |
|
|
|
if gazex >=0 and gazey < 0: |
|
gazey = 0 |
|
elif gazey >=0 and gazex < 0: |
|
gazex = 0 |
|
inout = int(gazex >= 0 and gazey >= 0) |
|
frame_dict["heads"].append({ |
|
"bbox": [xmin, ymin, xmax, ymax], |
|
"bbox_norm": [xmin / float(width), ymin / float(height), xmax / float(width), ymax / float(height)], |
|
"gazex": [gazex], |
|
"gazex_norm": [gazex / float(width)], |
|
"gazey": [gazey], |
|
"gazey_norm": [gazey / float(height)], |
|
"inout": inout |
|
}) |
|
p_idx = p_idx + 1 |
|
|
|
frames.append(frame_dict) |
|
seq_dict["frames"] = frames |
|
sequences.append(seq_dict) |
|
seq_idx += 1 |
|
|
|
print("{} max people per image {}".format(split, max_num_ppl)) |
|
print("{} num unique video sequences {}".format(split, len(sequences))) |
|
|
|
out_file = open(os.path.join(PATH, "{}_preprocessed.json".format(split)), "w") |
|
json.dump(sequences, out_file) |
|
|
|
if __name__ == "__main__": |
|
main(args.data_path) |