import argparse
import glob
from functools import reduce
import os
import pandas as pd
import json
import numpy as np
from PIL import Image

parser = argparse.ArgumentParser()
parser.add_argument("--data_path", type=str, default="./data/videoattentiontarget")
args = parser.parse_args()

# preprocessing adapted from https://github.com/ejcgt/attention-target-detection/blob/master/dataset.py

def merge_dfs(ls):
    for i, df in enumerate(ls): # give columns unique names
        df.columns = [col if col == "path" else f"{col}_df{i}" for col in df.columns]
    merged_df = reduce(
        lambda left, right: pd.merge(left, right, on=["path"], how="outer"), ls
    )
    merged_df = merged_df.sort_values(by=["path"])
    merged_df = merged_df.reset_index(drop=True)
    return merged_df

def smooth_by_conv(window_size, df, col):
    """Temporal smoothing on labels to match original VideoAttTarget evaluation.
    Adapted from https://github.com/ejcgt/attention-target-detection/blob/acd264a3c9e6002b71244dea8c1873e5c5818500/utils/myutils.py"""
    values = df[col].values
    padded_track = np.concatenate([values[0].repeat(window_size // 2), values, values[-1].repeat(window_size // 2)])
    smoothed_signals = np.convolve(
        padded_track.squeeze(), np.ones(window_size) / window_size, mode="valid"
    )
    return smoothed_signals

def smooth_df(window_size, df):
    df["xmin"] = smooth_by_conv(window_size, df, "xmin")
    df["ymin"] = smooth_by_conv(window_size, df, "ymin")
    df["xmax"] = smooth_by_conv(window_size, df, "xmax")
    df["ymax"] = smooth_by_conv(window_size, df, "ymax")
    return df


def main(PATH):
    # preprocess by sequence and person track
    splits = ["train", "test"]

    for split in splits:
        sequences = []
        max_num_ppl = 0
        seq_idx = 0
        for seq_path in glob.glob(
            os.path.join(PATH, "annotations", split, "*", "*")
        ):
            seq_img_path = os.path.join("images", *seq_path.split("/")[-2:]
            )
            sample_image = os.path.join(PATH, seq_img_path, os.listdir(os.path.join(PATH, seq_img_path))[0])
            width, height = Image.open(sample_image).size
            seq_dict = {"path": seq_img_path, "width": width, "height": height}
            frames = []
            person_files = glob.glob(os.path.join(seq_path, "*"))
            num_ppl = len(person_files)
            if num_ppl > max_num_ppl:
                max_num_ppl = num_ppl
            person_dfs = [
                pd.read_csv(
                    file,
                    header=None,
                    index_col=False,
                    names=["path", "xmin", "ymin", "xmax", "ymax", "gazex", "gazey"],
                )
                for file in person_files
            ]
            # moving-avg smoothing to match original benchmark's evaluation
            window_size = 11
            person_dfs = [smooth_df(window_size, df) for df in person_dfs]
            merged_df = merge_dfs(person_dfs) # merge annotations per person for same frames
            for frame_idx, row in merged_df.iterrows():
                frame_dict = {
                    "path": os.path.join(seq_img_path, row["path"]),
                    "heads": [],
                }
                p_idx = 0
                for i in range(1, num_ppl * 6 + 1, 6):
                    if not np.isnan(row.iloc[i]): # if it's nan lack of continuity (one person leaving the frame for a period of time)
                        xmin, ymin, xmax, ymax, gazex, gazey = row[i: i+6].values.tolist()
                        # match original benchmark's preprocessing of annotations
                        if gazex >=0 and gazey < 0:
                            gazey = 0
                        elif gazey >=0 and gazex < 0:
                            gazex = 0
                        inout = int(gazex >= 0 and gazey >= 0)
                        frame_dict["heads"].append({
                            "bbox": [xmin, ymin, xmax, ymax],
                            "bbox_norm": [xmin / float(width), ymin / float(height), xmax / float(width), ymax / float(height)],
                            "gazex": [gazex],
                            "gazex_norm": [gazex / float(width)],
                            "gazey": [gazey],
                            "gazey_norm": [gazey / float(height)],
                            "inout": inout
                        })
                    p_idx = p_idx + 1

                frames.append(frame_dict)
            seq_dict["frames"] = frames
            sequences.append(seq_dict)
            seq_idx += 1

        print("{} max people per image {}".format(split, max_num_ppl))
        print("{} num unique video sequences {}".format(split, len(sequences)))

        out_file = open(os.path.join(PATH, "{}_preprocessed.json".format(split)), "w")
        json.dump(sequences, out_file)

if __name__ == "__main__":
    main(args.data_path)