# TravelingBirds dataset needs to be downloaded from https://worksheets.codalab.org/bundles/0x518829de2aa440c79cd9d75ef6669f27
# as it comes from https://github.com/yewsiang/ConceptBottleneck
import os
from pathlib import Path

import numpy as np
import pandas as pd

from dataset_classes.cub200 import CUB200Class
from dataset_classes.utils import index_list_with_sorting, mask_list


class TravelingBirds(CUB200Class):
    init_base_folder = 'CUB_fixed'
    root = Path.home() / "tmp/Datasets/TravelingBirds"
    crop_root = Path.home() / "tmp/Datasets/PPTravelingBirds"
    def get_all_samples_dir(self, dir):

        self.base_folder = os.path.join(self.init_base_folder, dir)
        main_dir = Path(self.root) / self.init_base_folder / dir
        return self.get_all_sample(main_dir)

    def adapt_to_crop(self):
        self.root = self.crop_root
        folder_name = "train" if self.train else "test"
        folder_name = folder_name + "_cropped"
        self.base_folder = 'CUB_fixed/' + folder_name

    def get_all_sample(self, dir):
        answer = []
        for i, sub_dir in enumerate(sorted(os.listdir(dir))):
            class_dir = dir / sub_dir
            for single_img in os.listdir(class_dir):
                answer.append([Path(sub_dir) / single_img, i + 1])
        return answer
    def _load_metadata(self):
        train_test_split = pd.read_csv(
            os.path.join(Path(self.root).parent / "CUB200", 'CUB_200_2011', 'train_test_split.txt'),
            sep=' ', names=['img_id', 'is_training_img'])
        data = pd.read_csv(
            os.path.join(Path(self.root).parent / "CUB200", 'CUB_200_2011', 'images.txt'),
            sep=' ', names=['img_id', "path"])
        img_dict = {x[1]: x[0] for x in data.values}
        # TravelingBirds has all train+test images in both folders, just with different backgrounds.
        # They are separated by train_test_split of CUB200.
        if self.train:
            samples = self.get_all_samples_dir("train")
            mask = train_test_split["is_training_img"] == 1
        else:
            samples = self.get_all_samples_dir("test")
            mask = train_test_split["is_training_img"] == 0
        ids = np.array([img_dict[str(x[0])] for x in samples])
        sorted = np.argsort(ids)
        samples = index_list_with_sorting(samples, sorted)
        samples = mask_list(samples, mask)
        filepaths = [x[0] for x in samples]
        labels = [x[1] for x in samples]
        samples = pd.DataFrame({"filepath": filepaths, "target": labels})
        self.data = samples