import random
import os
import numpy as np
import argparse
import json
from collections import defaultdict
from matplotlib import pyplot as plt
from collections import Counter
from .data_utils import json_read

def set_random_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

class Reaction_Cluster:
    def __init__(self, root, reaction_filename, reverse_ratio=0.5):
        self.root = root
        self.reaction_data = json_read(os.path.join(self.root, reaction_filename))
        self.property_data = json_read(os.path.join(self.root, 'Abstract_property.json'))
        self.mol_property_map = {d['canon_smiles']: d for d in self.property_data}
        self.reverse_ratio = reverse_ratio
        self.rxn_mols_attr = defaultdict(lambda:{
            'freq': 0,
            'occurrence': 0,
            'in_caption': False,
        })

        self._read_reaction_mols() # add `valid_mols` in each rxn_dict
        self.mol_counter = Counter(mol for rxn_dict in self.reaction_data for mol in rxn_dict['valid_mols'])
        self._calculate_Pr() # calculate P(r), add `weight` in each rxn_dict
        self._calculate_Pir() # calculate P(i|r), add `mol_weight` in each rxn_dict

    def _read_reaction_mols(self):
        self.valid_rxn_indices = []
        for rxn_id, rxn_dict in enumerate(self.reaction_data):
            mol_role_map = {}
            for key in ['REACTANT', 'CATALYST', 'SOLVENT', 'PRODUCT']:
                for m in rxn_dict[key]:
                    if m in mol_role_map:
                        continue
                    if m in self.mol_property_map:
                        mol_role_map[m] = key
            valid_mols = []
            for mol in mol_role_map:
                assert mol in self.mol_property_map # this is garanteed by the above if statement
                if 'abstract' not in self.mol_property_map[mol]:
                    continue
                valid_mols.append(mol) # here the molecules should be in the R, C, S, P order.
            if len(valid_mols) > 0:
                self.valid_rxn_indices.append(rxn_id)
            rxn_dict['valid_mols'] = valid_mols
            rxn_dict['mol_role_map'] = mol_role_map

    def _calculate_Pr(self):
        total_weights = 0
        for rxn_dict in self.reaction_data:
            rxn_weight = sum([1/self.mol_counter[mol] for mol in rxn_dict['valid_mols']])
            rxn_dict['weight'] = rxn_weight
            total_weights += rxn_weight
        for rxn_dict in self.reaction_data:
            rxn_dict['weight'] = rxn_dict['weight'] / total_weights

    def _calculate_Pir(self):
        for rxn_dict in self.reaction_data:
            mol_weight = {}
            for mol in rxn_dict['valid_mols']:
                mol_weight[mol] = 1/self.mol_counter[mol]
            total_weight = sum(mol_weight.values())
            rxn_dict['mol_weight'] = {m:w/total_weight for m, w in mol_weight.items()}

    def choose_mol(self, valid_mols, k=4, weights=None):
        if k>=len(valid_mols):
            sampled_indices = list(range(len(valid_mols)))
        else:
            sampled_indices = np.random.choice(len(valid_mols), k, replace=False, p=weights)
            sampled_indices = list(sampled_indices)
        sampled_indices = sorted(sampled_indices)
        if random.random() < self.reverse_ratio: # reverse the indices with reverse_ratio chance.
            sampled_indices.reverse()
        sampled_mols = [valid_mols[i] for i in sampled_indices]
        return sampled_mols

    def sample_mol_batch(self, index=None, k=4):
        if index is None:
            index = self.sample_rxn_index(1)[0]
        assert index < len(self.reaction_data)
        rxn = self.reaction_data[index]
        valid_mols, weights = zip(*rxn['mol_weight'].items())

        sampled_mols = self.choose_mol(valid_mols, k=k, weights=weights)
        mol_property_batch = []
        for mol in sampled_mols:
            mol_property = self.mol_property_map[mol]
            mol_role = rxn['mol_role_map'][mol]
            mol_property['role'] = mol_role
            mol_property_batch.append(mol_property)
        if 'rsmiles_map' in rxn:
            rsmiles_map = random.choice(rxn['rsmiles_map'])
            for mol_property in mol_property_batch:
                canon_smiles = mol_property['canon_smiles']
                if canon_smiles in rsmiles_map:
                    mol_property['r_smiles'] = rsmiles_map[canon_smiles]
        return mol_property_batch

    def sample_rxn_index(self, num_samples):
        indices = range(len(self.reaction_data))
        weights = [d['weight'] for d in self.reaction_data]
        return np.random.choice(indices, num_samples, replace=False, p=weights)

    def __call__(self, rxn_num=1000, k=4):
        sampled_indices = self.sample_rxn_index(rxn_num)
        sampled_batch = [self.sample_mol_batch(idx, k=k) for idx in sampled_indices]
        return sampled_batch

    def generate_batch_uniform_rxn(self, rxn_num=1000, k=4):
        assert rxn_num <= len(self.valid_rxn_indices)
        sampled_rxn_indices = random.sample(self.valid_rxn_indices, rxn_num)
        sampled_batch = []
        for rxn_id in sampled_rxn_indices:
            rxn = self.reaction_data[rxn_id]
            sampled_mols = self.choose_mol(rxn['valid_mols'], k=k, weights=None)
            mol_property_batch = []
            for mol in sampled_mols:
                mol_property = self.mol_property_map[mol]
                mol_role = rxn['mol_role_map'][mol]
                mol_property['role'] = mol_role
                mol_property_batch.append(mol_property)
            sampled_batch.append(mol_property_batch)
        return sampled_batch

    def generate_batch_uniform_mol(self, rxn_num=1000, k=4):
        valid_mols = list(self.mol_counter.elements())
        assert rxn_num*k <= len(valid_mols)
        sampled_batch = []
        sampled_mol_ids = random.sample(range(len(valid_mols)), rxn_num*k)
        for i in range(rxn_num):
            sampled_batch.append([self.mol_property_map[valid_mols[mol_id]] for mol_id in sampled_mol_ids[i*k:(i+1)*k]])
        return sampled_batch

    def generate_batch_single(self, rxn_num=1000):
        valid_mols = list(self.mol_counter.elements())
        sampled_mols = random.sample(valid_mols, rxn_num)
        total_valid_mols = [[self.mol_property_map[mol]] for mol in sampled_mols]
        return total_valid_mols

    # visaulize probability for molecules in caption dataset.
    def visualize_mol_distribution(self):
        prob_dict = {mol:0.0 for mol in self.mol_property_map.keys()}
        N = len(prob_dict)
        M = len(self.reaction_data)
        assert N == len(self.mol_property_map)
        print(f'Number of molecules in Caption Dataset: {N}')
        print(f'Number of Reactions in Reaction Dataset: {M}')

        # prob distribution for molecules
        for rxn_dict in self.reaction_data:
            for mol, weight in rxn_dict['mol_weight'].items():
                prob_dict[mol] += weight * rxn_dict['weight']
        # sum of prob_dict.values() should already be 1.
        prob_values = np.array(list(prob_dict.values()))
        prob_values *= N

        # prob distribution for reactions
        rxn_weights = np.array([d['weight'] for d in self.reaction_data])
        # sum of rxn_weights should already be 1.
        rxn_weights *= M

        return prob_values, rxn_weights

    # visaulize the frequency for molecules in caption dataset.
    def visualize_mol_frequency(self, rxn_num=1000, k=4, epochs=100):
        sampled_mols_counter = Counter()
        sampled_rxns_counter = Counter()
        for _ in range(epochs):
            rxn_indices = self.sample_rxn_index(rxn_num)
            sampled_rxns_counter.update(rxn_indices)
            for index in rxn_indices:
                rxn = self.reaction_data[index]
                if len(rxn['valid_mols']) ==0:
                    continue
                valid_mols, weights = zip(*rxn['mol_weight'].items())
                mol_batch = self.choose_mol(valid_mols, k=k, weights=weights)
                sampled_mols_counter.update(mol_batch)
        sampled_mols_count = np.array([c for _, c in sorted(sampled_mols_counter.items())])
        sampled_rxns_count = np.array([c for _, c in sorted(sampled_rxns_counter.items())])
        return sampled_mols_count, sampled_rxns_count

    def _randomly(self, func, *args, **kwargs):
        # make fake weights and backup the weights
        for rxn_dict in self.reaction_data:
            rxn_dict['weight_bak'] = rxn_dict['weight']
            rxn_dict['weight'] = 1/len(self.reaction_data)
            rxn_dict['mol_weight_bak'] = rxn_dict['mol_weight']
            rxn_dict['mol_weight'] = {m:1/len(rxn_dict['mol_weight']) for m in rxn_dict['mol_weight']}

        # run the function
        result = func(*args, **kwargs)

        # weights recovery
        for rxn_dict in self.reaction_data:
            rxn_dict['weight'] = rxn_dict['weight_bak']
            del rxn_dict['weight_bak']
            rxn_dict['mol_weight'] = rxn_dict['mol_weight_bak']
            del rxn_dict['mol_weight_bak']

        return result