from numbers import Integral
from vllm import LLM, SamplingParams
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import gc
import re
import sys
import subprocess
import math
import random
from collections import defaultdict
from collections import Counter
import torch
import transformers
import accelerate
import time
from aimo_math import train_env
import pdb

NOTEBOOK_START_TIME = time.time()


if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    PRIVATE = True
else:
    PRIVATE = False

# if not PRIVATE:
#     class train_env():
#         def __init__(self, randomize=False):
#             self.randomlize = randomize

#             self.df = pd.read_csv('train.csv')
#             self.df['ground_truth'] = self.df['answer']
#             self.df['answer'] = -1

#             if self.randomlize:
#                 self.df = self.df.reset_index().sample(frac=1).reset_index(drop=True)

#             self.predict_called = True
#             self.counter = 0
#             self.len = len(self.df)

#         def iter_test(self):
#             while self.counter < self.len:
#                 if self.predict_called:
#                     self.predict_called = False
#                     yield (self.df.loc[[self.counter]][['id', 'problem']]), (self.df.loc[[self.counter]][['id', 'answer']])
#                 else:
#                     print(
#                         "You must call `predict()` successfully before you can continue with `iter_test()`")
#                     yield None

#         def predict(self, answer):
#             self.df.loc[self.counter, ('answer')] = answer['answer'].values[0]
#             self.predict_called = True
#             self.counter += 1

    # env = train_env(randomize=True)
#     env = ()
#     iter_test = env.iter_test()
# else:
#     assert False

env = train_env()
iter_test = env.iter_test()
QUANT = False
USE_PAST_KEY = True
SEED = 42
# MODEL_PATH = "/kaggle/input/deepseek-math"
MODEL_PATH = '/opt/tiger/deepseek-math-7b-rl/'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
N_REPETITIONS = 19 if PRIVATE else 4
MAX_NEW_TOKENS = 2048 if PRIVATE else 512
TIME_LIMIT = 31500 if PRIVATE else 300

def seed_everything(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    transformers.set_seed(seed)

if SEED:
    seed_everything(SEED)

torch.backends.cuda.enable_mem_efficient_sdp(False)

DEVICE_MAP = [('model.embed_tokens', 0),
              ('model.layers.0', 0),
              ('model.layers.1', 0),
              ('model.layers.2', 0),
              ('model.layers.3', 0),
              ('model.layers.4', 0),
              ('model.layers.5', 0),
              ('model.layers.6', 0),
              ('model.layers.7', 0),
              ('model.layers.8', 0),
              ('model.layers.9', 0),
              ('model.layers.10', 0),
              ('model.layers.11', 0),
              ('model.layers.12', 0),
              ('model.layers.13', 0),
              ('model.layers.14', 0),
              ('model.layers.15', 0),
              ('model.layers.16', 0),
              ('model.layers.17', 0),
              ('model.layers.18', 1),
              ('model.layers.19', 1),
              ('model.layers.20', 1),
              ('model.layers.21', 1),
              ('model.layers.22', 1),
              ('model.layers.23', 1),
              ('model.layers.24', 1),
              ('model.layers.25', 1),
              ('model.layers.26', 1),
              ('model.layers.27', 1),
              ('model.layers.28', 1),
              ('model.layers.29', 1),
              ('model.layers.30', 1),
              ('model.layers.31', 1),
              ('model.norm', 1),
              ('lm_head', 1)]

DEVICE_MAP = {ii: jj for (ii, jj) in DEVICE_MAP}

TEMPERATURE = [0.9, 0.9]  # temperature, temperature_coding
TOP_P = [1.0, 1.0]  # top_p, top_p_coding


class StoppingCriteriaSub(transformers.StoppingCriteria):
    def __init__(self, stops=[], encounters=1):
        super().__init__()
        self.stops = [stop.to(DEVICE) for stop in stops]

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        for stop in self.stops:
            last_token = input_ids[0][-len(stop):]
            if torch.all(torch.eq(stop, last_token)):
                return True
        return False


class LLM_SYSTEM:

    def __init__(self, model_path, device_map, temperature, top_p, prompt_options):
        # init llm
        self.model, self.tokenizer = self.initialize_llm(model_path, device_map)
        # init stop words
        self.stop_words = ["```output", "```python", "```\nOutput" , ")\n```" , "``````output"]
        # TODO: self.stop_words = ["```output",
        #                    "```\nOutput", ")\n```", "``````output"]
        self.stop_words_ids = [self.tokenizer(stop_word, return_tensors='pt', add_special_tokens=False)['input_ids'].squeeze() for stop_word in self.stop_words]
        self.stopping_criteria = transformers.StoppingCriteriaList([StoppingCriteriaSub(stops=self.stop_words_ids)])

        self.prompt_options = prompt_options

        self.temperature = temperature[0]
        self.top_p = top_p[0]

        self.temperature_coding = temperature[1]
        self.top_p_coding = top_p[1]

        self.total_results = {}
        self.total_answers = {}
        self.best_stats = {}
        self.total_outputs = {}
        self.question_type_counts = {}
        self.starting_counts = (2, 3)
        self.problem_count = 0

        self.already_generated_length = 0
        self.code_error = None
        self.code_error_count = 0
        self.code_output = -1
# ====================================================================================#

    def initialize_llm(self, model_path, device_map):
        # model = LLM(model=model_path, tensor_parallel_size=2,
        #             dtype=torch.float16)
        model = LLM(model=model_path, 
                    dtype=torch.float16)
        tokenizer = model.get_tokenizer()

        return model, tokenizer
# ====================================================================================#

    def predict(self, problem):
        self.problem_count += 1
        TIME_SPENT = time.time() - NOTEBOOK_START_TIME

        if TIME_SPENT > TIME_LIMIT:
            return -1

        for repetition in tqdm(range(N_REPETITIONS)):
            print(
                f"\n\n\nQUESTION {self.problem_count} - {repetition} - TIME_SPENT : {TIME_SPENT:.0f} secs")
            best, best_count = self.best_stats.get(
                self.problem_count, (-1, -1))
            if best_count > np.sqrt(repetition):
                print("SKIPPING CAUSE ALREADY FOUND BEST")
                continue

            outputs = self.total_outputs.get(self.problem_count, [])
            text_answers, code_answers = self.question_type_counts.get(
                self.problem_count, self.starting_counts)
            results = self.total_results.get(self.problem_count, [])
            answers = self.total_answers.get(self.problem_count, [])

            for _ in range(5):
                self.flush()
                time.sleep(0.2)

            try:
                self.already_generated_length = 0
                self.code_error = None
                self.code_error_count = 0
                self.code_output = -1

                counts = np.array([text_answers, code_answers])

                draw = np.random.choice(
                    self.prompt_options, 1, p=counts/counts.sum())

                initial_message = draw[0].format(problem, "{}")
                prompt = f"{initial_message}"
                # TODO: prompt = f"User: {initial_message}"
                prompt_original_length = len(prompt)
                print(f"{repetition}_{prompt}\n")

                model_inputs = self.tokenizer(
                    prompt, return_tensors='pt')
                prompt_token_length = len(model_inputs['input_ids'][0])
                sampling_params= SamplingParams(max_tokens=MAX_NEW_TOKENS-self.already_generated_length,
                                                temperature=self.temperature, top_p=self.top_p,
                                                stop =  self.stop_words
                                                )
                generation_output = self.model.generate(prompt, sampling_params)

                decoded_output = generation_output[0].outputs[0].text
                print(f"{decoded_output}\n")
                prompt_original_length += len(decoded_output)
                cummulative_code = ""

                stop_word_cond = False
                for stop_word in self.stop_words:
                    stop_word_cond = stop_word_cond or (
                        decoded_output[-len(stop_word):] == stop_word)

                while (stop_word_cond) and (self.already_generated_length < (MAX_NEW_TOKENS)):

                    if (decoded_output[-len("```python"):] == "```python"):
                        temperature_inner = self.temperature_coding
                        top_p_inner = self.top_p_coding
                        # prompt = decoded_output
                        prompt = prompt + decoded_output
                        # ? missing prevous prompt
                    else:
                        temperature_inner = self.temperature
                        top_p_inner = self.top_p
                        try:
                            if (decoded_output[-len("``````output"):] == "``````output"):
                                code_text = decoded_output.split(
                                    '```python')[-1].split("``````")[0]
                            else:
                                code_text = decoded_output.split(
                                    '```python')[-1].split("```")[0]

                            cummulative_code += code_text
                            self.code_output, CODE_STATUS = self.process_code(
                                cummulative_code, return_shell_output=True)
                            print('CODE RESULTS', self.code_output)

                            if self.code_error == self.code_output:
                                self.code_error_count += 1
                            else:
                                self.code_error = self.code_output
                                self.code_error_count = 0

                            if not CODE_STATUS:
                                cummulative_code = cummulative_code[:-len(
                                    code_text)]

                                if self.code_error_count >= 1:
                                    print("REPEATED ERRORS")
                                    break

                        except Exception as e:
                            print(e)
                            print('ERROR PARSING CODE')
                            self.code_output = -1

                        if self.code_output != -1:
                            if (decoded_output[-len(")\n```"):] == ")\n```"):
                                # prompt = decoded_output+'```output\n'+str(self.code_output)+'\n```\n'
                                prompt = prompt + decoded_output+'```output\n' + \
                                    str(self.code_output)+'\n```\n'
                            else:
                                # prompt = decoded_output+'\n'+str(self.code_output)+'\n```\n'
                                prompt = prompt + decoded_output+'\n' + \
                                    str(self.code_output)+'\n```\n'
                        else:
                            # prompt = decoded_output
                            prompt = prompt + decoded_output
                            cummulative_code = ""
                    model_inputs = self.tokenizer(prompt, return_tensors='pt')
                    self.already_generated_length = len(
                        model_inputs['input_ids'][0])-prompt_token_length


                    sampling_params= SamplingParams(max_tokens=MAX_NEW_TOKENS-self.already_generated_length,
                                                temperature=self.temperature, top_p=self.top_p,
                                                stop =  self.stop_words
                                                )
                    generation_output= self.model.generate(prompt, sampling_params)

                    decoded_output = generation_output[0].outputs[0].text
                    print(f"{decoded_output}\n")
                    prompt_original_length += len(decoded_output)


                    stop_word_cond = False
                    for stop_word in self.stop_words:
                        stop_word_cond = stop_word_cond or (
                            decoded_output[-len(stop_word):] == stop_word)
                        

                decoded_output = generation_output[0].outputs[0].text


                result_output = self.process_text_output(decoded_output)

                try:
                    self.code_output = round(
                        float(eval(self.code_output))) % 1000
                except Exception as e:
                    print(e, 'final_eval')
                    self.code_output = -1
            except Exception as e:
                print(e, "5")
                result_output, self.code_output = -1, -1

            if self.code_output != -1:
                outputs.append(self.code_output)
                code_answers += 1

            if result_output != -1:
                outputs.append(result_output)
                text_answers += 1

            if len(outputs) > 0:
                occurences = Counter(outputs).most_common()
                print(occurences)
                if occurences[0][1] > best_count:
                    print("GOOD ANSWER UPDATED!")
                    best = occurences[0][0]
                    best_count = occurences[0][1]
                if occurences[0][1] > 5:
                    print("ANSWER FOUND!")
                    break

            results.append(result_output)
            answers.append(self.code_output)

            self.best_stats[self.problem_count] = (best, best_count)
            self.question_type_counts[self.problem_count] = (
                text_answers, code_answers)
            self.total_outputs[self.problem_count] = outputs

            self.total_results[self.problem_count] = results
            self.total_answers[self.problem_count] = answers

            # print("code_answers", code_answers -
            #       self.starting_counts[1], "text_answers", text_answers-self.starting_counts[0])
        return self.best_stats[self.problem_count][0]
# ====================================================================================#

    def flush(self):
        torch.cuda.empty_cache()
        gc.collect()
# ====================================================================================#

    def naive_parse(self, answer):
        out = []
        start = False
        end = False
        for l in reversed(list(answer)):
            if l in '0123456789' and not end:
                start = True
                out.append(l)
            else:
                if start:
                    end = True

        out = reversed(out)
        return ''.join(out)
# ====================================================================================#

    def return_last_print(self, output, n):
        lines = output.strip().split('\n')
        if lines:
            return lines[n]
        else:
            return ""
# ====================================================================================#

    def repl(self, match):
        if "real" not in match.group():
            return "{}{}".format(match.group()[:-1], ', real=True)')
        else:
            return "{}{}".format(match.group()[:-1], ')')
# ====================================================================================#

    def process_code(self, code, return_shell_output=False):

        code = re.sub(r"symbols\([^)]+\)", self.repl, code)

        if return_shell_output:
            code = code.replace('\n', '\n    ')
            # Add a try...except block
            code = "\ntry:\n    from sympy import *\n{}\nexcept Exception as e:\n    print(e)\n    print('FAIL')\n".format(
                code)

        if not return_shell_output:
            print(code)
        with open('code.py', 'w') as fout:
            fout.write(code)

        batcmd = 'timeout 7 ' + sys.executable + ' code.py'
        try:
            shell_output = subprocess.check_output(
                batcmd, shell=True).decode('utf8')
            return_value = self.return_last_print(shell_output, -1)
            print(shell_output)
            if return_shell_output:
                if return_value == 'FAIL':
                    CODE_STATUS = False
                    return_value = self.return_last_print(shell_output, -2)
                    if "not defined" in return_value:
                        return_value += '\nTry checking the formatting and imports'
                else:
                    CODE_STATUS = True
                return return_value, CODE_STATUS
            self.code_output = round(float(eval(return_value))) % 1000
        except Exception as e:
            print(e, 'shell_output')
            self.code_output = -1

        if return_shell_output:
            if self.code_output == -1:
                CODE_STATUS = False
            else:
                CODE_STATUS = True
            return self.code_output, CODE_STATUS

        return self.code_output
# ====================================================================================#

    # def process_text_output(self, output):
    #     result = output
    #     try:
    #         result_output = output.split("The answer is")[-1].strip()
    #         result_output = round(float(eval(result_output))) % 1000
    #     except:
    #         try:
    #             result_output = re.findall(r'\\boxed\{(\d+)\}', result)

    #             print('BOXED', result_output)
    #             if not len(result_output):
    #                 result_output = self.naive_parse(result)
    #             else:
    #                 result_output = result_output[-1]

    #             print('BOXED FINAL', result_output)
    #             if not len(result_output):
    #                 result_output = -1

    #             else:
    #                 result_output = round(float(eval(result_output))) % 1000

    #         except Exception as e:
    #             print(e)
    #             print('ERROR PARSING TEXT')
    #             result_output = -1

    #     return result_output
    def process_text_output(self, output):
        result = output    
        try:
            result_output = re.findall(r'\\boxed\{(\d+)\}', result)

            print('BOXED', result_output)
            if not len(result_output):
                result_output = self.naive_parse(result)
            else:
                result_output = result_output[-1]

            print('BOXED FINAL', result_output)
            if not len(result_output):
                result_output = -1

            else:
                result_output = round(float(eval(result_output))) % 1000

        except Exception as e:
            print(e)
            print('ERROR PARSING TEXT')
            result_output = -1

        return result_output
# ====================================================================================#


# code = '''Below is a math problem you are to solve:
# \"{}\"
# To accomplish this, first determine a sympy-based approach for solving the problem by listing each step to take and what functions need to be called in each step.'''

# cot = '''Below is a math problem you are to solve:
# \"{}\"
# Analyze this problem and think step by step to come to a solution with programs.'''
code = """Below is a math problem you are to solve (positive numerical answer):
\"{}\"
To accomplish this, first determine a sympy-based approach for solving the problem by listing each step to take and what functions need to be called in each step. Be clear so even an idiot can follow your instructions, and remember, your final answer should be positive integer, not an algebraic expression!
Write the entire script covering all the steps (use comments and document it well) and print the result. After solving the problem, output the final numerical answer within \\boxed{}.

Approach:"""


cot = """Below is a math problem you are to solve (positive numerical answer!):
\"{}\"
Analyze this problem and think step by step to come to a solution with programs. After solving the problem, output the final numerical answer within \\boxed{}.\n\n"""

prompt_options = [code,cot]


prompt_options = [code, cot]

llm = LLM_SYSTEM(MODEL_PATH, DEVICE_MAP, TEMPERATURE, TOP_P, prompt_options)

result = []
for test, sample_submission in tqdm(iter_test):
    # sample_submission['answer'] = 0
    # print(test)
    res = llm.predict(test['problem'].values[0])
    if res == -1:
        break
    sample_submission['answer'] = res
    result.append(env.predict(sample_submission))

print(f"total: {len(result)}, Corrects: {sum(result)}. Accuracy: {sum(result)/len(result):.2f}")
print(f"Time: {time.time()-NOTEBOOK_START_TIME:.2f} secs")
# TOC = time.time()
# sample_submission['answer'] = best_stats[i][0]
# print(f"Problem {i}: {test['problem']}")
# print(f"Ground truth: {test['ground_truth']}")
# print(f"Model answer: {sample_submission['answer']}")
# print(f"\nTIME_SPENT for problem {i}: {TIME_SPENT:.0f} secs")
# if str(sample_submission['answer']) == str(test['ground_truth']):
#     num_corrects += 1
# print(f"Corrects: {num_corrects}/{i+1}. Accuracy: {num_corrects/(i+1):.2f}")