from numbers import Integral from vllm import LLM, SamplingParams import os import numpy as np import pandas as pd from tqdm import tqdm import gc import re import sys import subprocess import math import random from collections import defaultdict from collections import Counter import torch import transformers import accelerate import time from aimo_math import train_env import pdb NOTEBOOK_START_TIME = time.time() if os.getenv('KAGGLE_IS_COMPETITION_RERUN'): PRIVATE = True else: PRIVATE = False # if not PRIVATE: # class train_env(): # def __init__(self, randomize=False): # self.randomlize = randomize # self.df = pd.read_csv('train.csv') # self.df['ground_truth'] = self.df['answer'] # self.df['answer'] = -1 # if self.randomlize: # self.df = self.df.reset_index().sample(frac=1).reset_index(drop=True) # self.predict_called = True # self.counter = 0 # self.len = len(self.df) # def iter_test(self): # while self.counter < self.len: # if self.predict_called: # self.predict_called = False # yield (self.df.loc[[self.counter]][['id', 'problem']]), (self.df.loc[[self.counter]][['id', 'answer']]) # else: # print( # "You must call `predict()` successfully before you can continue with `iter_test()`") # yield None # def predict(self, answer): # self.df.loc[self.counter, ('answer')] = answer['answer'].values[0] # self.predict_called = True # self.counter += 1 # env = train_env(randomize=True) # env = () # iter_test = env.iter_test() # else: # assert False env = train_env() iter_test = env.iter_test() QUANT = False USE_PAST_KEY = True SEED = 42 # MODEL_PATH = "/kaggle/input/deepseek-math" MODEL_PATH = '/opt/tiger/deepseek-math-7b-rl/' DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' N_REPETITIONS = 19 if PRIVATE else 4 MAX_NEW_TOKENS = 2048 if PRIVATE else 512 TIME_LIMIT = 31500 if PRIVATE else 300 def seed_everything(seed: int): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False transformers.set_seed(seed) if SEED: seed_everything(SEED) torch.backends.cuda.enable_mem_efficient_sdp(False) DEVICE_MAP = [('model.embed_tokens', 0), ('model.layers.0', 0), ('model.layers.1', 0), ('model.layers.2', 0), ('model.layers.3', 0), ('model.layers.4', 0), ('model.layers.5', 0), ('model.layers.6', 0), ('model.layers.7', 0), ('model.layers.8', 0), ('model.layers.9', 0), ('model.layers.10', 0), ('model.layers.11', 0), ('model.layers.12', 0), ('model.layers.13', 0), ('model.layers.14', 0), ('model.layers.15', 0), ('model.layers.16', 0), ('model.layers.17', 0), ('model.layers.18', 1), ('model.layers.19', 1), ('model.layers.20', 1), ('model.layers.21', 1), ('model.layers.22', 1), ('model.layers.23', 1), ('model.layers.24', 1), ('model.layers.25', 1), ('model.layers.26', 1), ('model.layers.27', 1), ('model.layers.28', 1), ('model.layers.29', 1), ('model.layers.30', 1), ('model.layers.31', 1), ('model.norm', 1), ('lm_head', 1)] DEVICE_MAP = {ii: jj for (ii, jj) in DEVICE_MAP} TEMPERATURE = [0.9, 0.9] # temperature, temperature_coding TOP_P = [1.0, 1.0] # top_p, top_p_coding class StoppingCriteriaSub(transformers.StoppingCriteria): def __init__(self, stops=[], encounters=1): super().__init__() self.stops = [stop.to(DEVICE) for stop in stops] def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): for stop in self.stops: last_token = input_ids[0][-len(stop):] if torch.all(torch.eq(stop, last_token)): return True return False class LLM_SYSTEM: def __init__(self, model_path, device_map, temperature, top_p, prompt_options): # init llm self.model, self.tokenizer = self.initialize_llm(model_path, device_map) # init stop words self.stop_words = ["```output", "```python", "```\nOutput" , ")\n```" , "``````output"] # TODO: self.stop_words = ["```output", # "```\nOutput", ")\n```", "``````output"] self.stop_words_ids = [self.tokenizer(stop_word, return_tensors='pt', add_special_tokens=False)['input_ids'].squeeze() for stop_word in self.stop_words] self.stopping_criteria = transformers.StoppingCriteriaList([StoppingCriteriaSub(stops=self.stop_words_ids)]) self.prompt_options = prompt_options self.temperature = temperature[0] self.top_p = top_p[0] self.temperature_coding = temperature[1] self.top_p_coding = top_p[1] self.total_results = {} self.total_answers = {} self.best_stats = {} self.total_outputs = {} self.question_type_counts = {} self.starting_counts = (2, 3) self.problem_count = 0 self.already_generated_length = 0 self.code_error = None self.code_error_count = 0 self.code_output = -1 # ====================================================================================# def initialize_llm(self, model_path, device_map): # model = LLM(model=model_path, tensor_parallel_size=2, # dtype=torch.float16) model = LLM(model=model_path, dtype=torch.float16) tokenizer = model.get_tokenizer() return model, tokenizer # ====================================================================================# def predict(self, problem): self.problem_count += 1 TIME_SPENT = time.time() - NOTEBOOK_START_TIME if TIME_SPENT > TIME_LIMIT: return -1 for repetition in tqdm(range(N_REPETITIONS)): print( f"\n\n\nQUESTION {self.problem_count} - {repetition} - TIME_SPENT : {TIME_SPENT:.0f} secs") best, best_count = self.best_stats.get( self.problem_count, (-1, -1)) if best_count > np.sqrt(repetition): print("SKIPPING CAUSE ALREADY FOUND BEST") continue outputs = self.total_outputs.get(self.problem_count, []) text_answers, code_answers = self.question_type_counts.get( self.problem_count, self.starting_counts) results = self.total_results.get(self.problem_count, []) answers = self.total_answers.get(self.problem_count, []) for _ in range(5): self.flush() time.sleep(0.2) try: self.already_generated_length = 0 self.code_error = None self.code_error_count = 0 self.code_output = -1 counts = np.array([text_answers, code_answers]) draw = np.random.choice( self.prompt_options, 1, p=counts/counts.sum()) initial_message = draw[0].format(problem, "{}") prompt = f"{initial_message}" # TODO: prompt = f"User: {initial_message}" prompt_original_length = len(prompt) print(f"{repetition}_{prompt}\n") model_inputs = self.tokenizer( prompt, return_tensors='pt') prompt_token_length = len(model_inputs['input_ids'][0]) sampling_params= SamplingParams(max_tokens=MAX_NEW_TOKENS-self.already_generated_length, temperature=self.temperature, top_p=self.top_p, stop = self.stop_words ) generation_output = self.model.generate(prompt, sampling_params) decoded_output = generation_output[0].outputs[0].text print(f"{decoded_output}\n") prompt_original_length += len(decoded_output) cummulative_code = "" stop_word_cond = False for stop_word in self.stop_words: stop_word_cond = stop_word_cond or ( decoded_output[-len(stop_word):] == stop_word) while (stop_word_cond) and (self.already_generated_length < (MAX_NEW_TOKENS)): if (decoded_output[-len("```python"):] == "```python"): temperature_inner = self.temperature_coding top_p_inner = self.top_p_coding # prompt = decoded_output prompt = prompt + decoded_output # ? missing prevous prompt else: temperature_inner = self.temperature top_p_inner = self.top_p try: if (decoded_output[-len("``````output"):] == "``````output"): code_text = decoded_output.split( '```python')[-1].split("``````")[0] else: code_text = decoded_output.split( '```python')[-1].split("```")[0] cummulative_code += code_text self.code_output, CODE_STATUS = self.process_code( cummulative_code, return_shell_output=True) print('CODE RESULTS', self.code_output) if self.code_error == self.code_output: self.code_error_count += 1 else: self.code_error = self.code_output self.code_error_count = 0 if not CODE_STATUS: cummulative_code = cummulative_code[:-len( code_text)] if self.code_error_count >= 1: print("REPEATED ERRORS") break except Exception as e: print(e) print('ERROR PARSING CODE') self.code_output = -1 if self.code_output != -1: if (decoded_output[-len(")\n```"):] == ")\n```"): # prompt = decoded_output+'```output\n'+str(self.code_output)+'\n```\n' prompt = prompt + decoded_output+'```output\n' + \ str(self.code_output)+'\n```\n' else: # prompt = decoded_output+'\n'+str(self.code_output)+'\n```\n' prompt = prompt + decoded_output+'\n' + \ str(self.code_output)+'\n```\n' else: # prompt = decoded_output prompt = prompt + decoded_output cummulative_code = "" model_inputs = self.tokenizer(prompt, return_tensors='pt') self.already_generated_length = len( model_inputs['input_ids'][0])-prompt_token_length sampling_params= SamplingParams(max_tokens=MAX_NEW_TOKENS-self.already_generated_length, temperature=self.temperature, top_p=self.top_p, stop = self.stop_words ) generation_output= self.model.generate(prompt, sampling_params) decoded_output = generation_output[0].outputs[0].text print(f"{decoded_output}\n") prompt_original_length += len(decoded_output) stop_word_cond = False for stop_word in self.stop_words: stop_word_cond = stop_word_cond or ( decoded_output[-len(stop_word):] == stop_word) decoded_output = generation_output[0].outputs[0].text result_output = self.process_text_output(decoded_output) try: self.code_output = round( float(eval(self.code_output))) % 1000 except Exception as e: print(e, 'final_eval') self.code_output = -1 except Exception as e: print(e, "5") result_output, self.code_output = -1, -1 if self.code_output != -1: outputs.append(self.code_output) code_answers += 1 if result_output != -1: outputs.append(result_output) text_answers += 1 if len(outputs) > 0: occurences = Counter(outputs).most_common() print(occurences) if occurences[0][1] > best_count: print("GOOD ANSWER UPDATED!") best = occurences[0][0] best_count = occurences[0][1] if occurences[0][1] > 5: print("ANSWER FOUND!") break results.append(result_output) answers.append(self.code_output) self.best_stats[self.problem_count] = (best, best_count) self.question_type_counts[self.problem_count] = ( text_answers, code_answers) self.total_outputs[self.problem_count] = outputs self.total_results[self.problem_count] = results self.total_answers[self.problem_count] = answers # print("code_answers", code_answers - # self.starting_counts[1], "text_answers", text_answers-self.starting_counts[0]) return self.best_stats[self.problem_count][0] # ====================================================================================# def flush(self): torch.cuda.empty_cache() gc.collect() # ====================================================================================# def naive_parse(self, answer): out = [] start = False end = False for l in reversed(list(answer)): if l in '0123456789' and not end: start = True out.append(l) else: if start: end = True out = reversed(out) return ''.join(out) # ====================================================================================# def return_last_print(self, output, n): lines = output.strip().split('\n') if lines: return lines[n] else: return "" # ====================================================================================# def repl(self, match): if "real" not in match.group(): return "{}{}".format(match.group()[:-1], ', real=True)') else: return "{}{}".format(match.group()[:-1], ')') # ====================================================================================# def process_code(self, code, return_shell_output=False): code = re.sub(r"symbols\([^)]+\)", self.repl, code) if return_shell_output: code = code.replace('\n', '\n ') # Add a try...except block code = "\ntry:\n from sympy import *\n{}\nexcept Exception as e:\n print(e)\n print('FAIL')\n".format( code) if not return_shell_output: print(code) with open('code.py', 'w') as fout: fout.write(code) batcmd = 'timeout 7 ' + sys.executable + ' code.py' try: shell_output = subprocess.check_output( batcmd, shell=True).decode('utf8') return_value = self.return_last_print(shell_output, -1) print(shell_output) if return_shell_output: if return_value == 'FAIL': CODE_STATUS = False return_value = self.return_last_print(shell_output, -2) if "not defined" in return_value: return_value += '\nTry checking the formatting and imports' else: CODE_STATUS = True return return_value, CODE_STATUS self.code_output = round(float(eval(return_value))) % 1000 except Exception as e: print(e, 'shell_output') self.code_output = -1 if return_shell_output: if self.code_output == -1: CODE_STATUS = False else: CODE_STATUS = True return self.code_output, CODE_STATUS return self.code_output # ====================================================================================# # def process_text_output(self, output): # result = output # try: # result_output = output.split("The answer is")[-1].strip() # result_output = round(float(eval(result_output))) % 1000 # except: # try: # result_output = re.findall(r'\\boxed\{(\d+)\}', result) # print('BOXED', result_output) # if not len(result_output): # result_output = self.naive_parse(result) # else: # result_output = result_output[-1] # print('BOXED FINAL', result_output) # if not len(result_output): # result_output = -1 # else: # result_output = round(float(eval(result_output))) % 1000 # except Exception as e: # print(e) # print('ERROR PARSING TEXT') # result_output = -1 # return result_output def process_text_output(self, output): result = output try: result_output = re.findall(r'\\boxed\{(\d+)\}', result) print('BOXED', result_output) if not len(result_output): result_output = self.naive_parse(result) else: result_output = result_output[-1] print('BOXED FINAL', result_output) if not len(result_output): result_output = -1 else: result_output = round(float(eval(result_output))) % 1000 except Exception as e: print(e) print('ERROR PARSING TEXT') result_output = -1 return result_output # ====================================================================================# # code = '''Below is a math problem you are to solve: # \"{}\" # To accomplish this, first determine a sympy-based approach for solving the problem by listing each step to take and what functions need to be called in each step.''' # cot = '''Below is a math problem you are to solve: # \"{}\" # Analyze this problem and think step by step to come to a solution with programs.''' code = """Below is a math problem you are to solve (positive numerical answer): \"{}\" To accomplish this, first determine a sympy-based approach for solving the problem by listing each step to take and what functions need to be called in each step. Be clear so even an idiot can follow your instructions, and remember, your final answer should be positive integer, not an algebraic expression! Write the entire script covering all the steps (use comments and document it well) and print the result. After solving the problem, output the final numerical answer within \\boxed{}. Approach:""" cot = """Below is a math problem you are to solve (positive numerical answer!): \"{}\" Analyze this problem and think step by step to come to a solution with programs. After solving the problem, output the final numerical answer within \\boxed{}.\n\n""" prompt_options = [code,cot] prompt_options = [code, cot] llm = LLM_SYSTEM(MODEL_PATH, DEVICE_MAP, TEMPERATURE, TOP_P, prompt_options) result = [] for test, sample_submission in tqdm(iter_test): # sample_submission['answer'] = 0 # print(test) res = llm.predict(test['problem'].values[0]) if res == -1: break sample_submission['answer'] = res result.append(env.predict(sample_submission)) print(f"total: {len(result)}, Corrects: {sum(result)}. Accuracy: {sum(result)/len(result):.2f}") print(f"Time: {time.time()-NOTEBOOK_START_TIME:.2f} secs") # TOC = time.time() # sample_submission['answer'] = best_stats[i][0] # print(f"Problem {i}: {test['problem']}") # print(f"Ground truth: {test['ground_truth']}") # print(f"Model answer: {sample_submission['answer']}") # print(f"\nTIME_SPENT for problem {i}: {TIME_SPENT:.0f} secs") # if str(sample_submission['answer']) == str(test['ground_truth']): # num_corrects += 1 # print(f"Corrects: {num_corrects}/{i+1}. Accuracy: {num_corrects/(i+1):.2f}")