PowerInfer/SmallThinker-3B-Preview

rawsh

4 days ago

Hi! Do you have the evaluation scripts available? My numbers are a bit different locally

AIME: 8/90 = 8.88%, AIME2024: 4/30=13.33%
AMC: 32/83 = 38.55%

Is there anything finicky with the prompt? Running with vllm

yixinsong

PowerInfer org 4 days ago

•

edited 4 days ago

This is my evaluation script.

import re
import time
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

import re

def extract_final_answer(text):
    patterns = [ 
        r"\*+Final\s+Answer\*+\s*\n*\s*\\\[\s*\\boxed\s*{\s*([0-9.-]+)\s*}\s*\\\]",
        r"\*+Final\s+Answer\*+\s*\n*\s*\\\[\s*([0-9.-]+)\s*\\\]",
        r"\*?Final\s+Answer\*?\s*[:=]\s*([0-9.-]+)",
        r"[Tt]he\s+[Ff]inal\s+[Aa]nswer\s+[Ii]s\s*[:=]?\s*([0-9.-]+)",
        r"[Ff]inal\s+[Aa]nswer\s*[:=]\s*([0-9.-]+)",
    ]   

    # text_normalized = text.replace('\\\\', '\\')
    text_normalized = text
    print(text_normalized)

    for i, pattern in enumerate(patterns):
        match = re.search(pattern, text_normalized, re.DOTALL)
        if match:
            result = match.group(1).strip()
            return result
        else:
            print("Not found match")

    return None

def evaluate_answer(predicted, actual):
    if predicted is None:
        return False
    try:
        pred_val = float(predicted)
        actual_val = float(actual)
        return abs(pred_val - actual_val) < 1e-5
    except:
        return predicted.strip() == actual.strip()

def run_inference(model, tokenizer, question, max_new_tokens=16384):
    prompt = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": question},
    ]
    
    input_text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
    
    generated_ids = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        use_cache=True
    )
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

def main():
    # 1. Load model and tokenizer
    model_path = "saves/qwen2-01/full/sft/checkpoint-44000"  # Replace with your model_path
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map="cuda"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # 2. Load the dataset
    dataset = load_dataset("json", data_files="/home/syx/Qwen2.5-Math/evaluation/data/aime24/test.jsonl")  # Replace with the custom dataset
    eval_dataset = dataset["train"]
    
    # 3. Inference and validation
    results = {
        "correct": 0,
        "total": 0,
        "predictions": []
    }
    
    for item in tqdm(eval_dataset):
        question = item["question"]
        ground_truth = item["answer"] if "answer" in item else None
        
        # Inference
        start_time = time.time()
        response = run_inference(model, tokenizer, question)
        inference_time = time.time() - start_time
        
        # Extract
        predicted_answer = extract_final_answer(response)
        #print(response)
        #print(predicted_answer)
        
        # Evaluate
        is_correct = None
        if ground_truth is not None:
            is_correct = evaluate_answer(predicted_answer, ground_truth)
            print(is_correct)
            results["correct"] += int(is_correct)
            results["total"] += 1
            
        # Save
        results["predictions"].append({
            "question": question,
            "response": response,
            "extracted_answer": predicted_answer,
            "ground_truth": ground_truth,
            "is_correct": is_correct,
            "inference_time": inference_time
        })
        
    # 4. Output 
    if results["total"] > 0:
        accuracy = results["correct"] / results["total"] * 100
        print(f"\nAccuracy: {accuracy:.2f}%")
        print(f"Correct: {results['correct']}/{results['total']}")
    
    # Save to Disk 
    import json
    with open("aime24.jsonl", "w") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

if __name__ == "__main__":
    main()

This is my evaluation script. I conducted testing based on transformers, using configurations consistent with those in the generation_config.json of the released model.
You can refer to the result folder to get the evaluation log.

rawsh

4 days ago

Thanks! It might be the answer parsing.

rawsh

4 days ago

For the full finetune does that include system prompt in the messages? Should I be evaluating with "You are a helpful assistant"?

yixinsong

PowerInfer org 4 days ago

For the full finetune no system prompt was included, but I did use a system prompt during evaluation.

PowerInfer
/

SmallThinker-3B-Preview

Eval script