qnguyen3
/

r1-res-stream

Model card Files Files and versions Community

r1-res-stream / gen_ans.py

qnguyen3's picture

Upload folder using huggingface_hub

22f338c verified 13 days ago

history blame contribute delete

2.03 kB

	from datasets import load_dataset
	from openai import AsyncOpenAI
	from tqdm import tqdm
	import asyncio
	import json
	import os

	client = AsyncOpenAI(api_key="no-need", base_url="http://localhost:8000/v1")

	async def generate_answer(messages):
	try:
	response = await client.chat.completions.create(
	model="outputs/out-alpha",
	messages=messages,
	max_tokens=2048,
	)
	return response.choices[0].message.content
	except Exception as e:
	return 'error'

	async def process_batch(questions, batch_num, all_qa_pairs):
	tasks = []
	for q in tqdm(questions, desc=f"Batch {batch_num}", leave=False):
	tasks.append(generate_answer(q))
	answers = await asyncio.gather(*tasks)

	# Create list of question-answer pairs and append to existing pairs
	for q, a in zip(questions, answers):
	q.append({'role': 'assistant', 'content': a})
	all_qa_pairs.append({"conversations": q})

	# Save all results after each batch
	with open('qa_pairs_all-alpha1b_2.json', 'w') as f:
	json.dump(all_qa_pairs, f, indent=2)

	return answers

	async def main():
	dataset = load_dataset('qnguyen3/sft-r1')
	# Load existing QA pairs if file exists
	all_qa_pairs = []
	if os.path.exists('qa_pairs_all-alpha1b_2.json'):
	with open('qa_pairs_all-alpha1b_2.json', 'r') as f:
	all_qa_pairs = json.load(f)

	# Prepare questions
	question_list = []
	print("Preparing questions...")
	for i, item in tqdm(enumerate(dataset['train']), desc="Loading dataset"):
	if i >= 21600:
	question_list.append(item['messages'][:-1])

	# Process in batches of 200
	batch_size = 200
	for i in tqdm(range(0, len(question_list), batch_size), desc="Processing batches"):
	batch_questions = question_list[i:i+batch_size]
	batch_num = i // batch_size
	await process_batch(batch_questions, batch_num, all_qa_pairs)

	if __name__ == "__main__":
	asyncio.run(main())