tangled-llama-l-128k-v0.1 / scripts /contrain_datasets.py
mtasic85's picture
contrain instruct datasets
5281790
roles_map = {
'system': 'system',
'user': 'user',
'human': 'user',
'assistant': 'assistant',
'gpt': 'assistant',
'AI': 'assistant',
}
contrain_datasets = [
#
# general instructs
#
# mlabonne/open-perfectblend - 1.48 GB, 1,420,909
# meta-math/MetaMathQA 395,000
# openbmb/UltraInteract_sft 288,579
# HuggingFaceH4/ultrachat_200k 207,865
# microsoft/orca-math-word-problems-200k 200,035
# HuggingFaceH4/ultrafeedback_binarized 187,405
# theblackcat102/evol-codealpaca-v1 111,272
# Post-training-Data-Flywheel/AutoIF-instruct-61k 61,492
# mlabonne/lmsys-arena-human-preference-55k-sharegpt 57,362
*[
{'path': 'mlabonne/open-perfectblend', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
{'role': roles_map[m['from']], 'content': m['value']}
for m in msgs
]}
for i in range(0, 100, 20)
],
# arcee-ai/The-Tome - 4.58 GB, 1,752,473
# - arcee-ai/infini-instruct-top-500k (BAAI/Infinity-Instruct)
# - TIGER-Lab/WebInstructSub (top-500k) - IGNORE
# - jondurbin/airoboros-3.2
# - gardner/glaive-function-calling-v2-sharegpt
# - arcee-ai/reasoning-sharegpt (SkunkworksAI/reasoning-0.01)
# - arcee-ai/self-instruct-sharegpt (bigcode/self-oss-instruct-sc2-exec-filter-50k)
# - cognitivecomputations/ultrainteract_trajectories_sharegpt
# - cognitivecomputations/SystemChat-2.0
# - arcee-ai/qwen2-72b-magpie-en
*[
{'path': 'arcee-ai/The-Tome', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
{'role': roles_map[m['from']], 'content': m['value']}
for m in msgs
]}
for i in range(0, 100, 20)
],
# rombodawg/Everything_Instruct_Multilingual - 2.48 GB, 5,808,694
# Science:
# antiven0m/physical-reasoning-dpoScience
# LawalAfeez/science-dataset
# Social media:
# Kyle1668/AG-Tweets
# euclaise/reddit-instruct-curated
# General Knowledge:
# NousResearch/CharacterCodex_Characters
# jstet/quotes-500k_Famous_Quotes
# FronkonGames/steam-games-dataset_Video_Games
# totuta_youtube_subs_howto100M_HowTo
# Multi-lingual:
# Amani27/massive_translation_dataset
# udmurtNLP/udmurt-russian-english-labse
# grosenthal/latin_english
# msarmi9/korean-english-multitarget-ted-talks-task
# HaiderSultanArc/MT-Urdu-English_Translate
# Garsa3112/ChineseEnglishTranslationDataset
# Cooking:
# andrewsiah/se_cooking_preference_sft
# Hieu-Phamkaggle/food_recipes
# Writing:
# shahules786/PoetryFoundationData
# euclaise/writingprompts
# qwedsacf/ivypanda-essaysEssay
# Medicine:
# keivalya/MedQuad-MedicalQnADataset
# nuvocare/MSD
# History:
# ambrosfitz10k/history_data_v4
# Law:
# dzunggg/legal-qa-v1
# Role-Play:
# roleplay4/fun_CoupleRP
# Undi95andrijdavid/roleplay-conversation-sharegpt
# News:
# RealTimeData/bbc_news_alltime
# Coding: (rombodawg/code_bagel)
# layoric/tiny-codes-alpaca
# glaiveai/glaive-code-assistant-v3
# ajibawa-2023/Code-290k-ShareGPT
# chargoddard/commitpack-ft-instruct-rated
# iamtarun/code_instructions_120k_alpaca
# ise-uiuc/Magicoder-Evol-Instruct-110K
# cognitivecomputations/dolphin-coder
# nickrosh/Evol-Instruct-Code-80k-v1
# coseal/CodeUltraFeedback_binarized
# CyberNative/Code_Vulnerability_Security_DPO
# Math: (rombodawg/code_bagel)
# TIGER-Lab/MathInstruct
# Function calling: (rombodawg/code_bagel)
# glaiveai/glaive-function-calling-v2
# General Instruct: (rombodawg/OpenHermes-2.5-Uncensored)
# teknium/OpenHermes-2.5
*[
{'path': 'rombodawg/Everything_Instruct_Multilingual', 'split': f'train[{i}%:{i + 20}%]', 'transform': lambda r: [
{'role': 'system', 'content': r['instruction']},
{'role': 'user', 'content': r['input']},
{'role': 'assistant', 'content': r['output']},
]}
for i in range(0, 100, 20)
],
#
# tool/function calling
#
# 65.7 MB, 11,578
{'path': 'NousResearch/hermes-function-calling-v1', 'field': 'conversations', 'transform': lambda msgs: [
{'role': roles_map[m['from']], 'content': m['value']}
for m in msgs
]},
#
# agent
#
# 1.51 GB, 485,874
*[
{'path': 'arcee-ai/agent-data', 'split': f'train[{i}%:{i + 20}%]', 'field': 'conversations', 'transform': lambda msgs: [
{'role': roles_map[m['from']], 'content': m['value']}
for m in msgs
]}
for i in range(0, 100, 20)
],
#
# general reasoning
#
*[
# 10.8 MB, 15,770
{'path': 'AtlasUnified/Atlas-Reasoning', 'data_files': 'reasoning.csv', 'transform': lambda r: [
{'role': 'user', 'content': r['Prompt']},
{'role': 'assistant', 'content': r['Step-by-step reasoning'] + '\n' + r['Solution']},
]},
],
#
# math reasoning
#
# 8.99 MB, 6,914
{'path': 'thesven/gsm8k-reasoning', 'transform': lambda r: [
{'role': 'user', 'content': r['question']},
{'role': 'assistant', 'content': (r['generation'] or '') + '\n' + r['answer'] + '\n' + r['short_answer']},
]},
# 1.79 MB, 3,963
{'path': 'AlgorithmicResearchGroup/math_reasoning_autoformalization_track', 'transform': lambda r: [
{'role': 'user', 'content': r['informal_statement']},
{'role': 'assistant', 'content': r['informal_proof'] + '\n' + r['formal_proof']},
]},
# 307 MB, 19,944
{'path': 'KingNish/reasoning-base-20k', 'transform': lambda r: [
{'role': 'user', 'content': r['user']},
{'role': 'assistant', 'content': r['reasoning'] + '\n' + r['assistant']},
]},
# 9.45 MB, 10,000
{'path': 'Aarushhh/math-reasoning-10k', 'transform': lambda r: [
{'role': 'user', 'content': r['problem']},
{'role': 'assistant', 'content': r['plan'] + '\n' + r['solution']},
]},
#
# reflection
#
# 4.17 MB, 1,000
{'path': 'dvilasuero/reflection-v1-gpt-4o-judge', 'transform': lambda r: [
{'role': 'system', 'content': r['system']},
{'role': 'user', 'content': r['prompt']},
{'role': 'assistant', 'content': r['response']},
]},
# 12.4 MB, 3,000
{'path': 'dvilasuero/reflection-v1-openai-o-mini-judge', 'transform': lambda r: [
{'role': 'system', 'content': r['system']},
{'role': 'user', 'content': r['prompt']},
{'role': 'assistant', 'content': r['response']},
]},
# 70.8 MB, 36,549
{'path': 'dvilasuero/reflection-v1-final-dedup', 'transform': lambda r: [
{'role': 'system', 'content': r['system']},
{'role': 'user', 'content': r['prompt']},
{'role': 'assistant', 'content': r['response']},
]},
# 30.6 MB, 25,391
{'path': 'flozi00/reflection-qwen2.5-72b-260924', 'transform': lambda r: [
r['system'][0],
{'role': 'user', 'content': r['input']},
{'role': 'assistant', 'content': r['reflection'] + '\n' + r['output']},
]},
#
# general instructs
#
# 971 MB, 484,570
{'path': 'HuggingFaceTB/smol-smoltalk', 'field': 'messages'},
]