Upload folder using huggingface_hub
Browse files- .gitattributes +5 -0
- README.md +3 -1
- adapters/code/README.md +32 -0
- adapters/code/adapter_config.json +26 -0
- adapters/code/adapter_model.bin +3 -0
- adapters/creative/README.md +20 -0
- adapters/creative/adapter_config.json +26 -0
- adapters/creative/adapter_model.bin +3 -0
- adapters/function/README.md +47 -0
- adapters/function/adapter_config.json +26 -0
- adapters/function/adapter_model.bin +3 -0
- routing_data/expert_code.jsonl +0 -0
- routing_data/expert_creative.jsonl +0 -0
- routing_data/expert_function.jsonl +0 -0
- routing_data/expert_general.jsonl +0 -0
- routing_data/expert_qa.jsonl +0 -0
- routing_data/expert_reasoning.jsonl +0 -0
- scripts/segment_dataset.py +84 -0
- scripts/tune.sh +47 -0
- training_data/expert_code.jsonl +3 -0
- training_data/expert_creative.jsonl +3 -0
- training_data/expert_function.jsonl +0 -0
- training_data/expert_general.jsonl +3 -0
- training_data/expert_qa.jsonl +3 -0
- training_data/expert_reasoning.jsonl +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
training_data/expert_code.jsonl filter=lfs diff=lfs merge=lfs -text
|
37 |
+
training_data/expert_creative.jsonl filter=lfs diff=lfs merge=lfs -text
|
38 |
+
training_data/expert_general.jsonl filter=lfs diff=lfs merge=lfs -text
|
39 |
+
training_data/expert_qa.jsonl filter=lfs diff=lfs merge=lfs -text
|
40 |
+
training_data/expert_reasoning.jsonl filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
---
|
2 |
-
license:
|
3 |
---
|
|
|
|
|
|
1 |
---
|
2 |
+
license: other
|
3 |
---
|
4 |
+
|
5 |
+
https://github.com/jondurbin/airoboros#lmoe
|
adapters/code/README.md
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: peft
|
3 |
+
---
|
4 |
+
## Training procedure
|
5 |
+
|
6 |
+
|
7 |
+
The following `bitsandbytes` quantization config was used during training:
|
8 |
+
- load_in_8bit: False
|
9 |
+
- load_in_4bit: True
|
10 |
+
- llm_int8_threshold: 6.0
|
11 |
+
- llm_int8_skip_modules: None
|
12 |
+
- llm_int8_enable_fp32_cpu_offload: False
|
13 |
+
- llm_int8_has_fp16_weight: False
|
14 |
+
- bnb_4bit_quant_type: nf4
|
15 |
+
- bnb_4bit_use_double_quant: True
|
16 |
+
- bnb_4bit_compute_dtype: bfloat16
|
17 |
+
|
18 |
+
The following `bitsandbytes` quantization config was used during training:
|
19 |
+
- load_in_8bit: False
|
20 |
+
- load_in_4bit: True
|
21 |
+
- llm_int8_threshold: 6.0
|
22 |
+
- llm_int8_skip_modules: None
|
23 |
+
- llm_int8_enable_fp32_cpu_offload: False
|
24 |
+
- llm_int8_has_fp16_weight: False
|
25 |
+
- bnb_4bit_quant_type: nf4
|
26 |
+
- bnb_4bit_use_double_quant: True
|
27 |
+
- bnb_4bit_compute_dtype: bfloat16
|
28 |
+
### Framework versions
|
29 |
+
|
30 |
+
- PEFT 0.4.0
|
31 |
+
|
32 |
+
- PEFT 0.4.0
|
adapters/code/adapter_config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": "/workspace/llama-2-70b-hf",
|
4 |
+
"bias": "none",
|
5 |
+
"fan_in_fan_out": false,
|
6 |
+
"inference_mode": true,
|
7 |
+
"init_lora_weights": true,
|
8 |
+
"layers_pattern": null,
|
9 |
+
"layers_to_transform": null,
|
10 |
+
"lora_alpha": 16.0,
|
11 |
+
"lora_dropout": 0.1,
|
12 |
+
"modules_to_save": null,
|
13 |
+
"peft_type": "LORA",
|
14 |
+
"r": 64,
|
15 |
+
"revision": null,
|
16 |
+
"target_modules": [
|
17 |
+
"v_proj",
|
18 |
+
"down_proj",
|
19 |
+
"up_proj",
|
20 |
+
"k_proj",
|
21 |
+
"o_proj",
|
22 |
+
"gate_proj",
|
23 |
+
"q_proj"
|
24 |
+
],
|
25 |
+
"task_type": "CAUSAL_LM"
|
26 |
+
}
|
adapters/code/adapter_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3191b3c1cfe3af2377501ca83f683ca3c86576092039651dc7d08eafd2064eb6
|
3 |
+
size 1657155077
|
adapters/creative/README.md
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: peft
|
3 |
+
---
|
4 |
+
## Training procedure
|
5 |
+
|
6 |
+
|
7 |
+
The following `bitsandbytes` quantization config was used during training:
|
8 |
+
- load_in_8bit: False
|
9 |
+
- load_in_4bit: True
|
10 |
+
- llm_int8_threshold: 6.0
|
11 |
+
- llm_int8_skip_modules: None
|
12 |
+
- llm_int8_enable_fp32_cpu_offload: False
|
13 |
+
- llm_int8_has_fp16_weight: False
|
14 |
+
- bnb_4bit_quant_type: nf4
|
15 |
+
- bnb_4bit_use_double_quant: True
|
16 |
+
- bnb_4bit_compute_dtype: bfloat16
|
17 |
+
### Framework versions
|
18 |
+
|
19 |
+
|
20 |
+
- PEFT 0.4.0
|
adapters/creative/adapter_config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": "/workspace/llama-2-70b-hf",
|
4 |
+
"bias": "none",
|
5 |
+
"fan_in_fan_out": false,
|
6 |
+
"inference_mode": true,
|
7 |
+
"init_lora_weights": true,
|
8 |
+
"layers_pattern": null,
|
9 |
+
"layers_to_transform": null,
|
10 |
+
"lora_alpha": 16.0,
|
11 |
+
"lora_dropout": 0.1,
|
12 |
+
"modules_to_save": null,
|
13 |
+
"peft_type": "LORA",
|
14 |
+
"r": 64,
|
15 |
+
"revision": null,
|
16 |
+
"target_modules": [
|
17 |
+
"o_proj",
|
18 |
+
"up_proj",
|
19 |
+
"k_proj",
|
20 |
+
"v_proj",
|
21 |
+
"q_proj",
|
22 |
+
"down_proj",
|
23 |
+
"gate_proj"
|
24 |
+
],
|
25 |
+
"task_type": "CAUSAL_LM"
|
26 |
+
}
|
adapters/creative/adapter_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7706c60de732463422ae4fd2d2eaf39def04170a5f5481f65c5e3cb2d057906a
|
3 |
+
size 1657155077
|
adapters/function/README.md
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
library_name: peft
|
3 |
+
---
|
4 |
+
## Training procedure
|
5 |
+
|
6 |
+
|
7 |
+
The following `bitsandbytes` quantization config was used during training:
|
8 |
+
- load_in_8bit: False
|
9 |
+
- load_in_4bit: True
|
10 |
+
- llm_int8_threshold: 6.0
|
11 |
+
- llm_int8_skip_modules: None
|
12 |
+
- llm_int8_enable_fp32_cpu_offload: False
|
13 |
+
- llm_int8_has_fp16_weight: False
|
14 |
+
- bnb_4bit_quant_type: nf4
|
15 |
+
- bnb_4bit_use_double_quant: True
|
16 |
+
- bnb_4bit_compute_dtype: bfloat16
|
17 |
+
|
18 |
+
The following `bitsandbytes` quantization config was used during training:
|
19 |
+
- load_in_8bit: False
|
20 |
+
- load_in_4bit: True
|
21 |
+
- llm_int8_threshold: 6.0
|
22 |
+
- llm_int8_skip_modules: None
|
23 |
+
- llm_int8_enable_fp32_cpu_offload: False
|
24 |
+
- llm_int8_has_fp16_weight: False
|
25 |
+
- bnb_4bit_quant_type: nf4
|
26 |
+
- bnb_4bit_use_double_quant: True
|
27 |
+
- bnb_4bit_compute_dtype: bfloat16
|
28 |
+
### Framework versions
|
29 |
+
|
30 |
+
- PEFT 0.4.0
|
31 |
+
|
32 |
+
- PEFT 0.4.0
|
33 |
+
tion config was used during training:
|
34 |
+
- load_in_8bit: False
|
35 |
+
- load_in_4bit: True
|
36 |
+
- llm_int8_threshold: 6.0
|
37 |
+
- llm_int8_skip_modules: None
|
38 |
+
- llm_int8_enable_fp32_cpu_offload: False
|
39 |
+
- llm_int8_has_fp16_weight: False
|
40 |
+
- bnb_4bit_quant_type: nf4
|
41 |
+
- bnb_4bit_use_double_quant: True
|
42 |
+
- bnb_4bit_compute_dtype: bfloat16
|
43 |
+
### Framework versions
|
44 |
+
|
45 |
+
- PEFT 0.4.0
|
46 |
+
|
47 |
+
- PEFT 0.4.0
|
adapters/function/adapter_config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_mapping": null,
|
3 |
+
"base_model_name_or_path": "/workspace/llama-2-70b-hf",
|
4 |
+
"bias": "none",
|
5 |
+
"fan_in_fan_out": false,
|
6 |
+
"inference_mode": true,
|
7 |
+
"init_lora_weights": true,
|
8 |
+
"layers_pattern": null,
|
9 |
+
"layers_to_transform": null,
|
10 |
+
"lora_alpha": 16.0,
|
11 |
+
"lora_dropout": 0.1,
|
12 |
+
"modules_to_save": null,
|
13 |
+
"peft_type": "LORA",
|
14 |
+
"r": 64,
|
15 |
+
"revision": null,
|
16 |
+
"target_modules": [
|
17 |
+
"gate_proj",
|
18 |
+
"down_proj",
|
19 |
+
"o_proj",
|
20 |
+
"k_proj",
|
21 |
+
"v_proj",
|
22 |
+
"q_proj",
|
23 |
+
"up_proj"
|
24 |
+
],
|
25 |
+
"task_type": "CAUSAL_LM"
|
26 |
+
}
|
adapters/function/adapter_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2cce2878d176d9e64aaf65637e3303c8ba5d12c7a74bf86eeb7c9f68d6f1adb6
|
3 |
+
size 1657155077
|
routing_data/expert_code.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
routing_data/expert_creative.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
routing_data/expert_function.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
routing_data/expert_general.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
routing_data/expert_qa.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
routing_data/expert_reasoning.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
scripts/segment_dataset.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import random
|
4 |
+
from collections import defaultdict
|
5 |
+
from smart_open import smart_open
|
6 |
+
|
7 |
+
# URL to the dataset we're using.
|
8 |
+
dataset_url = "https://huggingface.co/datasets/jondurbin/airoboros-2.1/resolve/main/instructions.jsonl"
|
9 |
+
|
10 |
+
# Select the subset of data for each of our experts.
|
11 |
+
experts = {
|
12 |
+
"qa": [
|
13 |
+
"quiz",
|
14 |
+
"multiple_choice",
|
15 |
+
"contextual",
|
16 |
+
"counterfactual_contextual"
|
17 |
+
],
|
18 |
+
"creative": [
|
19 |
+
"card",
|
20 |
+
"writing",
|
21 |
+
"experience",
|
22 |
+
"song",
|
23 |
+
"roleplay",
|
24 |
+
"gtkm",
|
25 |
+
"rp",
|
26 |
+
"detailed_writing",
|
27 |
+
"joke"
|
28 |
+
],
|
29 |
+
"code": [
|
30 |
+
"coding"
|
31 |
+
],
|
32 |
+
"reasoning": [
|
33 |
+
"cot",
|
34 |
+
"theory_of_mind",
|
35 |
+
"riddle",
|
36 |
+
"orca"
|
37 |
+
],
|
38 |
+
"function": [
|
39 |
+
"agent",
|
40 |
+
"plan"
|
41 |
+
],
|
42 |
+
"general": [
|
43 |
+
"wordgame",
|
44 |
+
"trivia",
|
45 |
+
"general"
|
46 |
+
]
|
47 |
+
}
|
48 |
+
|
49 |
+
# Map all of our training data into the categories per expert.
|
50 |
+
categories = defaultdict(list)
|
51 |
+
with smart_open(dataset_url, "r") as infile:
|
52 |
+
for line in infile.readlines():
|
53 |
+
item = json.loads(line)
|
54 |
+
if not item.get("category"):
|
55 |
+
continue
|
56 |
+
categories[item["category"]].append(item)
|
57 |
+
|
58 |
+
# Include a random sampling of each expert's data in each other expert's dataset.
|
59 |
+
samples = {}
|
60 |
+
for expert, expert_cats in experts.items():
|
61 |
+
samples[expert] = []
|
62 |
+
for category in expert_cats:
|
63 |
+
samples[expert] += random.sample(categories[category], int(len(categories[category]) * 0.15) or 1)
|
64 |
+
|
65 |
+
# Save the split datasets.
|
66 |
+
if not os.path.exists("training_data"):
|
67 |
+
os.mkdir("training_data")
|
68 |
+
if not os.path.exists("routing_data"):
|
69 |
+
os.mkdir("routing_data")
|
70 |
+
for expert, expert_cats in experts.items():
|
71 |
+
with open(f"training_data/expert_{expert}.jsonl", "w") as outfile:
|
72 |
+
# Also, be sure to include stylized responses so it adapts to system prompt well.
|
73 |
+
for category in expert_cats + ["stylized_response"]:
|
74 |
+
for item in categories[category]:
|
75 |
+
outfile.write(json.dumps(item) + "\n")
|
76 |
+
for other in samples:
|
77 |
+
if other == expert:
|
78 |
+
continue
|
79 |
+
for item in samples[other]:
|
80 |
+
outfile.write(json.dumps(item) + "\n")
|
81 |
+
with open(f"routing_data/expert_{expert}.jsonl", "w") as outfile:
|
82 |
+
for category in expert_cats:
|
83 |
+
for item in categories[category]:
|
84 |
+
outfile.write(json.dumps({"instruction": item.get("system", "A chat.") + " " + item["instruction"]}) + "\n")
|
scripts/tune.sh
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
export EXPERT=$1
|
2 |
+
export MODEL_SIZE=$2
|
3 |
+
export BATCH_SIZE=$3
|
4 |
+
export CUDA_VISIBLE_DEVICES=$4
|
5 |
+
|
6 |
+
export BASE_DIR=/workspace
|
7 |
+
export WANDB_API_KEY=[redacted]
|
8 |
+
export WANDB_PROJECT=airoboros-lmoe-$MODEL_SIZE-2.1-$EXPERT
|
9 |
+
|
10 |
+
pyt qlora.py \
|
11 |
+
--model_name_or_path $BASE_DIR/llama-2-$MODEL_SIZE-hf \
|
12 |
+
--output_dir $BASE_DIR/$WANDB_PROJECT \
|
13 |
+
--num_train_epochs 3 \
|
14 |
+
--logging_steps 1 \
|
15 |
+
--save_strategy steps \
|
16 |
+
--save_steps 100 \
|
17 |
+
--save_total_limit 1 \
|
18 |
+
--data_seed 11422 \
|
19 |
+
--evaluation_strategy no \
|
20 |
+
--eval_dataset_size 2 \
|
21 |
+
--max_new_tokens 4096 \
|
22 |
+
--dataloader_num_workers 3 \
|
23 |
+
--logging_strategy steps \
|
24 |
+
--remove_unused_columns False \
|
25 |
+
--do_train \
|
26 |
+
--lora_r 64 \
|
27 |
+
--lora_alpha 16 \
|
28 |
+
--lora_modules all \
|
29 |
+
--bf16 \
|
30 |
+
--bits 4 \
|
31 |
+
--double_quant \
|
32 |
+
--quant_type nf4 \
|
33 |
+
--warmup_ratio 0.03 \
|
34 |
+
--lr_scheduler_type constant \
|
35 |
+
--dataset airoboros-lmoe-2.1/expert_$EXPERT.jsonl \
|
36 |
+
--dataset_format airoboros \
|
37 |
+
--model_max_len 4096 \
|
38 |
+
--per_device_train_batch_size $BASE_SIZE \
|
39 |
+
--learning_rate 0.00017 \
|
40 |
+
--adam_beta2 0.999 \
|
41 |
+
--max_grad_norm 0.3 \
|
42 |
+
--lora_dropout 0.05 \
|
43 |
+
--weight_decay 0.0 \
|
44 |
+
--seed 11422 \
|
45 |
+
--report_to wandb \
|
46 |
+
--gradient_accumulation_steps 16 \
|
47 |
+
--gradient_checkpointing
|
training_data/expert_code.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e2d2ee497fe2eb7ee9d8a53d8efe6c711174eadd3c593b447bfbf73f2c964ccf
|
3 |
+
size 17716707
|
training_data/expert_creative.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c41f8aa2f90c066ba13fdc6e28458a4bfc4b6478c01f07b449c442f04093c3e0
|
3 |
+
size 25482996
|
training_data/expert_function.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
training_data/expert_general.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e658a5081fd87b3869033e0b5305c16f69a6c8d27a8ad903473db9b03b7914b1
|
3 |
+
size 18641341
|
training_data/expert_qa.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:37ec3201c55bfb43326bc002032ea73179feae8b661b9678f32871b370fe7b02
|
3 |
+
size 12318163
|
training_data/expert_reasoning.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:00f4c3813c9c231a54ac3279af97ffe7268424e6c175eba4b048f310839fccd9
|
3 |
+
size 17556799
|