open orca support
Browse files
README.md
CHANGED
@@ -195,6 +195,10 @@ Have dataset(s) in one of the following format (JSONL recommended):
|
|
195 |
```json
|
196 |
{"message_1": "...", "message_2": "..."}
|
197 |
```
|
|
|
|
|
|
|
|
|
198 |
- `context_qa`: in context question answering from an article
|
199 |
```json
|
200 |
{"article": "...", "question": "...", "answer": "..."}
|
|
|
195 |
```json
|
196 |
{"message_1": "...", "message_2": "..."}
|
197 |
```
|
198 |
+
- `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct
|
199 |
+
```json
|
200 |
+
{"system_prompt": "...", "question": "...", "response": "..."}
|
201 |
+
```
|
202 |
- `context_qa`: in context question answering from an article
|
203 |
```json
|
204 |
{"article": "...", "question": "...", "answer": "..."}
|
src/axolotl/prompt_strategies/alpaca_w_system.py
CHANGED
@@ -75,6 +75,20 @@ class SystemDataPrompter(AlpacaPrompter):
|
|
75 |
yield res
|
76 |
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
def load(tokenizer, cfg):
|
79 |
return InstructionWSystemPromptTokenizingStrategy(
|
80 |
SystemDataPrompter(PromptStyle.CHAT.value),
|
@@ -82,3 +96,12 @@ def load(tokenizer, cfg):
|
|
82 |
cfg.train_on_inputs,
|
83 |
cfg.sequence_len,
|
84 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
yield res
|
76 |
|
77 |
|
78 |
+
class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):
|
79 |
+
"""
|
80 |
+
Tokenizing strategy for OpenOrca datasets
|
81 |
+
"""
|
82 |
+
|
83 |
+
def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
|
84 |
+
return (
|
85 |
+
prompt["question"],
|
86 |
+
"",
|
87 |
+
prompt["response"],
|
88 |
+
prompt["system_prompt"],
|
89 |
+
)
|
90 |
+
|
91 |
+
|
92 |
def load(tokenizer, cfg):
|
93 |
return InstructionWSystemPromptTokenizingStrategy(
|
94 |
SystemDataPrompter(PromptStyle.CHAT.value),
|
|
|
96 |
cfg.train_on_inputs,
|
97 |
cfg.sequence_len,
|
98 |
)
|
99 |
+
|
100 |
+
|
101 |
+
def load_open_orca(tokenizer, cfg):
|
102 |
+
return OpenOrcaPromptTokenizingStrategy(
|
103 |
+
SystemDataPrompter(PromptStyle.INSTRUCT.value),
|
104 |
+
tokenizer,
|
105 |
+
cfg.train_on_inputs,
|
106 |
+
cfg.sequence_len,
|
107 |
+
)
|