winglian commited on
Commit
78a1e1f
·
1 Parent(s): bc8a2e5

open orca support

Browse files
README.md CHANGED
@@ -195,6 +195,10 @@ Have dataset(s) in one of the following format (JSONL recommended):
195
  ```json
196
  {"message_1": "...", "message_2": "..."}
197
  ```
 
 
 
 
198
  - `context_qa`: in context question answering from an article
199
  ```json
200
  {"article": "...", "question": "...", "answer": "..."}
 
195
  ```json
196
  {"message_1": "...", "message_2": "..."}
197
  ```
198
+ - `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct
199
+ ```json
200
+ {"system_prompt": "...", "question": "...", "response": "..."}
201
+ ```
202
  - `context_qa`: in context question answering from an article
203
  ```json
204
  {"article": "...", "question": "...", "answer": "..."}
src/axolotl/prompt_strategies/alpaca_w_system.py CHANGED
@@ -75,6 +75,20 @@ class SystemDataPrompter(AlpacaPrompter):
75
  yield res
76
 
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  def load(tokenizer, cfg):
79
  return InstructionWSystemPromptTokenizingStrategy(
80
  SystemDataPrompter(PromptStyle.CHAT.value),
@@ -82,3 +96,12 @@ def load(tokenizer, cfg):
82
  cfg.train_on_inputs,
83
  cfg.sequence_len,
84
  )
 
 
 
 
 
 
 
 
 
 
75
  yield res
76
 
77
 
78
+ class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):
79
+ """
80
+ Tokenizing strategy for OpenOrca datasets
81
+ """
82
+
83
+ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
84
+ return (
85
+ prompt["question"],
86
+ "",
87
+ prompt["response"],
88
+ prompt["system_prompt"],
89
+ )
90
+
91
+
92
  def load(tokenizer, cfg):
93
  return InstructionWSystemPromptTokenizingStrategy(
94
  SystemDataPrompter(PromptStyle.CHAT.value),
 
96
  cfg.train_on_inputs,
97
  cfg.sequence_len,
98
  )
99
+
100
+
101
+ def load_open_orca(tokenizer, cfg):
102
+ return OpenOrcaPromptTokenizingStrategy(
103
+ SystemDataPrompter(PromptStyle.INSTRUCT.value),
104
+ tokenizer,
105
+ cfg.train_on_inputs,
106
+ cfg.sequence_len,
107
+ )