File size: 1,700 Bytes
4d09b42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
"""Module for plain input/output prompt pairs"""
from typing import Generator, Tuple

from axolotl.prompt_tokenizers import PromptTokenizingStrategy
from axolotl.prompters import IGNORE_TOKEN_ID, Prompter


class RawInputOutputStrategy(PromptTokenizingStrategy):
    """Prompt Strategy class for input/output pairs"""

    def __init__(self, *args, eos_token=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.eos_token = eos_token
        if not eos_token:
            self.eos_token = self.tokenizer.eos_token

    def tokenize_prompt(self, prompt):
        # pylint: disable=duplicate-code
        input_ids = []
        labels = []
        for label, text in self.prompter.build_prompt(prompt["segments"]):
            tokenized_output = self.tokenizer(
                text, add_special_tokens=False, return_tensors=None
            )["input_ids"]
            input_ids += tokenized_output
            if label or self.train_on_inputs:
                labels += tokenized_output
            else:
                labels += [IGNORE_TOKEN_ID] * len(tokenized_output)

        tokenized_prompt = {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": [1] * len(input_ids),
        }

        return tokenized_prompt


class RawInputOutputPrompter(Prompter):
    """prompter for raw i/o data"""

    def build_prompt(self, source) -> Generator[Tuple[bool, str], None, None]:
        for segment in source:
            yield segment["label"], segment["text"]


def load(tokenizer, cfg):
    return RawInputOutputStrategy(
        RawInputOutputPrompter(),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )