File size: 3,945 Bytes
cb9e677
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import dataclasses
from typing import Any, Iterator, List, Optional

import numpy as np
from mistral_common.tokens.tokenizers.sentencepiece import InstructTokenizerBase

from .args import DataArgs
from .dataset import build_dataset


@dataclasses.dataclass
class Batch:
    x: np.ndarray
    y: np.ndarray
    sizes: List[int]
    y_mask: Optional[np.ndarray] = None
    is_pad_only: bool = False

    def __post_init__(self):
        assert self.x.ndim == 1
        assert self.x.shape == self.y.shape
        assert self.x.dtype == np.int64
        assert self.y.dtype == np.int64
        assert isinstance(self.sizes, list)
        assert sum(self.sizes) == self.x.size == self.y.size

        if self.y_mask is not None:
            assert self.y_mask.size == self.y.size, (self.y_mask.shape, self.y.shape)
            assert self.y_mask.dtype == bool
            assert sum(self.sizes) == self.y_mask.size
            assert not self.y_mask.all()
            assert self.y_mask.any()

        if self.is_pad_only:
            assert np.sum(np.abs(self.y)) == 0
            assert np.sum(np.abs(self.x)) == 0
            assert self.y_mask is None
            # create all 0's mask for pad samples
            self.y_mask = np.zeros_like(self.x)




@dataclasses.dataclass
class BatchList:
    x: List[List[int]] = dataclasses.field(default_factory=list)
    y: List[List[int]] = dataclasses.field(default_factory=list)
    sizes: List[List[int]] = dataclasses.field(default_factory=list)
    y_mask: List[Optional[List[int]]] = dataclasses.field(default_factory=list)

    def __post_init__(self):
        assert self.x == [], "`BatchList` has to be empty at init."
        assert self.y == [], "`BatchList` has to be empty at init."
        assert self.sizes == [], "`BatchList` has to be empty at init."
        assert self.y_mask == [], "`BatchList` has to be empty at init."

    def __len__(self) -> int:
        return len(self.x)

    def add(self, x: List[int], y: List[int], sizes: List[int], y_mask: Optional[List[int]] = None):
        self.x.append(x)
        self.y.append(y)
        self.sizes.append(sizes)
        self.y_mask.append(y_mask)

    def empty(self):
        self.x = []
        self.y = []
        self.sizes = []
        self.y_mask = []

    @staticmethod
    def flatten_to_numpy(list_of_lists: List[List[Any]], dtype: np.dtype) -> np.array:
        return np.array([el for sublist in list_of_lists for el in sublist], dtype=dtype)

    def create_batch(self) -> Batch:
        x_np: np.array = self.flatten_to_numpy(self.x, dtype=np.int64)
        y_np: np.array = self.flatten_to_numpy(self.y, dtype=np.int64)
        sizes = sum(self.sizes, [])  # noqa

        y_mask_np: Optional[np.array] = self.flatten_to_numpy(self.y_mask, dtype=bool)
        y_mask_np = None if y_mask_np.all() else y_mask_np

        return Batch(x_np, y_np, sizes, y_mask_np)




def build_data_loader(
    instruct_tokenizer: InstructTokenizerBase,
    args: DataArgs,
    batch_size: int,
    seq_len: int,
    seed: Optional[int],
    rank: int,
    world_size: int,
    is_eval: bool,
) -> Iterator[Batch]:
    pretrain_data = args.data if not is_eval else ""
    instruct_data = args.instruct_data if not is_eval else args.eval_instruct_data

    dataset = build_dataset(
        pretrain_data=pretrain_data,
        instruct_data=instruct_data,
        instruct_args=args.instruct,
        instruct_tokenizer=instruct_tokenizer,
        seq_len=seq_len,
        seed=seed,
        rank=rank,
        world_size=world_size,
        is_eval=is_eval,
        shuffle_pretrain=args.shuffle,
    )

    batch_list = BatchList()
    for sample in dataset:
        assert all(s >= 0 for s in sample.sizes)

        batch_list.add(sample.x, sample.y, sample.sizes, sample.mask)

        if len(batch_list) == batch_size:
            batch: Batch = batch_list.create_batch()
            yield batch

            batch_list.empty()