Nanobit commited on
Commit
6abb7f6
·
1 Parent(s): de2406c

Lint datasets

Browse files
Files changed (1) hide show
  1. src/axolotl/datasets.py +11 -2
src/axolotl/datasets.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import logging
2
  from typing import List
3
 
@@ -14,7 +16,14 @@ from .prompt_tokenizers import PromptTokenizingStrategy, InvalidDataException
14
 
15
 
16
  class TokenizedPromptDataset(IterableDataset):
17
- def __init__(
 
 
 
 
 
 
 
18
  self,
19
  prompt_tokenizer: PromptTokenizingStrategy,
20
  dataset: IterableDataset,
@@ -42,7 +51,7 @@ class ConstantLengthDataset(IterableDataset):
42
  seq_length (int): Length of token sequences to return.
43
  """
44
 
45
- def __init__(
46
  self,
47
  tokenizer,
48
  datasets,
 
1
+ """Module containing Dataset functionality"""
2
+
3
  import logging
4
  from typing import List
5
 
 
16
 
17
 
18
  class TokenizedPromptDataset(IterableDataset):
19
+ """
20
+ Iterable dataset that returns tokenized prompts from a stream of text files.
21
+ Args:
22
+ prompt_tokenizer (PromptTokenizingStrategy): The prompt tokenizing method for proccessing the data.
23
+ dataset (dataset.Dataset): Dataset with text files.
24
+ """
25
+
26
+ def __init__( # pylint: disable=super-init-not-called
27
  self,
28
  prompt_tokenizer: PromptTokenizingStrategy,
29
  dataset: IterableDataset,
 
51
  seq_length (int): Length of token sequences to return.
52
  """
53
 
54
+ def __init__( # pylint: disable=super-init-not-called
55
  self,
56
  tokenizer,
57
  datasets,